[Groonga-commit] ranguba/chupa-text at 9deefbe [master] Use one data per page

Back to archive index
Kouhei Sutou null+****@clear*****
Tue Feb 26 06:36:00 JST 2019


Kouhei Sutou	2019-02-26 06:36:00 +0900 (Tue, 26 Feb 2019)

  Revision: 9deefbe9df42b18d7406338168d1efb3add9b263
  https://github.com/ranguba/chupa-text/commit/9deefbe9df42b18d7406338168d1efb3add9b263

  Message:
    Use one data per page

  Added files:
    lib/chupa-text/decomposers/open-document-presentation.rb
  Modified files:
    lib/chupa-text/decomposers/open-document.rb
    test/decomposers/test-open-document-presentation.rb

  Added: lib/chupa-text/decomposers/open-document-presentation.rb (+135 -0) 100644
===================================================================
--- /dev/null
+++ lib/chupa-text/decomposers/open-document-presentation.rb    2019-02-26 06:36:00 +0900 (0a7cf94)
@@ -0,0 +1,135 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "chupa-text/decomposers/open-document"
+
+module ChupaText
+  module Decomposers
+    class OpenDocumentPresentation < OpenDocument
+      registry.register("open-document-presentation", self)
+
+      def initialize(options={})
+        super
+        @extension = "odp"
+        @mime_type = "application/vnd.oasis.opendocument.presentation"
+      end
+
+      def target?(data)
+        data.extension == @extension or
+          data.mime_type == @mime_type
+      end
+
+      def target_score(data)
+        if target?(data)
+          -1
+        else
+          nil
+        end
+      end
+
+      def decompose(data)
+        slides = []
+        data.open do |input|
+          Archive::Zip.open(input) do |zip|
+            zip.each do |entry|
+              next unless entry.file?
+              case entry.zip_path
+              when "content.xml"
+                listener = SlidesListener.new(slides)
+                parse(entry.file_data, listener)
+              when "meta.xml"
+                attributes = {}
+                listener = AttributesListener.new(attributes)
+                parse(entry.file_data, listener)
+                metadata = TextData.new("", source_data: data)
+                attributes.each do |name, value|
+                  metadata[name] = value
+                end
+                yield(metadata)
+              end
+            end
+          end
+        end
+        slides.each_with_index do |slide, i|
+          text = slide[:text]
+          text_data = TextData.new(text, source_data: data)
+          text_data["nth_slide"] = i
+          yield(text_data)
+        end
+      end
+
+      private
+      def parse(io, listener)
+        source = REXML::Source.new(io.read)
+        parser = REXML::Parsers::SAX2Parser.new(source)
+        parser.listen(listener)
+        parser.parse
+      end
+
+      class SlidesListener
+        include REXML::SAX2Listener
+
+        TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
+        DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
+
+        def initialize(slides)
+          @slides = slides
+          @in_p = false
+        end
+
+        def start_element(uri, local_name, qname, attributes)
+          case uri
+          when TEXT_URI
+            case local_name
+            when "p"
+              @in_p = true
+            end
+          when DRAW_URI
+            case local_name
+            when "page"
+              @slides << {text: ""}
+            end
+          end
+        end
+
+        def end_element(uri, local_name, qname)
+          @in_p = false
+          case uri
+          when TEXT_URI
+            case local_name
+            when "p"
+              @slides.last[:text] << "\n"
+            end
+          end
+        end
+
+        def characters(text)
+          add_text(text)
+        end
+
+        def cdata(content)
+          add_text(content)
+        end
+
+        private
+        def add_text(text)
+          return unless @in_p
+          @slides.last[:text] << CGI.unescapeHTML(text)
+        end
+      end
+    end
+  end
+end

  Modified: lib/chupa-text/decomposers/open-document.rb (+2 -2)
===================================================================
--- lib/chupa-text/decomposers/open-document.rb    2019-02-25 17:54:38 +0900 (400024c)
+++ lib/chupa-text/decomposers/open-document.rb    2019-02-26 06:36:00 +0900 (294a2b8)
@@ -28,12 +28,10 @@ module ChupaText
       EXTENSIONS = [
         "odt",
         "ods",
-        "odp",
       ]
       MIME_TYPES = [
         "application/vnd.oasis.opendocument.text",
         "application/vnd.oasis.opendocument.spreadsheet",
-        "application/vnd.oasis.opendocument.presentation",
       ]
       def target?(data)
         EXTENSIONS.include?(data.extension) or
@@ -102,6 +100,8 @@ module ChupaText
         end
 
         def end_element(uri, local_name, qname)
+          @in_p = false
+
           return unless uri == TEXT_URI
           case local_name
           when "p"

  Modified: test/decomposers/test-open-document-presentation.rb (+27 -41)
===================================================================
--- test/decomposers/test-open-document-presentation.rb    2019-02-25 17:54:38 +0900 (3ae7819)
+++ test/decomposers/test-open-document-presentation.rb    2019-02-26 06:36:00 +0900 (70e6b9e)
@@ -18,7 +18,7 @@ class TestDecomposersOpenDocumentPresentation < Test::Unit::TestCase
   include Helper
 
   def setup
-    @decomposer = ChupaText::Decomposers::OpenDocument.new({})
+    @decomposer = ChupaText::Decomposers::OpenDocumentPresentation.new({})
   end
 
   def decompose(path)
@@ -48,46 +48,34 @@ class TestDecomposersOpenDocumentPresentation < Test::Unit::TestCase
   sub_test_case("#decompose") do
     sub_test_case("attributes") do
       def decompose(attribute_name)
-        super(fixture_path("odp", "attributes.odp")).collect do |data|
-          data[attribute_name]
-        end
+        super(fixture_path("odp", "attributes.odp")).first[attribute_name]
       end
 
       def test_title
-        assert_equal(["Title"], decompose("title"))
-      end
-
-      def test_author
-        assert_equal([nil], decompose("author"))
+        assert_equal("Title", decompose("title"))
       end
 
       def test_subject
-        assert_equal(["Subject"], decompose("subject"))
+        assert_equal("Subject", decompose("subject"))
       end
 
       def test_keywords
-        assert_equal([["Keyword1", "Keyword2"]], decompose("keywords"))
+        assert_equal(["Keyword1", "Keyword2"], decompose("keywords"))
       end
 
       def test_created_time
-        assert_equal([Time],
-                     decompose("created_time").collect(&:class))
+        assert_equal(Time,
+                     decompose("created_time").class)
       end
 
       def test_modified_time
-        assert_equal([Time],
-                     decompose("modified_time").collect(&:class))
+        assert_equal(Time,
+                     decompose("modified_time").class)
       end
 
       def test_generator
-        assert_equal(["LibreOffice"],
-                     normalize_generators(decompose("generator")))
-      end
-
-      def normalize_generators(generators)
-        generators.collect do |generator|
-          normalize_generator(generator)
-        end
+        assert_equal("LibreOffice",
+                     normalize_generator(decompose("generator")))
       end
 
       def normalize_generator(generator)
@@ -97,10 +85,6 @@ class TestDecomposersOpenDocumentPresentation < Test::Unit::TestCase
           generator
         end
       end
-
-      def test_creation_date
-        assert_equal([nil], decompose("creation_date"))
-      end
     end
 
     sub_test_case("one slide") do
@@ -109,10 +93,12 @@ class TestDecomposersOpenDocumentPresentation < Test::Unit::TestCase
       end
 
       def test_body
-        assert_equal([<<-BODY], decompose.collect(&:body))
-Slide1 title
-Slide1 content
-        BODY
+        assert_equal([
+                       "",
+                       "Slide1 title\n" +
+                       "Slide1 content\n",
+                     ],
+                     decompose.collect(&:body))
       end
     end
 
@@ -122,16 +108,16 @@ Slide1 content
       end
 
       def test_body
-        assert_equal([<<-BODY], decompose.collect(&:body))
-Slide1 title
-Slide1 content
-
-Slide2 title
-Slide2 content
-
-Slide3 title
-Slide3 content
-        BODY
+        assert_equal([
+                       "",
+                       "Slide1 title\n" +
+                       "Slide1 content\n",
+                       "Slide2 title\n" +
+                       "Slide2 content\n",
+                       "Slide3 title\n" +
+                       "Slide3 content\n",
+                     ],
+                     decompose.collect(&:body))
       end
     end
   end
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190226/078e049e/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index