[Groonga-commit] ranguba/chupa-text at a0d133a [master] Add support PowerPoint

Back to archive index
Kouhei Sutou null+****@clear*****
Mon Feb 25 16:29:17 JST 2019


Kouhei Sutou	2019-02-25 16:29:17 +0900 (Mon, 25 Feb 2019)

  Revision: a0d133abcc55a982eef82256c84b03ec4a2ba7bd
  https://github.com/ranguba/chupa-text/commit/a0d133abcc55a982eef82256c84b03ec4a2ba7bd

  Message:
    Add support PowerPoint

  Added files:
    lib/chupa-text/decomposers/office-open-xml-presentation.rb
    test/decomposers/test-office-open-xml-presentation.rb
    test/fixture/pptx/attributes.pptx
    test/fixture/pptx/multi-slides.pptx
    test/fixture/pptx/one-slide.pptx
  Modified files:
    lib/chupa-text/decomposers/office-open-xml-document.rb
    lib/chupa-text/decomposers/office-open-xml.rb

  Modified: lib/chupa-text/decomposers/office-open-xml-document.rb (+14 -3)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml-document.rb    2019-02-25 16:08:02 +0900 (31266ba)
+++ lib/chupa-text/decomposers/office-open-xml-document.rb    2019-02-25 16:29:17 +0900 (c0e3fef)
@@ -23,10 +23,21 @@ module ChupaText
 
       def initialize(options={})
         super
-        @extension = "docx"
-        @mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+        @extensions = [
+          "docx",
+          "docm",
+          "dotx",
+          "dotm",
+        ]
+        @mime_types = [
+          "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+          "application/vnd.ms-word.document.macroEnabled.12",
+          "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+          "application/vnd.ms-word.template.macroEnabled.12",
+        ]
         @path = "word/document.xml"
-        @namespace_uri = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+        @namespace_uri =
+          "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
       end
     end
   end

  Added: lib/chupa-text/decomposers/office-open-xml-presentation.rb (+64 -0) 100644
===================================================================
--- /dev/null
+++ lib/chupa-text/decomposers/office-open-xml-presentation.rb    2019-02-25 16:29:17 +0900 (63c17fc)
@@ -0,0 +1,64 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "chupa-text/decomposers/office-open-xml"
+
+module ChupaText
+  module Decomposers
+    class OfficeOpenXMLPresentation < OfficeOpenXML
+      registry.register("office-open-xml-presentation", self)
+
+      def initialize(options={})
+        super
+        @extensions = [
+          "pptx",
+          "pptm",
+          "ppsx",
+          "ppsm",
+          "potx",
+          "potm",
+          "sldx",
+          "sldm",
+        ]
+        @mime_types = [
+          "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+          "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
+          "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+          "application/vnd.ms-powerpoint.slideshow.macroEnabled.12",
+          "application/vnd.openxmlformats-officedocument.presentationml.template",
+          "application/vnd.ms-powerpoint.template.macroEnabled.12",
+          "application/vnd.openxmlformats-officedocument.presentationml.slide",
+          "application/vnd.ms-powerpoint.slide.macroEnabled.12",
+        ]
+        @path = /\Appt\/slides\/slide\d+\.xml/
+        @namespace_uri =
+          "http://schemas.openxmlformats.org/drawingml/2006/main"
+      end
+
+      private
+      def extract_text(entry, texts)
+        text = ""
+        super(entry, text)
+        nth_slide = Integer(entry.zip_path.scan(/(\d+)\.xml\z/)[0][0], 10)
+        texts << [nth_slide, text]
+      end
+
+      def accumulate_texts(texts)
+        texts.sort_by(&:first).collect(&:last).join("\n")
+      end
+    end
+  end
+end

  Modified: lib/chupa-text/decomposers/office-open-xml.rb (+14 -5)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml.rb    2019-02-25 16:08:02 +0900 (4438557)
+++ lib/chupa-text/decomposers/office-open-xml.rb    2019-02-25 16:29:17 +0900 (4f111a1)
@@ -24,7 +24,8 @@ module ChupaText
   module Decomposers
     class OfficeOpenXML < Decomposer
       def target?(data)
-        data.extension == @extension or data.mime_type == @mime_type
+        @extensions.include?(data.extension) or
+          @mime_types.include?(data.mime_type)
       end
 
       def target_score(data)
@@ -36,7 +37,7 @@ module ChupaText
       end
 
       def decompose(data)
-        text = nil
+        texts = []
         attributes = {}
         data.open do |input|
           Archive::Zip.open(input) do |zip|
@@ -44,9 +45,7 @@ module ChupaText
               next unless entry.file?
               case entry.zip_path
               when @path
-                text = ""
-                listener = TextListener.new(text, @namespace_uri)
-                parse(entry.file_data, listener)
+                extract_text(entry, texts)
               when "docProps/app.xml"
                 listener = AttributesListener.new(attributes)
                 parse(entry.file_data, listener)
@@ -57,6 +56,7 @@ module ChupaText
             end
           end
         end
+        text = accumulate_texts(texts)
         text_data = TextData.new(text, source_data: data)
         attributes.each do |name, value|
           text_data[name] = value
@@ -72,6 +72,15 @@ module ChupaText
         parser.parse
       end
 
+      def extract_text(entry, texts)
+        listener = TextListener.new(texts, @namespace_uri)
+        parse(entry.file_data, listener)
+      end
+
+      def accumulate_texts(texts)
+        texts.join("")
+      end
+
       class TextListener
         include REXML::SAX2Listener
 

  Added: test/decomposers/test-office-open-xml-presentation.rb (+133 -0) 100644
===================================================================
--- /dev/null
+++ test/decomposers/test-office-open-xml-presentation.rb    2019-02-25 16:29:17 +0900 (6cff112)
@@ -0,0 +1,133 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
+  include Helper
+
+  def setup
+    @decomposer = ChupaText::Decomposers::OfficeOpenXMLPresentation.new({})
+  end
+
+  def decompose(path)
+    data = ChupaText::InputData.new(path)
+    decomposed = []
+    @decomposer.decompose(data) do |decomposed_data|
+      decomposed << decomposed_data
+    end
+    decomposed
+  end
+
+  sub_test_case("#target_score") do
+    def test_extension
+      data = ChupaText::Data.new
+      data.body = ""
+      data.uri = "presentation.pptx"
+      assert_equal(-1, @decomposer.target_score(data))
+    end
+
+    def test_mime_type
+      data = ChupaText::Data.new
+      data.mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+      assert_equal(-1, @decomposer.target_score(data))
+    end
+  end
+
+  sub_test_case("#decompose") do
+    sub_test_case("attributes") do
+      def decompose(attribute_name)
+        super(fixture_path("pptx", "attributes.pptx")).collect do |data|
+          data[attribute_name]
+        end
+      end
+
+      def test_title
+        assert_equal(["Title"], decompose("title"))
+      end
+
+      def test_author
+        assert_equal([nil], decompose("author"))
+      end
+
+      def test_subject
+        assert_equal(["Subject"], decompose("subject"))
+      end
+
+      def test_keywords
+        assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
+      end
+
+      def test_modified_time
+        assert_equal([Time],
+                     decompose("modified_time").collect(&:class))
+      end
+
+      def test_application
+        assert_equal(["LibreOffice"],
+                     normalize_applications(decompose("application")))
+      end
+
+      def normalize_applications(applications)
+        applications.collect do |application|
+          normalize_application(application)
+        end
+      end
+
+      def normalize_application(application)
+        if application.start_with?("LibreOffice")
+          "LibreOffice"
+        else
+          application
+        end
+      end
+
+      def test_creation_date
+        assert_equal([nil], decompose("creation_date"))
+      end
+    end
+
+    sub_test_case("one slide") do
+      def decompose
+        super(fixture_path("pptx", "one-slide.pptx"))
+      end
+
+      def test_body
+        assert_equal([<<-BODY], decompose.collect(&:body))
+Slide1 title
+Slide1 content
+        BODY
+      end
+    end
+
+    sub_test_case("multi slides") do
+      def decompose
+        super(fixture_path("pptx", "multi-slides.pptx"))
+      end
+
+      def test_body
+        assert_equal([<<-BODY], decompose.collect(&:body))
+Slide1 title
+Slide1 content
+
+Slide2 title
+Slide2 content
+
+Slide3 title
+Slide3 content
+        BODY
+      end
+    end
+  end
+end

  Added: test/fixture/pptx/attributes.pptx (+23 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/pptx/attributes.pptx    2019-02-25 16:29:17 +0900 (db82ab6)
@@ -0,0 +1,23 @@
+MIME-Version: 1.0
+mime-type: application/vnd.openxmlformats-officedocument.presentationml.presentation
+uri: file:/tmp/mWit2k_attributes.pptx
+path: /tmp/mWit2k_attributes.pptx
+size: 20658
+Content-Type: multipart/mixed; boundary=086ab5ce9ffe83fe209b6535c62460ca64f2e0de
+
+--086ab5ce9ffe83fe209b6535c62460ca64f2e0de
+mime-type: text/plain
+uri: file:/tmp/mWit2k_attributes.txt
+path: /tmp/mWit2k_attributes.txt
+size: 6
+title: Title
+created_time: 2019-02-25 07:29:39 UTC
+source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.presentationml.presentation"]
+subject: Subject
+keywords: Keyword1, Keyword2
+creator: Impress
+producer: LibreOffice 5.2
+
+Title
+
+--086ab5ce9ffe83fe209b6535c62460ca64f2e0de--

  Added: test/fixture/pptx/multi-slides.pptx (+25 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/pptx/multi-slides.pptx    2019-02-25 16:29:17 +0900 (ad7fc0b)
@@ -0,0 +1,25 @@
+MIME-Version: 1.0
+mime-type: application/vnd.openxmlformats-officedocument.presentationml.presentation
+uri: file:/tmp/x5sjhk_multi-slides.pptx
+path: /tmp/x5sjhk_multi-slides.pptx
+size: 64545
+Content-Type: multipart/mixed; boundary=e91334c544f9aa5cb9be01fa2efee72507e9c749
+
+--e91334c544f9aa5cb9be01fa2efee72507e9c749
+mime-type: text/plain
+uri: file:/tmp/x5sjhk_multi-slides.txt
+path: /tmp/x5sjhk_multi-slides.txt
+size: 84
+created_time: 2019-02-25 07:29:41 UTC
+source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.presentationml.presentation"]
+creator: Impress
+producer: LibreOffice 5.2
+
+Slide1 title
+Slide1 content
+Slide2 title
+Slide2 content
+Slide3 title
+Slide3 content
+
+--e91334c544f9aa5cb9be01fa2efee72507e9c749--

  Added: test/fixture/pptx/one-slide.pptx (+21 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/pptx/one-slide.pptx    2019-02-25 16:29:17 +0900 (8ad2c0a)
@@ -0,0 +1,21 @@
+MIME-Version: 1.0
+mime-type: application/vnd.openxmlformats-officedocument.presentationml.presentation
+uri: file:/tmp/BoZPzY_one-slide.pptx
+path: /tmp/BoZPzY_one-slide.pptx
+size: 32488
+Content-Type: multipart/mixed; boundary=cb1251febb85ffd23fef39b59757cac95cfa933f
+
+--cb1251febb85ffd23fef39b59757cac95cfa933f
+mime-type: text/plain
+uri: file:/tmp/BoZPzY_one-slide.txt
+path: /tmp/BoZPzY_one-slide.txt
+size: 28
+created_time: 2019-02-25 07:29:43 UTC
+source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.presentationml.presentation"]
+creator: Impress
+producer: LibreOffice 5.2
+
+Slide1 title
+Slide1 content
+
+--cb1251febb85ffd23fef39b59757cac95cfa933f--
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190225/f7ab3d16/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index