[Groonga-commit] ranguba/chupa-text at 3ac5d14 [master] Add support Excel

Back to archive index
Kouhei Sutou null+****@clear*****
Mon Feb 25 17:02:55 JST 2019


Kouhei Sutou	2019-02-25 17:02:55 +0900 (Mon, 25 Feb 2019)

  Revision: 3ac5d1455ac8d14d3c153d6b6242e7703a48746c
  https://github.com/ranguba/chupa-text/commit/3ac5d1455ac8d14d3c153d6b6242e7703a48746c

  Message:
    Add support Excel

  Added files:
    lib/chupa-text/decomposers/office-open-xml-workbook.rb
    test/decomposers/test-office-open-xml-workbook.rb
    test/fixture/xlsx/attributes.xlsx
    test/fixture/xlsx/multi-sheets.xlsx
    test/fixture/xlsx/one-sheet.xlsx
  Modified files:
    lib/chupa-text/decomposers/office-open-xml-document.rb
    lib/chupa-text/decomposers/office-open-xml-presentation.rb
    lib/chupa-text/decomposers/office-open-xml.rb

  Modified: lib/chupa-text/decomposers/office-open-xml-document.rb (+8 -1)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml-document.rb    2019-02-25 16:29:17 +0900 (c0e3fef)
+++ lib/chupa-text/decomposers/office-open-xml-document.rb    2019-02-25 17:02:55 +0900 (48118e5)
@@ -35,10 +35,17 @@ module ChupaText
           "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
           "application/vnd.ms-word.template.macroEnabled.12",
         ]
-        @path = "word/document.xml"
         @namespace_uri =
           "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
       end
+
+      private
+      def process_entry(entry, context)
+        case entry.zip_path
+        when "word/document.xml"
+          extract_text(entry, context[:text])
+        end
+      end
     end
   end
 end

  Modified: lib/chupa-text/decomposers/office-open-xml-presentation.rb (+11 -8)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml-presentation.rb    2019-02-25 16:29:17 +0900 (63c17fc)
+++ lib/chupa-text/decomposers/office-open-xml-presentation.rb    2019-02-25 17:02:55 +0900 (1ce5759)
@@ -43,21 +43,24 @@ module ChupaText
           "application/vnd.openxmlformats-officedocument.presentationml.slide",
           "application/vnd.ms-powerpoint.slide.macroEnabled.12",
         ]
-        @path = /\Appt\/slides\/slide\d+\.xml/
         @namespace_uri =
           "http://schemas.openxmlformats.org/drawingml/2006/main"
       end
 
       private
-      def extract_text(entry, texts)
-        text = ""
-        super(entry, text)
-        nth_slide = Integer(entry.zip_path.scan(/(\d+)\.xml\z/)[0][0], 10)
-        texts << [nth_slide, text]
+      def process_entry(entry, context)
+        case entry.zip_path
+        when /\Appt\/slides\/slide(\d+)\.xml/
+          nth_slide = Integer($1, 10)
+          slide_text = ""
+          extract_text(entry, slide_text)
+          context[:slides] ||= []
+          context[:slides] << [nth_slide, slide_text]
+        end
       end
 
-      def accumulate_texts(texts)
-        texts.sort_by(&:first).collect(&:last).join("\n")
+      def accumulate_text(context)
+        context[:slides].sort_by(&:first).collect(&:last).join("\n")
       end
     end
   end

  Added: lib/chupa-text/decomposers/office-open-xml-workbook.rb (+114 -0) 100644
===================================================================
--- /dev/null
+++ lib/chupa-text/decomposers/office-open-xml-workbook.rb    2019-02-25 17:02:55 +0900 (a75d351)
@@ -0,0 +1,114 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "chupa-text/decomposers/office-open-xml"
+
+module ChupaText
+  module Decomposers
+    class OfficeOpenXMLWorkbook < OfficeOpenXML
+      registry.register("office-open-xml-workbook", self)
+
+      def initialize(options={})
+        super
+        @extensions = [
+          "xlsx",
+          "xlsm",
+          "xltx",
+          "xltm",
+        ]
+        @mime_types = [
+          "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+          "application/vnd.ms-excel.sheet.macroEnabled.12",
+          "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
+          "application/vnd.ms-excel.template.macroEnabled.12",
+        ]
+        @namespace_uri =
+          "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
+      end
+
+      private
+      def process_entry(entry, context)
+        case entry.zip_path
+        when "xl/sharedStrings.xml"
+          context[:shared_strings] = []
+          extract_text(entry, context[:shared_strings])
+        when /\Axl\/worksheets\/sheet(\d+)\.xml\z/
+          nth_sheet = Integer($1, 10)
+          sheet = []
+          listener = SheetListener.new(sheet)
+          parse(entry.file_data, listener)
+          context[:sheets] ||= []
+          context[:sheets] << [nth_sheet, sheet]
+        end
+      end
+
+      def accumulate_text(context)
+        shared_strings = context[:shared_strings]
+        sheets = context[:sheets].sort_by(&:first).collect(&:last)
+        sheet_texts = sheets.collect do |sheet|
+          sheet_text = ""
+          sheet.each do |row|
+            row_texts = row.collect do |index|
+              shared_strings[index]
+            end
+            sheet_text << row_texts.join("\t") << "\n"
+          end
+          sheet_text
+        end
+        sheet_texts.join("\n")
+      end
+
+      class SheetListener
+        include REXML::SAX2Listener
+
+        URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
+
+        def initialize(sheet)
+          @sheet = sheet
+          @in_v = false
+        end
+
+        def start_element(uri, local_name, qname, attributes)
+          return unless uri == URI
+          case local_name
+          when "row"
+            @sheet << []
+          when "v"
+            @in_v = true
+          end
+        end
+
+        def end_element(uri, local_name, qname)
+          @in_v = false
+        end
+
+        def characters(text)
+          add_column(text)
+        end
+
+        def cdata(content)
+          add_column(content)
+        end
+
+        private
+        def add_column(text)
+          return unless @in_v
+          @sheet.last << Integer(text, 10)
+        end
+      end
+    end
+  end
+end

  Modified: lib/chupa-text/decomposers/office-open-xml.rb (+14 -12)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml.rb    2019-02-25 16:29:17 +0900 (4f111a1)
+++ lib/chupa-text/decomposers/office-open-xml.rb    2019-02-25 17:02:55 +0900 (a891383)
@@ -37,28 +37,30 @@ module ChupaText
       end
 
       def decompose(data)
-        texts = []
-        attributes = {}
+        context = {
+          text: "",
+          attributes: {},
+        }
         data.open do |input|
           Archive::Zip.open(input) do |zip|
             zip.each do |entry|
               next unless entry.file?
               case entry.zip_path
-              when @path
-                extract_text(entry, texts)
               when "docProps/app.xml"
-                listener = AttributesListener.new(attributes)
+                listener = AttributesListener.new(context[:attributes])
                 parse(entry.file_data, listener)
               when "docProps/core.xml"
-                listener = AttributesListener.new(attributes)
+                listener = AttributesListener.new(context[:attributes])
                 parse(entry.file_data, listener)
+              else
+                process_entry(entry, context)
               end
             end
           end
         end
-        text = accumulate_texts(texts)
+        text = accumulate_text(context)
         text_data = TextData.new(text, source_data: data)
-        attributes.each do |name, value|
+        context[:attributes].each do |name, value|
           text_data[name] = value
         end
         yield(text_data)
@@ -77,8 +79,8 @@ module ChupaText
         parse(entry.file_data, listener)
       end
 
-      def accumulate_texts(texts)
-        texts.join("")
+      def accumulate_text(context)
+        context[:text]
       end
 
       class TextListener
@@ -99,10 +101,10 @@ module ChupaText
         end
 
         def end_element(uri, local_name, qname)
+          @in_target = false
+
           return unless uri == @target_uri
           case local_name
-          when "t"
-            @in_target = false
           when "p", "br"
             @output << "\n"
           end

  Added: test/decomposers/test-office-open-xml-workbook.rb (+138 -0) 100644
===================================================================
--- /dev/null
+++ test/decomposers/test-office-open-xml-workbook.rb    2019-02-25 17:02:55 +0900 (66154a5)
@@ -0,0 +1,138 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
+  include Helper
+
+  def setup
+    @decomposer = ChupaText::Decomposers::OfficeOpenXMLWorkbook.new({})
+  end
+
+  def decompose(path)
+    data = ChupaText::InputData.new(path)
+    decomposed = []
+    @decomposer.decompose(data) do |decomposed_data|
+      decomposed << decomposed_data
+    end
+    decomposed
+  end
+
+  sub_test_case("#target_score") do
+    def test_extension
+      data = ChupaText::Data.new
+      data.body = ""
+      data.uri = "workbook.xlsx"
+      assert_equal(-1, @decomposer.target_score(data))
+    end
+
+    def test_mime_type
+      data = ChupaText::Data.new
+      data.mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+      assert_equal(-1, @decomposer.target_score(data))
+    end
+  end
+
+  sub_test_case("#decompose") do
+    sub_test_case("attributes") do
+      def decompose(attribute_name)
+        super(fixture_path("xlsx", "attributes.xlsx")).collect do |data|
+          data[attribute_name]
+        end
+      end
+
+      def test_title
+        assert_equal(["Title"], decompose("title"))
+      end
+
+      def test_author
+        assert_equal([nil], decompose("author"))
+      end
+
+      def test_subject
+        assert_equal(["Subject"], decompose("subject"))
+      end
+
+      def test_keywords
+        assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
+      end
+
+      def test_created_time
+        assert_equal([Time],
+                     decompose("created_time").collect(&:class))
+      end
+
+      def test_modified_time
+        assert_equal([Time],
+                     decompose("modified_time").collect(&:class))
+      end
+
+      def test_application
+        assert_equal(["LibreOffice"],
+                     normalize_applications(decompose("application")))
+      end
+
+      def normalize_applications(applications)
+        applications.collect do |application|
+          normalize_application(application)
+        end
+      end
+
+      def normalize_application(application)
+        if application.start_with?("LibreOffice")
+          "LibreOffice"
+        else
+          application
+        end
+      end
+
+      def test_creation_date
+        assert_equal([nil], decompose("creation_date"))
+      end
+    end
+
+    sub_test_case("one sheet") do
+      def decompose
+        super(fixture_path("xlsx", "one-sheet.xlsx"))
+      end
+
+      def test_body
+        assert_equal([<<-BODY], decompose.collect(&:body))
+Sheet1 - A1\tSheet1 - B1
+Sheet1 - A2\tSheet1 - B2
+        BODY
+      end
+    end
+
+    sub_test_case("multi sheets") do
+      def decompose
+        super(fixture_path("xlsx", "multi-sheets.xlsx"))
+      end
+
+      def test_body
+        assert_equal([<<-BODY], decompose.collect(&:body))
+Sheet1 - A1\tSheet1 - B1
+Sheet1 - A2\tSheet1 - B2
+
+Sheet2 - A1\tSheet2 - B1
+Sheet2 - A2\tSheet2 - B2
+
+Sheet3 - A1\tSheet3 - B1
+Sheet3 - A2\tSheet3 - B2
+        BODY
+      end
+    end
+  end
+end

  Added: test/fixture/xlsx/attributes.xlsx (+24 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/xlsx/attributes.xlsx    2019-02-25 17:02:55 +0900 (067cecc)
@@ -0,0 +1,24 @@
+MIME-Version: 1.0
+mime-type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
+uri: file:/tmp/lz0OgQ_attributes.xlsx
+path: /tmp/lz0OgQ_attributes.xlsx
+size: 4610
+Content-Type: multipart/mixed; boundary=0648060d16debd465e4a4f8e0e1955f4681a9889
+
+--0648060d16debd465e4a4f8e0e1955f4681a9889
+mime-type: text/plain
+uri: file:/tmp/lz0OgQ_attributes.txt
+path: /tmp/lz0OgQ_attributes.txt
+size: 12
+source-mime-types: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
+name: Sheet1
+digest: b6906a8292c58517cf73036ad0ace729518f79b4
+size: 12
+first-row: 1
+last-row: 1
+first-column: A
+last-column: A
+
+"Sheet1 A1"
+
+--0648060d16debd465e4a4f8e0e1955f4681a9889--

  Added: test/fixture/xlsx/multi-sheets.xlsx (+59 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/xlsx/multi-sheets.xlsx    2019-02-25 17:02:55 +0900 (d4161ba)
@@ -0,0 +1,59 @@
+MIME-Version: 1.0
+mime-type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
+uri: file:/tmp/20tCV7_multi-sheets.xlsx
+path: /tmp/20tCV7_multi-sheets.xlsx
+size: 6643
+Content-Type: multipart/mixed; boundary=14bf74dbb0768dd2878a3b236390cd2e928a7b39
+
+--14bf74dbb0768dd2878a3b236390cd2e928a7b39
+mime-type: text/plain
+uri: file:/tmp/20tCV7_multi-sheets.txt
+path: /tmp/20tCV7_multi-sheets.txt
+size: 56
+source-mime-types: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
+name: Sheet1
+digest: 9011cb9443faf652afa3746acfae5c623ec5c7c9
+size: 56
+first-row: 1
+last-row: 2
+first-column: A
+last-column: B
+
+"Sheet1 - A1","Sheet1 - B1"
+"Sheet1 - A2","Sheet1 - B2"
+
+--14bf74dbb0768dd2878a3b236390cd2e928a7b39
+mime-type: text/plain
+uri: file:/tmp/20tCV7_multi-sheets.txt
+path: /tmp/20tCV7_multi-sheets.txt
+size: 56
+source-mime-types: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
+name: Sheet2
+digest: be99be632ea81d909f2d44b26e3dde28214cb126
+size: 56
+first-row: 1
+last-row: 2
+first-column: A
+last-column: B
+
+"Sheet2 - A1","Sheet2 - B1"
+"Sheet2 - A2","Sheet2 - B2"
+
+--14bf74dbb0768dd2878a3b236390cd2e928a7b39
+mime-type: text/plain
+uri: file:/tmp/20tCV7_multi-sheets.txt
+path: /tmp/20tCV7_multi-sheets.txt
+size: 56
+source-mime-types: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
+name: Sheet3
+digest: c03c013aba6e69703ba402967fc2a85cebbe9a28
+size: 56
+first-row: 1
+last-row: 2
+first-column: A
+last-column: B
+
+"Sheet3 - A1","Sheet3 - B1"
+"Sheet3 - A2","Sheet3 - B2"
+
+--14bf74dbb0768dd2878a3b236390cd2e928a7b39--

  Added: test/fixture/xlsx/one-sheet.xlsx (+25 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/xlsx/one-sheet.xlsx    2019-02-25 17:02:55 +0900 (9a94623)
@@ -0,0 +1,25 @@
+MIME-Version: 1.0
+mime-type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
+uri: file:/tmp/6DPYAc_one-sheet.xlsx
+path: /tmp/6DPYAc_one-sheet.xlsx
+size: 4593
+Content-Type: multipart/mixed; boundary=17909a3c30dfcb7626b7854f1bdecf6bc1e288f4
+
+--17909a3c30dfcb7626b7854f1bdecf6bc1e288f4
+mime-type: text/plain
+uri: file:/tmp/6DPYAc_one-sheet.txt
+path: /tmp/6DPYAc_one-sheet.txt
+size: 56
+source-mime-types: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
+name: Sheet1
+digest: 9011cb9443faf652afa3746acfae5c623ec5c7c9
+size: 56
+first-row: 1
+last-row: 2
+first-column: A
+last-column: B
+
+"Sheet1 - A1","Sheet1 - B1"
+"Sheet1 - A2","Sheet1 - B2"
+
+--17909a3c30dfcb7626b7854f1bdecf6bc1e288f4--
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190225/b1718798/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index