[Groonga-commit] ranguba/chupa-text at 2de171d [master] office-open-xml-presentation, workbook: emit multiple data

Back to archive index
Kouhei Sutou null+****@clear*****
Fri Mar 1 11:24:08 JST 2019


Kouhei Sutou	2019-03-01 11:24:08 +0900 (Fri, 01 Mar 2019)

  Revision: 2de171d610017dfa074ae7ec77717e4d54e71304
  https://github.com/ranguba/chupa-text/commit/2de171d610017dfa074ae7ec77717e4d54e71304

  Message:
    office-open-xml-presentation,workbook: emit multiple data

  Modified files:
    lib/chupa-text/decomposers/office-open-xml-document.rb
    lib/chupa-text/decomposers/office-open-xml-presentation.rb
    lib/chupa-text/decomposers/office-open-xml-workbook.rb
    lib/chupa-text/decomposers/office-open-xml.rb
    test/decomposers/test-office-open-xml-presentation.rb
    test/decomposers/test-office-open-xml-workbook.rb

  Modified: lib/chupa-text/decomposers/office-open-xml-document.rb (+12 -0)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml-document.rb    2019-03-01 10:59:44 +0900 (48118e5)
+++ lib/chupa-text/decomposers/office-open-xml-document.rb    2019-03-01 11:24:08 +0900 (d6cdaac)
@@ -40,12 +40,24 @@ module ChupaText
       end
 
       private
+      def start_decompose(context)
+        context[:text] = ""
+      end
+
       def process_entry(entry, context)
         case entry.zip_path
         when "word/document.xml"
           extract_text(entry, context[:text])
         end
       end
+
+      def finish_decompose(context, &block)
+        text_data = TextData.new(context[:text], source_data: context[:data])
+        context[:attributes].each do |name, value|
+          text_data[name] = value
+        end
+        yield(text_data)
+      end
     end
   end
 end

  Modified: lib/chupa-text/decomposers/office-open-xml-presentation.rb (+17 -3)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml-presentation.rb    2019-03-01 10:59:44 +0900 (1ce5759)
+++ lib/chupa-text/decomposers/office-open-xml-presentation.rb    2019-03-01 11:24:08 +0900 (7126eec)
@@ -48,19 +48,33 @@ module ChupaText
       end
 
       private
+      def start_decompose(context)
+        context[:slides] = []
+      end
+
       def process_entry(entry, context)
         case entry.zip_path
         when /\Appt\/slides\/slide(\d+)\.xml/
           nth_slide = Integer($1, 10)
           slide_text = ""
           extract_text(entry, slide_text)
-          context[:slides] ||= []
           context[:slides] << [nth_slide, slide_text]
         end
       end
 
-      def accumulate_text(context)
-        context[:slides].sort_by(&:first).collect(&:last).join("\n")
+      def finish_decompose(context, &block)
+        metadata = TextData.new("", source_data: context[:data])
+        context[:attributes].each do |name, value|
+          metadata[name] = value
+        end
+        yield(metadata)
+
+        slide_texts = context[:slides].sort_by(&:first).collect(&:last)
+        slide_texts.each_with_index do |slide_text, i|
+          text_data = TextData.new(slide_text, source_data: context[:data])
+          text_data["index"] = i
+          yield(text_data)
+        end
       end
     end
   end

  Modified: lib/chupa-text/decomposers/office-open-xml-workbook.rb (+39 -6)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml-workbook.rb    2019-03-01 10:59:44 +0900 (cdc09d1)
+++ lib/chupa-text/decomposers/office-open-xml-workbook.rb    2019-03-01 11:24:08 +0900 (90bb5d3)
@@ -40,25 +40,39 @@ module ChupaText
       end
 
       private
+      def start_decompose(context)
+        context[:shared_strings] = []
+        context[:sheet_names] = []
+        context[:sheets] = []
+      end
+
       def process_entry(entry, context)
         case entry.zip_path
         when "xl/sharedStrings.xml"
-          context[:shared_strings] = []
           extract_text(entry, context[:shared_strings])
+        when "xl/workbook.xml"
+          listener = WorkbookListener.new(context[:sheet_names])
+          parse(entry.file_data, listener)
         when /\Axl\/worksheets\/sheet(\d+)\.xml\z/
           nth_sheet = Integer($1, 10)
           sheet = []
           listener = SheetListener.new(sheet)
           parse(entry.file_data, listener)
-          context[:sheets] ||= []
           context[:sheets] << [nth_sheet, sheet]
         end
       end
 
-      def accumulate_text(context)
+      def finish_decompose(context, &block)
+        metadata = TextData.new("", source_data: context[:data])
+        context[:attributes].each do |name, value|
+          metadata[name] = value
+        end
+        yield(metadata)
+
         shared_strings = context[:shared_strings]
         sheets = context[:sheets].sort_by(&:first).collect(&:last)
-        sheet_texts = sheets.collect do |sheet|
+        sheet_names = context[:sheet_names]
+        sheets.each_with_index do |sheet, i|
           sheet_text = ""
           sheet.each do |row|
             row_texts = row.collect do |cell|
@@ -71,9 +85,28 @@ module ChupaText
             end
             sheet_text << row_texts.join("\t") << "\n"
           end
-          sheet_text
+          text_data = TextData.new(sheet_text, source_data: context[:data])
+          text_data["index"] = i
+          name = sheet_names[i]
+          text_data["name"] = name if name
+          yield(text_data)
+        end
+      end
+
+      class WorkbookListener < SAXListener
+        URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
+
+        def initialize(sheet_names)
+          @sheet_names = sheet_names
+        end
+
+        def start_element(uri, local_name, qname, attributes)
+          return unless uri == URI
+          case local_name
+          when "sheet"
+            @sheet_names << attributes["name"]
+          end
         end
-        sheet_texts.join("\n")
       end
 
       class SheetListener < SAXListener

  Modified: lib/chupa-text/decomposers/office-open-xml.rb (+4 -8)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml.rb    2019-03-01 10:59:44 +0900 (f2e2483)
+++ lib/chupa-text/decomposers/office-open-xml.rb    2019-03-01 11:24:08 +0900 (20543a8)
@@ -34,11 +34,12 @@ module ChupaText
         end
       end
 
-      def decompose(data)
+      def decompose(data, &block)
         context = {
-          text: "",
+          data: data,
           attributes: {},
         }
+        start_decompose(context)
         data.open do |input|
           Archive::Zip.open(input) do |zip|
             zip.each do |entry|
@@ -56,12 +57,7 @@ module ChupaText
             end
           end
         end
-        text = accumulate_text(context)
-        text_data = TextData.new(text, source_data: data)
-        context[:attributes].each do |name, value|
-          text_data[name] = value
-        end
-        yield(text_data)
+        finish_decompose(context, &block)
       end
 
       private

  Modified: test/decomposers/test-office-open-xml-presentation.rb (+46 -46)
===================================================================
--- test/decomposers/test-office-open-xml-presentation.rb    2019-03-01 10:59:44 +0900 (6cff112)
+++ test/decomposers/test-office-open-xml-presentation.rb    2019-03-01 11:24:08 +0900 (a72fb13)
@@ -48,41 +48,32 @@ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
   sub_test_case("#decompose") do
     sub_test_case("attributes") do
       def decompose(attribute_name)
-        super(fixture_path("pptx", "attributes.pptx")).collect do |data|
-          data[attribute_name]
-        end
+        super(fixture_path("pptx", "attributes.pptx")).first[attribute_name]
       end
 
       def test_title
-        assert_equal(["Title"], decompose("title"))
+        assert_equal("Title", decompose("title"))
       end
 
       def test_author
-        assert_equal([nil], decompose("author"))
+        assert_equal(nil, decompose("author"))
       end
 
       def test_subject
-        assert_equal(["Subject"], decompose("subject"))
+        assert_equal("Subject", decompose("subject"))
       end
 
       def test_keywords
-        assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
+        assert_equal("Keyword1 Keyword2", decompose("keywords"))
       end
 
       def test_modified_time
-        assert_equal([Time],
-                     decompose("modified_time").collect(&:class))
+        assert_equal(Time, decompose("modified_time").class)
       end
 
       def test_application
-        assert_equal(["LibreOffice"],
-                     normalize_applications(decompose("application")))
-      end
-
-      def normalize_applications(applications)
-        applications.collect do |application|
-          normalize_application(application)
-        end
+        assert_equal("LibreOffice",
+                     normalize_application(decompose("application")))
       end
 
       def normalize_application(application)
@@ -92,41 +83,50 @@ class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase
           application
         end
       end
-
-      def test_creation_date
-        assert_equal([nil], decompose("creation_date"))
-      end
     end
 
-    sub_test_case("one slide") do
-      def decompose
-        super(fixture_path("pptx", "one-slide.pptx"))
-      end
-
-      def test_body
-        assert_equal([<<-BODY], decompose.collect(&:body))
-Slide1 title
-Slide1 content
-        BODY
+    sub_test_case("slides") do
+      def decompose(path)
+        super(path).collect do |data|
+          [
+            data["index"],
+            data.body,
+          ]
+        end
       end
-    end
 
-    sub_test_case("multi slides") do
-      def decompose
-        super(fixture_path("pptx", "multi-slides.pptx"))
+      def test_one_slide
+        assert_equal([
+                       [nil, ""],
+                       [
+                         0,
+                         "Slide1 title\n" +
+                         "Slide1 content\n",
+                       ],
+                     ],
+                     decompose(fixture_path("pptx", "one-slide.pptx")))
       end
 
-      def test_body
-        assert_equal([<<-BODY], decompose.collect(&:body))
-Slide1 title
-Slide1 content
-
-Slide2 title
-Slide2 content
-
-Slide3 title
-Slide3 content
-        BODY
+      def test_multi_slides
+        assert_equal([
+                       [nil, ""],
+                       [
+                         0,
+                         "Slide1 title\n" +
+                         "Slide1 content\n",
+                       ],
+                       [
+                         1,
+                         "Slide2 title\n" +
+                         "Slide2 content\n",
+                       ],
+                       [
+                         2,
+                         "Slide3 title\n" +
+                         "Slide3 content\n",
+                       ],
+                     ],
+                     decompose(fixture_path("pptx", "multi-slides.pptx")))
       end
     end
   end

  Modified: test/decomposers/test-office-open-xml-workbook.rb (+68 -64)
===================================================================
--- test/decomposers/test-office-open-xml-workbook.rb    2019-03-01 10:59:44 +0900 (16970b3)
+++ test/decomposers/test-office-open-xml-workbook.rb    2019-03-01 11:24:08 +0900 (704f599)
@@ -48,46 +48,36 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
   sub_test_case("#decompose") do
     sub_test_case("attributes") do
       def decompose(attribute_name)
-        super(fixture_path("xlsx", "attributes.xlsx")).collect do |data|
-          data[attribute_name]
-        end
+        super(fixture_path("xlsx", "attributes.xlsx")).first[attribute_name]
       end
 
       def test_title
-        assert_equal(["Title"], decompose("title"))
+        assert_equal("Title", decompose("title"))
       end
 
       def test_author
-        assert_equal([nil], decompose("author"))
+        assert_equal(nil, decompose("author"))
       end
 
       def test_subject
-        assert_equal(["Subject"], decompose("subject"))
+        assert_equal("Subject", decompose("subject"))
       end
 
       def test_keywords
-        assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
+        assert_equal("Keyword1 Keyword2", decompose("keywords"))
       end
 
       def test_created_time
-        assert_equal([Time],
-                     decompose("created_time").collect(&:class))
+        assert_equal(Time, decompose("created_time").class)
       end
 
       def test_modified_time
-        assert_equal([Time],
-                     decompose("modified_time").collect(&:class))
+        assert_equal(Time, decompose("modified_time").class)
       end
 
       def test_application
-        assert_equal(["LibreOffice"],
-                     normalize_applications(decompose("application")))
-      end
-
-      def normalize_applications(applications)
-        applications.collect do |application|
-          normalize_application(application)
-        end
+        assert_equal("LibreOffice",
+                     normalize_application(decompose("application")))
       end
 
       def normalize_application(application)
@@ -97,55 +87,69 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
           application
         end
       end
-
-      def test_creation_date
-        assert_equal([nil], decompose("creation_date"))
-      end
     end
 
-    sub_test_case("one sheet") do
-      def decompose
-        super(fixture_path("xlsx", "one-sheet.xlsx"))
-      end
-
-      def test_body
-        assert_equal([<<-BODY], decompose.collect(&:body))
-Sheet1 - A1\tSheet1 - B1
-Sheet1 - A2\tSheet1 - B2
-        BODY
-      end
-    end
-
-    sub_test_case("not shared cell") do
-      def decompose
-        super(fixture_path("xlsx", "not-shared-cell.xlsx"))
-      end
-
-      def test_body
-        assert_equal([<<-BODY], decompose.collect(&:body))
-Sheet1 - A1\tSheet1 - B1
-Sheet1 - A2\tSheet1 - B2
-0.5\t0.5
-        BODY
-      end
-    end
-
-    sub_test_case("multi sheets") do
-      def decompose
-        super(fixture_path("xlsx", "multi-sheets.xlsx"))
+    sub_test_case("sheets") do
+      def decompose(path)
+        super(path).collect do |data|
+          [
+            data["index"],
+            data["name"],
+            data.body,
+          ]
+        end
       end
 
-      def test_body
-        assert_equal([<<-BODY], decompose.collect(&:body))
-Sheet1 - A1\tSheet1 - B1
-Sheet1 - A2\tSheet1 - B2
-
-Sheet2 - A1\tSheet2 - B1
-Sheet2 - A2\tSheet2 - B2
-
-Sheet3 - A1\tSheet3 - B1
-Sheet3 - A2\tSheet3 - B2
-        BODY
+      def test_one_sheet
+        assert_equal([
+                       [nil, nil, ""],
+                       [
+                         0,
+                         "Sheet1",
+                         "Sheet1 - A1\tSheet1 - B1\n" +
+                         "Sheet1 - A2\tSheet1 - B2\n",
+                       ],
+                     ],
+                     decompose(fixture_path("xlsx", "one-sheet.xlsx")))
+      end
+
+      def test_no_shared_cell
+        assert_equal([
+                       [nil, nil, ""],
+                       [
+                         0,
+                         "Sheet1",
+                         "Sheet1 - A1\tSheet1 - B1\n" +
+                         "Sheet1 - A2\tSheet1 - B2\n" +
+                         "0.5\t0.5\n",
+                       ],
+                     ],
+                     decompose(fixture_path("xlsx", "not-shared-cell.xlsx")))
+      end
+
+      def test_multi_sheets
+        assert_equal([
+                       [nil, nil, ""],
+                       [
+                         0,
+                         "Sheet1",
+                         "Sheet1 - A1\tSheet1 - B1\n" +
+                         "Sheet1 - A2\tSheet1 - B2\n",
+                       ],
+                       [
+                         1,
+                         "Sheet2",
+                         "Sheet2 - A1\tSheet2 - B1\n" +
+                         "Sheet2 - A2\tSheet2 - B2\n",
+                       ],
+                       [
+                         2,
+                         "Sheet3",
+                         "Sheet3 - A1\tSheet3 - B1\n" +
+                         "Sheet3 - A2\tSheet3 - B2\n",
+                       ],
+                     ],
+                     decompose(fixture_path("xlsx", "multi-sheets.xlsx")))
       end
     end
   end
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190301/16654e58/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index