[Groonga-commit] ranguba/chupa-text at cc182ed [master] Use one data per sheet

Back to archive index
Kouhei Sutou null+****@clear*****
Tue Feb 26 06:58:03 JST 2019


Kouhei Sutou	2019-02-26 06:58:03 +0900 (Tue, 26 Feb 2019)

  Revision: cc182ed795389091ea30c4b00c92377f8c119015
  https://github.com/ranguba/chupa-text/commit/cc182ed795389091ea30c4b00c92377f8c119015

  Message:
    Use one data per sheet

  Added files:
    lib/chupa-text/decomposers/open-document-spreadsheet.rb
  Modified files:
    lib/chupa-text/decomposers/open-document.rb
    test/decomposers/test-open-document-spreadsheet.rb

  Added: lib/chupa-text/decomposers/open-document-spreadsheet.rb (+164 -0) 100644
===================================================================
--- /dev/null
+++ lib/chupa-text/decomposers/open-document-spreadsheet.rb    2019-02-26 06:58:03 +0900 (a4c7eff)
@@ -0,0 +1,164 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "chupa-text/decomposers/open-document"
+
+module ChupaText
+  module Decomposers
+    class OpenDocumentSpreadsheet < OpenDocument
+      registry.register("open-document-spreadsheet", self)
+
+      def initialize(options={})
+        super
+        @extension = "ods"
+        @mime_type = "application/vnd.oasis.opendocument.spreadsheet"
+      end
+
+      def target?(data)
+        data.extension == @extension or
+          data.mime_type == @mime_type
+      end
+
+      def target_score(data)
+        if target?(data)
+          -1
+        else
+          nil
+        end
+      end
+
+      def decompose(data)
+        sheets = []
+        data.open do |input|
+          Archive::Zip.open(input) do |zip|
+            zip.each do |entry|
+              next unless entry.file?
+              case entry.zip_path
+              when "content.xml"
+                listener = SheetsListener.new(sheets)
+                parse(entry.file_data, listener)
+              when "meta.xml"
+                attributes = {}
+                listener = AttributesListener.new(attributes)
+                parse(entry.file_data, listener)
+                metadata = TextData.new("", source_data: data)
+                attributes.each do |name, value|
+                  metadata[name] = value
+                end
+                yield(metadata)
+              end
+            end
+          end
+        end
+        sheets.each_with_index do |sheet, i|
+          text = sheet[:text]
+          text_data = TextData.new(text, source_data: data)
+          text_data["index"] = i
+          name = sheet[:name]
+          text_data["name"] = name if name
+          yield(text_data)
+        end
+      end
+
+      private
+      def parse(io, listener)
+        source = REXML::Source.new(io.read)
+        parser = REXML::Parsers::SAX2Parser.new(source)
+        parser.listen(listener)
+        parser.parse
+      end
+
+      class SheetsListener
+        include REXML::SAX2Listener
+
+        TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
+        TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"
+
+        def initialize(sheets)
+          @sheets = sheets
+          @prefix_to_uri = {}
+          @uri_to_prefix = {}
+          @in_p = false
+        end
+
+        def start_prefix_mapping(prefix, uri)
+          @prefix_to_uri[prefix] = uri
+          @uri_to_prefix[uri] = prefix
+        end
+
+        def end_prefix_mapping(prefix)
+          uri = @prefix_to_uri.delete(prefix)
+          @uri_to_prefix.delete(uri)
+        end
+
+        def start_element(uri, local_name, qname, attributes)
+          case uri
+          when TEXT_URI
+            case local_name
+            when "p"
+              @in_p = true
+            end
+          when TABLE_URI
+            table_prefix = @uri_to_prefix[TABLE_URI]
+            case local_name
+            when "table"
+              @sheets << {
+                name: attributes["#{table_prefix}:name"],
+                rows: [],
+              }
+            when "table-row"
+              @sheets.last[:rows] << []
+            when "table-cell"
+              @sheets.last[:rows].last << {text: ""}
+            end
+          end
+        end
+
+        def end_element(uri, local_name, qname)
+          @in_p = false
+          case uri
+          when TABLE_URI
+            case local_name
+            when "table"
+              sheet =****@sheet*****
+              text = ""
+              sheet[:rows].each do |row|
+                cell_texts = row.collect {|cell| cell[:text]}
+                next if cell_texts.all?(&:empty?)
+                text << cell_texts.join("\t") << "\n"
+              end
+              sheet[:text] = text
+            end
+          end
+        end
+
+        def characters(text)
+          add_text(text)
+        end
+
+        def cdata(content)
+          add_text(content)
+        end
+
+        private
+        def add_text(text)
+          return unless @in_p
+          @sheets.last[:rows].last.last[:text] << CGI.unescapeHTML(text)
+        end
+      end
+    end
+  end
+end

  Modified: lib/chupa-text/decomposers/open-document.rb (+0 -2)
===================================================================
--- lib/chupa-text/decomposers/open-document.rb    2019-02-26 06:38:03 +0900 (294a2b8)
+++ lib/chupa-text/decomposers/open-document.rb    2019-02-26 06:58:03 +0900 (635c800)
@@ -27,11 +27,9 @@ module ChupaText
 
       EXTENSIONS = [
         "odt",
-        "ods",
       ]
       MIME_TYPES = [
         "application/vnd.oasis.opendocument.text",
-        "application/vnd.oasis.opendocument.spreadsheet",
       ]
       def target?(data)
         EXTENSIONS.include?(data.extension) or

  Modified: test/decomposers/test-open-document-spreadsheet.rb (+57 -43)
===================================================================
--- test/decomposers/test-open-document-spreadsheet.rb    2019-02-26 06:38:03 +0900 (b83e18a)
+++ test/decomposers/test-open-document-spreadsheet.rb    2019-02-26 06:58:03 +0900 (32039e7)
@@ -18,7 +18,7 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
   include Helper
 
   def setup
-    @decomposer = ChupaText::Decomposers::OpenDocument.new({})
+    @decomposer = ChupaText::Decomposers::OpenDocumentSpreadsheet.new({})
   end
 
   def decompose(path)
@@ -48,46 +48,34 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
   sub_test_case("#decompose") do
     sub_test_case("attributes") do
       def decompose(attribute_name)
-        super(fixture_path("ods", "attributes.ods")).collect do |data|
-          data[attribute_name]
-        end
+        super(fixture_path("ods", "attributes.ods")).first[attribute_name]
       end
 
       def test_title
-        assert_equal(["Title"], decompose("title"))
-      end
-
-      def test_author
-        assert_equal([nil], decompose("author"))
+        assert_equal("Title", decompose("title"))
       end
 
       def test_subject
-        assert_equal(["Subject"], decompose("subject"))
+        assert_equal("Subject", decompose("subject"))
       end
 
       def test_keywords
-        assert_equal([["Keyword1", "Keyword2"]], decompose("keywords"))
+        assert_equal(["Keyword1", "Keyword2"], decompose("keywords"))
       end
 
       def test_created_time
-        assert_equal([Time],
-                     decompose("created_time").collect(&:class))
+        assert_equal(Time,
+                     decompose("created_time").class)
       end
 
       def test_modified_time
-        assert_equal([Time],
-                     decompose("modified_time").collect(&:class))
+        assert_equal(Time,
+                     decompose("modified_time").class)
       end
 
       def test_generator
-        assert_equal(["LibreOffice"],
-                     normalize_generators(decompose("generator")))
-      end
-
-      def normalize_generators(generators)
-        generators.collect do |generator|
-          normalize_generator(generator)
-        end
+        assert_equal("LibreOffice",
+                     normalize_generator(decompose("generator")))
       end
 
       def normalize_generator(generator)
@@ -97,41 +85,67 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase
           generator
         end
       end
-
-      def test_creation_date
-        assert_equal([nil], decompose("creation_date"))
-      end
     end
 
     sub_test_case("one sheet") do
       def decompose
-        super(fixture_path("ods", "one-sheet.ods"))
+        super(fixture_path("ods", "one-sheet.ods")).collect do |data|
+          [
+            data["index"],
+            data["name"],
+            data.body,
+          ]
+        end
       end
 
       def test_body
-        assert_equal([<<-BODY], decompose.collect(&:body))
-Sheet1 - A1\tSheet1 - B1
-Sheet1 - A2\tSheet1 - B2
-        BODY
+        assert_equal([
+                       [nil, nil, ""],
+                       [
+                         0,
+                         "Sheet1",
+                         "Sheet1 - A1\tSheet1 - B1\n" +
+                         "Sheet1 - A2\tSheet1 - B2\n",
+                       ],
+                     ],
+                     decompose)
       end
     end
 
     sub_test_case("multi sheets") do
       def decompose
-        super(fixture_path("ods", "multi-sheets.ods"))
+        super(fixture_path("ods", "multi-sheets.ods")).collect do |data|
+          [
+            data["index"],
+            data["name"],
+            data.body,
+          ]
+        end
       end
 
       def test_body
-        assert_equal([<<-BODY], decompose.collect(&:body))
-Sheet1 - A1\tSheet1 - B1
-Sheet1 - A2\tSheet1 - B2
-
-Sheet2 - A1\tSheet2 - B1
-Sheet2 - A2\tSheet2 - B2
-
-Sheet3 - A1\tSheet3 - B1
-Sheet3 - A2\tSheet3 - B2
-        BODY
+        assert_equal([
+                       [nil, nil, ""],
+                       [
+                         0,
+                         "Sheet1",
+                         "Sheet1 - A1\tSheet1 - B1\n" +
+                         "Sheet1 - A2\tSheet1 - B2\n",
+                       ],
+                       [
+                         1,
+                         "Sheet2",
+                         "Sheet2 - A1\tSheet2 - B1\n" +
+                         "Sheet2 - A2\tSheet2 - B2\n",
+                       ],
+                       [
+                         2,
+                         "Sheet3",
+                         "Sheet3 - A1\tSheet3 - B1\n" +
+                         "Sheet3 - A2\tSheet3 - B2\n",
+                       ],
+                     ],
+                     decompose)
       end
     end
   end
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190226/9a5b4d90/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index