Kouhei Sutou 2019-02-26 06:58:03 +0900 (Tue, 26 Feb 2019) Revision: cc182ed795389091ea30c4b00c92377f8c119015 https://github.com/ranguba/chupa-text/commit/cc182ed795389091ea30c4b00c92377f8c119015 Message: Use one data per sheet Added files: lib/chupa-text/decomposers/open-document-spreadsheet.rb Modified files: lib/chupa-text/decomposers/open-document.rb test/decomposers/test-open-document-spreadsheet.rb Added: lib/chupa-text/decomposers/open-document-spreadsheet.rb (+164 -0) 100644 =================================================================== --- /dev/null +++ lib/chupa-text/decomposers/open-document-spreadsheet.rb 2019-02-26 06:58:03 +0900 (a4c7eff) @@ -0,0 +1,164 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "chupa-text/decomposers/open-document" + +module ChupaText + module Decomposers + class OpenDocumentSpreadsheet < OpenDocument + registry.register("open-document-spreadsheet", self) + + def initialize(options={}) + super + @extension = "ods" + @mime_type = "application/vnd.oasis.opendocument.spreadsheet" + end + + def target?(data) + data.extension == @extension or + data.mime_type == @mime_type + end + + def target_score(data) + if target?(data) + -1 + else + nil + end + end + + def decompose(data) + sheets = [] + data.open do |input| + Archive::Zip.open(input) do |zip| + zip.each do |entry| + next unless entry.file? + case entry.zip_path + when "content.xml" + listener = SheetsListener.new(sheets) + parse(entry.file_data, listener) + when "meta.xml" + attributes = {} + listener = AttributesListener.new(attributes) + parse(entry.file_data, listener) + metadata = TextData.new("", source_data: data) + attributes.each do |name, value| + metadata[name] = value + end + yield(metadata) + end + end + end + end + sheets.each_with_index do |sheet, i| + text = sheet[:text] + text_data = TextData.new(text, source_data: data) + text_data["index"] = i + name = sheet[:name] + text_data["name"] = name if name + yield(text_data) + end + end + + private + def parse(io, listener) + source = REXML::Source.new(io.read) + parser = REXML::Parsers::SAX2Parser.new(source) + parser.listen(listener) + parser.parse + end + + class SheetsListener + include REXML::SAX2Listener + + TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0" + TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0" + + def initialize(sheets) + @sheets = sheets + @prefix_to_uri = {} + @uri_to_prefix = {} + @in_p = false + end + + def start_prefix_mapping(prefix, uri) + @prefix_to_uri[prefix] = uri + @uri_to_prefix[uri] = prefix + end + + def end_prefix_mapping(prefix) + uri = @prefix_to_uri.delete(prefix) + @uri_to_prefix.delete(uri) + end + + def start_element(uri, local_name, qname, attributes) + case uri + when TEXT_URI + case local_name + when "p" + @in_p = true + end + when TABLE_URI + table_prefix = @uri_to_prefix[TABLE_URI] + case local_name + when "table" + @sheets << { + name: attributes["#{table_prefix}:name"], + rows: [], + } + when "table-row" + @sheets.last[:rows] << [] + when "table-cell" + @sheets.last[:rows].last << {text: ""} + end + end + end + + def end_element(uri, local_name, qname) + @in_p = false + case uri + when TABLE_URI + case local_name + when "table" + sheet =****@sheet***** + text = "" + sheet[:rows].each do |row| + cell_texts = row.collect {|cell| cell[:text]} + next if cell_texts.all?(&:empty?) + text << cell_texts.join("\t") << "\n" + end + sheet[:text] = text + end + end + end + + def characters(text) + add_text(text) + end + + def cdata(content) + add_text(content) + end + + private + def add_text(text) + return unless @in_p + @sheets.last[:rows].last.last[:text] << CGI.unescapeHTML(text) + end + end + end + end +end Modified: lib/chupa-text/decomposers/open-document.rb (+0 -2) =================================================================== --- lib/chupa-text/decomposers/open-document.rb 2019-02-26 06:38:03 +0900 (294a2b8) +++ lib/chupa-text/decomposers/open-document.rb 2019-02-26 06:58:03 +0900 (635c800) @@ -27,11 +27,9 @@ module ChupaText EXTENSIONS = [ "odt", - "ods", ] MIME_TYPES = [ "application/vnd.oasis.opendocument.text", - "application/vnd.oasis.opendocument.spreadsheet", ] def target?(data) EXTENSIONS.include?(data.extension) or Modified: test/decomposers/test-open-document-spreadsheet.rb (+57 -43) =================================================================== --- test/decomposers/test-open-document-spreadsheet.rb 2019-02-26 06:38:03 +0900 (b83e18a) +++ test/decomposers/test-open-document-spreadsheet.rb 2019-02-26 06:58:03 +0900 (32039e7) @@ -18,7 +18,7 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase include Helper def setup - @decomposer = ChupaText::Decomposers::OpenDocument.new({}) + @decomposer = ChupaText::Decomposers::OpenDocumentSpreadsheet.new({}) end def decompose(path) @@ -48,46 +48,34 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase sub_test_case("#decompose") do sub_test_case("attributes") do def decompose(attribute_name) - super(fixture_path("ods", "attributes.ods")).collect do |data| - data[attribute_name] - end + super(fixture_path("ods", "attributes.ods")).first[attribute_name] end def test_title - assert_equal(["Title"], decompose("title")) - end - - def test_author - assert_equal([nil], decompose("author")) + assert_equal("Title", decompose("title")) end def test_subject - assert_equal(["Subject"], decompose("subject")) + assert_equal("Subject", decompose("subject")) end def test_keywords - assert_equal([["Keyword1", "Keyword2"]], decompose("keywords")) + assert_equal(["Keyword1", "Keyword2"], decompose("keywords")) end def test_created_time - assert_equal([Time], - decompose("created_time").collect(&:class)) + assert_equal(Time, + decompose("created_time").class) end def test_modified_time - assert_equal([Time], - decompose("modified_time").collect(&:class)) + assert_equal(Time, + decompose("modified_time").class) end def test_generator - assert_equal(["LibreOffice"], - normalize_generators(decompose("generator"))) - end - - def normalize_generators(generators) - generators.collect do |generator| - normalize_generator(generator) - end + assert_equal("LibreOffice", + normalize_generator(decompose("generator"))) end def normalize_generator(generator) @@ -97,41 +85,67 @@ class TestDecomposersOpenDocumentSpreadsheet < Test::Unit::TestCase generator end end - - def test_creation_date - assert_equal([nil], decompose("creation_date")) - end end sub_test_case("one sheet") do def decompose - super(fixture_path("ods", "one-sheet.ods")) + super(fixture_path("ods", "one-sheet.ods")).collect do |data| + [ + data["index"], + data["name"], + data.body, + ] + end end def test_body - assert_equal([<<-BODY], decompose.collect(&:body)) -Sheet1 - A1\tSheet1 - B1 -Sheet1 - A2\tSheet1 - B2 - BODY + assert_equal([ + [nil, nil, ""], + [ + 0, + "Sheet1", + "Sheet1 - A1\tSheet1 - B1\n" + + "Sheet1 - A2\tSheet1 - B2\n", + ], + ], + decompose) end end sub_test_case("multi sheets") do def decompose - super(fixture_path("ods", "multi-sheets.ods")) + super(fixture_path("ods", "multi-sheets.ods")).collect do |data| + [ + data["index"], + data["name"], + data.body, + ] + end end def test_body - assert_equal([<<-BODY], decompose.collect(&:body)) -Sheet1 - A1\tSheet1 - B1 -Sheet1 - A2\tSheet1 - B2 - -Sheet2 - A1\tSheet2 - B1 -Sheet2 - A2\tSheet2 - B2 - -Sheet3 - A1\tSheet3 - B1 -Sheet3 - A2\tSheet3 - B2 - BODY + assert_equal([ + [nil, nil, ""], + [ + 0, + "Sheet1", + "Sheet1 - A1\tSheet1 - B1\n" + + "Sheet1 - A2\tSheet1 - B2\n", + ], + [ + 1, + "Sheet2", + "Sheet2 - A1\tSheet2 - B1\n" + + "Sheet2 - A2\tSheet2 - B2\n", + ], + [ + 2, + "Sheet3", + "Sheet3 - A1\tSheet3 - B1\n" + + "Sheet3 - A2\tSheet3 - B2\n", + ], + ], + decompose) end end end -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190226/9a5b4d90/attachment-0001.html>