Kouhei Sutou 2019-02-25 17:02:55 +0900 (Mon, 25 Feb 2019) Revision: 3ac5d1455ac8d14d3c153d6b6242e7703a48746c https://github.com/ranguba/chupa-text/commit/3ac5d1455ac8d14d3c153d6b6242e7703a48746c Message: Add support Excel Added files: lib/chupa-text/decomposers/office-open-xml-workbook.rb test/decomposers/test-office-open-xml-workbook.rb test/fixture/xlsx/attributes.xlsx test/fixture/xlsx/multi-sheets.xlsx test/fixture/xlsx/one-sheet.xlsx Modified files: lib/chupa-text/decomposers/office-open-xml-document.rb lib/chupa-text/decomposers/office-open-xml-presentation.rb lib/chupa-text/decomposers/office-open-xml.rb Modified: lib/chupa-text/decomposers/office-open-xml-document.rb (+8 -1) =================================================================== --- lib/chupa-text/decomposers/office-open-xml-document.rb 2019-02-25 16:29:17 +0900 (c0e3fef) +++ lib/chupa-text/decomposers/office-open-xml-document.rb 2019-02-25 17:02:55 +0900 (48118e5) @@ -35,10 +35,17 @@ module ChupaText "application/vnd.openxmlformats-officedocument.wordprocessingml.template", "application/vnd.ms-word.template.macroEnabled.12", ] - @path = "word/document.xml" @namespace_uri = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" end + + private + def process_entry(entry, context) + case entry.zip_path + when "word/document.xml" + extract_text(entry, context[:text]) + end + end end end end Modified: lib/chupa-text/decomposers/office-open-xml-presentation.rb (+11 -8) =================================================================== --- lib/chupa-text/decomposers/office-open-xml-presentation.rb 2019-02-25 16:29:17 +0900 (63c17fc) +++ lib/chupa-text/decomposers/office-open-xml-presentation.rb 2019-02-25 17:02:55 +0900 (1ce5759) @@ -43,21 +43,24 @@ module ChupaText "application/vnd.openxmlformats-officedocument.presentationml.slide", "application/vnd.ms-powerpoint.slide.macroEnabled.12", ] - @path = /\Appt\/slides\/slide\d+\.xml/ @namespace_uri = "http://schemas.openxmlformats.org/drawingml/2006/main" end private - def extract_text(entry, texts) - text = "" - super(entry, text) - nth_slide = Integer(entry.zip_path.scan(/(\d+)\.xml\z/)[0][0], 10) - texts << [nth_slide, text] + def process_entry(entry, context) + case entry.zip_path + when /\Appt\/slides\/slide(\d+)\.xml/ + nth_slide = Integer($1, 10) + slide_text = "" + extract_text(entry, slide_text) + context[:slides] ||= [] + context[:slides] << [nth_slide, slide_text] + end end - def accumulate_texts(texts) - texts.sort_by(&:first).collect(&:last).join("\n") + def accumulate_text(context) + context[:slides].sort_by(&:first).collect(&:last).join("\n") end end end Added: lib/chupa-text/decomposers/office-open-xml-workbook.rb (+114 -0) 100644 =================================================================== --- /dev/null +++ lib/chupa-text/decomposers/office-open-xml-workbook.rb 2019-02-25 17:02:55 +0900 (a75d351) @@ -0,0 +1,114 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "chupa-text/decomposers/office-open-xml" + +module ChupaText + module Decomposers + class OfficeOpenXMLWorkbook < OfficeOpenXML + registry.register("office-open-xml-workbook", self) + + def initialize(options={}) + super + @extensions = [ + "xlsx", + "xlsm", + "xltx", + "xltm", + ] + @mime_types = [ + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.ms-excel.sheet.macroEnabled.12", + "application/vnd.openxmlformats-officedocument.spreadsheetml.template", + "application/vnd.ms-excel.template.macroEnabled.12", + ] + @namespace_uri = + "http://schemas.openxmlformats.org/spreadsheetml/2006/main" + end + + private + def process_entry(entry, context) + case entry.zip_path + when "xl/sharedStrings.xml" + context[:shared_strings] = [] + extract_text(entry, context[:shared_strings]) + when /\Axl\/worksheets\/sheet(\d+)\.xml\z/ + nth_sheet = Integer($1, 10) + sheet = [] + listener = SheetListener.new(sheet) + parse(entry.file_data, listener) + context[:sheets] ||= [] + context[:sheets] << [nth_sheet, sheet] + end + end + + def accumulate_text(context) + shared_strings = context[:shared_strings] + sheets = context[:sheets].sort_by(&:first).collect(&:last) + sheet_texts = sheets.collect do |sheet| + sheet_text = "" + sheet.each do |row| + row_texts = row.collect do |index| + shared_strings[index] + end + sheet_text << row_texts.join("\t") << "\n" + end + sheet_text + end + sheet_texts.join("\n") + end + + class SheetListener + include REXML::SAX2Listener + + URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" + + def initialize(sheet) + @sheet = sheet + @in_v = false + end + + def start_element(uri, local_name, qname, attributes) + return unless uri == URI + case local_name + when "row" + @sheet << [] + when "v" + @in_v = true + end + end + + def end_element(uri, local_name, qname) + @in_v = false + end + + def characters(text) + add_column(text) + end + + def cdata(content) + add_column(content) + end + + private + def add_column(text) + return unless @in_v + @sheet.last << Integer(text, 10) + end + end + end + end +end Modified: lib/chupa-text/decomposers/office-open-xml.rb (+14 -12) =================================================================== --- lib/chupa-text/decomposers/office-open-xml.rb 2019-02-25 16:29:17 +0900 (4f111a1) +++ lib/chupa-text/decomposers/office-open-xml.rb 2019-02-25 17:02:55 +0900 (a891383) @@ -37,28 +37,30 @@ module ChupaText end def decompose(data) - texts = [] - attributes = {} + context = { + text: "", + attributes: {}, + } data.open do |input| Archive::Zip.open(input) do |zip| zip.each do |entry| next unless entry.file? case entry.zip_path - when @path - extract_text(entry, texts) when "docProps/app.xml" - listener = AttributesListener.new(attributes) + listener = AttributesListener.new(context[:attributes]) parse(entry.file_data, listener) when "docProps/core.xml" - listener = AttributesListener.new(attributes) + listener = AttributesListener.new(context[:attributes]) parse(entry.file_data, listener) + else + process_entry(entry, context) end end end end - text = accumulate_texts(texts) + text = accumulate_text(context) text_data = TextData.new(text, source_data: data) - attributes.each do |name, value| + context[:attributes].each do |name, value| text_data[name] = value end yield(text_data) @@ -77,8 +79,8 @@ module ChupaText parse(entry.file_data, listener) end - def accumulate_texts(texts) - texts.join("") + def accumulate_text(context) + context[:text] end class TextListener @@ -99,10 +101,10 @@ module ChupaText end def end_element(uri, local_name, qname) + @in_target = false + return unless uri == @target_uri case local_name - when "t" - @in_target = false when "p", "br" @output << "\n" end Added: test/decomposers/test-office-open-xml-workbook.rb (+138 -0) 100644 =================================================================== --- /dev/null +++ test/decomposers/test-office-open-xml-workbook.rb 2019-02-25 17:02:55 +0900 (66154a5) @@ -0,0 +1,138 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase + include Helper + + def setup + @decomposer = ChupaText::Decomposers::OfficeOpenXMLWorkbook.new({}) + end + + def decompose(path) + data = ChupaText::InputData.new(path) + decomposed = [] + @decomposer.decompose(data) do |decomposed_data| + decomposed << decomposed_data + end + decomposed + end + + sub_test_case("#target_score") do + def test_extension + data = ChupaText::Data.new + data.body = "" + data.uri = "workbook.xlsx" + assert_equal(-1, @decomposer.target_score(data)) + end + + def test_mime_type + data = ChupaText::Data.new + data.mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + assert_equal(-1, @decomposer.target_score(data)) + end + end + + sub_test_case("#decompose") do + sub_test_case("attributes") do + def decompose(attribute_name) + super(fixture_path("xlsx", "attributes.xlsx")).collect do |data| + data[attribute_name] + end + end + + def test_title + assert_equal(["Title"], decompose("title")) + end + + def test_author + assert_equal([nil], decompose("author")) + end + + def test_subject + assert_equal(["Subject"], decompose("subject")) + end + + def test_keywords + assert_equal(["Keyword1 Keyword2"], decompose("keywords")) + end + + def test_created_time + assert_equal([Time], + decompose("created_time").collect(&:class)) + end + + def test_modified_time + assert_equal([Time], + decompose("modified_time").collect(&:class)) + end + + def test_application + assert_equal(["LibreOffice"], + normalize_applications(decompose("application"))) + end + + def normalize_applications(applications) + applications.collect do |application| + normalize_application(application) + end + end + + def normalize_application(application) + if application.start_with?("LibreOffice") + "LibreOffice" + else + application + end + end + + def test_creation_date + assert_equal([nil], decompose("creation_date")) + end + end + + sub_test_case("one sheet") do + def decompose + super(fixture_path("xlsx", "one-sheet.xlsx")) + end + + def test_body + assert_equal([<<-BODY], decompose.collect(&:body)) +Sheet1 - A1\tSheet1 - B1 +Sheet1 - A2\tSheet1 - B2 + BODY + end + end + + sub_test_case("multi sheets") do + def decompose + super(fixture_path("xlsx", "multi-sheets.xlsx")) + end + + def test_body + assert_equal([<<-BODY], decompose.collect(&:body)) +Sheet1 - A1\tSheet1 - B1 +Sheet1 - A2\tSheet1 - B2 + +Sheet2 - A1\tSheet2 - B1 +Sheet2 - A2\tSheet2 - B2 + +Sheet3 - A1\tSheet3 - B1 +Sheet3 - A2\tSheet3 - B2 + BODY + end + end + end +end Added: test/fixture/xlsx/attributes.xlsx (+24 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/xlsx/attributes.xlsx 2019-02-25 17:02:55 +0900 (067cecc) @@ -0,0 +1,24 @@ +MIME-Version: 1.0 +mime-type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet +uri: file:/tmp/lz0OgQ_attributes.xlsx +path: /tmp/lz0OgQ_attributes.xlsx +size: 4610 +Content-Type: multipart/mixed; boundary=0648060d16debd465e4a4f8e0e1955f4681a9889 + +--0648060d16debd465e4a4f8e0e1955f4681a9889 +mime-type: text/plain +uri: file:/tmp/lz0OgQ_attributes.txt +path: /tmp/lz0OgQ_attributes.txt +size: 12 +source-mime-types: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] +name: Sheet1 +digest: b6906a8292c58517cf73036ad0ace729518f79b4 +size: 12 +first-row: 1 +last-row: 1 +first-column: A +last-column: A + +"Sheet1 A1" + +--0648060d16debd465e4a4f8e0e1955f4681a9889-- Added: test/fixture/xlsx/multi-sheets.xlsx (+59 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/xlsx/multi-sheets.xlsx 2019-02-25 17:02:55 +0900 (d4161ba) @@ -0,0 +1,59 @@ +MIME-Version: 1.0 +mime-type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet +uri: file:/tmp/20tCV7_multi-sheets.xlsx +path: /tmp/20tCV7_multi-sheets.xlsx +size: 6643 +Content-Type: multipart/mixed; boundary=14bf74dbb0768dd2878a3b236390cd2e928a7b39 + +--14bf74dbb0768dd2878a3b236390cd2e928a7b39 +mime-type: text/plain +uri: file:/tmp/20tCV7_multi-sheets.txt +path: /tmp/20tCV7_multi-sheets.txt +size: 56 +source-mime-types: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] +name: Sheet1 +digest: 9011cb9443faf652afa3746acfae5c623ec5c7c9 +size: 56 +first-row: 1 +last-row: 2 +first-column: A +last-column: B + +"Sheet1 - A1","Sheet1 - B1" +"Sheet1 - A2","Sheet1 - B2" + +--14bf74dbb0768dd2878a3b236390cd2e928a7b39 +mime-type: text/plain +uri: file:/tmp/20tCV7_multi-sheets.txt +path: /tmp/20tCV7_multi-sheets.txt +size: 56 +source-mime-types: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] +name: Sheet2 +digest: be99be632ea81d909f2d44b26e3dde28214cb126 +size: 56 +first-row: 1 +last-row: 2 +first-column: A +last-column: B + +"Sheet2 - A1","Sheet2 - B1" +"Sheet2 - A2","Sheet2 - B2" + +--14bf74dbb0768dd2878a3b236390cd2e928a7b39 +mime-type: text/plain +uri: file:/tmp/20tCV7_multi-sheets.txt +path: /tmp/20tCV7_multi-sheets.txt +size: 56 +source-mime-types: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] +name: Sheet3 +digest: c03c013aba6e69703ba402967fc2a85cebbe9a28 +size: 56 +first-row: 1 +last-row: 2 +first-column: A +last-column: B + +"Sheet3 - A1","Sheet3 - B1" +"Sheet3 - A2","Sheet3 - B2" + +--14bf74dbb0768dd2878a3b236390cd2e928a7b39-- Added: test/fixture/xlsx/one-sheet.xlsx (+25 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/xlsx/one-sheet.xlsx 2019-02-25 17:02:55 +0900 (9a94623) @@ -0,0 +1,25 @@ +MIME-Version: 1.0 +mime-type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet +uri: file:/tmp/6DPYAc_one-sheet.xlsx +path: /tmp/6DPYAc_one-sheet.xlsx +size: 4593 +Content-Type: multipart/mixed; boundary=17909a3c30dfcb7626b7854f1bdecf6bc1e288f4 + +--17909a3c30dfcb7626b7854f1bdecf6bc1e288f4 +mime-type: text/plain +uri: file:/tmp/6DPYAc_one-sheet.txt +path: /tmp/6DPYAc_one-sheet.txt +size: 56 +source-mime-types: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] +name: Sheet1 +digest: 9011cb9443faf652afa3746acfae5c623ec5c7c9 +size: 56 +first-row: 1 +last-row: 2 +first-column: A +last-column: B + +"Sheet1 - A1","Sheet1 - B1" +"Sheet1 - A2","Sheet1 - B2" + +--17909a3c30dfcb7626b7854f1bdecf6bc1e288f4-- -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190225/b1718798/attachment-0001.html>