Kouhei Sutou 2019-02-25 16:29:17 +0900 (Mon, 25 Feb 2019) Revision: a0d133abcc55a982eef82256c84b03ec4a2ba7bd https://github.com/ranguba/chupa-text/commit/a0d133abcc55a982eef82256c84b03ec4a2ba7bd Message: Add support PowerPoint Added files: lib/chupa-text/decomposers/office-open-xml-presentation.rb test/decomposers/test-office-open-xml-presentation.rb test/fixture/pptx/attributes.pptx test/fixture/pptx/multi-slides.pptx test/fixture/pptx/one-slide.pptx Modified files: lib/chupa-text/decomposers/office-open-xml-document.rb lib/chupa-text/decomposers/office-open-xml.rb Modified: lib/chupa-text/decomposers/office-open-xml-document.rb (+14 -3) =================================================================== --- lib/chupa-text/decomposers/office-open-xml-document.rb 2019-02-25 16:08:02 +0900 (31266ba) +++ lib/chupa-text/decomposers/office-open-xml-document.rb 2019-02-25 16:29:17 +0900 (c0e3fef) @@ -23,10 +23,21 @@ module ChupaText def initialize(options={}) super - @extension = "docx" - @mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + @extensions = [ + "docx", + "docm", + "dotx", + "dotm", + ] + @mime_types = [ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.ms-word.document.macroEnabled.12", + "application/vnd.openxmlformats-officedocument.wordprocessingml.template", + "application/vnd.ms-word.template.macroEnabled.12", + ] @path = "word/document.xml" - @namespace_uri = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + @namespace_uri = + "http://schemas.openxmlformats.org/wordprocessingml/2006/main" end end end Added: lib/chupa-text/decomposers/office-open-xml-presentation.rb (+64 -0) 100644 =================================================================== --- /dev/null +++ lib/chupa-text/decomposers/office-open-xml-presentation.rb 2019-02-25 16:29:17 +0900 (63c17fc) @@ -0,0 +1,64 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "chupa-text/decomposers/office-open-xml" + +module ChupaText + module Decomposers + class OfficeOpenXMLPresentation < OfficeOpenXML + registry.register("office-open-xml-presentation", self) + + def initialize(options={}) + super + @extensions = [ + "pptx", + "pptm", + "ppsx", + "ppsm", + "potx", + "potm", + "sldx", + "sldm", + ] + @mime_types = [ + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.ms-powerpoint.presentation.macroEnabled.12", + "application/vnd.openxmlformats-officedocument.presentationml.slideshow", + "application/vnd.ms-powerpoint.slideshow.macroEnabled.12", + "application/vnd.openxmlformats-officedocument.presentationml.template", + "application/vnd.ms-powerpoint.template.macroEnabled.12", + "application/vnd.openxmlformats-officedocument.presentationml.slide", + "application/vnd.ms-powerpoint.slide.macroEnabled.12", + ] + @path = /\Appt\/slides\/slide\d+\.xml/ + @namespace_uri = + "http://schemas.openxmlformats.org/drawingml/2006/main" + end + + private + def extract_text(entry, texts) + text = "" + super(entry, text) + nth_slide = Integer(entry.zip_path.scan(/(\d+)\.xml\z/)[0][0], 10) + texts << [nth_slide, text] + end + + def accumulate_texts(texts) + texts.sort_by(&:first).collect(&:last).join("\n") + end + end + end +end Modified: lib/chupa-text/decomposers/office-open-xml.rb (+14 -5) =================================================================== --- lib/chupa-text/decomposers/office-open-xml.rb 2019-02-25 16:08:02 +0900 (4438557) +++ lib/chupa-text/decomposers/office-open-xml.rb 2019-02-25 16:29:17 +0900 (4f111a1) @@ -24,7 +24,8 @@ module ChupaText module Decomposers class OfficeOpenXML < Decomposer def target?(data) - data.extension == @extension or data.mime_type == @mime_type + @extensions.include?(data.extension) or + @mime_types.include?(data.mime_type) end def target_score(data) @@ -36,7 +37,7 @@ module ChupaText end def decompose(data) - text = nil + texts = [] attributes = {} data.open do |input| Archive::Zip.open(input) do |zip| @@ -44,9 +45,7 @@ module ChupaText next unless entry.file? case entry.zip_path when @path - text = "" - listener = TextListener.new(text, @namespace_uri) - parse(entry.file_data, listener) + extract_text(entry, texts) when "docProps/app.xml" listener = AttributesListener.new(attributes) parse(entry.file_data, listener) @@ -57,6 +56,7 @@ module ChupaText end end end + text = accumulate_texts(texts) text_data = TextData.new(text, source_data: data) attributes.each do |name, value| text_data[name] = value @@ -72,6 +72,15 @@ module ChupaText parser.parse end + def extract_text(entry, texts) + listener = TextListener.new(texts, @namespace_uri) + parse(entry.file_data, listener) + end + + def accumulate_texts(texts) + texts.join("") + end + class TextListener include REXML::SAX2Listener Added: test/decomposers/test-office-open-xml-presentation.rb (+133 -0) 100644 =================================================================== --- /dev/null +++ test/decomposers/test-office-open-xml-presentation.rb 2019-02-25 16:29:17 +0900 (6cff112) @@ -0,0 +1,133 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +class TestDecomposersOfficeOpenXMLPresentation < Test::Unit::TestCase + include Helper + + def setup + @decomposer = ChupaText::Decomposers::OfficeOpenXMLPresentation.new({}) + end + + def decompose(path) + data = ChupaText::InputData.new(path) + decomposed = [] + @decomposer.decompose(data) do |decomposed_data| + decomposed << decomposed_data + end + decomposed + end + + sub_test_case("#target_score") do + def test_extension + data = ChupaText::Data.new + data.body = "" + data.uri = "presentation.pptx" + assert_equal(-1, @decomposer.target_score(data)) + end + + def test_mime_type + data = ChupaText::Data.new + data.mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation" + assert_equal(-1, @decomposer.target_score(data)) + end + end + + sub_test_case("#decompose") do + sub_test_case("attributes") do + def decompose(attribute_name) + super(fixture_path("pptx", "attributes.pptx")).collect do |data| + data[attribute_name] + end + end + + def test_title + assert_equal(["Title"], decompose("title")) + end + + def test_author + assert_equal([nil], decompose("author")) + end + + def test_subject + assert_equal(["Subject"], decompose("subject")) + end + + def test_keywords + assert_equal(["Keyword1 Keyword2"], decompose("keywords")) + end + + def test_modified_time + assert_equal([Time], + decompose("modified_time").collect(&:class)) + end + + def test_application + assert_equal(["LibreOffice"], + normalize_applications(decompose("application"))) + end + + def normalize_applications(applications) + applications.collect do |application| + normalize_application(application) + end + end + + def normalize_application(application) + if application.start_with?("LibreOffice") + "LibreOffice" + else + application + end + end + + def test_creation_date + assert_equal([nil], decompose("creation_date")) + end + end + + sub_test_case("one slide") do + def decompose + super(fixture_path("pptx", "one-slide.pptx")) + end + + def test_body + assert_equal([<<-BODY], decompose.collect(&:body)) +Slide1 title +Slide1 content + BODY + end + end + + sub_test_case("multi slides") do + def decompose + super(fixture_path("pptx", "multi-slides.pptx")) + end + + def test_body + assert_equal([<<-BODY], decompose.collect(&:body)) +Slide1 title +Slide1 content + +Slide2 title +Slide2 content + +Slide3 title +Slide3 content + BODY + end + end + end +end Added: test/fixture/pptx/attributes.pptx (+23 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/pptx/attributes.pptx 2019-02-25 16:29:17 +0900 (db82ab6) @@ -0,0 +1,23 @@ +MIME-Version: 1.0 +mime-type: application/vnd.openxmlformats-officedocument.presentationml.presentation +uri: file:/tmp/mWit2k_attributes.pptx +path: /tmp/mWit2k_attributes.pptx +size: 20658 +Content-Type: multipart/mixed; boundary=086ab5ce9ffe83fe209b6535c62460ca64f2e0de + +--086ab5ce9ffe83fe209b6535c62460ca64f2e0de +mime-type: text/plain +uri: file:/tmp/mWit2k_attributes.txt +path: /tmp/mWit2k_attributes.txt +size: 6 +title: Title +created_time: 2019-02-25 07:29:39 UTC +source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.presentationml.presentation"] +subject: Subject +keywords: Keyword1, Keyword2 +creator: Impress +producer: LibreOffice 5.2 + +Title + +--086ab5ce9ffe83fe209b6535c62460ca64f2e0de-- Added: test/fixture/pptx/multi-slides.pptx (+25 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/pptx/multi-slides.pptx 2019-02-25 16:29:17 +0900 (ad7fc0b) @@ -0,0 +1,25 @@ +MIME-Version: 1.0 +mime-type: application/vnd.openxmlformats-officedocument.presentationml.presentation +uri: file:/tmp/x5sjhk_multi-slides.pptx +path: /tmp/x5sjhk_multi-slides.pptx +size: 64545 +Content-Type: multipart/mixed; boundary=e91334c544f9aa5cb9be01fa2efee72507e9c749 + +--e91334c544f9aa5cb9be01fa2efee72507e9c749 +mime-type: text/plain +uri: file:/tmp/x5sjhk_multi-slides.txt +path: /tmp/x5sjhk_multi-slides.txt +size: 84 +created_time: 2019-02-25 07:29:41 UTC +source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.presentationml.presentation"] +creator: Impress +producer: LibreOffice 5.2 + +Slide1 title +Slide1 content +Slide2 title +Slide2 content +Slide3 title +Slide3 content + +--e91334c544f9aa5cb9be01fa2efee72507e9c749-- Added: test/fixture/pptx/one-slide.pptx (+21 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/pptx/one-slide.pptx 2019-02-25 16:29:17 +0900 (8ad2c0a) @@ -0,0 +1,21 @@ +MIME-Version: 1.0 +mime-type: application/vnd.openxmlformats-officedocument.presentationml.presentation +uri: file:/tmp/BoZPzY_one-slide.pptx +path: /tmp/BoZPzY_one-slide.pptx +size: 32488 +Content-Type: multipart/mixed; boundary=cb1251febb85ffd23fef39b59757cac95cfa933f + +--cb1251febb85ffd23fef39b59757cac95cfa933f +mime-type: text/plain +uri: file:/tmp/BoZPzY_one-slide.txt +path: /tmp/BoZPzY_one-slide.txt +size: 28 +created_time: 2019-02-25 07:29:43 UTC +source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.presentationml.presentation"] +creator: Impress +producer: LibreOffice 5.2 + +Slide1 title +Slide1 content + +--cb1251febb85ffd23fef39b59757cac95cfa933f-- -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190225/f7ab3d16/attachment-0001.html>