Kouhei Sutou 2019-02-26 06:36:00 +0900 (Tue, 26 Feb 2019) Revision: 9deefbe9df42b18d7406338168d1efb3add9b263 https://github.com/ranguba/chupa-text/commit/9deefbe9df42b18d7406338168d1efb3add9b263 Message: Use one data per page Added files: lib/chupa-text/decomposers/open-document-presentation.rb Modified files: lib/chupa-text/decomposers/open-document.rb test/decomposers/test-open-document-presentation.rb Added: lib/chupa-text/decomposers/open-document-presentation.rb (+135 -0) 100644 =================================================================== --- /dev/null +++ lib/chupa-text/decomposers/open-document-presentation.rb 2019-02-26 06:36:00 +0900 (0a7cf94) @@ -0,0 +1,135 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "chupa-text/decomposers/open-document" + +module ChupaText + module Decomposers + class OpenDocumentPresentation < OpenDocument + registry.register("open-document-presentation", self) + + def initialize(options={}) + super + @extension = "odp" + @mime_type = "application/vnd.oasis.opendocument.presentation" + end + + def target?(data) + data.extension == @extension or + data.mime_type == @mime_type + end + + def target_score(data) + if target?(data) + -1 + else + nil + end + end + + def decompose(data) + slides = [] + data.open do |input| + Archive::Zip.open(input) do |zip| + zip.each do |entry| + next unless entry.file? + case entry.zip_path + when "content.xml" + listener = SlidesListener.new(slides) + parse(entry.file_data, listener) + when "meta.xml" + attributes = {} + listener = AttributesListener.new(attributes) + parse(entry.file_data, listener) + metadata = TextData.new("", source_data: data) + attributes.each do |name, value| + metadata[name] = value + end + yield(metadata) + end + end + end + end + slides.each_with_index do |slide, i| + text = slide[:text] + text_data = TextData.new(text, source_data: data) + text_data["nth_slide"] = i + yield(text_data) + end + end + + private + def parse(io, listener) + source = REXML::Source.new(io.read) + parser = REXML::Parsers::SAX2Parser.new(source) + parser.listen(listener) + parser.parse + end + + class SlidesListener + include REXML::SAX2Listener + + TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0" + DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0" + + def initialize(slides) + @slides = slides + @in_p = false + end + + def start_element(uri, local_name, qname, attributes) + case uri + when TEXT_URI + case local_name + when "p" + @in_p = true + end + when DRAW_URI + case local_name + when "page" + @slides << {text: ""} + end + end + end + + def end_element(uri, local_name, qname) + @in_p = false + case uri + when TEXT_URI + case local_name + when "p" + @slides.last[:text] << "\n" + end + end + end + + def characters(text) + add_text(text) + end + + def cdata(content) + add_text(content) + end + + private + def add_text(text) + return unless @in_p + @slides.last[:text] << CGI.unescapeHTML(text) + end + end + end + end +end Modified: lib/chupa-text/decomposers/open-document.rb (+2 -2) =================================================================== --- lib/chupa-text/decomposers/open-document.rb 2019-02-25 17:54:38 +0900 (400024c) +++ lib/chupa-text/decomposers/open-document.rb 2019-02-26 06:36:00 +0900 (294a2b8) @@ -28,12 +28,10 @@ module ChupaText EXTENSIONS = [ "odt", "ods", - "odp", ] MIME_TYPES = [ "application/vnd.oasis.opendocument.text", "application/vnd.oasis.opendocument.spreadsheet", - "application/vnd.oasis.opendocument.presentation", ] def target?(data) EXTENSIONS.include?(data.extension) or @@ -102,6 +100,8 @@ module ChupaText end def end_element(uri, local_name, qname) + @in_p = false + return unless uri == TEXT_URI case local_name when "p" Modified: test/decomposers/test-open-document-presentation.rb (+27 -41) =================================================================== --- test/decomposers/test-open-document-presentation.rb 2019-02-25 17:54:38 +0900 (3ae7819) +++ test/decomposers/test-open-document-presentation.rb 2019-02-26 06:36:00 +0900 (70e6b9e) @@ -18,7 +18,7 @@ class TestDecomposersOpenDocumentPresentation < Test::Unit::TestCase include Helper def setup - @decomposer = ChupaText::Decomposers::OpenDocument.new({}) + @decomposer = ChupaText::Decomposers::OpenDocumentPresentation.new({}) end def decompose(path) @@ -48,46 +48,34 @@ class TestDecomposersOpenDocumentPresentation < Test::Unit::TestCase sub_test_case("#decompose") do sub_test_case("attributes") do def decompose(attribute_name) - super(fixture_path("odp", "attributes.odp")).collect do |data| - data[attribute_name] - end + super(fixture_path("odp", "attributes.odp")).first[attribute_name] end def test_title - assert_equal(["Title"], decompose("title")) - end - - def test_author - assert_equal([nil], decompose("author")) + assert_equal("Title", decompose("title")) end def test_subject - assert_equal(["Subject"], decompose("subject")) + assert_equal("Subject", decompose("subject")) end def test_keywords - assert_equal([["Keyword1", "Keyword2"]], decompose("keywords")) + assert_equal(["Keyword1", "Keyword2"], decompose("keywords")) end def test_created_time - assert_equal([Time], - decompose("created_time").collect(&:class)) + assert_equal(Time, + decompose("created_time").class) end def test_modified_time - assert_equal([Time], - decompose("modified_time").collect(&:class)) + assert_equal(Time, + decompose("modified_time").class) end def test_generator - assert_equal(["LibreOffice"], - normalize_generators(decompose("generator"))) - end - - def normalize_generators(generators) - generators.collect do |generator| - normalize_generator(generator) - end + assert_equal("LibreOffice", + normalize_generator(decompose("generator"))) end def normalize_generator(generator) @@ -97,10 +85,6 @@ class TestDecomposersOpenDocumentPresentation < Test::Unit::TestCase generator end end - - def test_creation_date - assert_equal([nil], decompose("creation_date")) - end end sub_test_case("one slide") do @@ -109,10 +93,12 @@ class TestDecomposersOpenDocumentPresentation < Test::Unit::TestCase end def test_body - assert_equal([<<-BODY], decompose.collect(&:body)) -Slide1 title -Slide1 content - BODY + assert_equal([ + "", + "Slide1 title\n" + + "Slide1 content\n", + ], + decompose.collect(&:body)) end end @@ -122,16 +108,16 @@ Slide1 content end def test_body - assert_equal([<<-BODY], decompose.collect(&:body)) -Slide1 title -Slide1 content - -Slide2 title -Slide2 content - -Slide3 title -Slide3 content - BODY + assert_equal([ + "", + "Slide1 title\n" + + "Slide1 content\n", + "Slide2 title\n" + + "Slide2 content\n", + "Slide3 title\n" + + "Slide3 content\n", + ], + decompose.collect(&:body)) end end end -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190226/078e049e/attachment-0001.html>