Kouhei Sutou 2019-02-25 17:40:51 +0900 (Mon, 25 Feb 2019) Revision: 4d52d452a778c70d133b0037e0a10254c6d6ffb2 https://github.com/ranguba/chupa-text/commit/4d52d452a778c70d133b0037e0a10254c6d6ffb2 Message: Add support for OpenDocument Text Added files: lib/chupa-text/decomposers/open-document.rb test/decomposers/test-open-document-text.rb test/fixture/odt/attributes.odt test/fixture/odt/multi-pages.odt test/fixture/odt/one-page.odt test/fixture/odt/special-characters.odt Added: lib/chupa-text/decomposers/open-document.rb (+193 -0) 100644 =================================================================== --- /dev/null +++ lib/chupa-text/decomposers/open-document.rb 2019-02-25 17:40:51 +0900 (159f9ba) @@ -0,0 +1,193 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "cgi/util" +require "rexml/parsers/sax2parser" +require "rexml/sax2listener" + +require "archive/zip" + +module ChupaText + module Decomposers + class OpenDocument < Decomposer + registry.register("open-document", self) + + EXTENSIONS = [ + "odt", + "ods", + "odp", + ] + MIME_TYPES = [ + "application/vnd.oasis.opendocument.text", + "application/vnd.oasis.opendocument.spreadsheet", + "application/vnd.oasis.opendocument.presentation ", + ] + def target?(data) + EXTENSIONS.include?(data.extension) or + MIME_TYPES.include?(data.mime_type) + end + + def target_score(data) + if target?(data) + -1 + else + nil + end + end + + def decompose(data) + context = { + text: "", + attributes: {}, + } + data.open do |input| + Archive::Zip.open(input) do |zip| + zip.each do |entry| + next unless entry.file? + case entry.zip_path + when "content.xml" + listener = TextListener.new(context[:text]) + parse(entry.file_data, listener) + when "meta.xml" + listener = AttributesListener.new(context[:attributes]) + parse(entry.file_data, listener) + end + end + end + end + text = context[:text] + text_data = TextData.new(text, source_data: data) + context[:attributes].each do |name, value| + text_data[name] = value + end + yield(text_data) + end + + private + def parse(io, listener) + source = REXML::Source.new(io.read) + parser = REXML::Parsers::SAX2Parser.new(source) + parser.listen(listener) + parser.parse + end + + class TextListener + include REXML::SAX2Listener + + TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0" + def initialize(output) + @output = output + @in_p = false + end + + def start_element(uri, local_name, qname, attributes) + return unless uri == TEXT_URI + case local_name + when "p" + @in_p = true + end + end + + def end_element(uri, local_name, qname) + return unless uri == TEXT_URI + case local_name + when "p" + @output << "\n" + end + end + + def characters(text) + add_text(text) + end + + def cdata(content) + add_text(content) + end + + private + def add_text(text) + return unless @in_p + @output << CGI.unescapeHTML(text) + end + end + + class AttributesListener + include REXML::SAX2Listener + + META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0" + DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/" + + def initialize(attributes) + @attributes = attributes + @name = nil + @type = nil + end + + def start_element(uri, local_name, qname, attributes) + case uri + when META_URI + case local_name + when "creation-date" + @name = "created_time" + @type = :w3cdtf + when "keyword" + @name = "keywords" + @type = :array + when "generator" + @name = local_name + end + when DUBLIN_CORE_URI + case local_name + when "date" + @name = "modified_time" + @type = :w3cdtf + when "description", "title", "subject" + @name = local_name + end + end + end + + def end_element(uri, local_name, qname) + @name = nil + @type = nil + end + + def characters(text) + set_attribute(text) + end + + def cdata(content) + set_attribute(content) + end + + def set_attribute(value) + return if****@name*****? + + value = CGI.unescapeHTML(value) + case @type + when :w3cdtf + value = Time.xmlschema(value) + when :array + values = @attributes[@name] || [] + values << value + value = values + end + @attributes[@name] = value + end + end + end + end +end Added: test/decomposers/test-open-document-text.rb (+144 -0) 100644 =================================================================== --- /dev/null +++ test/decomposers/test-open-document-text.rb 2019-02-25 17:40:51 +0900 (783d641) @@ -0,0 +1,144 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +class TestDecomposersOpenDocumentText < Test::Unit::TestCase + include Helper + + def setup + @decomposer = ChupaText::Decomposers::OpenDocument.new({}) + end + + def decompose(path) + data = ChupaText::InputData.new(path) + decomposed = [] + @decomposer.decompose(data) do |decomposed_data| + decomposed << decomposed_data + end + decomposed + end + + sub_test_case("#target_score") do + def test_extension + data = ChupaText::Data.new + data.body = "" + data.uri = "document.odt" + assert_equal(-1, @decomposer.target_score(data)) + end + + def test_mime_type + data = ChupaText::Data.new + data.mime_type = "application/vnd.oasis.opendocument.text" + assert_equal(-1, @decomposer.target_score(data)) + end + end + + sub_test_case("#decompose") do + sub_test_case("attributes") do + def decompose(attribute_name) + super(fixture_path("odt", "attributes.odt")).collect do |data| + data[attribute_name] + end + end + + def test_title + assert_equal(["Title"], decompose("title")) + end + + def test_author + assert_equal([nil], decompose("author")) + end + + def test_subject + assert_equal(["Subject"], decompose("subject")) + end + + def test_keywords + assert_equal([["Keyword1", "Keyword2"]], decompose("keywords")) + end + + def test_created_time + assert_equal([Time], + decompose("created_time").collect(&:class)) + end + + def test_modified_time + assert_equal([Time], + decompose("modified_time").collect(&:class)) + end + + def test_generator + assert_equal(["LibreOffice"], + normalize_generators(decompose("generator"))) + end + + def normalize_generators(generators) + generators.collect do |generator| + normalize_generator(generator) + end + end + + def normalize_generator(generator) + if generator.start_with?("LibreOffice") + "LibreOffice" + else + generator + end + end + + def test_creation_date + assert_equal([nil], decompose("creation_date")) + end + end + + sub_test_case("one page") do + def decompose + super(fixture_path("odt", "one-page.odt")) + end + + def test_body + assert_equal(["Page1\n"], decompose.collect(&:body)) + end + end + + sub_test_case("multi pages") do + def decompose + super(fixture_path("odt", "multi-pages.odt")) + end + + def test_body + assert_equal([<<-BODY], decompose.collect(&:body)) +Page1 +Page2 + BODY + end + end + + sub_test_case("special characters") do + def decompose + super(fixture_path("odt", "special-characters.odt")) + end + + def test_body + assert_equal([<<-BODY], decompose.collect(&:body)) +Ampersand: & +Reference: & +HTML: <a href=""> +Single quote: '' + BODY + end + end + end +end Added: test/fixture/odt/attributes.odt (+23 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/odt/attributes.odt 2019-02-25 17:40:51 +0900 (e556230) @@ -0,0 +1,23 @@ +MIME-Version: 1.0 +mime-type: application/vnd.oasis.opendocument.text +uri: file:/tmp/0N7PzU_attributes.odt +path: /tmp/0N7PzU_attributes.odt +size: 7762 +Content-Type: multipart/mixed; boundary=384aaebcbc70489e8c7503dfd50e812836f1e698 + +--384aaebcbc70489e8c7503dfd50e812836f1e698 +mime-type: text/plain +uri: file:/tmp/0N7PzU_attributes.txt +path: /tmp/0N7PzU_attributes.txt +size: 6 +title: Title +created_time: 2019-02-25 08:41:15 UTC +source-mime-types: ["application/pdf", "application/vnd.oasis.opendocument.text"] +subject: Subject +keywords: Keyword1, Keyword2 +creator: Writer +producer: LibreOffice 5.2 + +Page1 + +--384aaebcbc70489e8c7503dfd50e812836f1e698-- Added: test/fixture/odt/multi-pages.odt (+21 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/odt/multi-pages.odt 2019-02-25 17:40:51 +0900 (c21c3ca) @@ -0,0 +1,21 @@ +MIME-Version: 1.0 +mime-type: application/vnd.oasis.opendocument.text +uri: file:/tmp/eoQbPu_multi-pages.odt +path: /tmp/eoQbPu_multi-pages.odt +size: 7874 +Content-Type: multipart/mixed; boundary=07489dcf83e35fb759ebd449b94d977c2f3f1dbf + +--07489dcf83e35fb759ebd449b94d977c2f3f1dbf +mime-type: text/plain +uri: file:/tmp/eoQbPu_multi-pages.txt +path: /tmp/eoQbPu_multi-pages.txt +size: 12 +created_time: 2019-02-25 08:41:16 UTC +source-mime-types: ["application/pdf", "application/vnd.oasis.opendocument.text"] +creator: Writer +producer: LibreOffice 5.2 + +Page1 +Page2 + +--07489dcf83e35fb759ebd449b94d977c2f3f1dbf-- Added: test/fixture/odt/one-page.odt (+20 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/odt/one-page.odt 2019-02-25 17:40:51 +0900 (a77f44e) @@ -0,0 +1,20 @@ +MIME-Version: 1.0 +mime-type: application/vnd.oasis.opendocument.text +uri: file:/tmp/jpjMq8_one-page.odt +path: /tmp/jpjMq8_one-page.odt +size: 7662 +Content-Type: multipart/mixed; boundary=4daaf4b0cb28b43604ee9fec3f2531052215d746 + +--4daaf4b0cb28b43604ee9fec3f2531052215d746 +mime-type: text/plain +uri: file:/tmp/jpjMq8_one-page.txt +path: /tmp/jpjMq8_one-page.txt +size: 6 +created_time: 2019-02-25 08:41:18 UTC +source-mime-types: ["application/pdf", "application/vnd.oasis.opendocument.text"] +creator: Writer +producer: LibreOffice 5.2 + +Page1 + +--4daaf4b0cb28b43604ee9fec3f2531052215d746-- Added: test/fixture/odt/special-characters.odt (+23 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/odt/special-characters.odt 2019-02-25 17:40:51 +0900 (7b3340f) @@ -0,0 +1,23 @@ +MIME-Version: 1.0 +mime-type: application/vnd.oasis.opendocument.text +uri: file:/tmp/hJf9dE_special-characters.odt +path: /tmp/hJf9dE_special-characters.odt +size: 8466 +Content-Type: multipart/mixed; boundary=595251b39da4e9185b880ff96df7a62f51873ed1 + +--595251b39da4e9185b880ff96df7a62f51873ed1 +mime-type: text/plain +uri: file:/tmp/hJf9dE_special-characters.txt +path: /tmp/hJf9dE_special-characters.txt +size: 65 +created_time: 2019-02-25 08:41:19 UTC +source-mime-types: ["application/pdf", "application/vnd.oasis.opendocument.text"] +creator: Writer +producer: LibreOffice 5.2 + +Ampersand: & +Reference: & +HTML: <a href=""> +Single quote: '' + +--595251b39da4e9185b880ff96df7a62f51873ed1-- -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190225/533b15cf/attachment-0001.html>