[Groonga-commit] ranguba/chupa-text at 4d52d45 [master] Add support for OpenDocument Text

Back to archive index
Kouhei Sutou null+****@clear*****
Mon Feb 25 17:40:51 JST 2019


Kouhei Sutou	2019-02-25 17:40:51 +0900 (Mon, 25 Feb 2019)

  Revision: 4d52d452a778c70d133b0037e0a10254c6d6ffb2
  https://github.com/ranguba/chupa-text/commit/4d52d452a778c70d133b0037e0a10254c6d6ffb2

  Message:
    Add support for OpenDocument Text

  Added files:
    lib/chupa-text/decomposers/open-document.rb
    test/decomposers/test-open-document-text.rb
    test/fixture/odt/attributes.odt
    test/fixture/odt/multi-pages.odt
    test/fixture/odt/one-page.odt
    test/fixture/odt/special-characters.odt

  Added: lib/chupa-text/decomposers/open-document.rb (+193 -0) 100644
===================================================================
--- /dev/null
+++ lib/chupa-text/decomposers/open-document.rb    2019-02-25 17:40:51 +0900 (159f9ba)
@@ -0,0 +1,193 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "cgi/util"
+require "rexml/parsers/sax2parser"
+require "rexml/sax2listener"
+
+require "archive/zip"
+
+module ChupaText
+  module Decomposers
+    class OpenDocument < Decomposer
+      registry.register("open-document", self)
+
+      EXTENSIONS = [
+        "odt",
+        "ods",
+        "odp",
+      ]
+      MIME_TYPES = [
+        "application/vnd.oasis.opendocument.text",
+        "application/vnd.oasis.opendocument.spreadsheet",
+        "application/vnd.oasis.opendocument.presentation ",
+      ]
+      def target?(data)
+        EXTENSIONS.include?(data.extension) or
+          MIME_TYPES.include?(data.mime_type)
+      end
+
+      def target_score(data)
+        if target?(data)
+          -1
+        else
+          nil
+        end
+      end
+
+      def decompose(data)
+        context = {
+          text: "",
+          attributes: {},
+        }
+        data.open do |input|
+          Archive::Zip.open(input) do |zip|
+            zip.each do |entry|
+              next unless entry.file?
+              case entry.zip_path
+              when "content.xml"
+                listener = TextListener.new(context[:text])
+                parse(entry.file_data, listener)
+              when "meta.xml"
+                listener = AttributesListener.new(context[:attributes])
+                parse(entry.file_data, listener)
+              end
+            end
+          end
+        end
+        text = context[:text]
+        text_data = TextData.new(text, source_data: data)
+        context[:attributes].each do |name, value|
+          text_data[name] = value
+        end
+        yield(text_data)
+      end
+
+      private
+      def parse(io, listener)
+        source = REXML::Source.new(io.read)
+        parser = REXML::Parsers::SAX2Parser.new(source)
+        parser.listen(listener)
+        parser.parse
+      end
+
+      class TextListener
+        include REXML::SAX2Listener
+
+        TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
+        def initialize(output)
+          @output = output
+          @in_p = false
+        end
+
+        def start_element(uri, local_name, qname, attributes)
+          return unless uri == TEXT_URI
+          case local_name
+          when "p"
+            @in_p = true
+          end
+        end
+
+        def end_element(uri, local_name, qname)
+          return unless uri == TEXT_URI
+          case local_name
+          when "p"
+            @output << "\n"
+          end
+        end
+
+        def characters(text)
+          add_text(text)
+        end
+
+        def cdata(content)
+          add_text(content)
+        end
+
+        private
+        def add_text(text)
+          return unless @in_p
+          @output << CGI.unescapeHTML(text)
+        end
+      end
+
+      class AttributesListener
+        include REXML::SAX2Listener
+
+        META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
+        DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
+
+        def initialize(attributes)
+          @attributes = attributes
+          @name = nil
+          @type = nil
+        end
+
+        def start_element(uri, local_name, qname, attributes)
+          case uri
+          when META_URI
+            case local_name
+            when "creation-date"
+              @name = "created_time"
+              @type = :w3cdtf
+            when "keyword"
+              @name = "keywords"
+              @type = :array
+            when "generator"
+              @name = local_name
+            end
+          when DUBLIN_CORE_URI
+            case local_name
+            when "date"
+              @name = "modified_time"
+              @type = :w3cdtf
+            when "description", "title", "subject"
+              @name = local_name
+            end
+          end
+        end
+
+        def end_element(uri, local_name, qname)
+          @name = nil
+          @type = nil
+        end
+
+        def characters(text)
+          set_attribute(text)
+        end
+
+        def cdata(content)
+          set_attribute(content)
+        end
+
+        def set_attribute(value)
+          return if****@name*****?
+
+          value = CGI.unescapeHTML(value)
+          case @type
+          when :w3cdtf
+            value = Time.xmlschema(value)
+          when :array
+            values = @attributes[@name] || []
+            values << value
+            value = values
+          end
+          @attributes[@name] = value
+        end
+      end
+    end
+  end
+end

  Added: test/decomposers/test-open-document-text.rb (+144 -0) 100644
===================================================================
--- /dev/null
+++ test/decomposers/test-open-document-text.rb    2019-02-25 17:40:51 +0900 (783d641)
@@ -0,0 +1,144 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+class TestDecomposersOpenDocumentText < Test::Unit::TestCase
+  include Helper
+
+  def setup
+    @decomposer = ChupaText::Decomposers::OpenDocument.new({})
+  end
+
+  def decompose(path)
+    data = ChupaText::InputData.new(path)
+    decomposed = []
+    @decomposer.decompose(data) do |decomposed_data|
+      decomposed << decomposed_data
+    end
+    decomposed
+  end
+
+  sub_test_case("#target_score") do
+    def test_extension
+      data = ChupaText::Data.new
+      data.body = ""
+      data.uri = "document.odt"
+      assert_equal(-1, @decomposer.target_score(data))
+    end
+
+    def test_mime_type
+      data = ChupaText::Data.new
+      data.mime_type = "application/vnd.oasis.opendocument.text"
+      assert_equal(-1, @decomposer.target_score(data))
+    end
+  end
+
+  sub_test_case("#decompose") do
+    sub_test_case("attributes") do
+      def decompose(attribute_name)
+        super(fixture_path("odt", "attributes.odt")).collect do |data|
+          data[attribute_name]
+        end
+      end
+
+      def test_title
+        assert_equal(["Title"], decompose("title"))
+      end
+
+      def test_author
+        assert_equal([nil], decompose("author"))
+      end
+
+      def test_subject
+        assert_equal(["Subject"], decompose("subject"))
+      end
+
+      def test_keywords
+        assert_equal([["Keyword1", "Keyword2"]], decompose("keywords"))
+      end
+
+      def test_created_time
+        assert_equal([Time],
+                     decompose("created_time").collect(&:class))
+      end
+
+      def test_modified_time
+        assert_equal([Time],
+                     decompose("modified_time").collect(&:class))
+      end
+
+      def test_generator
+        assert_equal(["LibreOffice"],
+                     normalize_generators(decompose("generator")))
+      end
+
+      def normalize_generators(generators)
+        generators.collect do |generator|
+          normalize_generator(generator)
+        end
+      end
+
+      def normalize_generator(generator)
+        if generator.start_with?("LibreOffice")
+          "LibreOffice"
+        else
+          generator
+        end
+      end
+
+      def test_creation_date
+        assert_equal([nil], decompose("creation_date"))
+      end
+    end
+
+    sub_test_case("one page") do
+      def decompose
+        super(fixture_path("odt", "one-page.odt"))
+      end
+
+      def test_body
+        assert_equal(["Page1\n"], decompose.collect(&:body))
+      end
+    end
+
+    sub_test_case("multi pages") do
+      def decompose
+        super(fixture_path("odt", "multi-pages.odt"))
+      end
+
+      def test_body
+        assert_equal([<<-BODY], decompose.collect(&:body))
+Page1
+Page2
+        BODY
+      end
+    end
+
+    sub_test_case("special characters") do
+      def decompose
+        super(fixture_path("odt", "special-characters.odt"))
+      end
+
+      def test_body
+        assert_equal([<<-BODY], decompose.collect(&:body))
+Ampersand: &
+Reference: &
+HTML: <a href="">
+Single quote: ''
+        BODY
+      end
+    end
+  end
+end

  Added: test/fixture/odt/attributes.odt (+23 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/odt/attributes.odt    2019-02-25 17:40:51 +0900 (e556230)
@@ -0,0 +1,23 @@
+MIME-Version: 1.0
+mime-type: application/vnd.oasis.opendocument.text
+uri: file:/tmp/0N7PzU_attributes.odt
+path: /tmp/0N7PzU_attributes.odt
+size: 7762
+Content-Type: multipart/mixed; boundary=384aaebcbc70489e8c7503dfd50e812836f1e698
+
+--384aaebcbc70489e8c7503dfd50e812836f1e698
+mime-type: text/plain
+uri: file:/tmp/0N7PzU_attributes.txt
+path: /tmp/0N7PzU_attributes.txt
+size: 6
+title: Title
+created_time: 2019-02-25 08:41:15 UTC
+source-mime-types: ["application/pdf", "application/vnd.oasis.opendocument.text"]
+subject: Subject
+keywords: Keyword1, Keyword2
+creator: Writer
+producer: LibreOffice 5.2
+
+Page1
+
+--384aaebcbc70489e8c7503dfd50e812836f1e698--

  Added: test/fixture/odt/multi-pages.odt (+21 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/odt/multi-pages.odt    2019-02-25 17:40:51 +0900 (c21c3ca)
@@ -0,0 +1,21 @@
+MIME-Version: 1.0
+mime-type: application/vnd.oasis.opendocument.text
+uri: file:/tmp/eoQbPu_multi-pages.odt
+path: /tmp/eoQbPu_multi-pages.odt
+size: 7874
+Content-Type: multipart/mixed; boundary=07489dcf83e35fb759ebd449b94d977c2f3f1dbf
+
+--07489dcf83e35fb759ebd449b94d977c2f3f1dbf
+mime-type: text/plain
+uri: file:/tmp/eoQbPu_multi-pages.txt
+path: /tmp/eoQbPu_multi-pages.txt
+size: 12
+created_time: 2019-02-25 08:41:16 UTC
+source-mime-types: ["application/pdf", "application/vnd.oasis.opendocument.text"]
+creator: Writer
+producer: LibreOffice 5.2
+
+Page1
+Page2
+
+--07489dcf83e35fb759ebd449b94d977c2f3f1dbf--

  Added: test/fixture/odt/one-page.odt (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/odt/one-page.odt    2019-02-25 17:40:51 +0900 (a77f44e)
@@ -0,0 +1,20 @@
+MIME-Version: 1.0
+mime-type: application/vnd.oasis.opendocument.text
+uri: file:/tmp/jpjMq8_one-page.odt
+path: /tmp/jpjMq8_one-page.odt
+size: 7662
+Content-Type: multipart/mixed; boundary=4daaf4b0cb28b43604ee9fec3f2531052215d746
+
+--4daaf4b0cb28b43604ee9fec3f2531052215d746
+mime-type: text/plain
+uri: file:/tmp/jpjMq8_one-page.txt
+path: /tmp/jpjMq8_one-page.txt
+size: 6
+created_time: 2019-02-25 08:41:18 UTC
+source-mime-types: ["application/pdf", "application/vnd.oasis.opendocument.text"]
+creator: Writer
+producer: LibreOffice 5.2
+
+Page1
+
+--4daaf4b0cb28b43604ee9fec3f2531052215d746--

  Added: test/fixture/odt/special-characters.odt (+23 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/odt/special-characters.odt    2019-02-25 17:40:51 +0900 (7b3340f)
@@ -0,0 +1,23 @@
+MIME-Version: 1.0
+mime-type: application/vnd.oasis.opendocument.text
+uri: file:/tmp/hJf9dE_special-characters.odt
+path: /tmp/hJf9dE_special-characters.odt
+size: 8466
+Content-Type: multipart/mixed; boundary=595251b39da4e9185b880ff96df7a62f51873ed1
+
+--595251b39da4e9185b880ff96df7a62f51873ed1
+mime-type: text/plain
+uri: file:/tmp/hJf9dE_special-characters.txt
+path: /tmp/hJf9dE_special-characters.txt
+size: 65
+created_time: 2019-02-25 08:41:19 UTC
+source-mime-types: ["application/pdf", "application/vnd.oasis.opendocument.text"]
+creator: Writer
+producer: LibreOffice 5.2
+
+Ampersand: &
+Reference: &
+HTML: <a href="">
+Single quote: ''
+
+--595251b39da4e9185b880ff96df7a62f51873ed1--
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190225/533b15cf/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index