[Groonga-commit] ranguba/chupa-text at 6fb7ae7 [master] Add support for Nokogiri as an alternative SAX parser backend

Back to archive index
Kouhei Sutou null+****@clear*****
Thu Feb 28 11:17:15 JST 2019


Kouhei Sutou	2019-02-28 11:17:15 +0900 (Thu, 28 Feb 2019)

  Revision: 6fb7ae78165a2c3ea7147c5f47287dd32c3f8a21
  https://github.com/ranguba/chupa-text/commit/6fb7ae78165a2c3ea7147c5f47287dd32c3f8a21

  Message:
    Add support for Nokogiri as an alternative SAX parser backend
    
    Because REXML is too slow... We need to improve REXML performance.

  Added files:
    lib/chupa-text/sax-parser.rb
  Modified files:
    .travis.yml
    chupa-text.gemspec
    lib/chupa-text/decomposers/office-open-xml-workbook.rb
    lib/chupa-text/decomposers/office-open-xml.rb
    lib/chupa-text/decomposers/opendocument-presentation.rb
    lib/chupa-text/decomposers/opendocument-spreadsheet.rb
    lib/chupa-text/decomposers/opendocument-text.rb
    lib/chupa-text/decomposers/opendocument.rb

  Modified: .travis.yml (+3 -0)
===================================================================
--- .travis.yml    2019-02-26 15:34:22 +0900 (4c5d25c)
+++ .travis.yml    2019-02-28 11:17:15 +0900 (2175350)
@@ -7,3 +7,6 @@ rvm:
   - 2.5
   - 2.6
   - ruby-head
+script:
+  - bundle exec rake
+  - CHUPA_TEXT_SAX_PARSER_BACKEND=rexml bundle exec rake

  Modified: chupa-text.gemspec (+3 -2)
===================================================================
--- chupa-text.gemspec    2019-02-26 15:34:22 +0900 (d3da753)
+++ chupa-text.gemspec    2019-02-28 11:17:15 +0900 (d5b5d9f)
@@ -53,8 +53,9 @@ Gem::Specification.new do |spec|
   spec.add_runtime_dependency("archive-zip")
 
   spec.add_development_dependency("bundler")
-  spec.add_development_dependency("rake")
-  spec.add_development_dependency("test-unit")
+  spec.add_development_dependency("nokogiri")
   spec.add_development_dependency("packnga")
+  spec.add_development_dependency("rake")
   spec.add_development_dependency("redcarpet")
+  spec.add_development_dependency("test-unit")
 end

  Modified: lib/chupa-text/decomposers/office-open-xml-workbook.rb (+1 -3)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml-workbook.rb    2019-02-26 15:34:22 +0900 (a75d351)
+++ lib/chupa-text/decomposers/office-open-xml-workbook.rb    2019-02-28 11:17:15 +0900 (4c545e9)
@@ -71,9 +71,7 @@ module ChupaText
         sheet_texts.join("\n")
       end
 
-      class SheetListener
-        include REXML::SAX2Listener
-
+      class SheetListener < SAXListener
         URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
 
         def initialize(sheet)

  Modified: lib/chupa-text/decomposers/office-open-xml.rb (+7 -15)
===================================================================
--- lib/chupa-text/decomposers/office-open-xml.rb    2019-02-26 15:34:22 +0900 (a891383)
+++ lib/chupa-text/decomposers/office-open-xml.rb    2019-02-28 11:17:15 +0900 (f2e2483)
@@ -14,12 +14,10 @@
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
-require "cgi/util"
-require "rexml/parsers/sax2parser"
-require "rexml/sax2listener"
-
 require "archive/zip"
 
+require "chupa-text/sax-parser"
+
 module ChupaText
   module Decomposers
     class OfficeOpenXML < Decomposer
@@ -67,10 +65,8 @@ module ChupaText
       end
 
       private
-      def parse(io, listener)
-        source = REXML::Source.new(io.read)
-        parser = REXML::Parsers::SAX2Parser.new(source)
-        parser.listen(listener)
+      def parse(input, listener)
+        parser = SAXParser.new(input, listener)
         parser.parse
       end
 
@@ -83,9 +79,7 @@ module ChupaText
         context[:text]
       end
 
-      class TextListener
-        include REXML::SAX2Listener
-
+      class TextListener < SAXListener
         def initialize(output, target_uri)
           @output = output
           @target_uri = target_uri
@@ -121,13 +115,11 @@ module ChupaText
         private
         def add_text(text)
           return unless @in_target
-          @output << CGI.unescapeHTML(text)
+          @output << text
         end
       end
 
-      class AttributesListener
-        include REXML::SAX2Listener
-
+      class AttributesListener < SAXListener
         CORE_PROPERTIES_URI =
           "http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
         EXTENDED_PROPERTIES_URI =

  Modified: lib/chupa-text/decomposers/opendocument-presentation.rb (+2 -4)
===================================================================
--- lib/chupa-text/decomposers/opendocument-presentation.rb    2019-02-26 15:34:22 +0900 (043f589)
+++ lib/chupa-text/decomposers/opendocument-presentation.rb    2019-02-28 11:17:15 +0900 (bd2e595)
@@ -49,9 +49,7 @@ module ChupaText
         end
       end
 
-      class SlidesListener
-        include REXML::SAX2Listener
-
+      class SlidesListener < SAXListener
         TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
         DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
 
@@ -97,7 +95,7 @@ module ChupaText
         private
         def add_text(text)
           return unless @in_p
-          @slides.last[:text] << CGI.unescapeHTML(text)
+          @slides.last[:text] << text
         end
       end
     end

  Modified: lib/chupa-text/decomposers/opendocument-spreadsheet.rb (+2 -4)
===================================================================
--- lib/chupa-text/decomposers/opendocument-spreadsheet.rb    2019-02-26 15:34:22 +0900 (39c9bf3)
+++ lib/chupa-text/decomposers/opendocument-spreadsheet.rb    2019-02-28 11:17:15 +0900 (c421ffb)
@@ -51,9 +51,7 @@ module ChupaText
         end
       end
 
-      class SheetsListener
-        include REXML::SAX2Listener
-
+      class SheetsListener < SAXListener
         TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
         TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"
 
@@ -126,7 +124,7 @@ module ChupaText
         private
         def add_text(text)
           return unless @in_p
-          @sheets.last[:rows].last.last[:text] << CGI.unescapeHTML(text)
+          @sheets.last[:rows].last.last[:text] << text
         end
       end
     end

  Modified: lib/chupa-text/decomposers/opendocument-text.rb (+2 -4)
===================================================================
--- lib/chupa-text/decomposers/opendocument-text.rb    2019-02-26 15:34:22 +0900 (2a862ee)
+++ lib/chupa-text/decomposers/opendocument-text.rb    2019-02-28 11:17:15 +0900 (0cc8aa8)
@@ -43,9 +43,7 @@ module ChupaText
         yield(text_data)
       end
 
-      class TextListener
-        include REXML::SAX2Listener
-
+      class TextListener < SAXListener
         TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
         def initialize(output)
           @output = output
@@ -81,7 +79,7 @@ module ChupaText
         private
         def add_text(text)
           return unless @in_p
-          @output << CGI.unescapeHTML(text)
+          @output << text
         end
       end
     end

  Modified: lib/chupa-text/decomposers/opendocument.rb (+5 -12)
===================================================================
--- lib/chupa-text/decomposers/opendocument.rb    2019-02-26 15:34:22 +0900 (8aa5ddb)
+++ lib/chupa-text/decomposers/opendocument.rb    2019-02-28 11:17:15 +0900 (9a2ef14)
@@ -14,12 +14,10 @@
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
-require "cgi/util"
-require "rexml/parsers/sax2parser"
-require "rexml/sax2listener"
-
 require "archive/zip"
 
+require "chupa-text/sax-parser"
+
 module ChupaText
   module Decomposers
     class OpenDocument < Decomposer
@@ -58,10 +56,8 @@ module ChupaText
       end
 
       private
-      def parse(io, listener)
-        source = REXML::Source.new(io.read)
-        parser = REXML::Parsers::SAX2Parser.new(source)
-        parser.listen(listener)
+      def parse(input, listener)
+        parser = SAXParser.new(input, listener)
         parser.parse
       end
 
@@ -70,9 +66,7 @@ module ChupaText
         parse(entry.file_data, listener)
       end
 
-      class AttributesListener
-        include REXML::SAX2Listener
-
+      class AttributesListener < SAXListener
         META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
         DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
 
@@ -122,7 +116,6 @@ module ChupaText
         def set_attribute(value)
           return if****@name*****?
 
-          value = CGI.unescapeHTML(value)
           case @type
           when :w3cdtf
             value = Time.xmlschema(value)

  Added: lib/chupa-text/sax-parser.rb (+151 -0) 100644
===================================================================
--- /dev/null
+++ lib/chupa-text/sax-parser.rb    2019-02-28 11:17:15 +0900 (d6b8a3b)
@@ -0,0 +1,151 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "cgi/util"
+require "rexml/parsers/sax2parser"
+require "rexml/sax2listener"
+
+begin
+  require "nokogiri"
+rescue LoadError
+end
+
+module ChupaText
+  class SAXParser
+    class << self
+      def backend
+        case ENV["CHUPA_TEXT_SAX_PARSER_BACKEND"]
+        when "rexml"
+          :rexml
+        else
+          if Object.const_defined?(:Nokogiri)
+            :nokogiri
+          else
+            :rexml
+          end
+        end
+      end
+    end
+
+    def initialize(input, listener)
+      @input = input
+      @listener = listener
+    end
+
+    if backend == :nokogiri
+      def parse
+        document = Document.new(@listener)
+        parser = Nokogiri::XML::SAX::Parser.new(document)
+        parser.parse(@input)
+      end
+
+      class Document < Nokogiri::XML::SAX::Document
+        def initialize(listener)
+          @listener = listener
+          @namespaces_stack = []
+        end
+
+        def start_element_namespace(name,
+                                    attributes=[],
+                                    prefix=nil,
+                                    uri=nil,
+                                    namespaces=[])
+          namespaces.each do |namespace_prefix, namespace_uri|
+            @listener.start_prefix_mapping(namespace_prefix, namespace_uri)
+          end
+          attributes_hash = {}
+          attributes.each do |attribute|
+            attribute_qname = build_qname(attribute.prefix, attribute.localname)
+            attributes_hash[attribute_qname] = attribute.value
+          end
+          @namespaces_stack.push(namespaces)
+          @listener.start_element(uri,
+                                  name,
+                                  build_qname(prefix, name),
+                                  attributes_hash)
+        end
+
+        def end_element_namespace(name, prefix=nil, uri=nil)
+          @listener.end_element(uri, name, build_qname(prefix, name))
+          namespaces = @namespaces_stack.pop
+          namespaces.each do |namespace_prefix, _|
+            @listener.end_prefix_mapping(namespace_prefix)
+          end
+        end
+
+        def characters(text)
+          @listener.characters(text)
+        end
+
+        def cdata_block(content)
+          @listener.cdata(content)
+        end
+
+        private
+        def build_qname(prefix, local_name)
+          if prefix
+            "#{prefix}:#{local_name}"
+          else
+            local_name
+          end
+        end
+      end
+    else
+      def parse
+        source = REXML::Source.new(@input.read)
+        parser = REXML::Parsers::SAX2Parser.new(source)
+        parser.listen(Listener.new(@listener))
+        parser.parse
+      end
+
+      class Listener
+        include REXML::SAX2Listener
+
+        def initialize(listener)
+          @listener = listener
+        end
+
+        def start_prefix_mapping(*args)
+          @listener.start_prefix_mapping(*args)
+        end
+
+        def end_prefix_mapping(*args)
+          @listener.end_prefix_mapping(*args)
+        end
+
+        def start_element(*args)
+          @listener.start_element(*args)
+        end
+
+        def end_element(*args)
+          @listener.end_element(*args)
+        end
+
+        def characters(text)
+          @listener.characters(CGI.unescapeHTML(text))
+        end
+
+        def cdata(content)
+          @listener.cdata(CGI.unescapeHTML(content))
+        end
+      end
+    end
+  end
+
+  class SAXListener
+    include REXML::SAX2Listener
+  end
+end
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190228/3384cc86/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index