Kouhei Sutou 2019-02-28 11:17:15 +0900 (Thu, 28 Feb 2019) Revision: 6fb7ae78165a2c3ea7147c5f47287dd32c3f8a21 https://github.com/ranguba/chupa-text/commit/6fb7ae78165a2c3ea7147c5f47287dd32c3f8a21 Message: Add support for Nokogiri as an alternative SAX parser backend Because REXML is too slow... We need to improve REXML performance. Added files: lib/chupa-text/sax-parser.rb Modified files: .travis.yml chupa-text.gemspec lib/chupa-text/decomposers/office-open-xml-workbook.rb lib/chupa-text/decomposers/office-open-xml.rb lib/chupa-text/decomposers/opendocument-presentation.rb lib/chupa-text/decomposers/opendocument-spreadsheet.rb lib/chupa-text/decomposers/opendocument-text.rb lib/chupa-text/decomposers/opendocument.rb Modified: .travis.yml (+3 -0) =================================================================== --- .travis.yml 2019-02-26 15:34:22 +0900 (4c5d25c) +++ .travis.yml 2019-02-28 11:17:15 +0900 (2175350) @@ -7,3 +7,6 @@ rvm: - 2.5 - 2.6 - ruby-head +script: + - bundle exec rake + - CHUPA_TEXT_SAX_PARSER_BACKEND=rexml bundle exec rake Modified: chupa-text.gemspec (+3 -2) =================================================================== --- chupa-text.gemspec 2019-02-26 15:34:22 +0900 (d3da753) +++ chupa-text.gemspec 2019-02-28 11:17:15 +0900 (d5b5d9f) @@ -53,8 +53,9 @@ Gem::Specification.new do |spec| spec.add_runtime_dependency("archive-zip") spec.add_development_dependency("bundler") - spec.add_development_dependency("rake") - spec.add_development_dependency("test-unit") + spec.add_development_dependency("nokogiri") spec.add_development_dependency("packnga") + spec.add_development_dependency("rake") spec.add_development_dependency("redcarpet") + spec.add_development_dependency("test-unit") end Modified: lib/chupa-text/decomposers/office-open-xml-workbook.rb (+1 -3) =================================================================== --- lib/chupa-text/decomposers/office-open-xml-workbook.rb 2019-02-26 15:34:22 +0900 (a75d351) +++ lib/chupa-text/decomposers/office-open-xml-workbook.rb 2019-02-28 11:17:15 +0900 (4c545e9) @@ -71,9 +71,7 @@ module ChupaText sheet_texts.join("\n") end - class SheetListener - include REXML::SAX2Listener - + class SheetListener < SAXListener URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" def initialize(sheet) Modified: lib/chupa-text/decomposers/office-open-xml.rb (+7 -15) =================================================================== --- lib/chupa-text/decomposers/office-open-xml.rb 2019-02-26 15:34:22 +0900 (a891383) +++ lib/chupa-text/decomposers/office-open-xml.rb 2019-02-28 11:17:15 +0900 (f2e2483) @@ -14,12 +14,10 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -require "cgi/util" -require "rexml/parsers/sax2parser" -require "rexml/sax2listener" - require "archive/zip" +require "chupa-text/sax-parser" + module ChupaText module Decomposers class OfficeOpenXML < Decomposer @@ -67,10 +65,8 @@ module ChupaText end private - def parse(io, listener) - source = REXML::Source.new(io.read) - parser = REXML::Parsers::SAX2Parser.new(source) - parser.listen(listener) + def parse(input, listener) + parser = SAXParser.new(input, listener) parser.parse end @@ -83,9 +79,7 @@ module ChupaText context[:text] end - class TextListener - include REXML::SAX2Listener - + class TextListener < SAXListener def initialize(output, target_uri) @output = output @target_uri = target_uri @@ -121,13 +115,11 @@ module ChupaText private def add_text(text) return unless @in_target - @output << CGI.unescapeHTML(text) + @output << text end end - class AttributesListener - include REXML::SAX2Listener - + class AttributesListener < SAXListener CORE_PROPERTIES_URI = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties" EXTENDED_PROPERTIES_URI = Modified: lib/chupa-text/decomposers/opendocument-presentation.rb (+2 -4) =================================================================== --- lib/chupa-text/decomposers/opendocument-presentation.rb 2019-02-26 15:34:22 +0900 (043f589) +++ lib/chupa-text/decomposers/opendocument-presentation.rb 2019-02-28 11:17:15 +0900 (bd2e595) @@ -49,9 +49,7 @@ module ChupaText end end - class SlidesListener - include REXML::SAX2Listener - + class SlidesListener < SAXListener TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0" DRAW_URI = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0" @@ -97,7 +95,7 @@ module ChupaText private def add_text(text) return unless @in_p - @slides.last[:text] << CGI.unescapeHTML(text) + @slides.last[:text] << text end end end Modified: lib/chupa-text/decomposers/opendocument-spreadsheet.rb (+2 -4) =================================================================== --- lib/chupa-text/decomposers/opendocument-spreadsheet.rb 2019-02-26 15:34:22 +0900 (39c9bf3) +++ lib/chupa-text/decomposers/opendocument-spreadsheet.rb 2019-02-28 11:17:15 +0900 (c421ffb) @@ -51,9 +51,7 @@ module ChupaText end end - class SheetsListener - include REXML::SAX2Listener - + class SheetsListener < SAXListener TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0" TABLE_URI = "urn:oasis:names:tc:opendocument:xmlns:table:1.0" @@ -126,7 +124,7 @@ module ChupaText private def add_text(text) return unless @in_p - @sheets.last[:rows].last.last[:text] << CGI.unescapeHTML(text) + @sheets.last[:rows].last.last[:text] << text end end end Modified: lib/chupa-text/decomposers/opendocument-text.rb (+2 -4) =================================================================== --- lib/chupa-text/decomposers/opendocument-text.rb 2019-02-26 15:34:22 +0900 (2a862ee) +++ lib/chupa-text/decomposers/opendocument-text.rb 2019-02-28 11:17:15 +0900 (0cc8aa8) @@ -43,9 +43,7 @@ module ChupaText yield(text_data) end - class TextListener - include REXML::SAX2Listener - + class TextListener < SAXListener TEXT_URI = "urn:oasis:names:tc:opendocument:xmlns:text:1.0" def initialize(output) @output = output @@ -81,7 +79,7 @@ module ChupaText private def add_text(text) return unless @in_p - @output << CGI.unescapeHTML(text) + @output << text end end end Modified: lib/chupa-text/decomposers/opendocument.rb (+5 -12) =================================================================== --- lib/chupa-text/decomposers/opendocument.rb 2019-02-26 15:34:22 +0900 (8aa5ddb) +++ lib/chupa-text/decomposers/opendocument.rb 2019-02-28 11:17:15 +0900 (9a2ef14) @@ -14,12 +14,10 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -require "cgi/util" -require "rexml/parsers/sax2parser" -require "rexml/sax2listener" - require "archive/zip" +require "chupa-text/sax-parser" + module ChupaText module Decomposers class OpenDocument < Decomposer @@ -58,10 +56,8 @@ module ChupaText end private - def parse(io, listener) - source = REXML::Source.new(io.read) - parser = REXML::Parsers::SAX2Parser.new(source) - parser.listen(listener) + def parse(input, listener) + parser = SAXParser.new(input, listener) parser.parse end @@ -70,9 +66,7 @@ module ChupaText parse(entry.file_data, listener) end - class AttributesListener - include REXML::SAX2Listener - + class AttributesListener < SAXListener META_URI = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0" DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/" @@ -122,7 +116,6 @@ module ChupaText def set_attribute(value) return if****@name*****? - value = CGI.unescapeHTML(value) case @type when :w3cdtf value = Time.xmlschema(value) Added: lib/chupa-text/sax-parser.rb (+151 -0) 100644 =================================================================== --- /dev/null +++ lib/chupa-text/sax-parser.rb 2019-02-28 11:17:15 +0900 (d6b8a3b) @@ -0,0 +1,151 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "cgi/util" +require "rexml/parsers/sax2parser" +require "rexml/sax2listener" + +begin + require "nokogiri" +rescue LoadError +end + +module ChupaText + class SAXParser + class << self + def backend + case ENV["CHUPA_TEXT_SAX_PARSER_BACKEND"] + when "rexml" + :rexml + else + if Object.const_defined?(:Nokogiri) + :nokogiri + else + :rexml + end + end + end + end + + def initialize(input, listener) + @input = input + @listener = listener + end + + if backend == :nokogiri + def parse + document = Document.new(@listener) + parser = Nokogiri::XML::SAX::Parser.new(document) + parser.parse(@input) + end + + class Document < Nokogiri::XML::SAX::Document + def initialize(listener) + @listener = listener + @namespaces_stack = [] + end + + def start_element_namespace(name, + attributes=[], + prefix=nil, + uri=nil, + namespaces=[]) + namespaces.each do |namespace_prefix, namespace_uri| + @listener.start_prefix_mapping(namespace_prefix, namespace_uri) + end + attributes_hash = {} + attributes.each do |attribute| + attribute_qname = build_qname(attribute.prefix, attribute.localname) + attributes_hash[attribute_qname] = attribute.value + end + @namespaces_stack.push(namespaces) + @listener.start_element(uri, + name, + build_qname(prefix, name), + attributes_hash) + end + + def end_element_namespace(name, prefix=nil, uri=nil) + @listener.end_element(uri, name, build_qname(prefix, name)) + namespaces = @namespaces_stack.pop + namespaces.each do |namespace_prefix, _| + @listener.end_prefix_mapping(namespace_prefix) + end + end + + def characters(text) + @listener.characters(text) + end + + def cdata_block(content) + @listener.cdata(content) + end + + private + def build_qname(prefix, local_name) + if prefix + "#{prefix}:#{local_name}" + else + local_name + end + end + end + else + def parse + source = REXML::Source.new(@input.read) + parser = REXML::Parsers::SAX2Parser.new(source) + parser.listen(Listener.new(@listener)) + parser.parse + end + + class Listener + include REXML::SAX2Listener + + def initialize(listener) + @listener = listener + end + + def start_prefix_mapping(*args) + @listener.start_prefix_mapping(*args) + end + + def end_prefix_mapping(*args) + @listener.end_prefix_mapping(*args) + end + + def start_element(*args) + @listener.start_element(*args) + end + + def end_element(*args) + @listener.end_element(*args) + end + + def characters(text) + @listener.characters(CGI.unescapeHTML(text)) + end + + def cdata(content) + @listener.cdata(CGI.unescapeHTML(content)) + end + end + end + end + + class SAXListener + include REXML::SAX2Listener + end +end -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190228/3384cc86/attachment-0001.html>