[Groonga-commit] ranguba/chupa-text at 36cabe7 [master] xml: add support for Nokogiri

Back to archive index
Kouhei Sutou null+****@clear*****
Sat Mar 2 05:59:03 JST 2019


Kouhei Sutou	2019-03-02 05:59:03 +0900 (Sat, 02 Mar 2019)

  Revision: 36cabe712cf4a74b4863d6c6d687943f3dc5631c
  https://github.com/ranguba/chupa-text/commit/36cabe712cf4a74b4863d6c6d687943f3dc5631c

  Message:
    xml: add support for Nokogiri

  Modified files:
    lib/chupa-text/decomposers/xml.rb
    lib/chupa-text/sax-parser.rb
    test/decomposers/test-xml.rb

  Modified: lib/chupa-text/decomposers/xml.rb (+19 -8)
===================================================================
--- lib/chupa-text/decomposers/xml.rb    2019-03-02 05:32:43 +0900 (498f9d3)
+++ lib/chupa-text/decomposers/xml.rb    2019-03-02 05:59:03 +0900 (f5575ff)
@@ -14,8 +14,7 @@
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
-require "rexml/document"
-require "rexml/streamlistener"
+require "chupa-text/sax-parser"
 
 module ChupaText
   module Decomposers
@@ -34,9 +33,9 @@ module ChupaText
         listener = Listener.new(text)
         data.open do |input|
           begin
-            parser = REXML::Parsers::StreamParser.new(input, listener)
+            parser = SAXParser.new(input, listener)
             parser.parse
-          rescue REXML::ParseException => xml_error
+          rescue SAXParser::ParseError => xml_error
             error do
               message = "#{log_tag} Failed to parse XML: "
               message << "#{xml_error.class}: #{xml_error.message}\n"
@@ -54,15 +53,27 @@ module ChupaText
       def log_tag
         "[decomposer][xml]"
       end
-      class Listener
-        include REXML::StreamListener
 
+      class Listener < SAXListener
         def initialize(output)
           @output = output
+          @level = 0
         end
 
-        def text(text)
-          @output << text
+        def start_element(*args)
+          @level += 1
+        end
+
+        def end_element(*args)
+          @level -= 1
+        end
+
+        def characters(text)
+          @output << text if @level > 0
+        end
+
+        def cdata(content)
+          @output << content if @level > 0
         end
       end
     end

  Modified: lib/chupa-text/sax-parser.rb (+17 -2)
===================================================================
--- lib/chupa-text/sax-parser.rb    2019-03-02 05:32:43 +0900 (d6b8a3b)
+++ lib/chupa-text/sax-parser.rb    2019-03-02 05:59:03 +0900 (8c1f1b2)
@@ -25,6 +25,9 @@ end
 
 module ChupaText
   class SAXParser
+    class ParseError < Error
+    end
+
     class << self
       def backend
         case ENV["CHUPA_TEXT_SAX_PARSER_BACKEND"]
@@ -94,6 +97,10 @@ module ChupaText
           @listener.cdata(content)
         end
 
+        def error(detail)
+          raise ParseError, detail
+        end
+
         private
         def build_qname(prefix, local_name)
           if prefix
@@ -105,10 +112,18 @@ module ChupaText
       end
     else
       def parse
-        source = REXML::Source.new(@input.read)
+        source = @input
+        if source.is_a?(Archive::Zip::Codec::Deflate::Decompress)
+          source = source.read
+        end
         parser = REXML::Parsers::SAX2Parser.new(source)
         parser.listen(Listener.new(@listener))
-        parser.parse
+        begin
+          parser.parse
+        rescue REXML::ParseException => error
+          message = "#{error.class}: #{error.message}"
+          raise ParseError, message
+        end
       end
 
       class Listener

  Modified: test/decomposers/test-xml.rb (+9 -4)
===================================================================
--- test/decomposers/test-xml.rb    2019-03-02 05:32:43 +0900 (05697ff)
+++ test/decomposers/test-xml.rb    2019-03-02 05:59:03 +0900 (ee846ea)
@@ -35,7 +35,6 @@ class TestDecomposersXML < Test::Unit::TestCase
   Hello
   &
   World
-
       TEXT
       assert_equal([text],
                    decompose(xml).collect(&:body))
@@ -45,15 +44,21 @@ class TestDecomposersXML < Test::Unit::TestCase
       messages = capture_log do
         assert_equal([], decompose("<root x=/>"))
       end
+      normalized_messages = messages.collect do |level, message|
+        [
+          level,
+          message.gsub(/(ChupaText::SAXParser::ParseError:) .*/,
+                       "\\1 ...")
+        ]
+      end
       assert_equal([
                      [
                        :error,
                        "[decomposer][xml] Failed to parse XML: " +
-                       "REXML::ParseException: " +
-                       "Missing attribute value start quote: <x>",
+                       "ChupaText::SAXParser::ParseError: ...",
                      ],
                    ],
-                   messages)
+                   normalized_messages)
     end
 
     private
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190302/40f83396/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index