[Groonga-commit] droonga/wikipedia-search at 2a9c677 [master] Add Wikipedia to Groonga converter

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Apr 4 15:47:17 JST 2014


Kouhei Sutou	2014-04-04 15:47:17 +0900 (Fri, 04 Apr 2014)

  New Revision: 2a9c677fccdb94179008d088259d66bbae12595d
  https://github.com/droonga/wikipedia-search/commit/2a9c677fccdb94179008d088259d66bbae12595d

  Message:
    Add Wikipedia to Groonga converter
    
    GitHub: #1

  Added files:
    Gemfile
    bin/wikipedia-to-groonga.rb
    lib/wikipedia-search/groonga-converter.rb
    test/run-test.rb
    test/test-groonga-converter.rb
  Modified files:
    .gitignore

  Modified: .gitignore (+1 -0)
===================================================================
--- .gitignore    2014-04-04 11:42:20 +0900 (82f0c3a)
+++ .gitignore    2014-04-04 15:47:17 +0900 (cae0078)
@@ -1 +1,2 @@
+/Gemfile.lock
 /data/

  Added: Gemfile (+5 -0) 100644
===================================================================
--- /dev/null
+++ Gemfile    2014-04-04 15:47:17 +0900 (7459082)
@@ -0,0 +1,5 @@
+# -*- ruby -*-
+
+source "https://rubygems.org/"
+
+gem "test-unit", :require => false

  Added: bin/wikipedia-to-groonga.rb (+13 -0) 100755
===================================================================
--- /dev/null
+++ bin/wikipedia-to-groonga.rb    2014-04-04 15:47:17 +0900 (518160c)
@@ -0,0 +1,13 @@
+#!/usr/bin/env ruby
+
+require "pathname"
+
+base_dir_path = Pathname.new(__FILE__).dirname
+lib_dir_path = base_dir_path + "lib"
+
+$LOAD_PATH.unshift(lib_dir_path.to_s)
+
+require "wikipedia-search/groonga-converter"
+
+converter = WikipediaSearch::GroongaConverter.new(ARGF)
+converter.convert($stdout)

  Added: lib/wikipedia-search/groonga-converter.rb (+87 -0) 100644
===================================================================
--- /dev/null
+++ lib/wikipedia-search/groonga-converter.rb    2014-04-04 15:47:17 +0900 (c7a0057)
@@ -0,0 +1,87 @@
+require "json"
+require "rexml/streamlistener"
+require "rexml/parsers/baseparser"
+require "rexml/parsers/streamparser"
+
+module WikipediaSearch
+  class GroongaConverter
+    def initialize(input)
+      @input = input
+    end
+
+    def convert(output)
+      listener = Listener.new(output)
+      parser = REXML::Parsers::StreamParser.new(@input, listener)
+      parser.parse
+      listener.finish
+    end
+
+    class Listener
+      include REXML::StreamListener
+
+      def initialize(output)
+        @output = output
+        @text_stack = [""]
+        @first_page = true
+        @output.puts("load --table Pages")
+        @output.puts("[")
+      end
+
+      def finish
+        @output.puts unless @first_page
+        @output.puts("]")
+      end
+
+      def tag_start(name, attributes)
+        push_stacks
+        case name
+        when "page"
+          @title = nil
+          @id = nil
+          @text = nil
+        end
+      end
+
+      def tag_end(name)
+        case name
+        when "page"
+          if @first_page
+            @first_page = false
+          else
+            @output.puts(",")
+          end
+          page = {
+            "_key"  => @id,
+            "title" => @title,
+            "text"  => @text,
+          }
+          @output.print(JSON.generate(page))
+        when "title"
+          @title = @text_stack.last
+        when "id"
+          @id ||= Integer(@text_stack.last)
+        when "text"
+          @text = @text_stack.last
+        end
+        pop_stacks
+      end
+
+      def text(data)
+        @text_stack.last << data
+      end
+
+      def cdata(contnet)
+        @text_stack.last << content
+      end
+
+      private
+      def push_stacks
+        @text_stack << ""
+      end
+
+      def pop_stacks
+        @text_stack.pop
+      end
+    end
+  end
+end

  Added: test/run-test.rb (+14 -0) 100755
===================================================================
--- /dev/null
+++ test/run-test.rb    2014-04-04 15:47:17 +0900 (375eef7)
@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+
+require "pathname"
+
+require "bundler/setup"
+require "test-unit"
+
+base_dir_path = Pathname.new(__FILE__).dirname.parent
+lib_dir_path = base_dir_path + "lib"
+test_dir_path = base_dir_path + "test"
+
+$LOAD_PATH.unshift(lib_dir_path.to_s)
+
+exit(Test::Unit::AutoRunner.run(true, test_dir_path.to_s))

  Added: test/test-groonga-converter.rb (+45 -0) 100644
===================================================================
--- /dev/null
+++ test/test-groonga-converter.rb    2014-04-04 15:47:17 +0900 (4032f8a)
@@ -0,0 +1,45 @@
+require "stringio"
+require "wikipedia-search/groonga-converter"
+
+class TestGroongaConverter < Test::Unit::TestCase
+  def convert(xml)
+    input = StringIO.new(xml)
+    output = StringIO.new
+    converter = WikipediaSearch::GroongaConverter.new(input)
+    converter.convert(output)
+    output.string
+  end
+
+  def test_empty
+    xml = <<-XML
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/">
+</mediawiki>
+    XML
+    assert_equal(<<-GROONGA, convert(xml))
+load --table Pages
+[
+]
+    GROONGA
+  end
+
+  def test_one
+    xml = <<-XML
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/">
+  <page>
+    <title>Title</title>
+    <id>1</id>
+    <revision>
+      <id>1001</id>
+      <text>Text1 &amp; Text2</text>
+    </revision>
+  </page>
+</mediawiki>
+    XML
+    assert_equal(<<-GROONGA, convert(xml))
+load --table Pages
+[
+{"_key":1,"title":"Title","text":"Text1 & Text2"}
+]
+    GROONGA
+  end
+end
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index