Kouhei Sutou
null+****@clear*****
Fri Apr 4 15:47:17 JST 2014
Kouhei Sutou 2014-04-04 15:47:17 +0900 (Fri, 04 Apr 2014) New Revision: 2a9c677fccdb94179008d088259d66bbae12595d https://github.com/droonga/wikipedia-search/commit/2a9c677fccdb94179008d088259d66bbae12595d Message: Add Wikipedia to Groonga converter GitHub: #1 Added files: Gemfile bin/wikipedia-to-groonga.rb lib/wikipedia-search/groonga-converter.rb test/run-test.rb test/test-groonga-converter.rb Modified files: .gitignore Modified: .gitignore (+1 -0) =================================================================== --- .gitignore 2014-04-04 11:42:20 +0900 (82f0c3a) +++ .gitignore 2014-04-04 15:47:17 +0900 (cae0078) @@ -1 +1,2 @@ +/Gemfile.lock /data/ Added: Gemfile (+5 -0) 100644 =================================================================== --- /dev/null +++ Gemfile 2014-04-04 15:47:17 +0900 (7459082) @@ -0,0 +1,5 @@ +# -*- ruby -*- + +source "https://rubygems.org/" + +gem "test-unit", :require => false Added: bin/wikipedia-to-groonga.rb (+13 -0) 100755 =================================================================== --- /dev/null +++ bin/wikipedia-to-groonga.rb 2014-04-04 15:47:17 +0900 (518160c) @@ -0,0 +1,13 @@ +#!/usr/bin/env ruby + +require "pathname" + +base_dir_path = Pathname.new(__FILE__).dirname +lib_dir_path = base_dir_path + "lib" + +$LOAD_PATH.unshift(lib_dir_path.to_s) + +require "wikipedia-search/groonga-converter" + +converter = WikipediaSearch::GroongaConverter.new(ARGF) +converter.convert($stdout) Added: lib/wikipedia-search/groonga-converter.rb (+87 -0) 100644 =================================================================== --- /dev/null +++ lib/wikipedia-search/groonga-converter.rb 2014-04-04 15:47:17 +0900 (c7a0057) @@ -0,0 +1,87 @@ +require "json" +require "rexml/streamlistener" +require "rexml/parsers/baseparser" +require "rexml/parsers/streamparser" + +module WikipediaSearch + class GroongaConverter + def initialize(input) + @input = input + end + + def convert(output) + listener = Listener.new(output) + parser = REXML::Parsers::StreamParser.new(@input, listener) + parser.parse + listener.finish + end + + class Listener + include REXML::StreamListener + + def initialize(output) + @output = output + @text_stack = [""] + @first_page = true + @output.puts("load --table Pages") + @output.puts("[") + end + + def finish + @output.puts unless @first_page + @output.puts("]") + end + + def tag_start(name, attributes) + push_stacks + case name + when "page" + @title = nil + @id = nil + @text = nil + end + end + + def tag_end(name) + case name + when "page" + if @first_page + @first_page = false + else + @output.puts(",") + end + page = { + "_key" => @id, + "title" => @title, + "text" => @text, + } + @output.print(JSON.generate(page)) + when "title" + @title = @text_stack.last + when "id" + @id ||= Integer(@text_stack.last) + when "text" + @text = @text_stack.last + end + pop_stacks + end + + def text(data) + @text_stack.last << data + end + + def cdata(contnet) + @text_stack.last << content + end + + private + def push_stacks + @text_stack << "" + end + + def pop_stacks + @text_stack.pop + end + end + end +end Added: test/run-test.rb (+14 -0) 100755 =================================================================== --- /dev/null +++ test/run-test.rb 2014-04-04 15:47:17 +0900 (375eef7) @@ -0,0 +1,14 @@ +#!/usr/bin/env ruby + +require "pathname" + +require "bundler/setup" +require "test-unit" + +base_dir_path = Pathname.new(__FILE__).dirname.parent +lib_dir_path = base_dir_path + "lib" +test_dir_path = base_dir_path + "test" + +$LOAD_PATH.unshift(lib_dir_path.to_s) + +exit(Test::Unit::AutoRunner.run(true, test_dir_path.to_s)) Added: test/test-groonga-converter.rb (+45 -0) 100644 =================================================================== --- /dev/null +++ test/test-groonga-converter.rb 2014-04-04 15:47:17 +0900 (4032f8a) @@ -0,0 +1,45 @@ +require "stringio" +require "wikipedia-search/groonga-converter" + +class TestGroongaConverter < Test::Unit::TestCase + def convert(xml) + input = StringIO.new(xml) + output = StringIO.new + converter = WikipediaSearch::GroongaConverter.new(input) + converter.convert(output) + output.string + end + + def test_empty + xml = <<-XML +<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/"> +</mediawiki> + XML + assert_equal(<<-GROONGA, convert(xml)) +load --table Pages +[ +] + GROONGA + end + + def test_one + xml = <<-XML +<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/"> + <page> + <title>Title</title> + <id>1</id> + <revision> + <id>1001</id> + <text>Text1 & Text2</text> + </revision> + </page> +</mediawiki> + XML + assert_equal(<<-GROONGA, convert(xml)) +load --table Pages +[ +{"_key":1,"title":"Title","text":"Text1 & Text2"} +] + GROONGA + end +end -------------- next part -------------- HTML����������������������������...Download