Kouhei Sutou
null+****@clear*****
Fri Apr 4 16:24:03 JST 2014
Kouhei Sutou 2014-04-04 16:24:03 +0900 (Fri, 04 Apr 2014) New Revision: 7fd702d44b2aab43e0861b279ef5e7afc49e51e9 https://github.com/droonga/wikipedia-search/commit/7fd702d44b2aab43e0861b279ef5e7afc49e51e9 Message: Add --max-n-records option Modified files: bin/wikipedia-to-groonga.rb lib/wikipedia-search/groonga-converter.rb test/test-groonga-converter.rb Modified: bin/wikipedia-to-groonga.rb (+13 -1) =================================================================== --- bin/wikipedia-to-groonga.rb 2014-04-04 15:47:17 +0900 (518160c) +++ bin/wikipedia-to-groonga.rb 2014-04-04 16:24:03 +0900 (ae4d03c) @@ -1,6 +1,7 @@ #!/usr/bin/env ruby require "pathname" +require "optparse" base_dir_path = Pathname.new(__FILE__).dirname lib_dir_path = base_dir_path + "lib" @@ -9,5 +10,16 @@ $LOAD_PATH.unshift(lib_dir_path.to_s) require "wikipedia-search/groonga-converter" -converter = WikipediaSearch::GroongaConverter.new(ARGF) +options = { + :max_n_records => -1, +} +parser = OptionParser.new +parser.on("--max-n-records=N", Integer, + "The number of maximum records. -1 means unlimited.", + "(#{options[:max_n_records]})") do |n| + options[:max_n_records] = n +end +parser.parse!(ARGV) + +converter = WikipediaSearch::GroongaConverter.new(ARGF, options) converter.convert($stdout) Modified: lib/wikipedia-search/groonga-converter.rb (+20 -5) =================================================================== --- lib/wikipedia-search/groonga-converter.rb 2014-04-04 15:47:17 +0900 (c7a0057) +++ lib/wikipedia-search/groonga-converter.rb 2014-04-04 16:24:03 +0900 (f8f1f8e) @@ -5,24 +5,35 @@ require "rexml/parsers/streamparser" module WikipediaSearch class GroongaConverter - def initialize(input) + def initialize(input, options={}) @input = input + @options = options end def convert(output) - listener = Listener.new(output) - parser = REXML::Parsers::StreamParser.new(@input, listener) - parser.parse + listener = Listener.new(output, @options) + catch do |tag| + parser = REXML::Parsers::StreamParser.new(@input, listener) + listener.start(tag) + parser.parse + end listener.finish end class Listener include REXML::StreamListener - def initialize(output) + def initialize(output, options) @output = output + @options = options @text_stack = [""] @first_page = true + @n_records = 0 + @max_n_records = @options[:max_n_records] + end + + def start(abort_tag) + @abort_tag = abort_tag @output.puts("load --table Pages") @output.puts("[") end @@ -45,6 +56,9 @@ module WikipediaSearch def tag_end(name) case name when "page" + if @max_n_records and @n_records >= @max_n_records + throw(@abort_tag) + end if @first_page @first_page = false else @@ -56,6 +70,7 @@ module WikipediaSearch "text" => @text, } @output.print(JSON.generate(page)) + @n_records += 1 when "title" @title = @text_stack.last when "id" Modified: test/test-groonga-converter.rb (+31 -2) =================================================================== --- test/test-groonga-converter.rb 2014-04-04 15:47:17 +0900 (4032f8a) +++ test/test-groonga-converter.rb 2014-04-04 16:24:03 +0900 (c4e1e14) @@ -2,10 +2,10 @@ require "stringio" require "wikipedia-search/groonga-converter" class TestGroongaConverter < Test::Unit::TestCase - def convert(xml) + def convert(xml, options={}) input = StringIO.new(xml) output = StringIO.new - converter = WikipediaSearch::GroongaConverter.new(input) + converter = WikipediaSearch::GroongaConverter.new(input, options) converter.convert(output) output.string end @@ -42,4 +42,33 @@ load --table Pages ] GROONGA end + + def test_max_n_records + xml = <<-XML +<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/"> + <page> + <title>Title1</title> + <id>1</id> + <revision> + <id>1001</id> + <text>Text1</text> + </revision> + </page> + <page> + <title>Title2</title> + <id>2</id> + <revision> + <id>1002</id> + <text>Text2</text> + </revision> + </page> +</mediawiki> + XML + assert_equal(<<-GROONGA, convert(xml, :max_n_records => 1)) +load --table Pages +[ +{"_key":1,"title":"Title1","text":"Text1"} +] + GROONGA + end end -------------- next part -------------- HTML����������������������������...Download