Kouhei Sutou
null+****@clear*****
Fri Apr 4 17:30:52 JST 2014
Kouhei Sutou 2014-04-04 17:30:52 +0900 (Fri, 04 Apr 2014) New Revision: ddd2f55d25755caaf2b861854aaeca8276bce3c6 https://github.com/droonga/wikipedia-search/commit/ddd2f55d25755caaf2b861854aaeca8276bce3c6 Message: Use the leading 1000 characters Modified files: bin/wikipedia-to-groonga.rb lib/wikipedia-search/groonga-converter.rb lib/wikipedia-search/task.rb test/test-groonga-converter.rb Modified: bin/wikipedia-to-groonga.rb (+6 -0) =================================================================== --- bin/wikipedia-to-groonga.rb 2014-04-04 17:22:59 +0900 (8dde77d) +++ bin/wikipedia-to-groonga.rb 2014-04-04 17:30:52 +0900 (bcae968) @@ -15,6 +15,7 @@ options = OpenStruct.new options.output = "-" converter_options = { :max_n_records => -1, + :max_n_characters => -1, } parser = OptionParser.new parser.on("--max-n-records=N", Integer, @@ -22,6 +23,11 @@ parser.on("--max-n-records=N", Integer, "(#{converter_options[:max_n_records]})") do |n| converter_options[:max_n_records] = n end +parser.on("--max-n-characters=N", Integer, + "The number of maximum characters in a record. -1 means unlimited.", + "(#{converter_options[:max_n_characters]})") do |n| + converter_options[:max_n_characters] = n +end parser.on("--output=PATH", "Output to PATH. '-' means the standard output.", "(#{options.output})") do |path| Modified: lib/wikipedia-search/groonga-converter.rb (+12 -2) =================================================================== --- lib/wikipedia-search/groonga-converter.rb 2014-04-04 17:22:59 +0900 (838e32a) +++ lib/wikipedia-search/groonga-converter.rb 2014-04-04 17:30:52 +0900 (0179a7b) @@ -29,8 +29,10 @@ module WikipediaSearch @text_stack = [""] @first_page = true @n_records = 0 - @max_n_records = @options[:max_n_records] + @max_n_records = @options[:max_n_records] || -1 @max_n_records = nil if @max_n_records < 0 + @max_n_characters = @options[:max_n_characters] || -1 + @max_n_characters = nil if @max_n_characters < 0 end def start(abort_tag) @@ -68,7 +70,7 @@ module WikipediaSearch page = { "_key" => @id, "title" => @title, - "text" => @text, + "text" => shorten_text(@text), } @output.print(JSON.generate(page)) @n_records += 1 @@ -98,6 +100,14 @@ module WikipediaSearch def pop_stacks @text_stack.pop end + + def shorten_text(text) + if @max_n_characters + text[0, @max_n_characters] + else + text + end + end end end end Modified: lib/wikipedia-search/task.rb (+2 -0) =================================================================== --- lib/wikipedia-search/task.rb 2014-04-04 17:22:59 +0900 (700d68a) +++ lib/wikipedia-search/task.rb 2014-04-04 17:30:52 +0900 (4f372b3) @@ -45,6 +45,8 @@ module WikipediaSearch command_line << "bin/wikipedia-to-groonga.rb" command_line << "--max-n-records" command_line << "5000" + command_line << "--max-n-characters" + command_line << "1000" command_line << "--output" command_line << ja_groonga_data_path.to_s sh(command_line.join(" ")) Modified: test/test-groonga-converter.rb (+21 -0) =================================================================== --- test/test-groonga-converter.rb 2014-04-04 17:22:59 +0900 (c4e1e14) +++ test/test-groonga-converter.rb 2014-04-04 17:30:52 +0900 (6d6a291) @@ -71,4 +71,25 @@ load --table Pages ] GROONGA end + + def test_max_n_characters + xml = <<-XML +<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/"> + <page> + <title>Title</title> + <id>1</id> + <revision> + <id>1001</id> + <text>Text</text> + </revision> + </page> +</mediawiki> + XML + assert_equal(<<-GROONGA, convert(xml, :max_n_characters => 2)) +load --table Pages +[ +{"_key":1,"title":"Title","text":"Te"} +] + GROONGA + end end -------------- next part -------------- HTML����������������������������... Download