[Groonga-commit] droonga/wikipedia-search at ddd2f55 [master] Use the leading 1000 characters

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Apr 4 17:30:52 JST 2014


Kouhei Sutou	2014-04-04 17:30:52 +0900 (Fri, 04 Apr 2014)

  New Revision: ddd2f55d25755caaf2b861854aaeca8276bce3c6
  https://github.com/droonga/wikipedia-search/commit/ddd2f55d25755caaf2b861854aaeca8276bce3c6

  Message:
    Use the leading 1000 characters

  Modified files:
    bin/wikipedia-to-groonga.rb
    lib/wikipedia-search/groonga-converter.rb
    lib/wikipedia-search/task.rb
    test/test-groonga-converter.rb

  Modified: bin/wikipedia-to-groonga.rb (+6 -0)
===================================================================
--- bin/wikipedia-to-groonga.rb    2014-04-04 17:22:59 +0900 (8dde77d)
+++ bin/wikipedia-to-groonga.rb    2014-04-04 17:30:52 +0900 (bcae968)
@@ -15,6 +15,7 @@ options = OpenStruct.new
 options.output = "-"
 converter_options = {
   :max_n_records => -1,
+  :max_n_characters => -1,
 }
 parser = OptionParser.new
 parser.on("--max-n-records=N", Integer,
@@ -22,6 +23,11 @@ parser.on("--max-n-records=N", Integer,
           "(#{converter_options[:max_n_records]})") do |n|
   converter_options[:max_n_records] = n
 end
+parser.on("--max-n-characters=N", Integer,
+          "The number of maximum characters in a record. -1 means unlimited.",
+          "(#{converter_options[:max_n_characters]})") do |n|
+  converter_options[:max_n_characters] = n
+end
 parser.on("--output=PATH",
           "Output to PATH. '-' means the standard output.",
           "(#{options.output})") do |path|

  Modified: lib/wikipedia-search/groonga-converter.rb (+12 -2)
===================================================================
--- lib/wikipedia-search/groonga-converter.rb    2014-04-04 17:22:59 +0900 (838e32a)
+++ lib/wikipedia-search/groonga-converter.rb    2014-04-04 17:30:52 +0900 (0179a7b)
@@ -29,8 +29,10 @@ module WikipediaSearch
         @text_stack = [""]
         @first_page = true
         @n_records = 0
-        @max_n_records = @options[:max_n_records]
+        @max_n_records = @options[:max_n_records] || -1
         @max_n_records = nil if @max_n_records < 0
+        @max_n_characters = @options[:max_n_characters] || -1
+        @max_n_characters = nil if @max_n_characters < 0
       end
 
       def start(abort_tag)
@@ -68,7 +70,7 @@ module WikipediaSearch
           page = {
             "_key"  => @id,
             "title" => @title,
-            "text"  => @text,
+            "text"  => shorten_text(@text),
           }
           @output.print(JSON.generate(page))
           @n_records += 1
@@ -98,6 +100,14 @@ module WikipediaSearch
       def pop_stacks
         @text_stack.pop
       end
+
+      def shorten_text(text)
+        if @max_n_characters
+          text[0, @max_n_characters]
+        else
+          text
+        end
+      end
     end
   end
 end

  Modified: lib/wikipedia-search/task.rb (+2 -0)
===================================================================
--- lib/wikipedia-search/task.rb    2014-04-04 17:22:59 +0900 (700d68a)
+++ lib/wikipedia-search/task.rb    2014-04-04 17:30:52 +0900 (4f372b3)
@@ -45,6 +45,8 @@ module WikipediaSearch
             command_line << "bin/wikipedia-to-groonga.rb"
             command_line << "--max-n-records"
             command_line << "5000"
+            command_line << "--max-n-characters"
+            command_line << "1000"
             command_line << "--output"
             command_line << ja_groonga_data_path.to_s
             sh(command_line.join(" "))

  Modified: test/test-groonga-converter.rb (+21 -0)
===================================================================
--- test/test-groonga-converter.rb    2014-04-04 17:22:59 +0900 (c4e1e14)
+++ test/test-groonga-converter.rb    2014-04-04 17:30:52 +0900 (6d6a291)
@@ -71,4 +71,25 @@ load --table Pages
 ]
     GROONGA
   end
+
+  def test_max_n_characters
+    xml = <<-XML
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/">
+  <page>
+    <title>Title</title>
+    <id>1</id>
+    <revision>
+      <id>1001</id>
+      <text>Text</text>
+    </revision>
+  </page>
+</mediawiki>
+    XML
+    assert_equal(<<-GROONGA, convert(xml, :max_n_characters => 2))
+load --table Pages
+[
+{"_key":1,"title":"Title","text":"Te"}
+]
+    GROONGA
+  end
 end
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index