[Groonga-commit] droonga/wikipedia-search at d43b46b [master] Extract categories for drilldown

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Apr 4 18:09:57 JST 2014


Kouhei Sutou	2014-04-04 18:09:57 +0900 (Fri, 04 Apr 2014)

  New Revision: d43b46b7e8894eb4156f0af1c2f4f21c863086c9
  https://github.com/droonga/wikipedia-search/commit/d43b46b7e8894eb4156f0af1c2f4f21c863086c9

  Message:
    Extract categories for drilldown

  Modified files:
    lib/wikipedia-search/groonga-converter.rb
    test/test-groonga-converter.rb

  Modified: lib/wikipedia-search/groonga-converter.rb (+30 -9)
===================================================================
--- lib/wikipedia-search/groonga-converter.rb    2014-04-04 17:40:01 +0900 (0179a7b)
+++ lib/wikipedia-search/groonga-converter.rb    2014-04-04 18:09:57 +0900 (aaf10a0)
@@ -50,9 +50,7 @@ module WikipediaSearch
         push_stacks
         case name
         when "page"
-          @title = nil
-          @id = nil
-          @text = nil
+          @page = Page.new
         end
       end
 
@@ -68,18 +66,19 @@ module WikipediaSearch
             @output.puts(",")
           end
           page = {
-            "_key"  => @id,
-            "title" => @title,
-            "text"  => shorten_text(@text),
+            "_key"  => @page.id,
+            "title" => @page.title,
+            "text"  => shorten_text(@page.text),
+            "categories" => @page.extract_categories,
           }
           @output.print(JSON.generate(page))
           @n_records += 1
         when "title"
-          @title = @text_stack.last
+          @page.title = @text_stack.last
         when "id"
-          @id ||= Integer(@text_stack.last)
+          @page.id ||= Integer(@text_stack.last)
         when "text"
-          @text = @text_stack.last
+          @page.text = @text_stack.last
         end
         pop_stacks
       end
@@ -108,6 +107,28 @@ module WikipediaSearch
           text
         end
       end
+
+      class Page
+        attr_accessor :id, :title, :text
+        def initialize
+          @id = nil
+          @title = nil
+          @text = nil
+        end
+
+        def extract_categories
+          return [] if****@text*****?
+
+          categories = []
+          @text.scan(/\[\[(.+?)\]\]/) do |link,|
+            case link
+            when /\ACategory:(.+?)(?:\|.*)?\z/
+              categories << $1
+            end
+          end
+          categories
+        end
+      end
     end
   end
 end

  Modified: test/test-groonga-converter.rb (+24 -3)
===================================================================
--- test/test-groonga-converter.rb    2014-04-04 17:40:01 +0900 (6d6a291)
+++ test/test-groonga-converter.rb    2014-04-04 18:09:57 +0900 (2e5d102)
@@ -38,7 +38,7 @@ load --table Pages
     assert_equal(<<-GROONGA, convert(xml))
 load --table Pages
 [
-{"_key":1,"title":"Title","text":"Text1 & Text2"}
+{"_key":1,"title":"Title","text":"Text1 & Text2","categories":[]}
 ]
     GROONGA
   end
@@ -67,7 +67,7 @@ load --table Pages
     assert_equal(<<-GROONGA, convert(xml, :max_n_records => 1))
 load --table Pages
 [
-{"_key":1,"title":"Title1","text":"Text1"}
+{"_key":1,"title":"Title1","text":"Text1","categories":[]}
 ]
     GROONGA
   end
@@ -88,7 +88,28 @@ load --table Pages
     assert_equal(<<-GROONGA, convert(xml, :max_n_characters => 2))
 load --table Pages
 [
-{"_key":1,"title":"Title","text":"Te"}
+{"_key":1,"title":"Title","text":"Te","categories":[]}
+]
+    GROONGA
+  end
+
+  def test_categories
+    xml = <<-XML
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/">
+  <page>
+    <title>Title</title>
+    <id>1</id>
+    <revision>
+      <id>1001</id>
+      <text>[[Category:Groonga]]</text>
+    </revision>
+  </page>
+</mediawiki>
+    XML
+    assert_equal(<<-GROONGA, convert(xml))
+load --table Pages
+[
+{"_key":1,"title":"Title","text":"[[Category:Groonga]]","categories":["Groonga"]}
 ]
     GROONGA
   end
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index