Kouhei Sutou
null+****@clear*****
Fri Apr 4 18:09:57 JST 2014
Kouhei Sutou 2014-04-04 18:09:57 +0900 (Fri, 04 Apr 2014) New Revision: d43b46b7e8894eb4156f0af1c2f4f21c863086c9 https://github.com/droonga/wikipedia-search/commit/d43b46b7e8894eb4156f0af1c2f4f21c863086c9 Message: Extract categories for drilldown Modified files: lib/wikipedia-search/groonga-converter.rb test/test-groonga-converter.rb Modified: lib/wikipedia-search/groonga-converter.rb (+30 -9) =================================================================== --- lib/wikipedia-search/groonga-converter.rb 2014-04-04 17:40:01 +0900 (0179a7b) +++ lib/wikipedia-search/groonga-converter.rb 2014-04-04 18:09:57 +0900 (aaf10a0) @@ -50,9 +50,7 @@ module WikipediaSearch push_stacks case name when "page" - @title = nil - @id = nil - @text = nil + @page = Page.new end end @@ -68,18 +66,19 @@ module WikipediaSearch @output.puts(",") end page = { - "_key" => @id, - "title" => @title, - "text" => shorten_text(@text), + "_key" => @page.id, + "title" => @page.title, + "text" => shorten_text(@page.text), + "categories" => @page.extract_categories, } @output.print(JSON.generate(page)) @n_records += 1 when "title" - @title = @text_stack.last + @page.title = @text_stack.last when "id" - @id ||= Integer(@text_stack.last) + @page.id ||= Integer(@text_stack.last) when "text" - @text = @text_stack.last + @page.text = @text_stack.last end pop_stacks end @@ -108,6 +107,28 @@ module WikipediaSearch text end end + + class Page + attr_accessor :id, :title, :text + def initialize + @id = nil + @title = nil + @text = nil + end + + def extract_categories + return [] if****@text*****? + + categories = [] + @text.scan(/\[\[(.+?)\]\]/) do |link,| + case link + when /\ACategory:(.+?)(?:\|.*)?\z/ + categories << $1 + end + end + categories + end + end end end end Modified: test/test-groonga-converter.rb (+24 -3) =================================================================== --- test/test-groonga-converter.rb 2014-04-04 17:40:01 +0900 (6d6a291) +++ test/test-groonga-converter.rb 2014-04-04 18:09:57 +0900 (2e5d102) @@ -38,7 +38,7 @@ load --table Pages assert_equal(<<-GROONGA, convert(xml)) load --table Pages [ -{"_key":1,"title":"Title","text":"Text1 & Text2"} +{"_key":1,"title":"Title","text":"Text1 & Text2","categories":[]} ] GROONGA end @@ -67,7 +67,7 @@ load --table Pages assert_equal(<<-GROONGA, convert(xml, :max_n_records => 1)) load --table Pages [ -{"_key":1,"title":"Title1","text":"Text1"} +{"_key":1,"title":"Title1","text":"Text1","categories":[]} ] GROONGA end @@ -88,7 +88,28 @@ load --table Pages assert_equal(<<-GROONGA, convert(xml, :max_n_characters => 2)) load --table Pages [ -{"_key":1,"title":"Title","text":"Te"} +{"_key":1,"title":"Title","text":"Te","categories":[]} +] + GROONGA + end + + def test_categories + xml = <<-XML +<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/"> + <page> + <title>Title</title> + <id>1</id> + <revision> + <id>1001</id> + <text>[[Category:Groonga]]</text> + </revision> + </page> +</mediawiki> + XML + assert_equal(<<-GROONGA, convert(xml)) +load --table Pages +[ +{"_key":1,"title":"Title","text":"[[Category:Groonga]]","categories":["Groonga"]} ] GROONGA end -------------- next part -------------- HTML����������������������������...Download