[Groonga-commit] groonga/groonga-normalizer-mysql [master] Extract ctype-uca.c parser

Back to archive index

Kouhei Sutou null+****@clear*****
Sun Feb 10 19:04:55 JST 2013


Kouhei Sutou	2013-02-10 19:04:55 +0900 (Sun, 10 Feb 2013)

  New Revision: 1a417b4d18a2d406cb57b565fa9136adff9b6dda
  https://github.com/groonga/groonga-normalizer-mysql/commit/1a417b4d18a2d406cb57b565fa9136adff9b6dda

  Log:
    Extract ctype-uca.c parser

  Added files:
    tool/parser.rb
  Modified files:
    tool/dump-difference-uca.rb

  Modified: tool/dump-difference-uca.rb (+5 -68)
===================================================================
--- tool/dump-difference-uca.rb    2013-02-10 18:42:36 +0900 (9b201de)
+++ tool/dump-difference-uca.rb    2013-02-10 19:04:55 +0900 (5ea50b6)
@@ -14,82 +14,19 @@
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
-require "English"
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require "parser"
 
 if ARGV.size != 1
   puts("Usage: #{$0} MYSQL_SOURCE/strings/ctype-uca.c")
   exit(false)
 end
 
-def code_point_to_utf8(code_point)
-  [code_point].pack("U")
-end
-
-def utf8_to_code_point(utf8)
-  utf8.unpack("U")
-end
-
-current_page = nil
-pages = {}
-in_length = false
-lengths = []
-ARGF.each_line do |line|
-  case line
-  when / page(\d{3})data\[\]=/
-    current_page = $1.to_i(16)
-    pages[current_page] = []
-  when /^\s*0x(?:[\da-z]+)(?:,\s*0x(?:[\da-z]+))*,?$/i
-    next if current_page.nil?
-    weights = line.chomp.split(/,\s*/).collect do |component|
-      Integer(component)
-    end
-    pages[current_page].concat(weights)
-  when / uca_length\[256\]=/
-    in_length = true
-  when /^\d+(?:,\d+)*,?$/
-    next unless in_length
-    _lengths = line.chomp.split(/,/).collect {|length| Integer(length)}
-    lengths.concat(_lengths)
-  when /^\};$/
-    current_page = nil
-    in_length = false
-  end
-end
-
-pages.each do |page, flatten_weights|
-  weights = flatten_weights.each_slice(lengths[page])
-  pages[page] = weights.with_index.collect do |weight, i|
-    if weight.all?(&:zero?)
-      weight = [0]
-    else
-      while weight.last.zero?
-        weight.pop
-      end
-    end
-    code_point = (page << 8) + i
-    {
-      :weight     => weight,
-      :code_point => code_point,
-      :utf8       => code_point_to_utf8(code_point),
-    }
-  end
-end
-
-sorted_pages = pages.sort_by do |page, characters|
-  page
-end
-
-weight_based_characters = {}
-sorted_pages.each do |page, characters|
-  characters.each do |character|
-    weight = character[:weight]
-    weight_based_characters[weight] ||= []
-    weight_based_characters[weight] << character
-  end
-end
+parser = CTypeUCAParser.new
+parser.parse(ARGF)
 
 n_idencials = 0
-weight_based_characters.each do |weight, characters|
+parser.weight_based_characters.each do |weight, characters|
   next if characters.size == 1
   n_idencials += 1
   formatted_weight = weight.collect {|component| '%#07x' % component}.join(', ')

  Added: tool/parser.rb (+111 -0) 100644
===================================================================
--- /dev/null
+++ tool/parser.rb    2013-02-10 19:04:55 +0900 (8846bfd)
@@ -0,0 +1,111 @@
+# Copyright (C) 2013  Kouhei Sutou <kou �� clear-code.com>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Library General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Library General Public License for more details.
+#
+# You should have received a copy of the GNU Library General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "English"
+
+module Unicode
+  module_function
+  def to_utf8(code_point)
+    [code_point].pack("U")
+  end
+
+  def from_utf8(utf8)
+    utf8.unpack("U")
+  end
+end
+
+class CTypeUCAParser
+  attr_reader :pages
+  def initialize
+    @pages = {}
+    @lengths = []
+  end
+
+  def parse(input)
+    parse_ctype_uca(input)
+    normalize_pages
+    sort_pages
+  end
+
+  def weight_based_characters
+    weight_based_characters = {}
+    @sorted_pages.each do |page, characters|
+      characters.each do |character|
+        weight = character[:weight]
+        weight_based_characters[weight] ||= []
+        weight_based_characters[weight] << character
+      end
+    end
+    weight_based_characters
+  end
+
+  private
+  def parse_ctype_uca(input)
+    current_page = nil
+    in_length = false
+    input.each_line do |line|
+      case line
+      when / page(\d{3})data\[\]=/
+        current_page = $1.to_i(16)
+        @pages[current_page] = []
+      when /^\s*0x(?:[\da-z]+)(?:,\s*0x(?:[\da-z]+))*,?$/i
+        next if current_page.nil?
+        weights = line.chomp.split(/,\s*/).collect do |component|
+          Integer(component)
+        end
+        @pages[current_page].concat(weights)
+      when / uca_length\[256\]=/
+        in_length = true
+      when /^\d+(?:,\d+)*,?$/
+        next unless in_length
+        current_lengths = line.chomp.split(/,/).collect do |length|
+          Integer(length)
+        end
+        @lengths.concat(current_lengths)
+      when /^\};$/
+        current_page = nil
+        in_length = false
+      end
+    end
+  end
+
+  def normalize_pages
+    @pages.each do |page, flatten_weights|
+      weights = flatten_weights.each_slice(@lengths[page])
+      @pages[page] = weights.with_index.collect do |weight, i|
+        if weight.all?(&:zero?)
+          weight = [0]
+        else
+          while weight.last.zero?
+            weight.pop
+          end
+        end
+        code_point = (page << 8) + i
+        {
+          :weight     => weight,
+          :code_point => code_point,
+          :utf8       => Unicode.to_utf8(code_point),
+        }
+      end
+    end
+  end
+
+  def sort_pages
+    @sorted_pages =****@pages*****_by do |page, characters|
+      page
+    end
+  end
+end
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index