[Groonga-commit] groonga/groonga-normalizer-mysql [master] Extract ctype-utf8.c parser

Back to archive index

Kouhei Sutou null+****@clear*****
Sun Feb 10 19:10:32 JST 2013


Kouhei Sutou	2013-02-10 19:10:32 +0900 (Sun, 10 Feb 2013)

  New Revision: bfe2b53e99be3ea6dc3fe292cfc6e3aa58d83dbf
  https://github.com/groonga/groonga-normalizer-mysql/commit/bfe2b53e99be3ea6dc3fe292cfc6e3aa58d83dbf

  Log:
    Extract ctype-utf8.c parser

  Modified files:
    tool/dump-difference-utf8.rb
    tool/parser.rb

  Modified: tool/dump-difference-utf8.rb (+6 -44)
===================================================================
--- tool/dump-difference-utf8.rb    2013-02-10 19:04:55 +0900 (2a1a2a0)
+++ tool/dump-difference-utf8.rb    2013-02-10 19:10:32 +0900 (5af6bb2)
@@ -14,58 +14,20 @@
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
-require "English"
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require "parser"
 
 if ARGV.size != 1
   puts("Usage: #{$0} MYSQL_SOURCE/strings/ctype-utf8.c")
   exit(false)
 end
 
-def code_point_to_utf8(code_point)
-  [code_point].pack("U")
-end
-
-def utf8_to_code_point(utf8)
-  utf8.unpack("U")
-end
-
-current_plane = nil
-planes = {}
-ARGF.each_line do |line|
-  case line
-  when / plane(\d{2})\[\]=/
-    current_plane = $1.to_i(16)
-    planes[current_plane] = []
-  when /^\s*
-         \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\},
-         \s*
-         \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\},?$/ix
-    next if current_plane.nil?
-    parsed_characters = $LAST_MATCH_INFO.captures.collect do |value|
-      code_point_to_utf8(value.to_i(16))
-    end
-    upper1, lower1, sort1, upper2, lower2, sort2 = parsed_characters
-    characters = planes[current_plane]
-    characters << {:upper => upper1, :lower => lower1, :sort => sort1}
-    characters << {:upper => upper2, :lower => lower2, :sort => sort2}
-  when /^\};$/
-    current_plane = nil
-  end
-end
-
-planes.each do |plane, characters|
-  characters.each_with_index do |character, i|
-    character[:base] = code_point_to_utf8((plane << 8) + i)
-  end
-end
-
-sorted_planes = planes.sort_by do |plane, characters|
-  plane
-end
+parser = CTypeUTF8Parser.new
+parser.parse(ARGF)
 
 n_differences = 0
 n_expanded_sort_characters = 0
-sorted_planes.each do |plane, characters|
+parser.sorted_planes.each do |plane, characters|
   characters.each do |character|
     base = character[:base]
     upper = character[:upper]
@@ -75,7 +37,7 @@ sorted_planes.each do |plane, characters|
     n_differences += 1
     utf8s = [base, upper, lower, sort]
     formatted_code_points = utf8s.collect do |utf8|
-      "%#07x" % utf8_to_code_point(utf8)
+      "%#07x" % Unicode.from_utf8(utf8)
     end
     if sort.bytesize > base.bytesize
       n_expanded_sort_characters += 1

  Modified: tool/parser.rb (+51 -0)
===================================================================
--- tool/parser.rb    2013-02-10 19:04:55 +0900 (8846bfd)
+++ tool/parser.rb    2013-02-10 19:10:32 +0900 (400098d)
@@ -27,6 +27,57 @@ module Unicode
   end
 end
 
+class CTypeUTF8Parser
+  def initialize
+    @planes = {}
+  end
+
+  def parse(input)
+    parse_ctype_utf8(input)
+    normalize_planes
+  end
+
+  def sorted_planes
+    @planes.sort_by do |plane, characters|
+      plane
+    end
+  end
+
+  private
+  def parse_ctype_utf8(input)
+    current_plane = nil
+    input.each_line do |line|
+      case line
+      when / plane(\d{2})\[\]=/
+        current_plane = $1.to_i(16)
+        @planes[current_plane] = []
+      when /^\s*
+             \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\},
+             \s*
+             \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\},?$/ix
+        next if current_plane.nil?
+        parsed_characters = $LAST_MATCH_INFO.captures.collect do |value|
+          Unicode.to_utf8(value.to_i(16))
+        end
+        upper1, lower1, sort1, upper2, lower2, sort2 = parsed_characters
+        characters = @planes[current_plane]
+        characters << {:upper => upper1, :lower => lower1, :sort => sort1}
+        characters << {:upper => upper2, :lower => lower2, :sort => sort2}
+      when /^\};$/
+        current_plane = nil
+      end
+    end
+  end
+
+  def normalize_planes
+    @planes.each do |plane, characters|
+      characters.each_with_index do |character, i|
+        character[:base] = Unicode.to_utf8((plane << 8) + i)
+      end
+    end
+  end
+end
+
 class CTypeUCAParser
   attr_reader :pages
   def initialize
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index