Kouhei Sutou
null+****@clear*****
Sun Feb 10 19:10:32 JST 2013
Kouhei Sutou 2013-02-10 19:10:32 +0900 (Sun, 10 Feb 2013) New Revision: bfe2b53e99be3ea6dc3fe292cfc6e3aa58d83dbf https://github.com/groonga/groonga-normalizer-mysql/commit/bfe2b53e99be3ea6dc3fe292cfc6e3aa58d83dbf Log: Extract ctype-utf8.c parser Modified files: tool/dump-difference-utf8.rb tool/parser.rb Modified: tool/dump-difference-utf8.rb (+6 -44) =================================================================== --- tool/dump-difference-utf8.rb 2013-02-10 19:04:55 +0900 (2a1a2a0) +++ tool/dump-difference-utf8.rb 2013-02-10 19:10:32 +0900 (5af6bb2) @@ -14,58 +14,20 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -require "English" +$LOAD_PATH.unshift(File.dirname(__FILE__)) +require "parser" if ARGV.size != 1 puts("Usage: #{$0} MYSQL_SOURCE/strings/ctype-utf8.c") exit(false) end -def code_point_to_utf8(code_point) - [code_point].pack("U") -end - -def utf8_to_code_point(utf8) - utf8.unpack("U") -end - -current_plane = nil -planes = {} -ARGF.each_line do |line| - case line - when / plane(\d{2})\[\]=/ - current_plane = $1.to_i(16) - planes[current_plane] = [] - when /^\s* - \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\}, - \s* - \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\},?$/ix - next if current_plane.nil? - parsed_characters = $LAST_MATCH_INFO.captures.collect do |value| - code_point_to_utf8(value.to_i(16)) - end - upper1, lower1, sort1, upper2, lower2, sort2 = parsed_characters - characters = planes[current_plane] - characters << {:upper => upper1, :lower => lower1, :sort => sort1} - characters << {:upper => upper2, :lower => lower2, :sort => sort2} - when /^\};$/ - current_plane = nil - end -end - -planes.each do |plane, characters| - characters.each_with_index do |character, i| - character[:base] = code_point_to_utf8((plane << 8) + i) - end -end - -sorted_planes = planes.sort_by do |plane, characters| - plane -end +parser = CTypeUTF8Parser.new +parser.parse(ARGF) n_differences = 0 n_expanded_sort_characters = 0 -sorted_planes.each do |plane, characters| +parser.sorted_planes.each do |plane, characters| characters.each do |character| base = character[:base] upper = character[:upper] @@ -75,7 +37,7 @@ sorted_planes.each do |plane, characters| n_differences += 1 utf8s = [base, upper, lower, sort] formatted_code_points = utf8s.collect do |utf8| - "%#07x" % utf8_to_code_point(utf8) + "%#07x" % Unicode.from_utf8(utf8) end if sort.bytesize > base.bytesize n_expanded_sort_characters += 1 Modified: tool/parser.rb (+51 -0) =================================================================== --- tool/parser.rb 2013-02-10 19:04:55 +0900 (8846bfd) +++ tool/parser.rb 2013-02-10 19:10:32 +0900 (400098d) @@ -27,6 +27,57 @@ module Unicode end end +class CTypeUTF8Parser + def initialize + @planes = {} + end + + def parse(input) + parse_ctype_utf8(input) + normalize_planes + end + + def sorted_planes + @planes.sort_by do |plane, characters| + plane + end + end + + private + def parse_ctype_utf8(input) + current_plane = nil + input.each_line do |line| + case line + when / plane(\d{2})\[\]=/ + current_plane = $1.to_i(16) + @planes[current_plane] = [] + when /^\s* + \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\}, + \s* + \{0x([\da-z]+),0x([\da-z]+),0x([\da-z]+)\},?$/ix + next if current_plane.nil? + parsed_characters = $LAST_MATCH_INFO.captures.collect do |value| + Unicode.to_utf8(value.to_i(16)) + end + upper1, lower1, sort1, upper2, lower2, sort2 = parsed_characters + characters = @planes[current_plane] + characters << {:upper => upper1, :lower => lower1, :sort => sort1} + characters << {:upper => upper2, :lower => lower2, :sort => sort2} + when /^\};$/ + current_plane = nil + end + end + end + + def normalize_planes + @planes.each do |plane, characters| + characters.each_with_index do |character, i| + character[:base] = Unicode.to_utf8((plane << 8) + i) + end + end + end +end + class CTypeUCAParser attr_reader :pages def initialize -------------- next part -------------- HTML����������������������������...Download