Kouhei Sutou
null+****@clear*****
Sun Feb 10 19:04:55 JST 2013
Kouhei Sutou 2013-02-10 19:04:55 +0900 (Sun, 10 Feb 2013) New Revision: 1a417b4d18a2d406cb57b565fa9136adff9b6dda https://github.com/groonga/groonga-normalizer-mysql/commit/1a417b4d18a2d406cb57b565fa9136adff9b6dda Log: Extract ctype-uca.c parser Added files: tool/parser.rb Modified files: tool/dump-difference-uca.rb Modified: tool/dump-difference-uca.rb (+5 -68) =================================================================== --- tool/dump-difference-uca.rb 2013-02-10 18:42:36 +0900 (9b201de) +++ tool/dump-difference-uca.rb 2013-02-10 19:04:55 +0900 (5ea50b6) @@ -14,82 +14,19 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -require "English" +$LOAD_PATH.unshift(File.dirname(__FILE__)) +require "parser" if ARGV.size != 1 puts("Usage: #{$0} MYSQL_SOURCE/strings/ctype-uca.c") exit(false) end -def code_point_to_utf8(code_point) - [code_point].pack("U") -end - -def utf8_to_code_point(utf8) - utf8.unpack("U") -end - -current_page = nil -pages = {} -in_length = false -lengths = [] -ARGF.each_line do |line| - case line - when / page(\d{3})data\[\]=/ - current_page = $1.to_i(16) - pages[current_page] = [] - when /^\s*0x(?:[\da-z]+)(?:,\s*0x(?:[\da-z]+))*,?$/i - next if current_page.nil? - weights = line.chomp.split(/,\s*/).collect do |component| - Integer(component) - end - pages[current_page].concat(weights) - when / uca_length\[256\]=/ - in_length = true - when /^\d+(?:,\d+)*,?$/ - next unless in_length - _lengths = line.chomp.split(/,/).collect {|length| Integer(length)} - lengths.concat(_lengths) - when /^\};$/ - current_page = nil - in_length = false - end -end - -pages.each do |page, flatten_weights| - weights = flatten_weights.each_slice(lengths[page]) - pages[page] = weights.with_index.collect do |weight, i| - if weight.all?(&:zero?) - weight = [0] - else - while weight.last.zero? - weight.pop - end - end - code_point = (page << 8) + i - { - :weight => weight, - :code_point => code_point, - :utf8 => code_point_to_utf8(code_point), - } - end -end - -sorted_pages = pages.sort_by do |page, characters| - page -end - -weight_based_characters = {} -sorted_pages.each do |page, characters| - characters.each do |character| - weight = character[:weight] - weight_based_characters[weight] ||= [] - weight_based_characters[weight] << character - end -end +parser = CTypeUCAParser.new +parser.parse(ARGF) n_idencials = 0 -weight_based_characters.each do |weight, characters| +parser.weight_based_characters.each do |weight, characters| next if characters.size == 1 n_idencials += 1 formatted_weight = weight.collect {|component| '%#07x' % component}.join(', ') Added: tool/parser.rb (+111 -0) 100644 =================================================================== --- /dev/null +++ tool/parser.rb 2013-02-10 19:04:55 +0900 (8846bfd) @@ -0,0 +1,111 @@ +# Copyright (C) 2013 Kouhei Sutou <kou �� clear-code.com> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Library General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Library General Public License for more details. +# +# You should have received a copy of the GNU Library General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "English" + +module Unicode + module_function + def to_utf8(code_point) + [code_point].pack("U") + end + + def from_utf8(utf8) + utf8.unpack("U") + end +end + +class CTypeUCAParser + attr_reader :pages + def initialize + @pages = {} + @lengths = [] + end + + def parse(input) + parse_ctype_uca(input) + normalize_pages + sort_pages + end + + def weight_based_characters + weight_based_characters = {} + @sorted_pages.each do |page, characters| + characters.each do |character| + weight = character[:weight] + weight_based_characters[weight] ||= [] + weight_based_characters[weight] << character + end + end + weight_based_characters + end + + private + def parse_ctype_uca(input) + current_page = nil + in_length = false + input.each_line do |line| + case line + when / page(\d{3})data\[\]=/ + current_page = $1.to_i(16) + @pages[current_page] = [] + when /^\s*0x(?:[\da-z]+)(?:,\s*0x(?:[\da-z]+))*,?$/i + next if current_page.nil? + weights = line.chomp.split(/,\s*/).collect do |component| + Integer(component) + end + @pages[current_page].concat(weights) + when / uca_length\[256\]=/ + in_length = true + when /^\d+(?:,\d+)*,?$/ + next unless in_length + current_lengths = line.chomp.split(/,/).collect do |length| + Integer(length) + end + @lengths.concat(current_lengths) + when /^\};$/ + current_page = nil + in_length = false + end + end + end + + def normalize_pages + @pages.each do |page, flatten_weights| + weights = flatten_weights.each_slice(@lengths[page]) + @pages[page] = weights.with_index.collect do |weight, i| + if weight.all?(&:zero?) + weight = [0] + else + while weight.last.zero? + weight.pop + end + end + code_point = (page << 8) + i + { + :weight => weight, + :code_point => code_point, + :utf8 => Unicode.to_utf8(code_point), + } + end + end + end + + def sort_pages + @sorted_pages =****@pages*****_by do |page, characters| + page + end + end +end -------------- next part -------------- HTML����������������������������...Download