Kouhei Sutou
null+****@clear*****
Wed Jul 18 15:08:13 JST 2018
Kouhei Sutou 2018-07-18 15:08:13 +0900 (Wed, 18 Jul 2018) New Revision: c056eefecd4d89f7108db543fa1425d5e027c6af https://github.com/groonga/groonga-normalizer-mysql/commit/c056eefecd4d89f7108db543fa1425d5e027c6af Message: Fix voiced sound mark case Modified files: normalizers/mysql_unicode_900_ja_as_cs_table.h test/suite/unicode_900/ja_as_cs/japanese_accent.expected tool/parser.rb Modified: normalizers/mysql_unicode_900_ja_as_cs_table.h (+9 -9) =================================================================== --- normalizers/mysql_unicode_900_ja_as_cs_table.h 2018-07-18 12:46:11 +0900 (729176c) +++ normalizers/mysql_unicode_900_ja_as_cs_table.h 2018-07-18 15:08:13 +0900 (88c13df) @@ -1227,17 +1227,17 @@ static uint32_t unicode_900_ja_as_cs_page_30[] = { 0x03090, 0x03091, 0x03092, 0x03093, 0x03094, 0x03095, 0x03096, 0x03097, 0x03098, 0x03099, 0x0309a, 0x0309b, 0x0309c, 0x0309d, 0x0309e, 0x0309f, 0x030a0, 0x03041, 0x03042, 0x03043, 0x03044, 0x03045, 0x03046, 0x03047, - 0x03048, 0x03049, 0x0304a, 0x0304b, 0x030ac, 0x0304d, 0x030ae, 0x0304f, - 0x030b0, 0x03051, 0x030b2, 0x03053, 0x030b4, 0x03055, 0x030b6, 0x03057, - 0x030b8, 0x03059, 0x030ba, 0x0305b, 0x030bc, 0x0305d, 0x030be, 0x0305f, - 0x030c0, 0x03061, 0x030c2, 0x03063, 0x03064, 0x030c5, 0x03066, 0x030c7, - 0x03068, 0x030c9, 0x0306a, 0x0306b, 0x0306c, 0x0306d, 0x0306e, 0x0306f, - 0x030d0, 0x030d1, 0x03072, 0x030d3, 0x030d4, 0x03075, 0x030d6, 0x030d7, - 0x03078, 0x030d9, 0x030da, 0x0307b, 0x030dc, 0x030dd, 0x0307e, 0x0307f, + 0x03048, 0x03049, 0x0304a, 0x0304b, 0x0304c, 0x0304d, 0x0304e, 0x0304f, + 0x03050, 0x03051, 0x03052, 0x03053, 0x03054, 0x03055, 0x03056, 0x03057, + 0x03058, 0x03059, 0x0305a, 0x0305b, 0x0305c, 0x0305d, 0x0305e, 0x0305f, + 0x03060, 0x03061, 0x03062, 0x03063, 0x03064, 0x03065, 0x03066, 0x03067, + 0x03068, 0x03069, 0x0306a, 0x0306b, 0x0306c, 0x0306d, 0x0306e, 0x0306f, + 0x03070, 0x03071, 0x03072, 0x03073, 0x03074, 0x03075, 0x03076, 0x03077, + 0x03078, 0x03079, 0x0307a, 0x0307b, 0x0307c, 0x0307d, 0x0307e, 0x0307f, 0x03080, 0x03081, 0x03082, 0x03083, 0x03084, 0x03085, 0x03086, 0x03087, 0x03088, 0x03089, 0x0308a, 0x0308b, 0x0308c, 0x0308d, 0x0308e, 0x0308f, - 0x03090, 0x03091, 0x03092, 0x03093, 0x030f4, 0x03095, 0x03096, 0x030f7, - 0x030f8, 0x030f9, 0x030fa, 0x030fb, 0x030fc, 0x0309d, 0x030fe, 0x030ff + 0x03090, 0x03091, 0x03092, 0x03093, 0x03094, 0x03095, 0x03096, 0x030f7, + 0x030f8, 0x030f9, 0x030fa, 0x030fb, 0x030fc, 0x0309d, 0x0309e, 0x030ff }; static uint32_t unicode_900_ja_as_cs_page_31[] = { Modified: test/suite/unicode_900/ja_as_cs/japanese_accent.expected (+1 -1) =================================================================== --- test/suite/unicode_900/ja_as_cs/japanese_accent.expected 2018-07-18 12:46:11 +0900 (1268b2e) +++ test/suite/unicode_900/ja_as_cs/japanese_accent.expected 2018-07-18 15:08:13 +0900 (398123f) @@ -1,4 +1,4 @@ register normalizers/mysql [[0,0.0,0.0],true] normalize 'NormalizerMySQLUnicode900("locale", "ja", "weight_level", 3)' "ぱばパバ" -[[0,0.0,0.0],{"normalized":"ぱばパバ","types":[],"checks":[]}] +[[0,0.0,0.0],{"normalized":"ぱばぱば","types":[],"checks":[]}] Modified: tool/parser.rb (+62 -35) =================================================================== --- tool/parser.rb 2018-07-18 12:46:11 +0900 (8d083b8) +++ tool/parser.rb 2018-07-18 15:08:13 +0900 (c4940cb) @@ -200,6 +200,17 @@ class UCAParser end private + def remove_last_all_zero_weights(weights) + normalized_weights = [] + remove = true + weights.reverse_each do |weight| + next if remove and weight.all?(&:zero?) + remove = false + normalized_weights.unshift(weight) + end + normalized_weights + end + def weight_based_characters(level) sorted_pages =****@pages*****_by do |page, characters| page @@ -211,9 +222,7 @@ class UCAParser target_weights = weights.collect do |weight| weight[0, level] end - while target_weights.last and target_weights.last.all?(&:zero?) - target_weights.pop - end + target_weights = remove_last_all_zero_weights(target_weights) weight_based_characters[target_weights] ||= [] weight_based_characters[target_weights] << character end @@ -363,8 +372,6 @@ class ICUCollationCustomizationRuleParser nth_weight = @scanner[1].size elsif****@scann*****(/=/) type = :equal - elsif****@scann*****(/\//) - type = :expansion end break unless type @scanner.skip(/\s+/) @@ -373,6 +380,13 @@ class ICUCollationCustomizationRuleParser raise "Must be target string: #{@scanner.inspect}" end post_string = parse_prefix + if****@scann*****(/\//) + if post_string + post_string += parse_string + else + base_string += parse_string + end + end yield(Rule.new(type, base_string, target_string, @@ -431,27 +445,27 @@ end class UCA900Parser < UCAParser def initialize(options={}) super(options) - @tailoring = {} + @rules = {} end # Parse ICU Collation Customization syntax tailoring def parse_tailoring(input, locale) in_cldr_30 = false - rule_text = nil + tailoring = nil input.each_line do |line| case line when /#{Regexp.escape(locale)}_cldr_30\[\]/ in_cldr_30 = true - rule_text = "" + tailoring = "" when /"(.+)"(;)?/ raw_c_string = $1 semicolon = $2 next unless in_cldr_30 - rule_text << raw_c_string.gsub(/\\\\/, "\\") + tailoring << raw_c_string.gsub(/\\\\/, "\\") if semicolon == ";" - parse_icu_collation_cutomization_rule(rule_text) + parse_icu_collation_cutomization_ruleset(tailoring) break end end @@ -466,17 +480,17 @@ class UCA900Parser < UCAParser end private - def parse_icu_collation_cutomization_rule(rule_text) - parser = ICUCollationCustomizationRuleParser.new(rule_text) + def parse_icu_collation_cutomization_ruleset(tailoring) + parser = ICUCollationCustomizationRuleParser.new(tailoring) parser.parse do |rule| - next if rule.before_nth_weight next if rule.post_string + next if rule.before_nth_weight case rule.type when :greater, :equal - if****@tailo*****?(rule.base_string) + if****@rules*****?(rule.base_string) raise "Duplicated tailoring: #{rule.base_string}" end - @tailoring[rule.base_string] = { + @rules[rule.base_string] = { target: rule.target_string, nth_weight: rule.nth_weight, } @@ -536,38 +550,51 @@ class UCA900Parser < UCAParser def normalize_pages all_characters = {} + primary_weights = {} @pages.each do |page, weight_sets| @pages[page] = weight_sets.collect.with_index do |weights, i| + weights = remove_last_all_zero_weights(weights) code_point = (page << 8) + i character = Character.new(weights, code_point) all_characters[character.utf8] = character + primary_weights[weights[0]] ||= [] + primary_weights[weights[0]] << character character end end - all_characters.each do |utf8, character| - rule = @tailoring[utf8] - next if rule.nil? - target_character = all_characters[rule[:target]] + @rules.each do |utf8, rule| + next if utf8.size != 1 + base_character = all_characters[utf8] + if base_character.weights.size != 1 + raise "2 or more weights for base character isn't supported: <#{utf8}>" + end + target_base_character = all_characters[rule[:target]] + target_characters = primary_weights[target_base_character.weights[0]] if @options[:debug] - p [utf8, rule, character.weights, target_character.weights] + p [utf8, rule, base_character.weights, target_characters.collect(&:utf8)] end - nth_weight = rule[:nth_weight] - if nth_weight - character.weights.each_with_index do |weight, i| - weight.each_with_index do |w, j| - break if j >= nth_weight - target_character.weights[i][j] = w - end - if nth_weight > weight.size - weight << 0 - target_character.weights[i] << 1 + target_characters.each do |target_character| + if @options[:debug] + p [utf8, rule, base_character.weights, target_character.weights] + end + nth_weight = rule[:nth_weight] + if nth_weight + base_character.weights.each_with_index do |weight, i| + weight.each_with_index do |w, j| + break if j >= nth_weight + target_character.weights[i][j] = w + end + if nth_weight > weight.size + weight << 0 + target_character.weights[i] << 1 + end end + else + target_character.weights = base_character.weights + end + if @options[:debug] + p [utf8, rule, base_character.weights, target_character.weights] end - else - target_character.weights = character.weights - end - if @options[:debug] - p [utf8, rule, character.weights, target_character.weights] end end end -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180718/3157a78b/attachment-0001.htm