[Groonga-commit] groonga/groonga-normalizer-mysql at c056eef [master] Fix voiced sound mark case

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Jul 18 15:08:13 JST 2018


Kouhei Sutou	2018-07-18 15:08:13 +0900 (Wed, 18 Jul 2018)

  New Revision: c056eefecd4d89f7108db543fa1425d5e027c6af
  https://github.com/groonga/groonga-normalizer-mysql/commit/c056eefecd4d89f7108db543fa1425d5e027c6af

  Message:
    Fix voiced sound mark case

  Modified files:
    normalizers/mysql_unicode_900_ja_as_cs_table.h
    test/suite/unicode_900/ja_as_cs/japanese_accent.expected
    tool/parser.rb

  Modified: normalizers/mysql_unicode_900_ja_as_cs_table.h (+9 -9)
===================================================================
--- normalizers/mysql_unicode_900_ja_as_cs_table.h    2018-07-18 12:46:11 +0900 (729176c)
+++ normalizers/mysql_unicode_900_ja_as_cs_table.h    2018-07-18 15:08:13 +0900 (88c13df)
@@ -1227,17 +1227,17 @@ static uint32_t unicode_900_ja_as_cs_page_30[] = {
   0x03090, 0x03091, 0x03092, 0x03093, 0x03094, 0x03095, 0x03096, 0x03097,
   0x03098, 0x03099, 0x0309a, 0x0309b, 0x0309c, 0x0309d, 0x0309e, 0x0309f,
   0x030a0, 0x03041, 0x03042, 0x03043, 0x03044, 0x03045, 0x03046, 0x03047,
-  0x03048, 0x03049, 0x0304a, 0x0304b, 0x030ac, 0x0304d, 0x030ae, 0x0304f,
-  0x030b0, 0x03051, 0x030b2, 0x03053, 0x030b4, 0x03055, 0x030b6, 0x03057,
-  0x030b8, 0x03059, 0x030ba, 0x0305b, 0x030bc, 0x0305d, 0x030be, 0x0305f,
-  0x030c0, 0x03061, 0x030c2, 0x03063, 0x03064, 0x030c5, 0x03066, 0x030c7,
-  0x03068, 0x030c9, 0x0306a, 0x0306b, 0x0306c, 0x0306d, 0x0306e, 0x0306f,
-  0x030d0, 0x030d1, 0x03072, 0x030d3, 0x030d4, 0x03075, 0x030d6, 0x030d7,
-  0x03078, 0x030d9, 0x030da, 0x0307b, 0x030dc, 0x030dd, 0x0307e, 0x0307f,
+  0x03048, 0x03049, 0x0304a, 0x0304b, 0x0304c, 0x0304d, 0x0304e, 0x0304f,
+  0x03050, 0x03051, 0x03052, 0x03053, 0x03054, 0x03055, 0x03056, 0x03057,
+  0x03058, 0x03059, 0x0305a, 0x0305b, 0x0305c, 0x0305d, 0x0305e, 0x0305f,
+  0x03060, 0x03061, 0x03062, 0x03063, 0x03064, 0x03065, 0x03066, 0x03067,
+  0x03068, 0x03069, 0x0306a, 0x0306b, 0x0306c, 0x0306d, 0x0306e, 0x0306f,
+  0x03070, 0x03071, 0x03072, 0x03073, 0x03074, 0x03075, 0x03076, 0x03077,
+  0x03078, 0x03079, 0x0307a, 0x0307b, 0x0307c, 0x0307d, 0x0307e, 0x0307f,
   0x03080, 0x03081, 0x03082, 0x03083, 0x03084, 0x03085, 0x03086, 0x03087,
   0x03088, 0x03089, 0x0308a, 0x0308b, 0x0308c, 0x0308d, 0x0308e, 0x0308f,
-  0x03090, 0x03091, 0x03092, 0x03093, 0x030f4, 0x03095, 0x03096, 0x030f7,
-  0x030f8, 0x030f9, 0x030fa, 0x030fb, 0x030fc, 0x0309d, 0x030fe, 0x030ff
+  0x03090, 0x03091, 0x03092, 0x03093, 0x03094, 0x03095, 0x03096, 0x030f7,
+  0x030f8, 0x030f9, 0x030fa, 0x030fb, 0x030fc, 0x0309d, 0x0309e, 0x030ff
 };
 
 static uint32_t unicode_900_ja_as_cs_page_31[] = {

  Modified: test/suite/unicode_900/ja_as_cs/japanese_accent.expected (+1 -1)
===================================================================
--- test/suite/unicode_900/ja_as_cs/japanese_accent.expected    2018-07-18 12:46:11 +0900 (1268b2e)
+++ test/suite/unicode_900/ja_as_cs/japanese_accent.expected    2018-07-18 15:08:13 +0900 (398123f)
@@ -1,4 +1,4 @@
 register normalizers/mysql
 [[0,0.0,0.0],true]
 normalize   'NormalizerMySQLUnicode900("locale", "ja",                              "weight_level", 3)'   "ぱばパバ"
-[[0,0.0,0.0],{"normalized":"ぱばパバ","types":[],"checks":[]}]
+[[0,0.0,0.0],{"normalized":"ぱばぱば","types":[],"checks":[]}]

  Modified: tool/parser.rb (+62 -35)
===================================================================
--- tool/parser.rb    2018-07-18 12:46:11 +0900 (8d083b8)
+++ tool/parser.rb    2018-07-18 15:08:13 +0900 (c4940cb)
@@ -200,6 +200,17 @@ class UCAParser
   end
 
   private
+  def remove_last_all_zero_weights(weights)
+    normalized_weights = []
+    remove = true
+    weights.reverse_each do |weight|
+      next if remove and weight.all?(&:zero?)
+      remove = false
+      normalized_weights.unshift(weight)
+    end
+    normalized_weights
+  end
+
   def weight_based_characters(level)
     sorted_pages =****@pages*****_by do |page, characters|
       page
@@ -211,9 +222,7 @@ class UCAParser
         target_weights = weights.collect do |weight|
           weight[0, level]
         end
-        while target_weights.last and target_weights.last.all?(&:zero?)
-          target_weights.pop
-        end
+        target_weights = remove_last_all_zero_weights(target_weights)
         weight_based_characters[target_weights] ||= []
         weight_based_characters[target_weights] << character
       end
@@ -363,8 +372,6 @@ class ICUCollationCustomizationRuleParser
           nth_weight = @scanner[1].size
         elsif****@scann*****(/=/)
           type = :equal
-        elsif****@scann*****(/\//)
-          type = :expansion
         end
         break unless type
         @scanner.skip(/\s+/)
@@ -373,6 +380,13 @@ class ICUCollationCustomizationRuleParser
           raise "Must be target string: #{@scanner.inspect}"
         end
         post_string = parse_prefix
+        if****@scann*****(/\//)
+          if post_string
+            post_string += parse_string
+          else
+            base_string += parse_string
+          end
+        end
         yield(Rule.new(type,
                        base_string,
                        target_string,
@@ -431,27 +445,27 @@ end
 class UCA900Parser < UCAParser
   def initialize(options={})
     super(options)
-    @tailoring = {}
+    @rules = {}
   end
 
   # Parse ICU Collation Customization syntax tailoring
   def parse_tailoring(input, locale)
     in_cldr_30 = false
-    rule_text = nil
+    tailoring = nil
     input.each_line do |line|
       case line
       when /#{Regexp.escape(locale)}_cldr_30\[\]/
         in_cldr_30 = true
-        rule_text = ""
+        tailoring = ""
       when /"(.+)"(;)?/
         raw_c_string = $1
         semicolon = $2
         next unless in_cldr_30
 
-        rule_text << raw_c_string.gsub(/\\\\/, "\\")
+        tailoring << raw_c_string.gsub(/\\\\/, "\\")
 
         if semicolon == ";"
-          parse_icu_collation_cutomization_rule(rule_text)
+          parse_icu_collation_cutomization_ruleset(tailoring)
           break
         end
       end
@@ -466,17 +480,17 @@ class UCA900Parser < UCAParser
   end
 
   private
-  def parse_icu_collation_cutomization_rule(rule_text)
-    parser = ICUCollationCustomizationRuleParser.new(rule_text)
+  def parse_icu_collation_cutomization_ruleset(tailoring)
+    parser = ICUCollationCustomizationRuleParser.new(tailoring)
     parser.parse do |rule|
-      next if rule.before_nth_weight
       next if rule.post_string
+      next if rule.before_nth_weight
       case rule.type
       when :greater, :equal
-        if****@tailo*****?(rule.base_string)
+        if****@rules*****?(rule.base_string)
           raise "Duplicated tailoring: #{rule.base_string}"
         end
-        @tailoring[rule.base_string] = {
+        @rules[rule.base_string] = {
           target: rule.target_string,
           nth_weight: rule.nth_weight,
         }
@@ -536,38 +550,51 @@ class UCA900Parser < UCAParser
 
   def normalize_pages
     all_characters = {}
+    primary_weights = {}
     @pages.each do |page, weight_sets|
       @pages[page] = weight_sets.collect.with_index do |weights, i|
+        weights = remove_last_all_zero_weights(weights)
         code_point = (page << 8) + i
         character = Character.new(weights, code_point)
         all_characters[character.utf8] = character
+        primary_weights[weights[0]] ||= []
+        primary_weights[weights[0]] << character
         character
       end
     end
-    all_characters.each do |utf8, character|
-      rule = @tailoring[utf8]
-      next if rule.nil?
-      target_character = all_characters[rule[:target]]
+    @rules.each do |utf8, rule|
+      next if utf8.size != 1
+      base_character = all_characters[utf8]
+      if base_character.weights.size != 1
+        raise "2 or more weights for base character isn't supported: <#{utf8}>"
+      end
+      target_base_character = all_characters[rule[:target]]
+      target_characters = primary_weights[target_base_character.weights[0]]
       if @options[:debug]
-        p [utf8, rule, character.weights, target_character.weights]
+        p [utf8, rule, base_character.weights, target_characters.collect(&:utf8)]
       end
-      nth_weight = rule[:nth_weight]
-      if nth_weight
-        character.weights.each_with_index do |weight, i|
-          weight.each_with_index do |w, j|
-            break if j >= nth_weight
-            target_character.weights[i][j] = w
-          end
-          if nth_weight > weight.size
-            weight << 0
-            target_character.weights[i] << 1
+      target_characters.each do |target_character|
+        if @options[:debug]
+          p [utf8, rule, base_character.weights, target_character.weights]
+        end
+        nth_weight = rule[:nth_weight]
+        if nth_weight
+          base_character.weights.each_with_index do |weight, i|
+            weight.each_with_index do |w, j|
+              break if j >= nth_weight
+              target_character.weights[i][j] = w
+            end
+            if nth_weight > weight.size
+              weight << 0
+              target_character.weights[i] << 1
+            end
           end
+        else
+          target_character.weights = base_character.weights
+        end
+        if @options[:debug]
+          p [utf8, rule, base_character.weights, target_character.weights]
         end
-      else
-        target_character.weights = character.weights
-      end
-      if @options[:debug]
-        p [utf8, rule, character.weights, target_character.weights]
       end
     end
   end
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180718/3157a78b/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index