[Groonga-commit] groonga/groonga-normalizer-mysql at ffc4672 [master] Extract common code

Back to archive index

Kouhei Sutou null+****@clear*****
Tue Jul 17 10:57:26 JST 2018


Kouhei Sutou	2018-07-17 10:57:26 +0900 (Tue, 17 Jul 2018)

  New Revision: ffc4672ec036d0c87b7fbde1d0bba09ef4ee9b31
  https://github.com/groonga/groonga-normalizer-mysql/commit/ffc4672ec036d0c87b7fbde1d0bba09ef4ee9b31

  Message:
    Extract common code

  Modified files:
    normalizers/mysql_unicode_ci_table.h
    tool/generate_uca_table.rb
    tool/parser.rb

  Modified: normalizers/mysql_unicode_ci_table.h (+1 -1)
===================================================================
--- normalizers/mysql_unicode_ci_table.h    2018-04-28 21:09:57 +0900 (e3a6b4b)
+++ normalizers/mysql_unicode_ci_table.h    2018-07-17 10:57:26 +0900 (be1384b)
@@ -1551,7 +1551,7 @@ static uint32_t unicode_ci_page_ff[] = {
   0x0fff8, 0x00000, 0x00000, 0x00000, 0x0fffc, 0x0fffd, 0x0fffe, 0x0ffff
 };
 
-static uint32_t *unicode_ci_table[256] = {
+static uint32_t *unicode_ci_table[] = {
    unicode_ci_page_00,  unicode_ci_page_01,
    unicode_ci_page_02,  unicode_ci_page_03,
    unicode_ci_page_04,  unicode_ci_page_05,

  Modified: tool/generate_uca_table.rb (+9 -35)
===================================================================
--- tool/generate_uca_table.rb    2018-04-28 21:09:57 +0900 (4dcb14c)
+++ tool/generate_uca_table.rb    2018-07-17 10:57:26 +0900 (82e4cba)
@@ -1,7 +1,6 @@
 #!/usr/bin/env ruby
-# -*- coding: utf-8 -*-
 #
-# Copyright (C) 2013-2015  Kouhei Sutou <kou �� clear-code.com>
+# Copyright (C) 2013-2018  Kouhei Sutou <kou �� clear-code.com>
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Library General Public
@@ -138,43 +137,18 @@ parser.weight_based_characters.each do |weight, characters|
   grouped_characters.concat(split_characters(characters))
 end
 
-GREEK_CAPITAL_UNICODE_RANGE = Unicode.from_utf8("Α")..Unicode.from_utf8("Ω")
-def find_greek_capital_character(characters)
-  characters.find do |character|
-    GREEK_CAPITAL_UNICODE_RANGE.cover?(character[:code_point])
-  end
-end
-
-def find_representative_character(characters)
-  representative_character = nil
-  case characters.first[:utf8]
-  when "⺄", "⺇", "⺈", "⺊", "⺌", "⺗"
-    representative_character = characters.last
-  when "⺜", "⺝", "⺧", "⺫", "⺬", "⺮", "⺶", "⺻", "⺼", "⺽"
-    representative_character = characters[1]
-  when "⻆", "⻊", "⻏", "⻑", "⻕", "⻗", "⻝", "⻡", "⻣", "⻤"
-    representative_character = characters.last
-  when "⻱", "⼀", "⼆", "⼈"
-    representative_character = characters[1]
-  when "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", "っ", "ゃ", "ゅ", "ょ", "ゎ"
-    representative_character = characters[1] unless @split_small_kana_p
-  else
-    representative_character ||= find_greek_capital_character(characters)
-  end
-  representative_character ||= characters.first
-  representative_character
-end
-
 target_pages = {}
 grouped_characters.each do |characters|
+  characters.extend(CharacterArray)
   next if characters.size == 1
-  representative_character = find_representative_character(characters)
-  representative_code_point = representative_character[:code_point]
+  representative_character =
+    characters.find_representative_character(split_small_kana: @split_small_kana_p)
+  representative_code_point = representative_character.code_point
   rest_characters = characters.reject do |character|
     character == representative_character
   end
   rest_characters.each do |character|
-    code_point = character[:code_point]
+    code_point = character.code_point
     page = code_point >> 8
     low_code = code_point & 0xff
     target_pages[page] ||= [nil] * 256
@@ -279,16 +253,16 @@ end
 
 puts(<<-PAGES_HEADER)
 
-static uint32_t *#{variable_name_prefix}_table[#{parser.n_pages}] = {
+static uint32_t *#{variable_name_prefix}_table[] = {
 PAGES_HEADER
 
-pages = ["NULL"] * parser.n_pages
+pages = []
 sorted_target_pages.each do |page, characters|
   pages[page] = page_name(page)
 end
 lines = pages.each_slice(2).collect do |pages_group|
   formatted_pages = pages_group.collect do |page|
-    "%19s" % page
+    "%19s" % (page || "NULL")
   end
   "  " + formatted_pages.join(", ")
 end

  Modified: tool/parser.rb (+69 -30)
===================================================================
--- tool/parser.rb    2018-04-28 21:09:57 +0900 (cc0b2f0)
+++ tool/parser.rb    2018-07-17 10:57:26 +0900 (c7ff979)
@@ -1,4 +1,4 @@
-# Copyright (C) 2013  Kouhei Sutou <kou �� clear-code.com>
+# Copyright (C) 2013-2018  Kouhei Sutou <kou �� clear-code.com>
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Library General Public
@@ -27,6 +27,42 @@ module Unicode
   end
 end
 
+class Character < Struct.new(:weights,
+                             :code_point)
+  def utf8
+    Unicode.to_utf8(code_point)
+  end
+end
+
+module CharacterArray
+  def find_representative_character(options={})
+    representative_character = nil
+    case first.utf8
+    when "⺄", "⺇", "⺈", "⺊", "⺌", "⺗"
+      representative_character = last
+    when "⺜", "⺝", "⺧", "⺫", "⺬", "⺮", "⺶", "⺻", "⺼", "⺽"
+      representative_character = self[1]
+    when "⻆", "⻊", "⻏", "⻑", "⻕", "⻗", "⻝", "⻡", "⻣", "⻤"
+      representative_character = last
+    when "⻱", "⼀", "⼆", "⼈"
+      representative_character = self[1]
+    when "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", "っ", "ゃ", "ゅ", "ょ", "ゎ"
+      representative_character = self[1] unless options[:split_small_kana]
+    else
+      representative_character ||= find_greek_capital_character
+    end
+    representative_character ||= first
+    representative_character
+  end
+
+  GREEK_CAPITAL_UNICODE_RANGE = Unicode.from_utf8("Α")..Unicode.from_utf8("Ω")
+  def find_greek_capital_character
+    find do |character|
+      GREEK_CAPITAL_UNICODE_RANGE.cover?(character.code_point)
+    end
+  end
+end
+
 class CTypeUTF8Parser
   def initialize
     @pages = {}
@@ -78,26 +114,21 @@ class CTypeUTF8Parser
   end
 end
 
-class CTypeUCAParser
-  attr_reader :pages
-  def initialize(version=nil)
-    @version = version
+class UCAParser
+  def initialize
     @pages = {}
-    @lengths = []
   end
 
-  def parse(input)
-    parse_ctype_uca(input)
-    normalize_pages
-  end
-
-  def weight_based_characters
+  def weight_based_characters(level)
     weight_based_characters = {}
     sorted_pages.each do |page, characters|
       characters.each do |character|
-        weight = character[:weight]
-        weight_based_characters[weight] ||= []
-        weight_based_characters[weight] << character
+        weights = character.weights
+        target_weights = weights.collect do |weight|
+          weight[0, level]
+        end
+        weight_based_characters[target_weights] ||= []
+        weight_based_characters[target_weights] << character
       end
     end
     weight_based_characters
@@ -108,9 +139,22 @@ class CTypeUCAParser
       page
     end
   end
+end
 
-  def n_pages
-    @lengths.size
+class CTypeUCAParser < UCAParser
+  def initialize(version=nil)
+    super()
+    @version = version
+    @lengths = []
+  end
+
+  def parse(input)
+    parse_ctype_uca(input)
+    normalize_pages
+  end
+
+  def weight_based_characters
+    super(1)
   end
 
   private
@@ -162,21 +206,16 @@ class CTypeUCAParser
 
   def normalize_pages
     @pages.each do |page, flatten_weights|
-      weights = flatten_weights.each_slice(@lengths[page])
-      @pages[page] = weights.with_index.collect do |weight, i|
-        if weight.all?(&:zero?)
-          weight = [0]
-        else
-          while weight.last.zero?
-            weight.pop
-          end
+      weights_set = flatten_weights.each_slice(@lengths[page])
+      @pages[page] = weights_set.with_index.collect do |weights, i|
+        weights = weights.collect do |level1_weight|
+          [level1_weight]
+        end
+        while weights.last == [0]
+          weights.pop
         end
         code_point = (page << 8) + i
-        {
-          :weight     => weight,
-          :code_point => code_point,
-          :utf8       => Unicode.to_utf8(code_point),
-        }
+        Character.new(weights, code_point)
       end
     end
   end
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180717/9bdc9b6f/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index