Kouhei Sutou
null+****@clear*****
Tue Jul 17 10:57:26 JST 2018
Kouhei Sutou 2018-07-17 10:57:26 +0900 (Tue, 17 Jul 2018) New Revision: ffc4672ec036d0c87b7fbde1d0bba09ef4ee9b31 https://github.com/groonga/groonga-normalizer-mysql/commit/ffc4672ec036d0c87b7fbde1d0bba09ef4ee9b31 Message: Extract common code Modified files: normalizers/mysql_unicode_ci_table.h tool/generate_uca_table.rb tool/parser.rb Modified: normalizers/mysql_unicode_ci_table.h (+1 -1) =================================================================== --- normalizers/mysql_unicode_ci_table.h 2018-04-28 21:09:57 +0900 (e3a6b4b) +++ normalizers/mysql_unicode_ci_table.h 2018-07-17 10:57:26 +0900 (be1384b) @@ -1551,7 +1551,7 @@ static uint32_t unicode_ci_page_ff[] = { 0x0fff8, 0x00000, 0x00000, 0x00000, 0x0fffc, 0x0fffd, 0x0fffe, 0x0ffff }; -static uint32_t *unicode_ci_table[256] = { +static uint32_t *unicode_ci_table[] = { unicode_ci_page_00, unicode_ci_page_01, unicode_ci_page_02, unicode_ci_page_03, unicode_ci_page_04, unicode_ci_page_05, Modified: tool/generate_uca_table.rb (+9 -35) =================================================================== --- tool/generate_uca_table.rb 2018-04-28 21:09:57 +0900 (4dcb14c) +++ tool/generate_uca_table.rb 2018-07-17 10:57:26 +0900 (82e4cba) @@ -1,7 +1,6 @@ #!/usr/bin/env ruby -# -*- coding: utf-8 -*- # -# Copyright (C) 2013-2015 Kouhei Sutou <kou �� clear-code.com> +# Copyright (C) 2013-2018 Kouhei Sutou <kou �� clear-code.com> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Library General Public @@ -138,43 +137,18 @@ parser.weight_based_characters.each do |weight, characters| grouped_characters.concat(split_characters(characters)) end -GREEK_CAPITAL_UNICODE_RANGE = Unicode.from_utf8("Α")..Unicode.from_utf8("Ω") -def find_greek_capital_character(characters) - characters.find do |character| - GREEK_CAPITAL_UNICODE_RANGE.cover?(character[:code_point]) - end -end - -def find_representative_character(characters) - representative_character = nil - case characters.first[:utf8] - when "⺄", "⺇", "⺈", "⺊", "⺌", "⺗" - representative_character = characters.last - when "⺜", "⺝", "⺧", "⺫", "⺬", "⺮", "⺶", "⺻", "⺼", "⺽" - representative_character = characters[1] - when "⻆", "⻊", "⻏", "⻑", "⻕", "⻗", "⻝", "⻡", "⻣", "⻤" - representative_character = characters.last - when "⻱", "⼀", "⼆", "⼈" - representative_character = characters[1] - when "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", "っ", "ゃ", "ゅ", "ょ", "ゎ" - representative_character = characters[1] unless @split_small_kana_p - else - representative_character ||= find_greek_capital_character(characters) - end - representative_character ||= characters.first - representative_character -end - target_pages = {} grouped_characters.each do |characters| + characters.extend(CharacterArray) next if characters.size == 1 - representative_character = find_representative_character(characters) - representative_code_point = representative_character[:code_point] + representative_character = + characters.find_representative_character(split_small_kana: @split_small_kana_p) + representative_code_point = representative_character.code_point rest_characters = characters.reject do |character| character == representative_character end rest_characters.each do |character| - code_point = character[:code_point] + code_point = character.code_point page = code_point >> 8 low_code = code_point & 0xff target_pages[page] ||= [nil] * 256 @@ -279,16 +253,16 @@ end puts(<<-PAGES_HEADER) -static uint32_t *#{variable_name_prefix}_table[#{parser.n_pages}] = { +static uint32_t *#{variable_name_prefix}_table[] = { PAGES_HEADER -pages = ["NULL"] * parser.n_pages +pages = [] sorted_target_pages.each do |page, characters| pages[page] = page_name(page) end lines = pages.each_slice(2).collect do |pages_group| formatted_pages = pages_group.collect do |page| - "%19s" % page + "%19s" % (page || "NULL") end " " + formatted_pages.join(", ") end Modified: tool/parser.rb (+69 -30) =================================================================== --- tool/parser.rb 2018-04-28 21:09:57 +0900 (cc0b2f0) +++ tool/parser.rb 2018-07-17 10:57:26 +0900 (c7ff979) @@ -1,4 +1,4 @@ -# Copyright (C) 2013 Kouhei Sutou <kou �� clear-code.com> +# Copyright (C) 2013-2018 Kouhei Sutou <kou �� clear-code.com> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Library General Public @@ -27,6 +27,42 @@ module Unicode end end +class Character < Struct.new(:weights, + :code_point) + def utf8 + Unicode.to_utf8(code_point) + end +end + +module CharacterArray + def find_representative_character(options={}) + representative_character = nil + case first.utf8 + when "⺄", "⺇", "⺈", "⺊", "⺌", "⺗" + representative_character = last + when "⺜", "⺝", "⺧", "⺫", "⺬", "⺮", "⺶", "⺻", "⺼", "⺽" + representative_character = self[1] + when "⻆", "⻊", "⻏", "⻑", "⻕", "⻗", "⻝", "⻡", "⻣", "⻤" + representative_character = last + when "⻱", "⼀", "⼆", "⼈" + representative_character = self[1] + when "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", "っ", "ゃ", "ゅ", "ょ", "ゎ" + representative_character = self[1] unless options[:split_small_kana] + else + representative_character ||= find_greek_capital_character + end + representative_character ||= first + representative_character + end + + GREEK_CAPITAL_UNICODE_RANGE = Unicode.from_utf8("Α")..Unicode.from_utf8("Ω") + def find_greek_capital_character + find do |character| + GREEK_CAPITAL_UNICODE_RANGE.cover?(character.code_point) + end + end +end + class CTypeUTF8Parser def initialize @pages = {} @@ -78,26 +114,21 @@ class CTypeUTF8Parser end end -class CTypeUCAParser - attr_reader :pages - def initialize(version=nil) - @version = version +class UCAParser + def initialize @pages = {} - @lengths = [] end - def parse(input) - parse_ctype_uca(input) - normalize_pages - end - - def weight_based_characters + def weight_based_characters(level) weight_based_characters = {} sorted_pages.each do |page, characters| characters.each do |character| - weight = character[:weight] - weight_based_characters[weight] ||= [] - weight_based_characters[weight] << character + weights = character.weights + target_weights = weights.collect do |weight| + weight[0, level] + end + weight_based_characters[target_weights] ||= [] + weight_based_characters[target_weights] << character end end weight_based_characters @@ -108,9 +139,22 @@ class CTypeUCAParser page end end +end - def n_pages - @lengths.size +class CTypeUCAParser < UCAParser + def initialize(version=nil) + super() + @version = version + @lengths = [] + end + + def parse(input) + parse_ctype_uca(input) + normalize_pages + end + + def weight_based_characters + super(1) end private @@ -162,21 +206,16 @@ class CTypeUCAParser def normalize_pages @pages.each do |page, flatten_weights| - weights = flatten_weights.each_slice(@lengths[page]) - @pages[page] = weights.with_index.collect do |weight, i| - if weight.all?(&:zero?) - weight = [0] - else - while weight.last.zero? - weight.pop - end + weights_set = flatten_weights.each_slice(@lengths[page]) + @pages[page] = weights_set.with_index.collect do |weights, i| + weights = weights.collect do |level1_weight| + [level1_weight] + end + while weights.last == [0] + weights.pop end code_point = (page << 8) + i - { - :weight => weight, - :code_point => code_point, - :utf8 => Unicode.to_utf8(code_point), - } + Character.new(weights, code_point) end end end -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180717/9bdc9b6f/attachment-0001.htm