Kouhei Sutou
null+****@clear*****
Tue Jul 17 16:54:11 JST 2018
Kouhei Sutou 2018-07-17 16:54:11 +0900 (Tue, 17 Jul 2018) New Revision: e80523e6afb30fed3e66c7fdf45b4682a38e69c9 https://github.com/groonga/groonga-normalizer-mysql/commit/e80523e6afb30fed3e66c7fdf45b4682a38e69c9 Message: Regenerate existing tables Modified files: normalizers/mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h normalizers/mysql_unicode_520_ci_table.h normalizers/mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h tool/generate_uca_table.rb tool/parser.rb Modified: normalizers/mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h (+3 -389) =================================================================== --- normalizers/mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h 2018-07-17 14:24:14 +0900 (6832d3a) +++ normalizers/mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h 2018-07-17 16:54:11 +0900 (8521216) @@ -41,8 +41,7 @@ Written by Alexander Barkov <bar �� mysql.com> */ -#ifndef MYSQL_UCA_520_EXCEPT_KANA_CI_KANA_WITH_VOICED_SOUND_MARK_H -#define MYSQL_UCA_520_EXCEPT_KANA_CI_KANA_WITH_VOICED_SOUND_MARK_H +#pragma once #include <stdint.h> @@ -2846,7 +2845,7 @@ static uint32_t unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_e 0xe01f8, 0xe01f9, 0xe01fa, 0xe01fb, 0xe01fc, 0xe01fd, 0xe01fe, 0xe01ff }; -static uint32_t *unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table[4352] = { +static uint32_t *unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table[] = { unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_00, unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_01, unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_02, unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_03, unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_04, unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_05, @@ -4639,390 +4638,5 @@ static uint32_t *unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table NULL, NULL, NULL, NULL, NULL, NULL, - unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_e00, unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_eunicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_e00, unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_page_e01 }; - -#endif Modified: normalizers/mysql_unicode_520_ci_table.h (+3 -389) =================================================================== --- normalizers/mysql_unicode_520_ci_table.h 2018-07-17 14:24:14 +0900 (22a56d7) +++ normalizers/mysql_unicode_520_ci_table.h 2018-07-17 16:54:11 +0900 (ab2e44c) @@ -41,8 +41,7 @@ Written by Alexander Barkov <bar �� mysql.com> */ -#ifndef MYSQL_UCA_520_H -#define MYSQL_UCA_520_H +#pragma once #include <stdint.h> @@ -2846,7 +2845,7 @@ static uint32_t unicode_520_ci_page_e01[] = { 0xe01f8, 0xe01f9, 0xe01fa, 0xe01fb, 0xe01fc, 0xe01fd, 0xe01fe, 0xe01ff }; -static uint32_t *unicode_520_ci_table[4352] = { +static uint32_t *unicode_520_ci_table[] = { unicode_520_ci_page_00, unicode_520_ci_page_01, unicode_520_ci_page_02, unicode_520_ci_page_03, unicode_520_ci_page_04, unicode_520_ci_page_05, @@ -4639,390 +4638,5 @@ static uint32_t *unicode_520_ci_table[4352] = { NULL, NULL, NULL, NULL, NULL, NULL, - unicode_520_ci_page_e00, unicode_520_ci_page_e01, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL, - NULL, NULL + unicode_520_ci_page_e00, unicode_520_ci_page_e01 }; - -#endif Modified: normalizers/mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h (+2 -5) =================================================================== --- normalizers/mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h 2018-07-17 14:24:14 +0900 (7153cb1) +++ normalizers/mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h 2018-07-17 16:54:11 +0900 (b641d76) @@ -41,8 +41,7 @@ Written by Alexander Barkov <bar �� mysql.com> */ -#ifndef MYSQL_UCA_EXCEPT_KANA_CI_KANA_WITH_VOICED_SOUND_MARK_H -#define MYSQL_UCA_EXCEPT_KANA_CI_KANA_WITH_VOICED_SOUND_MARK_H +#pragma once #include <stdint.h> @@ -1551,7 +1550,7 @@ static uint32_t unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_ff[] 0x0fff8, 0x00000, 0x00000, 0x00000, 0x0fffc, 0x0fffd, 0x0fffe, 0x0ffff }; -static uint32_t *unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table[256] = { +static uint32_t *unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table[] = { unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_00, unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_01, unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_02, unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_03, unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_04, unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_05, @@ -1681,5 +1680,3 @@ static uint32_t *unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table[256 unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_fc, unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_fd, unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_fe, unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_page_ff }; - -#endif Modified: tool/generate_uca_table.rb (+14 -12) =================================================================== --- tool/generate_uca_table.rb 2018-07-17 14:24:14 +0900 (e88f3be) +++ tool/generate_uca_table.rb 2018-07-17 16:54:11 +0900 (c5d4221) @@ -23,9 +23,11 @@ require "parser" @version = nil @suffix = "" - �� split_small_kana_p = false - �� split_kana_with_voiced_sound_mark_p = false - �� split_kana_with_semi_voiced_sound_mark_p = false + �� options = { + split_small_kana: false, + split_kana_with_voiced_sound_mark: false, + split_kana_with_semi_voiced_sound_mark: false, +} option_parser = OptionParser.new option_parser.banner += " MYSQL_SOURCE/strings/ctype-uca.c" @@ -41,26 +43,26 @@ end option_parser.on("--[no-]split-small-kana", "Split small hiragana (katakana) and " + "large hiragana (katakana)", - "(#{@split_small_kana_p})") do |boolean| - @split_small_kana_p = boolean + "(#{@options[:split_small_kana]})") do |boolean| + @options[:split_small_kana] = boolean end option_parser.on("--[no-]split-kana-with-voiced-sound-mark", "Split hiragana (katakana) with voiced sound mark", - "(#{@split_kana_with_voiced_sound_mark})") do |boolean| - @split_kana_with_voiced_sound_mark_p = boolean + "(#{@options[:split_kana_with_voiced_sound_mark]})") do |boolean| + @options[:split_kana_with_voiced_sound_mark] = boolean end option_parser.on("--[no-]split-kana-with-semi-voiced-sound-mark", "Split hiragana (katakana) with semi-voiced sound mark", - "(#{@split_kana_with_semi_voiced_sound_mark})") do |boolean| - @split_kana_with_semi_voiced_sound_mark_p = boolean + "(#{@options[:split_kana_with_semi_voiced_sound_mark]})") do |boolean| + @options[:split_kana_with_semi_voiced_sound_mark] = boolean end begin option_parser.parse!(ARGV) -rescue OptionParser::Error - puts($!) +rescue OptionParser::ParseError + $stderr.puts($!) exit(false) end @@ -76,7 +78,7 @@ File.open(ctype_uca_c_path) do |ctype_uca_c| parser.parse(ctype_uca_c) end -normalization_table = parser.normalization_table +normalization_table = parser.normalization_table(@options) normalized_ctype_uca_c_path = ctype_uca_c_path.sub(/\A.*\/([^\/]+\/strings\/ctype-uca\.c)\z/, "\\1") Modified: tool/parser.rb (+45 -43) =================================================================== --- tool/parser.rb 2018-07-17 14:24:14 +0900 (16a05f5) +++ tool/parser.rb 2018-07-17 16:54:11 +0900 (8d3eaf7) @@ -32,6 +32,45 @@ class Character < Struct.new(:weights, def utf8 Unicode.to_utf8(code_point) end + + SMALL_KANAS = [ + "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", + "っ", + "ゃ", "ゅ", "ょ", + "ゎ", + "ァ", "ィ", "ゥ", "ェ", "ォ", + "ッ", + "ャ", "ュ", "ョ", + "ヮ", + "ァ", "ィ", "ゥ", "ェ", "ォ", + "ッ", + "ャ", "ュ", "ョ", + ] + def small_kana? + SMALL_KANAS.include?(utf8) + end + + KANA_WITH_VOICED_SOUND_MARKS = [ + "が", "ぎ", "ぐ", "げ", "ご", + "ざ", "じ", "ず", "ぜ", "ぞ", + "だ", "ぢ", "づ", "で", "ど", + "ば", "び", "ぶ", "べ", "ぼ", + "ガ", "ギ", "グ", "ゲ", "ゴ", + "ザ", "ジ", "ズ", "ゼ", "ゾ", + "ダ", "ヂ", "ヅ", "デ", "ド", + "バ", "ビ", "ブ", "ベ", "ボ", + ] + def kana_with_voiced_sound_mark? + KANA_WITH_VOICED_SOUND_MARKS.include?(utf8) + end + + KANA_WITH_SEMI_VOICED_SOUND_MARKS = [ + "ぱ", "ぴ", "ぷ", "ぺ", "ぽ", + "パ", "ピ", "プ", "ペ", "ポ", + ] + def kana_with_semi_voiced_sound_mark? + KANA_WITH_SEMI_VOICED_SOUND_MARKS.include?(utf8) + end end module CharacterArray @@ -47,7 +86,9 @@ module CharacterArray when "⻱", "⼀", "⼆", "⼈" representative_character = self[1] when "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", "っ", "ゃ", "ゅ", "ょ", "ゎ" - representative_character = self[1] unless options[:split_small_kana] + if options[:split_small_kana] == false + representative_character = self[1] + end else representative_character ||= find_greek_capital_character end @@ -174,54 +215,15 @@ class UCAParser grouped_characters end - SMALL_KANAS = [ - "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", - "っ", - "ゃ", "ゅ", "ょ", - "ゎ", - "ァ", "ィ", "ゥ", "ェ", "ォ", - "ッ", - "ャ", "ュ", "ョ", - "ヮ", - "ァ", "ィ", "ゥ", "ェ", "ォ", - "ッ", - "ャ", "ュ", "ョ", - ] - def small_kana?(character) - SMALL_KANAS.include?(character[:utf8]) - end - - KANA_WITH_VOICED_SOUND_MARKS = [ - "が", "ぎ", "ぐ", "げ", "ご", - "ざ", "じ", "ず", "ぜ", "ぞ", - "だ", "ぢ", "づ", "で", "ど", - "ば", "び", "ぶ", "べ", "ぼ", - "ガ", "ギ", "グ", "ゲ", "ゴ", - "ザ", "ジ", "ズ", "ゼ", "ゾ", - "ダ", "ヂ", "ヅ", "デ", "ド", - "バ", "ビ", "ブ", "ベ", "ボ", - ] - def kana_with_voiced_sound_mark?(character) - KANA_WITH_VOICED_SOUND_MARKS.include?(character[:utf8]) - end - - KANA_WITH_SEMI_VOICED_SOUND_MARKS = [ - "ぱ", "ぴ", "ぷ", "ぺ", "ぽ", - "パ", "ピ", "プ", "ペ", "ポ", - ] - def kana_with_semi_voiced_sound_mark?(character) - KANA_WITH_SEMI_VOICED_SOUND_MARKS.include?(character[:utf8]) - end - def split_characters(characters, options) grouped_characters = characters.group_by do |character| - if options[:split_small_kana] and small_kana?(character) + if options[:split_small_kana] and character.small_kana? :small_kana elsif options[:split_kana_with_voiced_sound_mark] and - kana_with_voiced_sound_mark?(character) + character.kana_with_voiced_sound_mark? :kana_with_voiced_sound_mark elsif options[:split_kana_with_semi_voiced_sound_mark] and - kana_with_semi_voiced_sound_mark?(character) + character.kana_with_semi_voiced_sound_mark? :kana_with_semi_voiced_sound_mark else :other -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180717/13ce4bd4/attachment-0001.htm