Kouhei Sutou
null+****@clear*****
Mon Apr 22 17:06:32 JST 2013
Kouhei Sutou 2013-04-22 17:06:32 +0900 (Mon, 22 Apr 2013) New Revision: 1bdde8b40a40f514b9dde6ed50098badaaf11a68 https://github.com/groonga/groonga-normalizer-mysql/commit/1bdde8b40a40f514b9dde6ed50098badaaf11a68 Message: Support UCA table customization * Hiragana and Katakana case sensitive mode * Hiragana/Katakana with voiced sound mark sensitive mode * Hiragana/Katakana with semi-voiced sound mark sensitive mode [groonga-dev,01301] Suggested by Kouhei Tanabe. Thanks!!! Modified files: normalizers/Makefile.am tool/generate_uca_table.rb Modified: normalizers/Makefile.am (+16 -1) =================================================================== --- normalizers/Makefile.am 2013-04-21 10:05:19 +0900 (2fc9586) +++ normalizers/Makefile.am 2013-04-22 17:06:32 +0900 (c5c131c) @@ -39,7 +39,12 @@ ensure-mysql-source-dir: exit 1; \ fi -update-tables: update-general-ci-table update-unicode-ci-table +UPDATE_TABLES_TARGETS = \ + update-general-ci-table \ + update-unicode-ci-table \ + update-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark-table + +update-tables: $(UPDATE_TABLES_TARGETS) update-general-ci-table: ensure-mysql-source-dir $(RUBY) \ @@ -52,3 +57,13 @@ update-unicode-ci-table: ensure-mysql-source-dir $(top_srcdir)/tool/generate_uca_table.rb \ $(MYSQL_SOURCE_DIR)/strings/ctype-uca.c > \ $(srcdir)/mysql_unicode_ci_table.h + +update-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark-table: ensure-mysql-source-dir + $(RUBY) \ + $(top_srcdir)/tool/generate_uca_table.rb \ + --split-small-kana \ + --split-kana-with-voiced-sound-mark \ + --split-kana-with-semi-voiced-sound-mark \ + --suffix _except_kana_ci_kana_with_voiced_sound_mark \ + $(MYSQL_SOURCE_DIR)/strings/ctype-uca.c > \ + $(srcdir)/mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h Modified: tool/generate_uca_table.rb (+112 -7) =================================================================== --- tool/generate_uca_table.rb 2013-04-21 10:05:19 +0900 (ca7f5ff) +++ tool/generate_uca_table.rb 2013-04-22 17:06:32 +0900 (57dba4e) @@ -17,11 +17,51 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +require "optparse" + $LOAD_PATH.unshift(File.dirname(__FILE__)) require "parser" + �� suffix = "" + �� split_small_kana_p = false + �� split_kana_with_voiced_sound_mark_p = false + �� split_kana_with_semi_voiced_sound_mark_p = false + +option_parser = OptionParser.new +option_parser.banner += " MYSQL_SOURCE/strings/ctype-uca.c" + +option_parser.on("--suffix=SUFFIX", "Add SUFFIX to names") do |suffix| + @suffix = suffix +end + +option_parser.on("--[no-]split-small-kana", + "Split small hiragana (katakana) and " + + "large hiragana (katakana)", + "(#{@split_small_kana_p})") do |boolean| + @split_small_kana_p = boolean +end + +option_parser.on("--[no-]split-kana-with-voiced-sound-mark", + "Split hiragana (katakana) with voiced sound mark", + "(#{@split_kana_with_voiced_sound_mark})") do |boolean| + @split_kana_with_voiced_sound_mark_p = boolean +end + +option_parser.on("--[no-]split-kana-with-semi-voiced-sound-mark", + "Split hiragana (katakana) with semi-voiced sound mark", + "(#{@split_kana_with_semi_voiced_sound_mark})") do |boolean| + @split_kana_with_semi_voiced_sound_mark_p = boolean +end + +begin + option_parser.parse!(ARGV) +rescue OptionParser::Error + puts($!) + exit(false) +end + if ARGV.size != 1 - puts("Usage: #{$0} MYSQL_SOURCE/strings/ctype-uca.c") + puts(option_parser) exit(false) end @@ -32,6 +72,67 @@ File.open(ctype_uca_c_path) do |ctype_uca_c| parser.parse(ctype_uca_c) end +SMALL_KANAS = [ + "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", + "っ", + "ゃ", "ゅ", "ょ", + "ゎ", + "ァ", "ィ", "ゥ", "ェ", "ォ", + "ッ", + "ャ", "ュ", "ョ", + "ヮ", + "ァ", "ィ", "ゥ", "ェ", "ォ", + "ッ", + "ャ", "ュ", "ョ", +] +def small_kana?(character) + SMALL_KANAS.include?(character[:utf8]) +end + +KANA_WITH_VOICED_SOUND_MARKS = [ + "が", "ぎ", "ぐ", "げ", "ご", + "ざ", "じ", "ず", "ぜ", "ぞ", + "だ", "ぢ", "づ", "で", "ど", + "ば", "び", "ぶ", "べ", "ぼ", + "ガ", "ギ", "グ", "ゲ", "ゴ", + "ザ", "ジ", "ズ", "ゼ", "ゾ", + "ダ", "ヂ", "ヅ", "デ", "ド", + "バ", "ビ", "ブ", "ベ", "ボ", +] +def kana_with_voiced_sound_mark?(character) + KANA_WITH_VOICED_SOUND_MARKS.include?(character[:utf8]) +end + +KANA_WITH_SEMI_VOICED_SOUND_MARKS = [ + "ぱ", "ぴ", "ぷ", "ぺ", "ぽ", + "パ", "ピ", "プ", "ペ", "ポ", +] +def kana_with_semi_voiced_sound_mark?(character) + KANA_WITH_SEMI_VOICED_SOUND_MARKS.include?(character[:utf8]) +end + +def split_characters(characters) + grouped_characters = characters.group_by do |character| + if @split_small_kana_p and small_kana?(character) + :small_kana + elsif @split_kana_with_voiced_sound_mark_p and + kana_with_voiced_sound_mark?(character) + :kana_with_voiced_sound_mark + elsif @split_kana_with_semi_voiced_sound_mark_p and + kana_with_semi_voiced_sound_mark?(character) + :kana_with_semi_voiced_sound_mark + else + :other + end + end + grouped_characters.values +end + +grouped_characters = [] +parser.weight_based_characters.each do |weight, characters| + grouped_characters.concat(split_characters(characters)) +end + GREEK_CAPITAL_UNICODE_RANGE = Unicode.from_utf8("Α")..Unicode.from_utf8("Ω") def find_greek_capital_character(characters) characters.find do |character| @@ -51,7 +152,7 @@ def find_representative_character(characters) when "⻱", "⼀", "⼆", "⼈" representative_character = characters[1] when "ぁ", "ぃ", "ぅ", "ぇ", "ぉ", "っ", "ゃ", "ゅ", "ょ", "ゎ" - representative_character = characters[1] + representative_character = characters[1] unless @split_small_kana_p else representative_character ||= find_greek_capital_character(characters) end @@ -60,7 +161,7 @@ def find_representative_character(characters) end target_pages = {} -parser.weight_based_characters.each do |weight, characters| +grouped_characters.each do |characters| next if characters.size == 1 representative_character = find_representative_character(characters) representative_code_point = representative_character[:code_point] @@ -80,8 +181,12 @@ sorted_target_pages = target_pages.sort_by do |page, code_points| page end + normalized_ctype_uca_c_path = ctype_uca_c_path.sub(/\A.*\/([^\/]+\/strings\/ctype-uca\.c)\z/, "\\1") + + �� suffix_upper_case =****@suffi***** + puts(<<-HEADER) /* Copyright(C) 2013 Kouhei Sutou <kou �� clear-code.com> @@ -126,14 +231,14 @@ puts(<<-HEADER) Written by Alexander Barkov <bar �� mysql.com> */ -#ifndef MYSQL_UCA_H -#define MYSQL_UCA_H +#ifndef MYSQL_UCA#{@suffix_upper_case}_H +#define MYSQL_UCA#{@suffix_upper_case}_H #include <stdint.h> HEADER def page_name(page) - "unicode_ci_page_%02x" % page + "unicode_ci#{@suffix}_page_%02x" % page end sorted_target_pages.each do |page, characters| @@ -156,7 +261,7 @@ end puts(<<-PAGES_HEADER) -static uint32_t *unicode_ci_table[256] = { +static uint32_t *unicode_ci#{@suffix}_table[256] = { PAGES_HEADER pages = ["NULL"] * 256 -------------- next part -------------- HTML����������������������������...Download