null+****@clear*****
null+****@clear*****
2010年 6月 10日 (木) 16:33:59 JST
Kouhei Sutou 2010-06-10 07:33:59 +0000 (Thu, 10 Jun 2010)
New Revision: 8873b3d556a921f507b213f928aba9307608c4d1
Log:
report encoding difference between groonga and MeCab. #85
Modified files:
modules/tokenizers/mecab.c
Modified: modules/tokenizers/mecab.c (+41 -0)
===================================================================
--- modules/tokenizers/mecab.c 2010-06-10 04:19:42 +0000 (1d97708)
+++ modules/tokenizers/mecab.c 2010-06-10 07:33:59 +0000 (acd5b65)
@@ -158,12 +158,53 @@ mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
return NULL;
}
+static void
+check_mecab_dictionary_encoding(grn_ctx *ctx)
+{
+ mecab_t *mecab;
+
+ mecab = mecab_new(0, NULL);
+ if (mecab) {
+ grn_encoding encoding;
+ const mecab_dictionary_info_t *dictionary;
+ int have_same_encoding_dictionary = 0;
+
+ encoding = GRN_CTX_GET_ENCODING(ctx);
+ dictionary = mecab_dictionary_info(mecab);
+ for (; dictionary; dictionary = dictionary->next) {
+ switch (encoding) {
+ case GRN_ENC_EUC_JP:
+ if (strcmp(dictionary->charset, "EUC-JP") == 0) {
+ have_same_encoding_dictionary = 1;
+ }
+ break;
+ case GRN_ENC_UTF8:
+ if (strcmp(dictionary->charset, "UTF-8") == 0) {
+ have_same_encoding_dictionary = 1;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ mecab_destroy(mecab);
+
+ if (!have_same_encoding_dictionary) {
+ ERR(GRN_TOKENIZER_ERROR,
+ "MeCab has no dictionary that uses the context encoding: <%s>",
+ grn_enctostr(encoding));
+ }
+ }
+}
+
grn_rc
grn_module_init_mecab(grn_ctx *ctx)
{
sole_mecab = NULL;
CRITICAL_SECTION_INIT(sole_mecab_lock);
+ check_mecab_dictionary_encoding(ctx);
+
return GRN_SUCCESS;
}