null+****@clear*****
null+****@clear*****
2011年 12月 9日 (金) 10:44:29 JST
Susumu Yata 2011-12-09 01:44:29 +0000 (Fri, 09 Dec 2011)
New Revision: a39da9071ba08616d0b31a5c62f904f9c7c336a9
Log:
fixed some bugs and added comments.
Modified files:
plugins/tokenizers/mecab.c
Modified: plugins/tokenizers/mecab.c (+31 -10)
===================================================================
--- plugins/tokenizers/mecab.c 2011-12-08 02:11:40 +0000 (4d0e9a7)
+++ plugins/tokenizers/mecab.c 2011-12-09 01:44:29 +0000 (ddc8816)
@@ -1,5 +1,5 @@
/* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2009-2010 Brazil
+/* Copyright(C) 2009-2011 Brazil
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -32,6 +32,10 @@
static mecab_t *sole_mecab;
static grn_critical_section sole_mecab_lock;
+/*
+ This macro is called only once.
+ Why don't you put this directly?
+ */
#define SOLE_MECAB_CONFIRM do {\
if (!sole_mecab) {\
static char *argv[] = {"", "-Owakati"};\
@@ -49,6 +53,7 @@ static grn_critical_section sole_mecab_lock;
typedef struct {
grn_str *nstr;
mecab_t *mecab;
+ /* Why these pointers are unsigned? */
unsigned char *buf;
unsigned char *next;
unsigned char *end;
@@ -68,6 +73,12 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
grn_obj_flags table_flags;
grn_mecab_tokenizer *token;
unsigned int bufsize, maxtrial = 10, len;
+ /*
+ user_data->ptr should be initialized with NULL?
+ How an error is detected? user_data->ptr == NULL?
+ If mecab_next() and mecab_fin() are always called after mecab_init(),
+ it may cause a critical error.
+ */
if (!(str = grn_ctx_pop(ctx))) {
ERR(GRN_INVALID_ARGUMENT, "missing argument");
return NULL;
@@ -80,9 +91,7 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
return NULL;
}
if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; }
- user_data->ptr = token;
token->mecab = sole_mecab;
- // if (!(token->mecab = mecab_new3())) {
grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
@@ -93,8 +102,9 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
}
len = token->nstr->norm_blen;
for (bufsize = len * 2 + 1; maxtrial; bufsize *= 2, maxtrial--) {
- if(!(buf = GRN_MALLOC(bufsize + 1))) {
+ if (!(buf = GRN_MALLOC(bufsize + 1))) {
GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !");
+ grn_str_close(ctx, token->nstr);
GRN_FREE(token);
return NULL;
}
@@ -110,15 +120,17 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
}
if (!s) {
ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d bufsize=%d err=%s",
- len, bufsize, mecab_err);
+ len, bufsize, mecab_err);
+ grn_str_close(ctx, token->nstr);
GRN_FREE(token);
return NULL;
}
- // certain version of mecab returns trailing lf or spaces.
+ /* A certain version of mecab returns trailing lf or spaces. */
for (p = buf + strlen(buf) - 1;
buf <= p && isspace(*(unsigned char *)p);
p--) { *p = '\0'; }
- //grn_log("sparsed='%s'", s);
+ /* grn_log("sparsed='%s'", s); */
+ user_data->ptr = token;
token->buf = (unsigned char *)buf;
token->next = (unsigned char *)buf;
token->end = (unsigned char *)buf + strlen(buf);
@@ -131,7 +143,7 @@ static grn_obj *
mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
size_t cl;
- // grn_obj *table = args[0];
+ /* grn_obj *table = args[0]; */
grn_mecab_tokenizer *token = user_data->ptr;
const unsigned char *p = token->next, *r;
const unsigned char *e = token->end;
@@ -157,9 +169,9 @@ mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
static grn_obj *
mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
- // grn_obj *table = args[0];
+ /* grn_obj *table = args[0]; */
grn_mecab_tokenizer *token = user_data->ptr;
- // if (token->mecab) { mecab_destroy(token->mecab); }
+ /* if (token->mecab) { mecab_destroy(token->mecab); } */
grn_str_close(ctx, token->nstr);
GRN_FREE(token->buf);
GRN_FREE(token);
@@ -223,6 +235,10 @@ GRN_PLUGIN_INIT(grn_ctx *ctx)
check_mecab_dictionary_encoding(ctx);
+ /*
+ This function returns GRN_SUCCESS even if an encoding error is detected.
+ */
+
return GRN_SUCCESS;
}
@@ -243,6 +259,11 @@ GRN_PLUGIN_REGISTER(grn_ctx *ctx)
mecab_init, mecab_next, mecab_fin, 3, vars);
if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_MECAB) { return GRN_FILE_CORRUPT; }
+ /*
+ obj will never be used?
+ grn_proc_create() is called here but grn_proc_destroy() does not appear.
+ */
+
return GRN_SUCCESS;
}