[Groonga-commit] groonga/groonga [master] [normalizer] changed to use grn_normalized_text instead of grn_str.

Back to archive index

null+****@clear***** null+****@clear*****
2012年 2月 10日 (金) 19:16:20 JST


Kouhei Sutou	2012-02-10 19:16:20 +0900 (Fri, 10 Feb 2012)

  New Revision: 27aac3047ea6f54bd610e0917bbca0953a23a665

  Log:
    [normalizer] changed to use grn_normalized_text instead of grn_str.

  Modified files:
    lib/db.c
    lib/db.h
    lib/ii.c
    lib/token.c
    lib/tokenizer.c
    plugins/tokenizers/mecab.c

  Modified: lib/db.c (+9 -4)
===================================================================
--- lib/db.c    2012-02-10 19:13:32 +0900 (d9eb95c)
+++ lib/db.c    2012-02-10 19:16:20 +0900 (e15e2b8)
@@ -1805,7 +1805,8 @@ exit :
 
 grn_rc
 grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
-                   grn_encoding *encoding, grn_obj **tokenizer)
+                   grn_encoding *encoding, grn_obj **tokenizer,
+                   grn_obj **normalizer)
 {
   grn_rc rc = GRN_INVALID_ARGUMENT;
   GRN_API_ENTER;
@@ -1815,24 +1816,28 @@ grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
       if (flags) { *flags = ((grn_pat *)table)->obj.header.flags; }
       if (encoding) { *encoding = ((grn_pat *)table)->encoding; }
       if (tokenizer) { *tokenizer = ((grn_pat *)table)->tokenizer; }
+      if (normalizer) { *normalizer = ((grn_pat *)table)->normalizer; }
       rc = GRN_SUCCESS;
       break;
     case GRN_TABLE_DAT_KEY :
       if (flags) { *flags = ((grn_dat *)table)->obj.header.flags; }
       if (encoding) { *encoding = ((grn_dat *)table)->encoding; }
       if (tokenizer) { *tokenizer = ((grn_dat *)table)->tokenizer; }
+      if (normalizer) { *normalizer = ((grn_dat *)table)->normalizer; }
       rc = GRN_SUCCESS;
       break;
     case GRN_TABLE_HASH_KEY :
       if (flags) { *flags = ((grn_hash *)table)->obj.header.flags; }
       if (encoding) { *encoding = ((grn_hash *)table)->encoding; }
       if (tokenizer) { *tokenizer = ((grn_hash *)table)->tokenizer; }
+      if (normalizer) { *normalizer = ((grn_hash *)table)->normalizer; }
       rc = GRN_SUCCESS;
       break;
     case GRN_TABLE_NO_KEY :
       if (flags) { *flags = 0; }
       if (encoding) { *encoding = GRN_ENC_NONE; }
       if (tokenizer) { *tokenizer = grn_uvector_tokenizer; }
+      if (normalizer) { *normalizer = NULL; }
       rc = GRN_SUCCESS;
       break;
     }
@@ -8074,7 +8079,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
         if (obj->header.type != GRN_COLUMN_FIX_SIZE) {
           grn_obj *tokenizer, *lexicon = grn_ctx_at(ctx, target->header.domain);
           if (!lexicon) { continue; }
-          grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
+          grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
           if (tokenizer) { continue; }
         }
         if (n < buf_size) {
@@ -8112,7 +8117,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
           if (!lexicon) { continue; }
           if (lexicon->header.type != GRN_TABLE_PAT_KEY) { continue; }
           /* FIXME: GRN_TABLE_DAT_KEY should be supported */
-          grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
+          grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
           if (tokenizer) { continue; }
         }
         if (n < buf_size) {
@@ -8192,7 +8197,7 @@ grn_column_index(grn_ctx *ctx, grn_obj *obj, grn_operator op,
               if (!lexicon) { continue; }
               if (lexicon->header.type != GRN_TABLE_PAT_KEY) { continue; }
               /* FIXME: GRN_TABLE_DAT_KEY should be supported */
-              grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer);
+              grn_table_get_info(ctx, lexicon, NULL, NULL, &tokenizer, NULL);
               if (tokenizer) { continue; }
             }
             if (n < buf_size) {

  Modified: lib/db.h (+2 -1)
===================================================================
--- lib/db.h    2012-02-10 19:13:32 +0900 (4f76d43)
+++ lib/db.h    2012-02-10 19:16:20 +0900 (537b32f)
@@ -92,7 +92,8 @@ grn_id grn_table_get_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_si
 grn_id grn_table_add_v(grn_ctx *ctx, grn_obj *table, const void *key, int key_size,
                        void **value, int *added);
 GRN_API grn_rc grn_table_get_info(grn_ctx *ctx, grn_obj *table, grn_obj_flags *flags,
-                                  grn_encoding *encoding, grn_obj **tokenizer);
+                                  grn_encoding *encoding, grn_obj **tokenizer,
+                                  grn_obj **normalizer);
 const char *_grn_table_key(grn_ctx *ctx, grn_obj *table, grn_id id, uint32_t *key_size);
 
 grn_rc grn_table_search(grn_ctx *ctx, grn_obj *table,

  Modified: lib/ii.c (+8 -4)
===================================================================
--- lib/ii.c    2012-02-10 19:13:32 +0900 (ac340da)
+++ lib/ii.c    2012-02-10 19:16:20 +0900 (09192e5)
@@ -3405,7 +3405,9 @@ _grn_ii_create(grn_ctx *ctx, grn_ii *ii, const char *path, grn_obj *lexicon, uin
     free_histogram[i] = 0;
   }
   */
-  if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer)) { return NULL; }
+  if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL)) {
+    return NULL;
+  }
   if (path && strlen(path) + 6 >= PATH_MAX) { return NULL; }
   seg = grn_io_create(ctx, path, sizeof(struct grn_ii_header),
                       S_SEGMENT, GRN_II_MAX_LSEG, grn_io_auto, GRN_IO_EXPIRE_SEGMENT);
@@ -3524,7 +3526,9 @@ grn_ii_open(grn_ctx *ctx, const char *path, grn_obj *lexicon)
   grn_obj_flags lflags;
   grn_encoding encoding;
   grn_obj *tokenizer;
-  if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer)) { return NULL; }
+  if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, NULL)) {
+    return NULL;
+  }
   if (strlen(path) + 6 >= PATH_MAX) { return NULL; }
   strcpy(path2, path);
   strcat(path2, ".c");
@@ -6549,7 +6553,7 @@ grn_ii_builder_tokenize(grn_ctx *ctx, grn_ii_builder *builder, grn_id rid, grn_o
       grn_obj *range = grn_ctx_at(ctx, DB_OBJ(builder->lexicon)->range);
       grn_obj *tokenizer;
       grn_obj_flags flags;
-      grn_table_get_info(ctx, builder->lexicon, &flags, NULL, &tokenizer);
+      grn_table_get_info(ctx, builder->lexicon, &flags, NULL, &tokenizer, NULL);
       flags &= ~GRN_OBJ_PERSISTENT;
       builder->tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range);
       grn_obj_set_info(ctx, builder->tmp_lexicon, GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
@@ -6879,7 +6883,7 @@ grn_ii_build(grn_ctx *ctx, grn_ii *ii)
   builder.tmp_lexicon = NULL;
   {
     grn_obj_flags flags;
-    grn_table_get_info(ctx, builder.lexicon, &flags, NULL, NULL);
+    grn_table_get_info(ctx, builder.lexicon, &flags, NULL, NULL, NULL);
     if (flags & GRN_OBJ_TABLE_PAT_KEY) {
       grn_pat_cache_enable(ctx, (grn_pat *)builder.lexicon, PAT_CACHE_SIZE);
     }

  Modified: lib/token.c (+84 -28)
===================================================================
--- lib/token.c    2012-02-10 19:13:32 +0900 (3ef64d8)
+++ lib/token.c    2012-02-10 19:16:20 +0900 (887491d)
@@ -79,7 +79,8 @@ uvector_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 }
 
 typedef struct {
-  grn_str *nstr;
+  grn_obj *normalized_text;
+  char *target_text;
   const uint8_t *delimiter;
   uint32_t delimiter_len;
   int32_t pos;
@@ -97,29 +98,47 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data,
                const uint8_t *delimiter, uint32_t delimiter_len)
 {
   grn_obj *str;
-  int nflags = 0;
   grn_delimited_tokenizer *token;
   grn_obj_flags table_flags;
+  grn_obj *normalizer;
   if (!(str = grn_ctx_pop(ctx))) {
     ERR(GRN_INVALID_ARGUMENT, "missing argument");
     return NULL;
   }
   if (!(token = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) { return NULL; }
   user_data->ptr = token;
+  token->normalized_text = NULL;
+  token->target_text = NULL;
   token->delimiter = delimiter;
   token->delimiter_len = delimiter_len;
   token->pos = 0;
-  grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
-  nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
-  if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
-                                    nflags, token->encoding))) {
-    GRN_FREE(token);
-    ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
-    return NULL;
+  grn_table_get_info(ctx, table, &table_flags,
+                     &token->encoding, NULL, &normalizer);
+  if (normalizer) {
+    unsigned int length_in_bytes;
+    if (!(token->normalized_text = grn_normalized_text_open(ctx,
+                                                            normalizer,
+                                                            GRN_TEXT_VALUE(str),
+                                                            GRN_TEXT_LEN(str),
+                                                            token->encoding,
+                                                            0))) {
+      GRN_FREE(token);
+      ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+      return NULL;
+    }
+    grn_normalized_text_get_value(ctx, token->normalized_text,
+                                  (const char **)(&(token->next)),
+                                  &(token->len),
+                                  &length_in_bytes);
+    token->end = token->next + length_in_bytes;
+  } else {
+    token->len = GRN_TEXT_LEN(str);
+    token->target_text = GRN_MALLOC(token->len + 1);
+    memcpy(token->target_text, GRN_TEXT_VALUE(str), token->len);
+    token->target_text[token->len] = '\0';
+    token->next = (unsigned char *)token->target_text;
+    token->end = token->next + token->len;
   }
-  token->next = (unsigned char *)token->nstr->norm;
-  token->end = token->next + token->nstr->norm_blen;
-  token->len = token->nstr->length;
   GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
   GRN_UINT32_INIT(&token->stat_, 0);
   return NULL;
@@ -154,7 +173,12 @@ static grn_obj *
 delimited_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_delimited_tokenizer *token = user_data->ptr;
-  grn_str_close(ctx, token->nstr);
+  if (token->normalized_text) {
+    grn_obj_unlink(ctx, token->normalized_text);
+  }
+  if (token->target_text) {
+    GRN_FREE(token->target_text);
+  }
   GRN_FREE(token);
   return NULL;
 }
@@ -178,6 +202,7 @@ delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
 /* ngram tokenizer */
 
 typedef struct {
+  grn_obj *normalized_text;
   grn_str *nstr;
   uint8_t uni_alpha;
   uint8_t uni_digit;
@@ -190,7 +215,7 @@ typedef struct {
   grn_encoding encoding;
   const unsigned char *next;
   const unsigned char *end;
-  uint_least8_t *ctypes;
+  const uint_least8_t *ctypes;
   int32_t len;
   uint32_t tail;
   grn_obj curr_;
@@ -202,6 +227,7 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
            uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
 {
   grn_obj *str;
+  grn_obj *normalizer;
   int nflags = GRN_NORMALIZE_REMOVE_BLANK|GRN_NORMALIZE_WITH_TYPES;
   grn_ngram_tokenizer *token;
   grn_obj_flags table_flags;
@@ -211,6 +237,8 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
   }
   if (!(token = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { return NULL; }
   user_data->ptr = token;
+  token->normalized_text = NULL;
+  token->nstr = NULL;
   token->uni_alpha = uni_alpha;
   token->uni_digit = uni_digit;
   token->uni_symbol = uni_symbol;
@@ -219,18 +247,39 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
   token->overlap = 0;
   token->pos = 0;
   token->skip = 0;
-  grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
-  nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
-  if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
-                                    nflags, token->encoding))) {
-    GRN_FREE(token);
-    ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
-    return NULL;
+  grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL,
+                     &normalizer);
+  if (normalizer) {
+    unsigned int length_in_bytes;
+    if (!(token->normalized_text = grn_normalized_text_open(ctx,
+                                                            normalizer,
+                                                            GRN_TEXT_VALUE(str),
+                                                            GRN_TEXT_LEN(str),
+                                                            token->encoding,
+                                                            nflags))) {
+      GRN_FREE(token);
+      ERR(GRN_TOKENIZER_ERROR,
+          "[tokenizer][ngram][init] failed to open normalized text");
+      return NULL;
+    }
+    grn_normalized_text_get_value(ctx, token->normalized_text,
+                                  (const char **)(&(token->next)),
+                                  &(token->len),
+                                  &length_in_bytes);
+    token->end = token->next + length_in_bytes;
+    token->ctypes = grn_normalized_text_get_types(ctx, token->normalized_text);
+  } else {
+    if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
+                                      nflags, token->encoding))) {
+      GRN_FREE(token);
+      ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+      return NULL;
+    }
+    token->next = (unsigned char *)token->nstr->norm;
+    token->end = token->next + token->nstr->norm_blen;
+    token->ctypes = token->nstr->ctypes;
+    token->len = token->nstr->length;
   }
-  token->next = (unsigned char *)token->nstr->norm;
-  token->end = token->next + token->nstr->norm_blen;
-  token->ctypes = token->nstr->ctypes;
-  token->len = token->nstr->length;
   GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
   GRN_UINT32_INIT(&token->stat_, 0);
   return NULL;
@@ -283,7 +332,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_ngram_tokenizer *token = user_data->ptr;
   const unsigned char *p = token->next, *r = p, *e = token->end;
   int32_t len = 0, pos = token->pos + token->skip, status = 0;
-  uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
+  const uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
   if (cp && token->uni_alpha && GRN_CHAR_TYPE(*cp) == grn_char_alpha) {
     while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
       len++;
@@ -371,7 +420,12 @@ static grn_obj *
 ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_ngram_tokenizer *token = user_data->ptr;
-  grn_str_close(ctx, token->nstr);
+  if (token->normalized_text) {
+    grn_obj_unlink(ctx, token->normalized_text);
+  }
+  if (token->nstr) {
+    grn_str_close(ctx, token->nstr);
+  }
   GRN_FREE(token);
   return NULL;
 }
@@ -406,7 +460,9 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
   grn_token *token;
   grn_encoding encoding;
   grn_obj *tokenizer;
-  if (grn_table_get_info(ctx, table, NULL, &encoding, &tokenizer)) { return NULL; }
+  if (grn_table_get_info(ctx, table, NULL, &encoding, &tokenizer, NULL)) {
+    return NULL;
+  }
   if (!(token = GRN_MALLOC(sizeof(grn_token)))) { return NULL; }
   token->table = table;
   token->mode = mode;

  Modified: lib/tokenizer.c (+1 -1)
===================================================================
--- lib/tokenizer.c    2012-02-10 19:13:32 +0900 (ee87dbd)
+++ lib/tokenizer.c    2012-02-10 19:16:20 +0900 (7d4415f)
@@ -186,7 +186,7 @@ grn_tokenizer_query *grn_tokenizer_query_create(grn_ctx *ctx,
       grn_obj * const table = args[0];
       grn_encoding table_encoding;
       int flags = 0;
-      grn_table_get_info(ctx, table, NULL, &table_encoding, NULL);
+      grn_table_get_info(ctx, table, NULL, &table_encoding, NULL, NULL);
       {
         grn_str * const str = grn_str_open_(ctx, GRN_TEXT_VALUE(query_str),
                                             GRN_TEXT_LEN(query_str),

  Modified: plugins/tokenizers/mecab.c (+30 -13)
===================================================================
--- plugins/tokenizers/mecab.c    2012-02-10 19:13:32 +0900 (f944656)
+++ plugins/tokenizers/mecab.c    2012-02-10 19:16:20 +0900 (0c4b48c)
@@ -34,7 +34,6 @@ static grn_critical_section sole_mecab_lock;
 static grn_encoding sole_mecab_encoding = GRN_ENC_NONE;
 
 typedef struct {
-  grn_str *nstr;
   mecab_t *mecab;
   char *buf;
   char *next;
@@ -84,9 +83,12 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_obj *str;
   int nflags = 0;
+  const char *target_text;
   char *buf, *s, *p;
   char mecab_err[256];
   grn_obj *table = args[0];
+  grn_obj *normalizer;
+  grn_obj *normalized_text;
   grn_obj_flags table_flags;
   grn_encoding table_encoding;
   grn_mecab_tokenizer *token;
@@ -113,7 +115,8 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
         "mecab_new2 failed on grn_mecab_init: %s", mecab_err);
     return NULL;
   }
-  grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL);
+  grn_table_get_info(ctx, table, &table_flags, &table_encoding,
+                     NULL, &normalizer);
   if (table_encoding != sole_mecab_encoding) {
     ERR(GRN_TOKENIZER_ERROR,
         "MeCab dictionary charset (%s) does not match the context encoding: <%s>",
@@ -123,23 +126,36 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return NULL; }
   token->mecab = sole_mecab;
   token->encoding = table_encoding;
-  nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
-  if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
-                                    nflags, token->encoding))) {
-    GRN_FREE(token);
-    ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
-    return NULL;
+  normalized_text = NULL;
+  if (normalizer) {
+    if (!(normalized_text = grn_normalized_text_open(ctx,
+                                                     normalizer,
+                                                     GRN_TEXT_VALUE(str),
+                                                     GRN_TEXT_LEN(str),
+                                                     token->encoding,
+                                                     nflags))) {
+      GRN_FREE(token);
+      ERR(GRN_TOKENIZER_ERROR,
+          "[tokenizer][mecab] failed to open normalized text");
+      return NULL;
+    }
+    grn_normalized_text_get_value(ctx, normalized_text,
+                                  &target_text, NULL, &len);
+  } else {
+    target_text = GRN_TEXT_VALUE(str);
+    len = GRN_TEXT_LEN(str);
   }
-  len = token->nstr->norm_blen;
   for (bufsize = len * 2 + 1; maxtrial; bufsize *= 2, maxtrial--) {
     if (!(buf = GRN_MALLOC(bufsize + 1))) {
       GRN_LOG(ctx, GRN_LOG_ALERT, "buffer allocation on mecab_init failed !");
-      grn_str_close(ctx, token->nstr);
+      if (normalized_text) {
+        grn_obj_unlink(ctx, normalized_text);
+      }
       GRN_FREE(token);
       return NULL;
     }
     CRITICAL_SECTION_ENTER(sole_mecab_lock);
-    s = mecab_sparse_tostr3(token->mecab, token->nstr->norm, len, buf, bufsize);
+    s = mecab_sparse_tostr3(token->mecab, target_text, len, buf, bufsize);
     if (!s) {
       strncpy(mecab_err, mecab_strerror(token->mecab), sizeof(mecab_err) - 1);
     }
@@ -148,10 +164,12 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     GRN_FREE(buf);
     if (strstr(mecab_err, "output buffer overflow") == NULL) { break; }
   }
+  if (normalized_text) {
+    grn_obj_unlink(ctx, normalized_text);
+  }
   if (!s) {
     ERR(GRN_TOKENIZER_ERROR, "mecab_sparse_tostr failed len=%d bufsize=%d err=%s",
         len, bufsize, mecab_err);
-    grn_str_close(ctx, token->nstr);
     GRN_FREE(token);
     return NULL;
   }
@@ -206,7 +224,6 @@ static grn_obj *
 mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 {
   grn_mecab_tokenizer *token = user_data->ptr;
-  grn_str_close(ctx, token->nstr);
   GRN_FREE(token->buf);
   GRN_FREE(token);
   return NULL;




Groonga-commit メーリングリストの案内
Back to archive index