[Groonga-commit] groonga/groonga [master] TokenMecab: support tokenized delimiter

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Nov 9 19:08:16 JST 2012


Kouhei Sutou	2012-11-09 19:08:16 +0900 (Fri, 09 Nov 2012)

  New Revision: 1233e5d2574c4f4b1609a09d46ef4600b3b7a810
  https://github.com/groonga/groonga/commit/1233e5d2574c4f4b1609a09d46ef4600b3b7a810

  Log:
    TokenMecab: support tokenized delimiter

  Modified files:
    plugins/tokenizers/mecab.c

  Modified: plugins/tokenizers/mecab.c (+36 -5)
===================================================================
--- plugins/tokenizers/mecab.c    2012-11-09 18:54:11 +0900 (35aa499)
+++ plugins/tokenizers/mecab.c    2012-11-09 19:08:16 +0900 (6e6870b)
@@ -41,6 +41,7 @@ typedef struct {
   char *end;
   grn_encoding encoding;
   grn_tokenizer_token token;
+  grn_bool have_tokenized_delimiter;
 } grn_mecab_tokenizer;
 
 static grn_encoding
@@ -128,7 +129,17 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
     ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
     return NULL;
   }
+
   len = tokenizer->nstr->norm_blen;
+  tokenizer->have_tokenized_delimiter =
+    grn_tokenizer_have_tokenized_delimiter(ctx,
+                                           tokenizer->nstr->norm, len,
+                                           tokenizer->encoding);
+  if (tokenizer->have_tokenized_delimiter) {
+    tokenizer->buf = NULL;
+    tokenizer->next = tokenizer->nstr->norm;
+    tokenizer->end = tokenizer->next + len;
+  } else {
   CRITICAL_SECTION_ENTER(sole_mecab_lock);
   s = mecab_sparse_tostr2(tokenizer->mecab, tokenizer->nstr->norm, len);
   if (!s) {
@@ -152,10 +163,11 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   for (p = buf + bufsize - 2;
        buf <= p && isspace(*(unsigned char *)p);
        p--) { *p = '\0'; }
-  user_data->ptr = tokenizer;
   tokenizer->buf = buf;
   tokenizer->next = buf;
   tokenizer->end = p + 1;
+  }
+  user_data->ptr = tokenizer;
 
   grn_tokenizer_token_init(ctx, &(tokenizer->token));
 
@@ -173,19 +185,36 @@ mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_mecab_tokenizer *tokenizer = user_data->ptr;
   char *p = tokenizer->next, *r;
   char *e = tokenizer->end;
+  grn_encoding encoding = tokenizer->encoding;
   grn_tokenizer_status status;
+
+  if (tokenizer->have_tokenized_delimiter) {
+    for (r = p; r < e; r += cl) {
+      cl = grn_charlen_(ctx, r, e, encoding);
+      if (cl > 0) {
+        if (grn_tokenizer_is_tokenized_delimiter(ctx, r, cl, encoding)) {
+          tokenizer->next = r + cl;
+          break;
+        }
+      } else {
+        tokenizer->next = e;
+        break;
+      }
+    }
+  } else {
   for (r = p; r < e; r += cl) {
-    if (!(cl = grn_charlen_(ctx, r, e, tokenizer->encoding))) {
+    if (!(cl = grn_charlen_(ctx, r, e, encoding))) {
       tokenizer->next = e;
       break;
     }
-    if (grn_isspace(r, tokenizer->encoding)) {
+    if (grn_isspace(r, encoding)) {
       char *q = r;
-      while ((cl = grn_isspace(q, tokenizer->encoding))) { q += cl; }
+      while ((cl = grn_isspace(q, encoding))) { q += cl; }
       tokenizer->next = q;
       break;
     }
   }
+  }
 
   if (r == e) {
     status = GRN_TOKENIZER_LAST;
@@ -205,7 +234,9 @@ mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   grn_mecab_tokenizer *tokenizer = user_data->ptr;
   grn_tokenizer_token_fin(ctx, &(tokenizer->token));
   grn_str_close(ctx, tokenizer->nstr);
-  GRN_FREE(tokenizer->buf);
+  if (tokenizer->buf) {
+    GRN_FREE(tokenizer->buf);
+  }
   GRN_FREE(tokenizer);
   return NULL;
 }
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index