[Groonga-commit] groonga/groonga at 7f35ebc [master] Extract built-in torkenizers to tokenizers.c

Back to archive index

Kouhei Sutou null+****@clear*****
Sun Nov 9 22:21:14 JST 2014


Kouhei Sutou	2014-11-09 22:21:14 +0900 (Sun, 09 Nov 2014)

  New Revision: 7f35ebc1a9b0a5cc02b258fb438ed7b85b37633f
  https://github.com/groonga/groonga/commit/7f35ebc1a9b0a5cc02b258fb438ed7b85b37633f

  Message:
    Extract built-in torkenizers to tokenizers.c

  Added files:
    lib/grn_tokenizers.h
  Copied files:
    lib/tokenizers.c
      (from lib/token.c)
  Modified files:
    lib/ctx.c
    lib/db.c
    lib/grn_token.h
    lib/sources.am
    lib/token.c

  Modified: lib/ctx.c (+4 -4)
===================================================================
--- lib/ctx.c    2014-11-09 14:30:28 +0900 (dd360e2)
+++ lib/ctx.c    2014-11-09 22:21:14 +0900 (f57a0b2)
@@ -18,7 +18,7 @@
 
 #include "grn.h"
 #include <string.h>
-#include "grn_token.h"
+#include "grn_tokenizers.h"
 #include "grn_ctx_impl.h"
 #include "grn_pat.h"
 #include "grn_plugin.h"
@@ -1282,8 +1282,8 @@ grn_init(void)
     GRN_LOG(ctx, GRN_LOG_ALERT, "grn_normalizer_init failed (%d)", rc);
     return rc;
   }
-  if ((rc = grn_token_init())) {
-    GRN_LOG(ctx, GRN_LOG_ALERT, "grn_token_init failed (%d)", rc);
+  if ((rc = grn_tokenizers_init())) {
+    GRN_LOG(ctx, GRN_LOG_ALERT, "grn_tokenizers_init failed (%d)", rc);
     return rc;
   }
   /*
@@ -1380,7 +1380,7 @@ grn_fin(void)
   }
   query_logger_fin(ctx);
   grn_cache_fin();
-  grn_token_fin();
+  grn_tokenizers_fin();
   grn_normalizer_fin();
   grn_plugins_fin();
   grn_io_fin();

  Modified: lib/db.c (+1 -0)
===================================================================
--- lib/db.c    2014-11-09 14:30:28 +0900 (a902ffa)
+++ lib/db.c    2014-11-09 22:21:14 +0900 (6c35ebd)
@@ -22,6 +22,7 @@
 #include "grn_ii.h"
 #include "grn_ctx_impl.h"
 #include "grn_token.h"
+#include "grn_tokenizers.h"
 #include "grn_proc.h"
 #include "grn_plugin.h"
 #include "grn_geo.h"

  Modified: lib/grn_token.h (+0 -10)
===================================================================
--- lib/grn_token.h    2014-11-09 14:30:28 +0900 (bb6dc39)
+++ lib/grn_token.h    2014-11-09 22:21:14 +0900 (95dec9b)
@@ -17,10 +17,8 @@
 #ifndef GRN_TOKEN_H
 #define GRN_TOKEN_H
 
-#include "grn.h"
 #include "grn_ctx.h"
 #include "grn_db.h"
-#include "grn_str.h"
 
 #include <groonga/tokenizer.h>
 
@@ -59,11 +57,6 @@ typedef struct {
   grn_obj *nstr;
 } grn_token_cursor;
 
-extern grn_obj *grn_token_uvector;
-
-grn_rc grn_token_init(void);
-grn_rc grn_token_fin(void);
-
 #define GRN_TOKEN_ENABLE_TOKENIZED_DELIMITER (0x01L<<0)
 
 GRN_API grn_token_cursor *grn_token_cursor_open(grn_ctx *ctx, grn_obj *table,
@@ -74,9 +67,6 @@ GRN_API grn_token_cursor *grn_token_cursor_open(grn_ctx *ctx, grn_obj *table,
 GRN_API grn_id grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor);
 GRN_API grn_rc grn_token_cursor_close(grn_ctx *ctx, grn_token_cursor *token_cursor);
 
-grn_rc grn_db_init_mecab_tokenizer(grn_ctx *ctx);
-grn_rc grn_db_init_builtin_tokenizers(grn_ctx *ctx);
-
 #ifdef __cplusplus
 }
 #endif

  Added: lib/grn_tokenizers.h (+38 -0) 100644
===================================================================
--- /dev/null
+++ lib/grn_tokenizers.h    2014-11-09 22:21:14 +0900 (bc6cd6a)
@@ -0,0 +1,38 @@
+/* -*- c-basic-offset: 2 -*- */
+/* Copyright(C) 2009-2014 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#ifndef GRN_TOKENIZERS_H
+#define GRN_TOKENIZERS_H
+
+#include "grn_ctx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern grn_obj *grn_token_uvector;
+
+grn_rc grn_tokenizers_init(void);
+grn_rc grn_tokenizers_fin(void);
+
+grn_rc grn_db_init_mecab_tokenizer(grn_ctx *ctx);
+grn_rc grn_db_init_builtin_tokenizers(grn_ctx *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GRN_TOKENIZERS_H */

  Modified: lib/sources.am (+2 -0)
===================================================================
--- lib/sources.am    2014-11-09 14:30:28 +0900 (c51b1e1)
+++ lib/sources.am    2014-11-09 22:21:14 +0900 (76c018c)
@@ -48,6 +48,8 @@ libgroonga_la_SOURCES =				\
 	token.c					\
 	grn_token.h				\
 	tokenizer.c				\
+	tokenizers.c				\
+	grn_tokenizers.h			\
 	token_filter.c				\
 	util.c					\
 	grn_util.h

  Modified: lib/token.c (+1 -549)
===================================================================
--- lib/token.c    2014-11-09 14:30:28 +0900 (f9bb289)
+++ lib/token.c    2014-11-09 22:21:14 +0900 (f3a46ba)
@@ -15,485 +15,10 @@
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
-#include "grn.h"
-#include <string.h>
-#include <ctype.h>
-#include "grn_ctx_impl.h"
 #include "grn_token.h"
+#include "grn_string.h"
 #include "grn_pat.h"
 #include "grn_dat.h"
-#include "grn_hash.h"
-#include "grn_string.h"
-#include "grn_plugin.h"
-#include <groonga/tokenizer.h>
-
-grn_obj *grn_token_uvector = NULL;
-
-typedef struct {
-  grn_tokenizer_token token;
-  byte *curr;
-  byte *tail;
-  uint32_t unit;
-} grn_uvector_tokenizer;
-
-static grn_obj *
-uvector_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  grn_obj *str, *flags, *mode;
-  grn_uvector_tokenizer *tokenizer;
-  if (!(flags = grn_ctx_pop(ctx))) {
-    ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: flags");
-    return NULL;
-  }
-  if (!(str = grn_ctx_pop(ctx))) {
-    ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: string");
-    return NULL;
-  }
-  if (!(mode = grn_ctx_pop(ctx))) {
-    ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: mode");
-    return NULL;
-  }
-  if (!(tokenizer = GRN_MALLOC(sizeof(grn_uvector_tokenizer)))) {
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[tokenizer][uvector] "
-        "memory allocation to grn_uvector_tokenizer failed");
-    return NULL;
-  }
-  user_data->ptr = tokenizer;
-
-  grn_tokenizer_token_init(ctx, &(tokenizer->token));
-  tokenizer->curr = (byte *)GRN_TEXT_VALUE(str);
-  tokenizer->tail = tokenizer->curr + GRN_TEXT_LEN(str);
-  tokenizer->unit = sizeof(grn_id);
-  return NULL;
-}
-
-static grn_obj *
-uvector_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  grn_uvector_tokenizer *tokenizer = user_data->ptr;
-  byte *p = tokenizer->curr + tokenizer->unit;
-  if (tokenizer->tail < p) {
-    grn_tokenizer_token_push(ctx, &(tokenizer->token),
-                             (const char *)tokenizer->curr, 0,
-                             GRN_TOKENIZER_TOKEN_LAST);
-  } else {
-    grn_tokenizer_status status;
-    if (tokenizer->tail == p) {
-      status = GRN_TOKENIZER_TOKEN_LAST;
-    } else {
-      status = GRN_TOKENIZER_TOKEN_CONTINUE;
-    }
-    grn_tokenizer_token_push(ctx, &(tokenizer->token),
-                             (const char *)tokenizer->curr, tokenizer->unit,
-                             status);
-    tokenizer->curr = p;
-  }
-  return NULL;
-}
-
-static grn_obj *
-uvector_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  grn_uvector_tokenizer *tokenizer = user_data->ptr;
-  if (!tokenizer) {
-    return NULL;
-  }
-  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
-  GRN_FREE(tokenizer);
-  return NULL;
-}
-
-typedef struct {
-  const uint8_t *delimiter;
-  uint32_t delimiter_len;
-  const unsigned char *next;
-  const unsigned char *end;
-  grn_tokenizer_token token;
-  grn_tokenizer_query *query;
-  grn_bool have_tokenized_delimiter;
-} grn_delimited_tokenizer;
-
-static grn_obj *
-delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data,
-               const uint8_t *delimiter, uint32_t delimiter_len)
-{
-  grn_tokenizer_query *query;
-  unsigned int normalize_flags = 0;
-  const char *normalized;
-  unsigned int normalized_length_in_bytes;
-  grn_delimited_tokenizer *tokenizer;
-
-  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
-  if (!query) {
-    return NULL;
-  }
-
-  if (!(tokenizer = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) {
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[tokenizer][delimit] "
-        "memory allocation to grn_delimited_tokenizer failed");
-    grn_tokenizer_query_close(ctx, query);
-    return NULL;
-  }
-  user_data->ptr = tokenizer;
-
-  tokenizer->query = query;
-
-  tokenizer->have_tokenized_delimiter =
-    grn_tokenizer_have_tokenized_delimiter(ctx,
-                                           tokenizer->query->ptr,
-                                           tokenizer->query->length,
-                                           tokenizer->query->encoding);
-  tokenizer->delimiter = delimiter;
-  tokenizer->delimiter_len = delimiter_len;
-  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
-                            &normalized, &normalized_length_in_bytes,
-                            NULL);
-  tokenizer->next = (const unsigned char *)normalized;
-  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
-
-  grn_tokenizer_token_init(ctx, &(tokenizer->token));
-
-  return NULL;
-}
-
-static grn_obj *
-delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  grn_delimited_tokenizer *tokenizer = user_data->ptr;
-
-  if (tokenizer->have_tokenized_delimiter) {
-    unsigned int rest_length;
-    rest_length = tokenizer->end - tokenizer->next;
-    tokenizer->next =
-      (unsigned char *)grn_tokenizer_tokenized_delimiter_next(
-        ctx,
-        &(tokenizer->token),
-        (const char *)tokenizer->next,
-        rest_length,
-        tokenizer->query->encoding);
-  } else {
-    size_t cl;
-    const unsigned char *p = tokenizer->next, *r;
-    const unsigned char *e = tokenizer->end;
-    grn_tokenizer_status status;
-    for (r = p; r < e; r += cl) {
-      if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                              tokenizer->query->encoding))) {
-        tokenizer->next = (unsigned char *)e;
-        break;
-      }
-      {
-        grn_bool found_delimiter = GRN_FALSE;
-        const unsigned char *current_end = r;
-        while (current_end + tokenizer->delimiter_len <= e &&
-               !memcmp(current_end,
-                       tokenizer->delimiter, tokenizer->delimiter_len)) {
-          current_end += tokenizer->delimiter_len;
-          tokenizer->next = current_end;
-          found_delimiter = GRN_TRUE;
-        }
-        if (found_delimiter) {
-          break;
-        }
-      }
-    }
-    if (r == e) {
-      status = GRN_TOKENIZER_LAST;
-    } else {
-      status = GRN_TOKENIZER_CONTINUE;
-    }
-    grn_tokenizer_token_push(ctx,
-                             &(tokenizer->token),
-                             (const char *)p,
-                             r - p,
-                             status);
-  }
-
-  return NULL;
-}
-
-static grn_obj *
-delimited_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  grn_delimited_tokenizer *tokenizer = user_data->ptr;
-  if (!tokenizer) {
-    return NULL;
-  }
-  grn_tokenizer_query_close(ctx, tokenizer->query);
-  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
-  GRN_FREE(tokenizer);
-  return NULL;
-}
-
-static grn_obj *
-delimit_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  static const uint8_t delimiter[1] = {' '};
-  return delimited_init(ctx, nargs, args, user_data, delimiter, 1);
-}
-
-static grn_obj *
-delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  static const uint8_t delimiter[1] = {'\0'};
-  return delimited_init(ctx, nargs, args, user_data, delimiter, 1);
-}
-
-/* ngram tokenizer */
-
-typedef struct {
-  grn_tokenizer_token token;
-  grn_tokenizer_query *query;
-  uint8_t uni_alpha;
-  uint8_t uni_digit;
-  uint8_t uni_symbol;
-  uint8_t ngram_unit;
-  uint8_t ignore_blank;
-  uint8_t overlap;
-  int32_t pos;
-  uint32_t skip;
-  const unsigned char *next;
-  const unsigned char *end;
-  const uint_least8_t *ctypes;
-  uint32_t len;
-  uint32_t tail;
-} grn_ngram_tokenizer;
-
-static grn_obj *
-ngram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, uint8_t ngram_unit,
-           uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
-{
-  unsigned int normalize_flags =
-    GRN_STRING_REMOVE_BLANK |
-    GRN_STRING_WITH_TYPES |
-    GRN_STRING_REMOVE_TOKENIZED_DELIMITER;
-  grn_tokenizer_query *query;
-  const char *normalized;
-  unsigned int normalized_length_in_bytes;
-  grn_ngram_tokenizer *tokenizer;
-
-  query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
-  if (!query) {
-    return NULL;
-  }
-
-  if (!(tokenizer = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) {
-    grn_tokenizer_query_close(ctx, query);
-    ERR(GRN_NO_MEMORY_AVAILABLE,
-        "[tokenizer][ngram] "
-        "memory allocation to grn_ngram_tokenizer failed");
-    return NULL;
-  }
-  user_data->ptr = tokenizer;
-
-  grn_tokenizer_token_init(ctx, &(tokenizer->token));
-  tokenizer->query = query;
-
-  tokenizer->uni_alpha = uni_alpha;
-  tokenizer->uni_digit = uni_digit;
-  tokenizer->uni_symbol = uni_symbol;
-  tokenizer->ngram_unit = ngram_unit;
-  tokenizer->ignore_blank = ignore_blank;
-  tokenizer->overlap = 0;
-  tokenizer->pos = 0;
-  tokenizer->skip = 0;
-
-  grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
-                            &normalized, &normalized_length_in_bytes,
-                            &(tokenizer->len));
-  tokenizer->next = (const unsigned char *)normalized;
-  tokenizer->end = tokenizer->next + normalized_length_in_bytes;
-  tokenizer->ctypes =
-    grn_string_get_types(ctx, tokenizer->query->normalized_query);
-  return NULL;
-}
-
-static grn_obj *
-unigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, nargs, args, user_data, 1, 1, 1, 1, 0); }
-
-static grn_obj *
-bigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 0); }
-
-static grn_obj *
-trigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, nargs, args, user_data, 3, 1, 1, 1, 0); }
-
-static grn_obj *
-bigrams_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 0); }
-
-static grn_obj *
-bigramsa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 0); }
-
-static grn_obj *
-bigramsad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 0); }
-
-static grn_obj *
-bigrami_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 1); }
-
-static grn_obj *
-bigramis_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 1); }
-
-static grn_obj *
-bigramisa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 1); }
-
-static grn_obj *
-bigramisad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 1); }
-
-static grn_obj *
-ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  size_t cl;
-  grn_ngram_tokenizer *tokenizer = user_data->ptr;
-  const unsigned char *p = tokenizer->next, *r = p, *e = tokenizer->end;
-  int32_t len = 0, pos = tokenizer->pos + tokenizer->skip, status = 0;
-  const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL;
-  if (cp && tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) {
-    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                              tokenizer->query->encoding))) {
-      len++;
-      r += cl;
-      if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
-      if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; }
-    }
-    tokenizer->next = r;
-    tokenizer->overlap = 0;
-  } else if (cp &&
-             tokenizer->uni_digit &&
-             GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) {
-    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                              tokenizer->query->encoding))) {
-      len++;
-      r += cl;
-      if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
-      if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; }
-    }
-    tokenizer->next = r;
-    tokenizer->overlap = 0;
-  } else if (cp &&
-             tokenizer->uni_symbol &&
-             GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) {
-    while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                              tokenizer->query->encoding))) {
-      len++;
-      r += cl;
-      if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
-      if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; }
-    }
-    tokenizer->next = r;
-    tokenizer->overlap = 0;
-  } else {
-#ifdef PRE_DEFINED_UNSPLIT_WORDS
-    const unsigned char *key = NULL;
-    // todo : grn_pat_lcp_search
-    if ((tid = grn_sym_common_prefix_search(sym, p))) {
-      if (!(key = _grn_sym_key(sym, tid))) {
-        tokenizer->status = GRN_TOKEN_NOT_FOUND;
-        return NULL;
-      }
-      len = grn_str_len(key, tokenizer->query->encoding, NULL);
-    }
-    r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding);
-    if (tid && (len > 1 || r == p)) {
-      if (r != p && pos + len - 1 <= tokenizer->tail) { continue; }
-      p += strlen(key);
-      if (!*p && tokenizer->mode == GRN_TOKEN_GET) {
-        tokenizer->status = GRN_TOKEN_DONE;
-      }
-    }
-#endif /* PRE_DEFINED_UNSPLIT_WORDS */
-    if ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                           tokenizer->query->encoding))) {
-      len++;
-      r += cl;
-      tokenizer->next = r;
-      while (len < tokenizer->ngram_unit &&
-             (cl = grn_charlen_(ctx, (char *)r, (char *)e,
-                                tokenizer->query->encoding))) {
-        if (cp) {
-          if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
-          cp++;
-          if ((tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) ||
-              (tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) ||
-              (tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) {
-            break;
-          }
-        }
-        len++;
-        r += cl;
-      }
-      if (tokenizer->overlap) {
-        status |= GRN_TOKENIZER_TOKEN_OVERLAP;
-      }
-      if (len < tokenizer->ngram_unit) {
-        status |= GRN_TOKENIZER_TOKEN_UNMATURED;
-      }
-      tokenizer->overlap = (len > 1) ? 1 : 0;
-    }
-  }
-  tokenizer->pos = pos;
-  tokenizer->len = len;
-  tokenizer->tail = pos + len - 1;
-  if (p == r || tokenizer->next == e) {
-    tokenizer->skip = 0;
-    status |= GRN_TOKENIZER_TOKEN_LAST;
-  } else {
-    tokenizer->skip = tokenizer->overlap ? 1 : len;
-  }
-  if (r == e) { status |= GRN_TOKENIZER_TOKEN_REACH_END; }
-  grn_tokenizer_token_push(ctx,
-                           &(tokenizer->token),
-                           (const char *)p,
-                           r - p,
-                           status);
-  return NULL;
-}
-
-static grn_obj *
-ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
-{
-  grn_ngram_tokenizer *tokenizer = user_data->ptr;
-  if (!tokenizer) {
-    return NULL;
-  }
-  grn_tokenizer_token_fin(ctx, &(tokenizer->token));
-  grn_tokenizer_query_close(ctx, tokenizer->query);
-  GRN_FREE(tokenizer);
-  return NULL;
-}
-
-/* external */
-
-grn_rc
-grn_token_init(void)
-{
-  static grn_proc _grn_token_uvector;
-  _grn_token_uvector.obj.db = NULL;
-  _grn_token_uvector.obj.id = GRN_ID_NIL;
-  _grn_token_uvector.obj.header.domain = GRN_ID_NIL;
-  _grn_token_uvector.obj.range = GRN_ID_NIL;
-  _grn_token_uvector.funcs[PROC_INIT] = uvector_init;
-  _grn_token_uvector.funcs[PROC_NEXT] = uvector_next;
-  _grn_token_uvector.funcs[PROC_FIN] = uvector_fin;
-  grn_token_uvector = (grn_obj *)&_grn_token_uvector;
-  return GRN_SUCCESS;
-}
-
-grn_rc
-grn_token_fin(void)
-{
-  return GRN_SUCCESS;
-}
 
 static void
 grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx,
@@ -822,76 +347,3 @@ grn_token_cursor_close(grn_ctx *ctx, grn_token_cursor *token_cursor)
     return GRN_INVALID_ARGUMENT;
   }
 }
-
-grn_rc
-grn_db_init_mecab_tokenizer(grn_ctx *ctx)
-{
-  switch (GRN_CTX_GET_ENCODING(ctx)) {
-  case GRN_ENC_EUC_JP :
-  case GRN_ENC_UTF8 :
-  case GRN_ENC_SJIS :
-    {
-      const char *mecab_plugin_name = "tokenizers/mecab";
-      char *path;
-      path = grn_plugin_find_path(ctx, mecab_plugin_name);
-      if (path) {
-        GRN_FREE(path);
-        return grn_plugin_register(ctx, mecab_plugin_name);
-      } else {
-        return GRN_NO_SUCH_FILE_OR_DIRECTORY;
-      }
-    }
-    break;
-  default :
-    return GRN_OPERATION_NOT_SUPPORTED;
-  }
-}
-
-#define DEF_TOKENIZER(name, init, next, fin, vars)\
-  (grn_proc_create(ctx, (name), (sizeof(name) - 1),\
-                   GRN_PROC_TOKENIZER, (init), (next), (fin), 3, (vars)))
-
-grn_rc
-grn_db_init_builtin_tokenizers(grn_ctx *ctx)
-{
-  grn_obj *obj;
-  grn_expr_var vars[] = {
-    {NULL, 0},
-    {NULL, 0},
-    {NULL, 0}
-  };
-  GRN_TEXT_INIT(&vars[0].value, 0);
-  GRN_TEXT_INIT(&vars[1].value, 0);
-  GRN_UINT32_INIT(&vars[2].value, 0);
-
-  obj = DEF_TOKENIZER("TokenDelimit",
-                      delimit_init, delimited_next, delimited_fin, vars);
-  if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_DELIMIT) { return GRN_FILE_CORRUPT; }
-  obj = DEF_TOKENIZER("TokenUnigram",
-                      unigram_init, ngram_next, ngram_fin, vars);
-  if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_UNIGRAM) { return GRN_FILE_CORRUPT; }
-  obj = DEF_TOKENIZER("TokenBigram",
-                      bigram_init, ngram_next, ngram_fin, vars);
-  if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_BIGRAM) { return GRN_FILE_CORRUPT; }
-  obj = DEF_TOKENIZER("TokenTrigram",
-                      trigram_init, ngram_next, ngram_fin, vars);
-  if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_TRIGRAM) { return GRN_FILE_CORRUPT; }
-
-  DEF_TOKENIZER("TokenBigramSplitSymbol",
-                bigrams_init, ngram_next, ngram_fin, vars);
-  DEF_TOKENIZER("TokenBigramSplitSymbolAlpha",
-                bigramsa_init, ngram_next, ngram_fin, vars);
-  DEF_TOKENIZER("TokenBigramSplitSymbolAlphaDigit",
-                bigramsad_init, ngram_next, ngram_fin, vars);
-  DEF_TOKENIZER("TokenBigramIgnoreBlank",
-                bigrami_init, ngram_next, ngram_fin, vars);
-  DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbol",
-                bigramis_init, ngram_next, ngram_fin, vars);
-  DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbolAlpha",
-                bigramisa_init, ngram_next, ngram_fin, vars);
-  DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbolAlphaDigit",
-                bigramisad_init, ngram_next, ngram_fin, vars);
-  DEF_TOKENIZER("TokenDelimitNull",
-                delimit_null_init, delimited_next, delimited_fin, vars);
-  return GRN_SUCCESS;
-}

  Copied: lib/tokenizers.c (+2 -336) 61%
===================================================================
--- lib/token.c    2014-11-09 14:30:28 +0900 (f9bb289)
+++ lib/tokenizers.c    2014-11-09 22:21:14 +0900 (712f9a8)
@@ -15,14 +15,8 @@
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
-#include "grn.h"
 #include <string.h>
-#include <ctype.h>
-#include "grn_ctx_impl.h"
 #include "grn_token.h"
-#include "grn_pat.h"
-#include "grn_dat.h"
-#include "grn_hash.h"
 #include "grn_string.h"
 #include "grn_plugin.h"
 #include <groonga/tokenizer.h>
@@ -475,7 +469,7 @@ ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 /* external */
 
 grn_rc
-grn_token_init(void)
+grn_tokenizers_init(void)
 {
   static grn_proc _grn_token_uvector;
   _grn_token_uvector.obj.db = NULL;
@@ -490,339 +484,11 @@ grn_token_init(void)
 }
 
 grn_rc
-grn_token_fin(void)
+grn_tokenizers_fin(void)
 {
   return GRN_SUCCESS;
 }
 
-static void
-grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx,
-                                               grn_token_cursor *token_cursor)
-{
-  grn_obj *token_filters = token_cursor->token_filters;
-  unsigned int i, n_token_filters;
-
-  if (token_filters) {
-    n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
-  } else {
-    n_token_filters = 0;
-  }
-
-  for (i = 0; i < n_token_filters; i++) {
-    grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
-    grn_proc *token_filter = (grn_proc *)token_filter_object;
-
-    token_filter->user_data =
-      token_filter->callbacks.token_filter.init(ctx,
-                                                token_cursor->table,
-                                                token_cursor->mode);
-  }
-}
-
-grn_token_cursor *
-grn_token_cursor_open(grn_ctx *ctx, grn_obj *table,
-                      const char *str, size_t str_len,
-                      grn_token_mode mode, unsigned int flags)
-{
-  grn_token_cursor *token_cursor;
-  grn_encoding encoding;
-  grn_obj *tokenizer;
-  grn_obj *normalizer;
-  grn_obj *token_filters;
-  grn_obj_flags table_flags;
-  if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer,
-                         &normalizer, &token_filters)) {
-    return NULL;
-  }
-  if (!(token_cursor = GRN_MALLOC(sizeof(grn_token_cursor)))) { return NULL; }
-  token_cursor->table = table;
-  token_cursor->mode = mode;
-  token_cursor->encoding = encoding;
-  token_cursor->tokenizer = tokenizer;
-  token_cursor->token_filters = token_filters;
-  token_cursor->orig = (const unsigned char *)str;
-  token_cursor->orig_blen = str_len;
-  token_cursor->curr = NULL;
-  token_cursor->nstr = NULL;
-  token_cursor->curr_size = 0;
-  token_cursor->pos = -1;
-  token_cursor->status = GRN_TOKEN_DOING;
-  token_cursor->force_prefix = 0;
-  if (tokenizer) {
-    grn_obj str_, flags_, mode_;
-    GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
-    GRN_TEXT_SET_REF(&str_, str, str_len);
-    GRN_UINT32_INIT(&flags_, 0);
-    GRN_UINT32_SET(ctx, &flags_, flags);
-    GRN_UINT32_INIT(&mode_, 0);
-    GRN_UINT32_SET(ctx, &mode_, mode);
-    token_cursor->pctx.caller = NULL;
-    token_cursor->pctx.user_data.ptr = NULL;
-    token_cursor->pctx.proc = (grn_proc *)tokenizer;
-    token_cursor->pctx.hooks = NULL;
-    token_cursor->pctx.currh = NULL;
-    token_cursor->pctx.phase = PROC_INIT;
-    grn_ctx_push(ctx, &mode_);
-    grn_ctx_push(ctx, &str_);
-    grn_ctx_push(ctx, &flags_);
-    ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token_cursor->pctx.user_data);
-    grn_obj_close(ctx, &flags_);
-    grn_obj_close(ctx, &str_);
-    grn_obj_close(ctx, &mode_);
-  } else {
-    int nflags = 0;
-    token_cursor->nstr = grn_string_open_(ctx, str, str_len,
-                                          normalizer,
-                                          nflags,
-                                          token_cursor->encoding);
-    if (token_cursor->nstr) {
-      const char *normalized;
-      grn_string_get_normalized(ctx, token_cursor->nstr,
-                                &normalized, &(token_cursor->curr_size), NULL);
-      token_cursor->curr = (const unsigned char *)normalized;
-    } else {
-      ERR(GRN_TOKENIZER_ERROR,
-          "[token-cursor][open] failed to grn_string_open()");
-    }
-  }
-
-  grn_token_cursor_open_initialize_token_filters(ctx, token_cursor);
-
-  if (ctx->rc) {
-    grn_token_cursor_close(ctx, token_cursor);
-    token_cursor = NULL;
-  }
-  return token_cursor;
-}
-
-static int
-grn_token_cursor_next_apply_token_filters(grn_ctx *ctx,
-                                          grn_token_cursor *token_cursor,
-                                          grn_obj *current_token_data,
-                                          grn_obj *status)
-{
-  grn_obj *token_filters = token_cursor->token_filters;
-  unsigned int i, n_token_filters;
-  grn_token current_token;
-  grn_token next_token;
-
-  if (token_filters) {
-    n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
-  } else {
-    n_token_filters = 0;
-  }
-
-  GRN_TEXT_INIT(&(current_token.data), GRN_OBJ_DO_SHALLOW_COPY);
-  GRN_TEXT_SET(ctx, &(current_token.data),
-               GRN_TEXT_VALUE(current_token_data),
-               GRN_TEXT_LEN(current_token_data));
-  current_token.status = GRN_INT32_VALUE(status);
-  GRN_TEXT_INIT(&(next_token.data), GRN_OBJ_DO_SHALLOW_COPY);
-  GRN_TEXT_SET(ctx, &(next_token.data),
-               GRN_TEXT_VALUE(&(current_token.data)),
-               GRN_TEXT_LEN(&(current_token.data)));
-  next_token.status = current_token.status;
-
-  for (i = 0; i < n_token_filters; i++) {
-    grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
-    grn_proc *token_filter = (grn_proc *)token_filter_object;
-
-#define SKIP_FLAGS\
-    (GRN_TOKENIZER_TOKEN_SKIP |\
-     GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION)
-    if (current_token.status & SKIP_FLAGS) {
-      break;
-    }
-#undef SKIP_FLAGS
-
-    token_filter->callbacks.token_filter.filter(ctx,
-                                                &current_token,
-                                                &next_token,
-                                                token_filter->user_data);
-    GRN_TEXT_SET(ctx, &(current_token.data),
-                 GRN_TEXT_VALUE(&(next_token.data)),
-                 GRN_TEXT_LEN(&(next_token.data)));
-    current_token.status = next_token.status;
-  }
-
-  token_cursor->curr =
-    (const unsigned char *)GRN_TEXT_VALUE(&(current_token.data));
-  token_cursor->curr_size = GRN_TEXT_LEN(&(current_token.data));
-
-  return current_token.status;
-}
-
-grn_id
-grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor)
-{
-  int status;
-  grn_id tid = GRN_ID_NIL;
-  grn_obj *table = token_cursor->table;
-  grn_obj *tokenizer = token_cursor->tokenizer;
-  while (token_cursor->status != GRN_TOKEN_DONE) {
-    if (tokenizer) {
-      grn_obj *curr_, *stat_;
-      ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token_cursor->pctx.user_data);
-      stat_ = grn_ctx_pop(ctx);
-      curr_ = grn_ctx_pop(ctx);
-      status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor,
-                                                         curr_, stat_);
-      token_cursor->status =
-        ((status & GRN_TOKENIZER_TOKEN_LAST) ||
-         (token_cursor->mode == GRN_TOKEN_GET &&
-          (status & GRN_TOKENIZER_TOKEN_REACH_END)))
-        ? GRN_TOKEN_DONE : GRN_TOKEN_DOING;
-      token_cursor->force_prefix = 0;
-#define SKIP_FLAGS \
-      (GRN_TOKENIZER_TOKEN_SKIP | GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION)
-      if (status & SKIP_FLAGS) {
-        if (status & GRN_TOKENIZER_TOKEN_SKIP) {
-          token_cursor->pos++;
-        }
-        if (token_cursor->status == GRN_TOKEN_DONE && tid == GRN_ID_NIL) {
-          token_cursor->status = GRN_TOKEN_DONE_SKIP;
-          break;
-        } else {
-          continue;
-        }
-      }
-#undef SKIP_FLAGS
-      if (token_cursor->curr_size == 0) {
-        char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE];
-        int tokenizer_name_length;
-        tokenizer_name_length =
-          grn_obj_name(ctx, token_cursor->tokenizer,
-                       tokenizer_name, GRN_TABLE_MAX_KEY_SIZE);
-        GRN_LOG(ctx, GRN_WARN,
-                "[token_next] ignore an empty token: <%.*s>: <%.*s>",
-                tokenizer_name_length, tokenizer_name,
-                token_cursor->orig_blen, token_cursor->orig);
-        continue;
-      }
-      if (token_cursor->curr_size > GRN_TABLE_MAX_KEY_SIZE) {
-        GRN_LOG(ctx, GRN_WARN,
-                "[token_next] ignore too long token. "
-                "Token must be less than or equal to %d: <%d>(<%.*s>)",
-                GRN_TABLE_MAX_KEY_SIZE,
-                token_cursor->curr_size,
-                token_cursor->curr_size, token_cursor->curr);
-        continue;
-      }
-      if (status & GRN_TOKENIZER_TOKEN_UNMATURED) {
-        if (status & GRN_TOKENIZER_TOKEN_OVERLAP) {
-          if (token_cursor->mode == GRN_TOKEN_GET) { token_cursor->pos++; continue; }
-        } else {
-          if (status & GRN_TOKENIZER_TOKEN_LAST) { token_cursor->force_prefix = 1; }
-        }
-      }
-    } else {
-      token_cursor->status = GRN_TOKEN_DONE;
-    }
-    if (token_cursor->mode == GRN_TOKEN_ADD) {
-      switch (table->header.type) {
-      case GRN_TABLE_PAT_KEY :
-        if (grn_io_lock(ctx, ((grn_pat *)table)->io, grn_lock_timeout)) {
-          tid = GRN_ID_NIL;
-        } else {
-          tid = grn_pat_add(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size,
-                            NULL, NULL);
-          grn_io_unlock(((grn_pat *)table)->io);
-        }
-        break;
-      case GRN_TABLE_DAT_KEY :
-        if (grn_io_lock(ctx, ((grn_dat *)table)->io, grn_lock_timeout)) {
-          tid = GRN_ID_NIL;
-        } else {
-          tid = grn_dat_add(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size,
-                            NULL, NULL);
-          grn_io_unlock(((grn_dat *)table)->io);
-        }
-        break;
-      case GRN_TABLE_HASH_KEY :
-        if (grn_io_lock(ctx, ((grn_hash *)table)->io, grn_lock_timeout)) {
-          tid = GRN_ID_NIL;
-        } else {
-          tid = grn_hash_add(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size,
-                             NULL, NULL);
-          grn_io_unlock(((grn_hash *)table)->io);
-        }
-        break;
-      case GRN_TABLE_NO_KEY :
-        if (token_cursor->curr_size == sizeof(grn_id)) {
-          tid = *((grn_id *)token_cursor->curr);
-        } else {
-          tid = GRN_ID_NIL;
-        }
-        break;
-      }
-    } else {
-      switch (table->header.type) {
-      case GRN_TABLE_PAT_KEY :
-        tid = grn_pat_get(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL);
-        break;
-      case GRN_TABLE_DAT_KEY :
-        tid = grn_dat_get(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL);
-        break;
-      case GRN_TABLE_HASH_KEY :
-        tid = grn_hash_get(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL);
-        break;
-      case GRN_TABLE_NO_KEY :
-        if (token_cursor->curr_size == sizeof(grn_id)) {
-          tid = *((grn_id *)token_cursor->curr);
-        } else {
-          tid = GRN_ID_NIL;
-        }
-        break;
-      }
-    }
-    if (tid == GRN_ID_NIL && token_cursor->status != GRN_TOKEN_DONE) {
-      token_cursor->status = GRN_TOKEN_NOT_FOUND;
-    }
-    token_cursor->pos++;
-    break;
-  }
-  return tid;
-}
-
-static void
-grn_token_cursor_close_token_filters(grn_ctx *ctx,
-                                     grn_token_cursor *token_cursor)
-{
-  grn_obj *token_filters = token_cursor->token_filters;
-  unsigned int i, n_token_filters;
-
-  if (token_filters) {
-    n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
-  } else {
-    n_token_filters = 0;
-  }
-  for (i = 0; i < n_token_filters; i++) {
-    grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
-    grn_proc *token_filter = (grn_proc *)token_filter_object;
-
-    token_filter->callbacks.token_filter.fin(ctx, token_filter->user_data);
-  }
-}
-
-grn_rc
-grn_token_cursor_close(grn_ctx *ctx, grn_token_cursor *token_cursor)
-{
-  if (token_cursor) {
-    if (token_cursor->tokenizer) {
-      ((grn_proc *)token_cursor->tokenizer)->funcs[PROC_FIN](ctx, 1, &token_cursor->table,
-                                                             &token_cursor->pctx.user_data);
-    }
-    grn_token_cursor_close_token_filters(ctx, token_cursor);
-    if (token_cursor->nstr) {
-      grn_obj_close(ctx, token_cursor->nstr);
-    }
-    GRN_FREE(token_cursor);
-    return GRN_SUCCESS;
-  } else {
-    return GRN_INVALID_ARGUMENT;
-  }
-}
-
 grn_rc
 grn_db_init_mecab_tokenizer(grn_ctx *ctx)
 {
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index