null+****@clear*****
null+****@clear*****
2012年 2月 9日 (木) 12:52:50 JST
Susumu Yata 2012-02-09 12:52:50 +0900 (Thu, 09 Feb 2012)
New Revision: 679948e7d5245df5f2bbc13aa36a70bf4f534862
Log:
added a new module to help tokenizer development.
Added files:
include/groonga/tokenizer.h
lib/tokenizer.c
Added: include/groonga/tokenizer.h (+248 -0) 100644
===================================================================
--- /dev/null
+++ include/groonga/tokenizer.h 2012-02-09 12:52:50 +0900 (d5970f4)
@@ -0,0 +1,248 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2012 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#ifndef GRN_PLUGIN_TOKENIZER_H
+#define GRN_PLUGIN_TOKENIZER_H
+
+#include <stddef.h>
+
+#include <groonga/plugin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/*
+ Don't call these functions directly. Use GRN_TOKENIZER_MALLOC() and
+ GRN_TOKENIZER_FREE() instead.
+ */
+void *grn_tokenizer_malloc(grn_ctx *ctx, size_t size, const char *file,
+ int line, const char *func);
+void grn_tokenizer_free(grn_ctx *ctx, void *ptr, const char *file,
+ int line, const char *func);
+
+/*
+ GRN_TOKENIZER_MALLOC() allocates `size' bytes and returns a pointer to the
+ allocated memory space. Note that the memory space is associated with `ctx'.
+ */
+#define GRN_TOKENIZER_MALLOC(ctx, size) \
+ grn_tokenizer_malloc((ctx), (size), __FILE__, __LINE__, __FUNCTION__)
+/*
+ GRN_TOKENIZER_FREE() frees a memory space allocated by
+ GRN_TOKENIZER_MALLOC(). This means that `ptr' must be a pointer returned by
+ GRN_TOKENIZER_MALLOC().
+ */
+#define GRN_TOKENIZER_FREE(ctx, ptr) \
+ grn_tokenizer_free((ctx), (ptr), __FILE__, __LINE__, __FUNCTION__)
+
+/*
+ GRN_TOKENIZER_LOG() reports a log of `level'. Its error message is generated
+ from `format' and the varying number of arguments. See grn_log_level in
+ "groonga.h" for more details of `level'.
+ */
+#define GRN_TOKENIZER_LOG(ctx, level, format, ...) \
+ GRN_LOG((ctx), (level), (format), ## __VA_ARGS__)
+
+/*
+ Don't call grn_tokenizer_set_error() directly. This function is used in
+ GRN_TOKENIZER_SET_ERROR().
+ */
+void grn_tokenizer_set_error(grn_ctx *ctx, grn_log_level level,
+ grn_rc error_code,
+ const char *file, int line, const char *func,
+ const char *format, ...);
+
+/*
+ Don't call these functions directly. grn_tokenizer_backtrace() and
+ grn_tokenizer_logtrace() are used in GRN_TOKENIZER_SET_ERROR().
+ */
+void grn_tokenizer_backtrace(grn_ctx *ctx);
+void grn_tokenizer_logtrace(grn_ctx *ctx, grn_log_level level);
+
+/*
+ Don't use GRN_TOKENIZER_SET_ERROR() directly. This macro is used in
+ GRN_TOKENIZER_ERROR().
+ */
+#define GRN_TOKENIZER_SET_ERROR(ctx, level, error_code, format, ...) do { \
+ grn_tokenizer_set_error(ctx, level, error_code, \
+ __FILE__, __LINE__, __FUNCTION__, \
+ format, ## __VA_ARGS__); \
+ GRN_LOG(ctx, level, format, ## __VA_ARGS__); \
+ grn_tokenizer_backtrace(ctx); \
+ grn_tokenizer_logtrace(ctx, level); \
+} while (0)
+
+/*
+ GRN_TOKENIZER_ERROR() reports an error of `error_code'. Its error message is
+ generated from `format' and the varying number of arguments. See grn_rc in
+ "groonga.h" for more details of `error_code'.
+ */
+#define GRN_TOKENIZER_ERROR(ctx, error_code, format, ...) \
+ GRN_TOKENIZER_SET_ERROR(ctx, GRN_LOG_ERROR, error_code, \
+ format, ## __VA_ARGS__)
+
+/*
+ grn_tokenizer_mutex is available to make a critical section. See the
+ following functions.
+ */
+typedef struct _grn_tokenizer_mutex grn_tokenizer_mutex;
+
+/*
+ grn_tokenizer_mutex_create() returns a pointer to a new object of
+ grn_tokenizer_mutex. Memory for the new object is obtained with
+ GRN_TOKENIZER_MALLOC(). grn_tokenizer_mutex_create() returns NULL if
+ sufficient memory is not available.
+ */
+grn_tokenizer_mutex *grn_tokenizer_mutex_create(grn_ctx *ctx);
+
+/*
+ grn_tokenizer_mutex_destroy() finalizes an object of grn_tokenizer_mutex
+ and then frees memory allocated for that object.
+ */
+void grn_tokenizer_mutex_destroy(grn_ctx *ctx, grn_tokenizer_mutex *mutex);
+
+/*
+ grn_tokenizer_mutex_lock() locks a mutex object. If the object is already
+ locked, the calling thread waits until the object will be unlocked.
+ */
+void grn_tokenizer_mutex_lock(grn_ctx *ctx, grn_tokenizer_mutex *mutex);
+
+/*
+ grn_tokenizer_mutex_unlock() unlocks a mutex object.
+ grn_tokenizer_mutex_unlock() should not be called for an unlocked object.
+ */
+void grn_tokenizer_mutex_unlock(grn_ctx *ctx, grn_tokenizer_mutex *mutex);
+
+/*
+ grn_tokenizer_charlen() returns the length (#bytes) of the first character
+ in the string specified by `str_ptr' and `str_length'. If the starting bytes
+ are invalid as a character, grn_tokenizer_charlen() returns 0. See
+ grn_encoding in "groonga.h" for more details of `encoding'
+ */
+int grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr,
+ unsigned int str_length, grn_encoding encoding);
+
+/*
+ grn_tokenizer_isspace() returns the length (#bytes) of the first character
+ in the string specified by `str_ptr' and `str_length' if it is a space
+ character. Otherwise, grn_tokenizer_isspace() returns 0.
+ */
+int grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr,
+ unsigned int str_length, grn_encoding encoding);
+
+/*
+ grn_tokenizer_query is a structure for storing a query. See the following
+ functions.
+ */
+typedef struct _grn_tokenizer_query grn_tokenizer_query;
+
+struct _grn_tokenizer_query {
+ grn_str *str;
+ const char *ptr;
+ unsigned int length;
+ grn_encoding encoding;
+};
+
+/*
+ grn_tokenizer_query_create() parses `args' and returns a new object of
+ grn_tokenizer_query. The new object stores information of the query.
+ grn_tokenizer_query_create() normalizes the query if the target table
+ requires normalization. grn_tokenizer_query_create() returns NULL if
+ something goes wrong. Note that grn_tokenizer_query_create() must be called
+ just once in the function that initializes a tokenizer.
+ */
+grn_tokenizer_query *grn_tokenizer_query_create(grn_ctx *ctx,
+ int num_args, grn_obj **args);
+
+/*
+ grn_tokenizer_mutex_destroy() finalizes an object of grn_tokenizer_mutex
+ and then frees memory allocated for that object.
+ */
+void grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query);
+
+/*
+ grn_tokenizer_token is needed to return tokens. A grn_tokenizer_token object
+ stores a token to be returned and it must be maintained until a request for
+ next token or finalization comes.
+ */
+typedef struct _grn_tokenizer_token grn_tokenizer_token;
+
+struct _grn_tokenizer_token {
+ grn_obj str;
+ grn_obj status;
+};
+
+/*
+ grn_tokenizer_token_init() initializes `token'. Note that an initialized
+ object must be finalized by grn_tokenizer_token_fin().
+ */
+void grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token);
+
+/*
+ grn_tokenizer_token_fin() finalizes `token' that has been initialized by
+ grn_tokenizer_token_init().
+ */
+void grn_tokenizer_token_fin(grn_ctx *ctx, grn_tokenizer_token *token);
+
+/*
+ grn_tokenizer_status provides a list of tokenizer status codes.
+ GRN_TOKENIZER_CONTINUE means that the next token is not the last one and
+ GRN_TOKENIZER_LAST means that the next token is the last one. If a document
+ or query contains no tokens, please push an empty string with
+ GRN_TOKENIZER_LAST as a token.
+ */
+typedef enum _grn_tokenizer_status grn_tokenizer_status;
+
+enum _grn_tokenizer_status {
+ GRN_TOKENIZER_CONTINUE = 0,
+ GRN_TOKENIZER_LAST = 1
+};
+
+/*
+ grn_tokenizer_token_push() pushes the next token in `*token'. Note that
+ grn_tokenizer_token_push() does not make a copy of the given string. This
+ means that you have to maintain a memory space allocated to the string.
+ Also note that the grn_tokenizer_token object must be maintained until the
+ request for the next token or finalization comes. See grn_tokenizer_status in
+ this header for more details of `status'.
+ */
+void grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token,
+ const char *str_ptr, unsigned int str_length,
+ grn_tokenizer_status status);
+
+/*
+ grn_tokenizer_register() registers a plugin to the database which is
+ associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the
+ plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and
+ an underscore ('_') are capable characters. `init', `next' and `fin' specify
+ the plugin functions. `init' is called for initializing a tokenizer for a
+ document or query. `next' is called for extracting tokens one by one. `fin'
+ is called for finalizing a tokenizer. grn_tokenizer_register() returns
+ GRN_SUCCESS on success, an error code on failure. See "groonga.h" for more
+ details of grn_proc_func and grn_user_data, that is used as an argument of
+ grn_proc_func.
+ */
+grn_rc grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
+ unsigned int plugin_name_length,
+ grn_proc_func *init, grn_proc_func *next,
+ grn_proc_func *fin);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* GRN_PLUGIN_TOKENIZER_H */
Added: lib/tokenizer.c (+274 -0) 100644
===================================================================
--- /dev/null
+++ lib/tokenizer.c 2012-02-09 12:52:50 +0900 (9847086)
@@ -0,0 +1,274 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2012 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include "groonga/tokenizer.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ctx.h"
+#include "db.h"
+#include "str.h"
+#include "token.h"
+
+void *grn_tokenizer_malloc(grn_ctx *ctx, size_t size, const char *file,
+ int line, const char *func) {
+ return grn_malloc(ctx, size, file, line, func);
+}
+
+void grn_tokenizer_free(grn_ctx *ctx, void *ptr, const char *file,
+ int line, const char *func) {
+ return grn_free(ctx, ptr, file, line, func);
+}
+
+/*
+ grn_tokenizer_ctx_log() is a clone of grn_ctx_log() in ctx.c. The only
+ difference is that grn_tokenizer_ctx_log() uses va_list instead of `...'.
+ */
+static void grn_tokenizer_ctx_log(grn_ctx *ctx, const char *format,
+ va_list ap) {
+ va_list aq;
+ va_copy(aq, ap);
+ vsnprintf(ctx->errbuf, GRN_CTX_MSGSIZE, format, aq);
+ va_end(aq);
+}
+
+void grn_tokenizer_set_error(grn_ctx *ctx, grn_log_level level,
+ grn_rc error_code,
+ const char *file, int line, const char *func,
+ const char *format, ...) {
+ ctx->errlvl = level;
+ ctx->rc = error_code;
+ ctx->errfile = file;
+ ctx->errline = line;
+ ctx->errfunc = func;
+ grn_ctx_impl_err(ctx);
+
+ {
+ va_list ap;
+ va_start(ap, format);
+ grn_tokenizer_ctx_log(ctx, format, ap);
+ va_end(ap);
+ }
+}
+
+void grn_tokenizer_backtrace(grn_ctx *ctx) {
+ BACKTRACE(ctx);
+}
+
+void grn_tokenizer_logtrace(grn_ctx *ctx, grn_log_level level) {
+ if (level <= GRN_LOG_ERROR) {
+ LOGTRACE(ctx, level);
+ }
+}
+
+struct _grn_tokenizer_mutex {
+ grn_critical_section critical_section;
+};
+
+grn_tokenizer_mutex *grn_tokenizer_mutex_create(grn_ctx *ctx) {
+ grn_tokenizer_mutex * const mutex =
+ GRN_TOKENIZER_MALLOC(ctx, sizeof(grn_tokenizer_mutex));
+ if (mutex != NULL) {
+ CRITICAL_SECTION_INIT(mutex->critical_section);
+ }
+ return mutex;
+}
+
+void grn_tokenizer_mutex_destroy(grn_ctx *ctx, grn_tokenizer_mutex *mutex) {
+ if (mutex != NULL) {
+ CRITICAL_SECTION_FIN(mutex->critical_section);
+ GRN_TOKENIZER_FREE(ctx, mutex);
+ }
+}
+
+void grn_tokenizer_mutex_lock(grn_ctx *ctx, grn_tokenizer_mutex *mutex) {
+ if (mutex != NULL) {
+ CRITICAL_SECTION_ENTER(mutex->critical_section);
+ }
+}
+
+void grn_tokenizer_mutex_unlock(grn_ctx *ctx, grn_tokenizer_mutex *mutex) {
+ if (mutex != NULL) {
+ CRITICAL_SECTION_LEAVE(mutex->critical_section);
+ }
+}
+
+/*
+ grn_tokenizer_charlen() takes the length of a string, unlike grn_charlen_().
+ */
+int grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr,
+ unsigned int str_length, grn_encoding encoding) {
+ return grn_charlen_(ctx, str_ptr, str_ptr + str_length, encoding);
+}
+
+/*
+ grn_tokenizer_isspace() takes the length of a string, unlike grn_isspace().
+ */
+int grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr,
+ unsigned int str_length, grn_encoding encoding) {
+ if ((str_ptr == NULL) || (str_length == 0)) {
+ return 0;
+ }
+ switch ((unsigned char)str_ptr[0]) {
+ case ' ':
+ case '\f':
+ case '\n':
+ case '\r':
+ case '\t':
+ case '\v': {
+ return 1;
+ }
+ case 0x81: {
+ if ((encoding == GRN_ENC_SJIS) && (str_length >= 2) &&
+ ((unsigned char)str_ptr[1] == 0x40)) {
+ return 2;
+ }
+ break;
+ }
+ case 0xA1: {
+ if ((encoding == GRN_ENC_EUC_JP) && (str_length >= 2) &&
+ ((unsigned char)str_ptr[1] == 0xA1)) {
+ return 2;
+ }
+ break;
+ }
+ case 0xE3: {
+ if ((encoding == GRN_ENC_UTF8) && (str_length >= 3) &&
+ ((unsigned char)str_ptr[1] == 0x80) &&
+ ((unsigned char)str_ptr[2] == 0x80)) {
+ return 3;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ return 0;
+}
+
+grn_tokenizer_query *grn_tokenizer_query_create(grn_ctx *ctx,
+ int num_args, grn_obj **args) {
+ grn_obj *query_str = grn_ctx_pop(ctx);
+ if (query_str == NULL) {
+ GRN_TOKENIZER_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
+ return NULL;
+ }
+
+ if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
+ GRN_TOKENIZER_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
+ return NULL;
+ }
+
+ {
+ grn_tokenizer_query * const query =
+ GRN_TOKENIZER_MALLOC(ctx, sizeof(grn_tokenizer_query));
+ if (query == NULL) {
+ return NULL;
+ }
+
+ {
+ grn_obj * const table = args[0];
+ grn_obj_flags table_flags;
+ grn_encoding table_encoding;
+ grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL);
+ {
+ grn_str * const str = grn_str_open_(ctx, GRN_TEXT_VALUE(query_str),
+ GRN_TEXT_LEN(query_str),
+ table_flags & GRN_OBJ_KEY_NORMALIZE,
+ table_encoding);
+ if (str == NULL) {
+ GRN_TOKENIZER_FREE(ctx, query);
+ return NULL;
+ }
+ query->str = str;
+ }
+ query->ptr = query->str->norm;
+ query->length = query->str->norm_blen;
+ query->encoding = table_encoding;
+ }
+ return query;
+ }
+}
+
+void grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query) {
+ if (query != NULL) {
+ if (query->str != NULL) {
+ grn_str_close(ctx, query->str);
+ }
+ GRN_TOKENIZER_FREE(ctx, query);
+ }
+}
+
+void grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token) {
+ GRN_TEXT_INIT(&token->str, GRN_OBJ_DO_SHALLOW_COPY);
+ GRN_UINT32_INIT(&token->status, 0);
+}
+
+void grn_tokenizer_token_fin(grn_ctx *ctx, grn_tokenizer_token *token) {
+}
+
+void grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token,
+ const char *str_ptr, unsigned int str_length,
+ grn_tokenizer_status status) {
+ GRN_TEXT_SET_REF(&token->str, str_ptr, str_length);
+ switch (status) {
+ case GRN_TOKENIZER_CONTINUE: {
+ GRN_UINT32_SET(ctx, &token->status, 0);
+ break;
+ }
+ case GRN_TOKENIZER_LAST:
+ default: {
+ GRN_UINT32_SET(ctx, &token->status, GRN_TOKEN_LAST);
+ break;
+ }
+ }
+ grn_ctx_push(ctx, &token->str);
+ grn_ctx_push(ctx, &token->status);
+}
+
+grn_rc grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
+ unsigned int plugin_name_length,
+ grn_proc_func *init, grn_proc_func *next,
+ grn_proc_func *fin) {
+ grn_expr_var vars[] = {
+ { NULL, 0 },
+ { NULL, 0 },
+ { NULL, 0 }
+ };
+ GRN_TEXT_INIT(&vars[0].value, 0);
+ GRN_TEXT_INIT(&vars[1].value, 0);
+ GRN_UINT32_INIT(&vars[2].value, 0);
+
+ {
+ /*
+ grn_proc_create() registers a plugin to the database which is associated
+ with `ctx'. A returned object must not be finalized here.
+ */
+ grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr,
+ plugin_name_length,
+ GRN_PROC_TOKENIZER,
+ init, next, fin, 3, vars);
+ if (obj == NULL) {
+ GRN_TOKENIZER_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "grn_proc_create() failed");
+ return ctx->rc;
+ }
+ }
+ return GRN_SUCCESS;
+}