[Groonga-commit] groonga/groonga at 2eaf537 [master] grn_tokenizer_query: support custom normalize flags

Back to archive index

Kouhei Sutou null+****@clear*****
Tue May 8 16:07:38 JST 2018


Kouhei Sutou	2018-05-08 16:07:38 +0900 (Tue, 08 May 2018)

  New Revision: 2eaf53784cd5035a44090c505273bd8d211bc5d1
  https://github.com/groonga/groonga/commit/2eaf53784cd5035a44090c505273bd8d211bc5d1

  Message:
    grn_tokenizer_query: support custom normalize flags

  Modified files:
    include/groonga/tokenizer.h
    lib/grn_tokenizer.h
    lib/tokenizer.c

  Modified: include/groonga/tokenizer.h (+11 -0)
===================================================================
--- include/groonga/tokenizer.h    2018-05-08 16:06:21 +0900 (56d07803c)
+++ include/groonga/tokenizer.h    2018-05-08 16:07:38 +0900 (e62a94776)
@@ -103,6 +103,14 @@ GRN_PLUGIN_EXPORT void grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_que
  */
 void grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query);
 
+GRN_PLUGIN_EXPORT grn_rc
+grn_tokenizer_query_set_normalize_flags(grn_ctx *ctx,
+                                        grn_tokenizer_query *query,
+                                        unsigned int flags);
+GRN_PLUGIN_EXPORT unsigned int
+grn_tokenizer_query_get_normalize_flags(grn_ctx *ctx,
+                                        grn_tokenizer_query *query);
+
 GRN_PLUGIN_EXPORT grn_obj *
 grn_tokenizer_query_get_normalized_string(grn_ctx *ctx,
                                           grn_tokenizer_query *query);
@@ -125,6 +133,9 @@ grn_tokenizer_query_have_tokenized_delimiter(grn_ctx *ctx,
 GRN_PLUGIN_EXPORT grn_tokenize_mode
 grn_tokenizer_query_get_mode(grn_ctx *ctx, grn_tokenizer_query *query);
 
+GRN_PLUGIN_EXPORT grn_obj *
+grn_tokenizer_query_get_lexicon(grn_ctx *ctx, grn_tokenizer_query *query);
+
 /*
   grn_tokenizer_token is needed to return tokens. A grn_tokenizer_token object
   stores a token to be returned and it must be maintained until a request for

  Modified: lib/grn_tokenizer.h (+4 -0)
===================================================================
--- lib/grn_tokenizer.h    2018-05-08 16:06:21 +0900 (87c696758)
+++ lib/grn_tokenizer.h    2018-05-08 16:07:38 +0900 (a1ad61517)
@@ -41,6 +41,10 @@ typedef struct _grn_tokenizer_query {
   grn_token_mode token_mode;
   grn_tokenize_mode tokenize_mode;
   /* End _grn_tokenizer_query_deprecated compatible layout. */
+
+  grn_obj *lexicon;
+  unsigned int normalize_flags;
+  grn_bool need_normalize;
 } grn_tokenizer_query;
 
 #ifdef __cplusplus

  Modified: lib/tokenizer.c (+170 -91)
===================================================================
--- lib/tokenizer.c    2018-05-08 16:06:21 +0900 (c74614324)
+++ lib/tokenizer.c    2018-05-08 16:07:38 +0900 (bd7a4dff5)
@@ -95,30 +95,81 @@ grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx,
   return GRN_FALSE;
 }
 
+static void
+grn_tokenizer_query_ensure_normalized(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+  if (!query->need_normalize) {
+    return;
+  }
+
+  query->need_normalize = GRN_FALSE;
+
+  if (query->normalized_query) {
+    grn_obj_close(ctx, query->normalized_query);
+  }
+  query->normalized_query = grn_string_open_(ctx,
+                                             query->ptr,
+                                             query->length,
+                                             query->lexicon,
+                                             query->normalize_flags,
+                                             query->encoding);
+  if (!query->normalized_query) {
+    query->have_tokenized_delimiter = GRN_FALSE;
+    GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+                     "[tokenizer][normalize] failed to open normalized string");
+    return;
+  }
+
+  if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
+    const char *normalized_string;
+    unsigned int normalized_string_length;
+
+    grn_string_get_normalized(ctx,
+                              query->normalized_query,
+                              &normalized_string,
+                              &normalized_string_length,
+                              NULL);
+    query->have_tokenized_delimiter =
+      grn_tokenizer_have_tokenized_delimiter(ctx,
+                                             normalized_string,
+                                             normalized_string_length,
+                                             query->encoding);
+  } else {
+    query->have_tokenized_delimiter = GRN_FALSE;
+  }
+}
+
 grn_tokenizer_query *
 grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
                          unsigned int normalize_flags)
 {
-  grn_obj *flags = grn_ctx_pop(ctx);
-  grn_obj *query_str = grn_ctx_pop(ctx);
-  grn_obj *tokenize_mode = grn_ctx_pop(ctx);
+  grn_obj *flags;
+  grn_obj *query_str;
+  grn_obj *tokenize_mode;
+
+  GRN_API_ENTER;
+
+  flags = grn_ctx_pop(ctx);
+  query_str = grn_ctx_pop(ctx);
+  tokenize_mode = grn_ctx_pop(ctx);
 
   if (query_str == NULL) {
     GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
-    return NULL;
+    GRN_API_RETURN(NULL);
   }
 
   if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
     GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
-    return NULL;
+    GRN_API_RETURN(NULL);
   }
 
   {
     grn_tokenizer_query * const query =
         GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
     if (query == NULL) {
-      return NULL;
+      GRN_API_RETURN(NULL);
     }
+    query->normalize_flags = normalize_flags;
     query->normalized_query = NULL;
     query->query_buf = NULL;
     if (flags) {
@@ -136,6 +187,12 @@ grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
     {
       grn_obj * const table = args[0];
       grn_encoding table_encoding;
+      query->lexicon = table;
+      grn_table_get_info(ctx, table, NULL, &table_encoding, NULL,
+                         NULL, NULL);
+      query->encoding = table_encoding;
+    }
+    {
       unsigned int query_length = GRN_TEXT_LEN(query_str);
       char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
 
@@ -143,53 +200,19 @@ grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
         GRN_PLUGIN_FREE(ctx, query);
         GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
                          "[tokenizer] failed to duplicate query");
-        return NULL;
-      }
-      grn_table_get_info(ctx, table, NULL, &table_encoding, NULL,
-                         NULL, NULL);
-      {
-        grn_obj *normalized_query;
-        normalized_query = grn_string_open_(ctx,
-                                            GRN_TEXT_VALUE(query_str),
-                                            GRN_TEXT_LEN(query_str),
-                                            table,
-                                            normalize_flags,
-                                            table_encoding);
-        if (!normalized_query) {
-          GRN_PLUGIN_FREE(ctx, query_buf);
-          GRN_PLUGIN_FREE(ctx, query);
-          GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
-                           "[tokenizer] failed to open normalized string");
-          return NULL;
-        }
-        query->normalized_query = normalized_query;
-        grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
-        query_buf[query_length] = '\0';
-        query->query_buf = query_buf;
-        query->ptr = query_buf;
-        query->length = query_length;
-      }
-      query->encoding = table_encoding;
-
-      if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
-        const char *normalized_string;
-        unsigned int normalized_string_length;
-
-        grn_string_get_normalized(ctx,
-                                  query->normalized_query,
-                                  &normalized_string,
-                                  &normalized_string_length,
-                                  NULL);
-        query->have_tokenized_delimiter =
-          grn_tokenizer_have_tokenized_delimiter(ctx,
-                                                 normalized_string,
-                                                 normalized_string_length,
-                                                 query->encoding);
-      } else {
-        query->have_tokenized_delimiter = GRN_FALSE;
+        GRN_API_RETURN(NULL);
       }
+      grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
+      query_buf[query_length] = '\0';
+      query->query_buf = query_buf;
+      query->ptr = query_buf;
+      query->length = query_length;
     }
-    return query;
+
+    query->need_normalize = GRN_TRUE;
+    grn_tokenizer_query_ensure_normalized(ctx, query);
+
+    GRN_API_RETURN(query);
   }
 }
 
@@ -202,15 +225,17 @@ grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args)
 void
 grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_query *query)
 {
-  if (query != NULL) {
-    if (query->normalized_query != NULL) {
+  GRN_API_ENTER;
+  if (query) {
+    if (query->normalized_query) {
       grn_obj_unlink(ctx, query->normalized_query);
     }
-    if (query->query_buf != NULL) {
+    if (query->query_buf) {
       GRN_PLUGIN_FREE(ctx, query->query_buf);
     }
     GRN_PLUGIN_FREE(ctx, query);
   }
+  GRN_API_RETURN();
 }
 
 void
@@ -219,11 +244,34 @@ grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query)
   grn_tokenizer_query_close(ctx, query);
 }
 
+grn_rc
+grn_tokenizer_query_set_normalize_flags(grn_ctx *ctx,
+                                        grn_tokenizer_query *query,
+                                        unsigned int normalize_flags)
+{
+  GRN_API_ENTER;
+  if (query->normalize_flags != normalize_flags) {
+    query->normalize_flags = normalize_flags;
+    query->need_normalize = GRN_TRUE;
+  }
+  GRN_API_RETURN(ctx->rc);
+}
+
+unsigned int
+grn_tokenizer_query_get_normalize_flags(grn_ctx *ctx,
+                                        grn_tokenizer_query *query)
+{
+  GRN_API_ENTER;
+  GRN_API_RETURN(query->normalize_flags);
+}
+
 grn_obj *
 grn_tokenizer_query_get_normalized_string(grn_ctx *ctx,
                                           grn_tokenizer_query *query)
 {
-  return query->normalized_query;
+  GRN_API_ENTER;
+  grn_tokenizer_query_ensure_normalized(ctx, query);
+  GRN_API_RETURN(query->normalized_query);
 }
 
 const char *
@@ -231,35 +279,48 @@ grn_tokenizer_query_get_raw_string(grn_ctx *ctx,
                                    grn_tokenizer_query *query,
                                    size_t *length)
 {
+  GRN_API_ENTER;
   if (length) {
     *length = query->length;
   }
-  return query->ptr;
+  GRN_API_RETURN(query->ptr);
 }
 
 grn_encoding
 grn_tokenizer_query_get_encoding(grn_ctx *ctx, grn_tokenizer_query *query)
 {
-  return query->encoding;
+  GRN_API_ENTER;
+  GRN_API_RETURN(query->encoding);
 }
 
 unsigned int
 grn_tokenizer_query_get_flags(grn_ctx *ctx, grn_tokenizer_query *query)
 {
-  return query->flags;
+  GRN_API_ENTER;
+  GRN_API_RETURN(query->flags);
 }
 
 grn_bool
 grn_tokenizer_query_have_tokenized_delimiter(grn_ctx *ctx,
                                              grn_tokenizer_query *query)
 {
-  return query->have_tokenized_delimiter;
+  GRN_API_ENTER;
+  grn_tokenizer_query_ensure_normalized(ctx, query);
+  GRN_API_RETURN(query->have_tokenized_delimiter);
 }
 
 grn_tokenize_mode
 grn_tokenizer_query_get_mode(grn_ctx *ctx, grn_tokenizer_query *query)
 {
-  return query->tokenize_mode;
+  GRN_API_ENTER;
+  GRN_API_RETURN(query->tokenize_mode);
+}
+
+grn_obj *
+grn_tokenizer_query_get_lexicon(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+  GRN_API_ENTER;
+  GRN_API_RETURN(query->lexicon);
 }
 
 void
@@ -358,57 +419,75 @@ grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
 }
 
 grn_obj *
-grn_token_get_data(grn_ctx *ctx, grn_token *token)
+grn_tokenizer_create(grn_ctx *ctx,
+                     const char *name,
+                     int name_length)
 {
+  grn_obj *tokenizer;
+
   GRN_API_ENTER;
-  if (!token) {
-    ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
-    GRN_API_RETURN(NULL);
+  tokenizer = grn_proc_create(ctx,
+                              name,
+                              name_length,
+                              GRN_PROC_TOKENIZER,
+                              NULL,
+                              NULL,
+                              NULL,
+                              0,
+                              NULL);
+  if (!tokenizer) {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_TOKENIZER_ERROR,
+                     "[tokenizer][create] failed to create");
   }
-  GRN_API_RETURN(&(token->data));
+
+  GRN_API_RETURN(tokenizer);
 }
 
 grn_rc
-grn_token_set_data(grn_ctx *ctx,
-                   grn_token *token,
-                   const char *str_ptr,
-                   int str_length)
+grn_tokenizer_set_init_func(grn_ctx *ctx,
+                            grn_obj *tokenizer,
+                            grn_tokenizer_init_func *init)
 {
   GRN_API_ENTER;
-  if (!token) {
-    ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
-    goto exit;
-  }
-  if (str_length == -1) {
-    str_length = strlen(str_ptr);
+  if (tokenizer) {
+    ((grn_proc *)tokenizer)->callbacks.tokenizer.init = init;
+  } else {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_INVALID_ARGUMENT,
+                     "[tokenizer][init][set] tokenizer is NULL");
   }
-  GRN_TEXT_SET(ctx, &(token->data), str_ptr, str_length);
-exit:
   GRN_API_RETURN(ctx->rc);
 }
 
-grn_token_status
-grn_token_get_status(grn_ctx *ctx, grn_token *token)
+grn_rc
+grn_tokenizer_set_next_func(grn_ctx *ctx,
+                            grn_obj *tokenizer,
+                            grn_tokenizer_next_func *next)
 {
   GRN_API_ENTER;
-  if (!token) {
-    ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
-    GRN_API_RETURN(GRN_TOKEN_CONTINUE);
+  if (tokenizer) {
+    ((grn_proc *)tokenizer)->callbacks.tokenizer.next = next;
+  } else {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_INVALID_ARGUMENT,
+                     "[tokenizer][next][set] tokenizer is NULL");
   }
-  GRN_API_RETURN(token->status);
+  GRN_API_RETURN(ctx->rc);
 }
 
 grn_rc
-grn_token_set_status(grn_ctx *ctx,
-                     grn_token *token,
-                     grn_token_status status)
+grn_tokenizer_set_fin_func(grn_ctx *ctx,
+                           grn_obj *tokenizer,
+                           grn_tokenizer_fin_func *fin)
 {
   GRN_API_ENTER;
-  if (!token) {
-    ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
-    goto exit;
+  if (tokenizer) {
+    ((grn_proc *)tokenizer)->callbacks.tokenizer.fin = fin;
+  } else {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_INVALID_ARGUMENT,
+                     "[tokenizer][fin][set] tokenizer is NULL");
   }
-  token->status = status;
-exit:
   GRN_API_RETURN(ctx->rc);
 }
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180508/b47a1e8b/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index