groonga/groonga at 3744cc1 [master] TokenFilterStem: add algorithm option (Groonga-commit) - Groonga - fulltext search engine.

Kouhei Sutou	2018-10-29 15:44:25 +0900 (Mon, 29 Oct 2018)

  Revision: 3744cc101832044d4f5a9e2ccd9ff63fb3dcaf40
  https://github.com/groonga/groonga/commit/3744cc101832044d4f5a9e2ccd9ff63fb3dcaf40

  Message:
    TokenFilterStem: add algorithm option

  Added files:
    test/command/suite/token_filters/stem/french.expected
    test/command/suite/token_filters/stem/french.test
  Modified files:
    plugins/token_filters/stem.c

  Modified: plugins/token_filters/stem.c (+88 -10)
===================================================================

--- plugins/token_filters/stem.c    2018-10-29 15:44:07 +0900 (e918ed8a1)
+++ plugins/token_filters/stem.c    2018-10-29 15:44:25 +0900 (dd61dbe37)
@@ -1,6 +1,7 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
   Copyright(C) 2014 Brazil
+  Copyright(C) 2018 Kouhei Sutou <kou****@clear*****>
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -31,16 +32,93 @@
 #include <libstemmer.h>
 
 typedef struct {
+  grn_obj algorithm;
+} grn_stem_token_filter_options;
+
+typedef struct {
+  grn_stem_token_filter_options *options;
   struct sb_stemmer *stemmer;
   grn_tokenizer_token token;
   grn_obj buffer;
 } grn_stem_token_filter;
 
+static void
+stem_options_init(grn_ctx *ctx, grn_stem_token_filter_options *options)
+{
+  GRN_TEXT_INIT(&(options->algorithm), 0);
+  GRN_TEXT_SETS(ctx, &(options->algorithm), "english");
+  GRN_TEXT_PUTC(ctx, &(options->algorithm), '\0');
+}
+
 static void *
-stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode)
+stem_open_options(grn_ctx *ctx,
+                  grn_obj *tokenizer,
+                  grn_obj *raw_options,
+                  void *user_data)
 {
+  grn_stem_token_filter_options *options;
+
+  options = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter_options));
+  if (!options) {
+    GRN_PLUGIN_ERROR(ctx,
+                     GRN_NO_MEMORY_AVAILABLE,
+                     "[token-filter][stem] "
+                     "failed to allocate memory for options");
+    return NULL;
+  }
+
+  stem_options_init(ctx, options);
+
+  GRN_OPTION_VALUES_EACH_BEGIN(ctx, raw_options, i, name, name_length) {
+    grn_raw_string name_raw;
+    name_raw.value = name;
+    name_raw.length = name_length;
+
+    if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "algorithm")) {
+      const char *algorithm;
+      unsigned int length;
+      length = grn_vector_get_element(ctx,
+                                      raw_options,
+                                      i,
+                                      &algorithm,
+                                      NULL,
+                                      NULL);
+      GRN_TEXT_SET(ctx, &(options->algorithm), algorithm, length);
+      GRN_TEXT_PUTC(ctx, &(options->algorithm), '\0');
+    }
+  } GRN_OPTION_VALUES_EACH_END();
+
+  return options;
+}
+
+static void
+stem_close_options(grn_ctx *ctx, void *data)
+{
+  grn_stem_token_filter_options *options = data;
+  GRN_OBJ_FIN(ctx, &(options->algorithm));
+  GRN_PLUGIN_FREE(ctx, options);
+}
+
+static void *
+stem_init(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+  grn_obj *lexicon;
+  unsigned int i;
+  grn_stem_token_filter_options *options;
   grn_stem_token_filter *token_filter;
 
+  lexicon = grn_tokenizer_query_get_lexicon(ctx, query);
+  i = grn_tokenizer_query_get_token_filter_index(ctx, query);
+  options = grn_table_cache_token_filter_options(ctx,
+                                                 lexicon,
+                                                 i,
+                                                 stem_open_options,
+                                                 stem_close_options,
+                                                 NULL);
+  if (ctx->rc != GRN_SUCCESS) {
+    return NULL;
+  }
+
   token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter));
   if (!token_filter) {
     GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
@@ -48,10 +126,11 @@ stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode)
                      "failed to allocate grn_stem_token_filter");
     return NULL;
   }
+  token_filter->options = options;
 
   {
-    /* TODO: Support other languages. */
-    const char *algorithm = "english";
+    const char *algorithm = GRN_TEXT_VALUE(&(token_filter->options->algorithm));
+    /* TODO: Support other encoding. */
     const char *encoding = "UTF_8";
     token_filter->stemmer = sb_stemmer_new(algorithm, encoding);
     if (!token_filter->stemmer) {
@@ -261,15 +340,14 @@ GRN_PLUGIN_INIT(grn_ctx *ctx)
 grn_rc
 GRN_PLUGIN_REGISTER(grn_ctx *ctx)
 {
-  grn_rc rc;
+  grn_obj *token_filter;
 
-  rc = grn_token_filter_register(ctx,
-                                 "TokenFilterStem", -1,
-                                 stem_init,
-                                 stem_filter,
-                                 stem_fin);
+  token_filter = grn_token_filter_create(ctx, "TokenFilterStem", -1);
+  grn_token_filter_set_init_func(ctx, token_filter, stem_init);
+  grn_token_filter_set_filter_func(ctx, token_filter, stem_filter);
+  grn_token_filter_set_fin_func(ctx, token_filter, stem_fin);
 
-  return rc;
+  return ctx->rc;
 }
 
 grn_rc

  Added: test/command/suite/token_filters/stem/french.expected (+49 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/token_filters/stem/french.expected    2018-10-29 15:44:25 +0900 (bf2362501)
@@ -0,0 +1,49 @@
+plugin_register token_filters/stem
+[[0,0.0,0.0],true]
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto   --token_filters 'TokenFilterStem("algorithm", "french")'
+[[0,0.0,0.0],true]
+column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "maintenait"},
+{"content": "maintenant"}
+]
+[[0,0.0,0.0],2]
+select Memos --match_columns content --query "maintenir"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        2
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "content",
+          "ShortText"
+        ]
+      ],
+      [
+        1,
+        "maintenait"
+      ],
+      [
+        2,
+        "maintenant"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/token_filters/stem/french.test (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/token_filters/stem/french.test    2018-10-29 15:44:25 +0900 (ef48bb533)
@@ -0,0 +1,20 @@
+#@on-error omit
+plugin_register token_filters/stem
+#@on-error default
+
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto \
+  --token_filters 'TokenFilterStem("algorithm", "french")'
+column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content
+
+load --table Memos
+[
+{"content": "maintenait"},
+{"content": "maintenant"}
+]
+
+select Memos --match_columns content --query "maintenir"
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181029/41277ecb/attachment-0001.html>


Groonga - fulltext search engine.

[Groonga-commit] groonga/groonga at 3744cc1 [master] TokenFilterStem: add algorithm option