[Groonga-commit] groonga/groonga at 2c1ffb2 [master] token-filter stem: support all upper-case term

Back to archive index

Kouhei Sutou null+****@clear*****
Sun Dec 28 19:08:56 JST 2014


Kouhei Sutou	2014-12-28 19:08:56 +0900 (Sun, 28 Dec 2014)

  New Revision: 2c1ffb206e1f1eab886cd913ecbaa01db33fd150
  https://github.com/groonga/groonga/commit/2c1ffb206e1f1eab886cd913ecbaa01db33fd150

  Message:
    token-filter stem: support all upper-case term
    
    MySQL compatible normalizer normalizes term to all upper-case.

  Added files:
    test/command/suite/token_filters/stem/all_upper.expected
    test/command/suite/token_filters/stem/all_upper.test
  Modified files:
    plugins/token_filters/stem.c

  Modified: plugins/token_filters/stem.c (+161 -29)
===================================================================
--- plugins/token_filters/stem.c    2014-12-28 15:50:52 +0900 (c3bb46a)
+++ plugins/token_filters/stem.c    2014-12-28 19:08:56 +0900 (010b8c9)
@@ -21,6 +21,7 @@
 #include <groonga.h>
 #include <groonga/token_filter.h>
 
+#include <ctype.h>
 #include <string.h>
 
 #include <libstemmer.h>
@@ -28,6 +29,7 @@
 typedef struct {
   struct sb_stemmer *stemmer;
   grn_tokenizer_token token;
+  grn_obj buffer;
 } grn_stem_token_filter;
 
 static void *
@@ -43,12 +45,128 @@ stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode)
     return NULL;
   }
 
-  token_filter->stemmer = NULL;
+  {
+    /* TODO: Support other languages. */
+    const char *algorithm = "english";
+    const char *encoding = "UTF_8";
+    token_filter->stemmer = sb_stemmer_new(algorithm, encoding);
+    if (!token_filter->stemmer) {
+      GRN_PLUGIN_FREE(ctx, token_filter);
+      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                       "[token-filter][stem] "
+                       "failed to create stemmer: "
+                       "algorithm=<%s>, encoding=<%s>",
+                       algorithm, encoding);
+      return NULL;
+    }
+  }
   grn_tokenizer_token_init(ctx, &(token_filter->token));
+  GRN_TEXT_INIT(&(token_filter->buffer), 0);
 
   return token_filter;
 }
 
+static grn_bool
+is_stemmable(grn_obj *data, grn_bool *is_all_upper)
+{
+  const char *current, *end;
+  grn_bool have_lower = GRN_FALSE;
+  grn_bool have_upper = GRN_FALSE;
+
+  *is_all_upper = GRN_FALSE;
+
+  switch (data->header.domain) {
+  case GRN_DB_SHORT_TEXT :
+  case GRN_DB_TEXT :
+  case GRN_DB_LONG_TEXT :
+    break;
+  default :
+    return GRN_FALSE;
+  }
+
+  current = GRN_TEXT_VALUE(data);
+  end = current + GRN_TEXT_LEN(data);
+
+  for (; current < end; current++) {
+    if (islower(*current)) {
+      have_lower = GRN_TRUE;
+      continue;
+    }
+    if (isupper(*current)) {
+      have_upper = GRN_TRUE;
+      continue;
+    }
+    if (isdigit(*current)) {
+      continue;
+    }
+    switch (*current) {
+    case '-' :
+    case '\'' :
+      break;
+    default :
+      return GRN_FALSE;
+    }
+  }
+
+  if (!have_lower && have_upper) {
+    *is_all_upper = GRN_TRUE;
+  }
+
+  return GRN_TRUE;
+}
+
+static void
+normalize(grn_ctx *ctx,
+          const char *string, unsigned int length,
+          grn_obj *normalized)
+{
+  const char *current, *end;
+  const char *unwritten;
+
+  current = unwritten = string;
+  end = current + length;
+
+  for (; current < end; current++) {
+    if (isupper(*current)) {
+      if (current > unwritten) {
+        GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten);
+      }
+      GRN_TEXT_PUTC(ctx, normalized, tolower(*current));
+      unwritten = current + 1;
+    }
+  }
+
+  if (current != unwritten) {
+    GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten);
+  }
+}
+
+static void
+unnormalize(grn_ctx *ctx,
+            const char *string, unsigned int length,
+            grn_obj *normalized)
+{
+  const char *current, *end;
+  const char *unwritten;
+
+  current = unwritten = string;
+  end = current + length;
+
+  for (; current < end; current++) {
+    if (islower(*current)) {
+      if (current > unwritten) {
+        GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten);
+      }
+      GRN_TEXT_PUTC(ctx, normalized, toupper(*current));
+      unwritten = current + 1;
+    }
+  }
+
+  if (current != unwritten) {
+    GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten);
+  }
+}
+
 static void
 stem_filter(grn_ctx *ctx,
             grn_token *current_token,
@@ -57,46 +175,59 @@ stem_filter(grn_ctx *ctx,
 {
   grn_stem_token_filter *token_filter = user_data;
   grn_obj *data;
+  grn_bool is_all_upper = GRN_FALSE;
 
   if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) {
     return;
   }
 
   data = grn_token_get_data(ctx, current_token);
-
-  if (token_filter->stemmer) {
-    sb_stemmer_delete(token_filter->stemmer);
-  }
-  {
-    /* TODO: Detect algorithm from the current token. */
-    const char *algorithm = "english";
-    const char *encoding = "UTF_8";
-    token_filter->stemmer = sb_stemmer_new(algorithm, encoding);
-    if (!token_filter->stemmer) {
-      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
-                       "[token-filter][stem] "
-                       "failed to create stemmer: "
-                       "algorithm=<%s>, encoding=<%s>",
-                       algorithm, encoding);
-      return;
-    }
+  if (!is_stemmable(data, &is_all_upper)) {
+    return;
   }
 
   {
     const sb_symbol *stemmed;
 
-    stemmed = sb_stemmer_stem(token_filter->stemmer,
-                              GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data));
-    if (stemmed) {
-      grn_token_set_data(ctx, next_token,
-                         stemmed,
-                         sb_stemmer_length(token_filter->stemmer));
+    if (is_all_upper) {
+      grn_obj *buffer;
+      buffer = &(token_filter->buffer);
+      GRN_BULK_REWIND(buffer);
+      normalize(ctx,
+                GRN_TEXT_VALUE(data),
+                GRN_TEXT_LEN(data),
+                buffer);
+      stemmed = sb_stemmer_stem(token_filter->stemmer,
+                                GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer));
+      if (stemmed) {
+        GRN_BULK_REWIND(buffer);
+        unnormalize(ctx,
+                    stemmed,
+                    sb_stemmer_length(token_filter->stemmer),
+                    buffer);
+        grn_token_set_data(ctx, next_token,
+                           GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer));
+      } else {
+        GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                         "[token-filter][stem] "
+                         "failed to allocate memory for stemmed word: <%.*s> "
+                         "(normalized: <%.*s>)",
+                         (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data),
+                         (int)GRN_TEXT_LEN(buffer), GRN_TEXT_VALUE(buffer));
+      }
     } else {
-      GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
-                       "[token-filter][stem] "
-                       "failed to allocate memory for stemmed word: <%.*s>",
-                       (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data));
-      return;
+      stemmed = sb_stemmer_stem(token_filter->stemmer,
+                                GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data));
+      if (stemmed) {
+        grn_token_set_data(ctx, next_token,
+                           stemmed,
+                           sb_stemmer_length(token_filter->stemmer));
+      } else {
+        GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                         "[token-filter][stem] "
+                         "failed to allocate memory for stemmed word: <%.*s>",
+                         (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data));
+      }
     }
   }
 }
@@ -113,6 +244,7 @@ stem_fin(grn_ctx *ctx, void *user_data)
   if (token_filter->stemmer) {
     sb_stemmer_delete(token_filter->stemmer);
   }
+  GRN_OBJ_FIN(ctx, &(token_filter->buffer));
   GRN_PLUGIN_FREE(ctx, token_filter);
 }
 

  Added: test/command/suite/token_filters/stem/all_upper.expected (+113 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/token_filters/stem/all_upper.expected    2014-12-28 19:08:56 +0900 (0557388)
@@ -0,0 +1,113 @@
+register token_filters/stem
+[[0,0.0,0.0],true]
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenDelimit   --token_filters TokenFilterStem
+[[0,0.0,0.0],true]
+column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "I DEVELOP GROONGA"},
+{"content": "I'M DEVELOPING GROONGA"},
+{"content": "I DEVELOPED GROONGA"},
+{"content": "GROONGA DEVELOPERS' SITE"}
+]
+[[0,0.0,0.0],4]
+select Terms
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        5
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "_key",
+          "ShortText"
+        ],
+        [
+          "memos_content",
+          "UInt32"
+        ]
+      ],
+      [
+        2,
+        "DEVELOP",
+        7
+      ],
+      [
+        3,
+        "GROONGA",
+        8
+      ],
+      [
+        1,
+        "I",
+        4
+      ],
+      [
+        4,
+        "I'M",
+        1
+      ],
+      [
+        5,
+        "SITE",
+        1
+      ]
+    ]
+  ]
+]
+select Memos --match_columns content --query "DEVELOPS"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        4
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "content",
+          "ShortText"
+        ]
+      ],
+      [
+        1,
+        "I DEVELOP GROONGA"
+      ],
+      [
+        2,
+        "I'M DEVELOPING GROONGA"
+      ],
+      [
+        3,
+        "I DEVELOPED GROONGA"
+      ],
+      [
+        4,
+        "GROONGA DEVELOPERS' SITE"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/token_filters/stem/all_upper.test (+22 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/token_filters/stem/all_upper.test    2014-12-28 19:08:56 +0900 (0f2ba1d)
@@ -0,0 +1,22 @@
+#@on-error omit
+register token_filters/stem
+#@on-error default
+
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenDelimit \
+  --token_filters TokenFilterStem
+column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content
+
+load --table Memos
+[
+{"content": "I DEVELOP GROONGA"},
+{"content": "I'M DEVELOPING GROONGA"},
+{"content": "I DEVELOPED GROONGA"},
+{"content": "GROONGA DEVELOPERS' SITE"}
+]
+
+select Terms
+select Memos --match_columns content --query "DEVELOPS"
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index