[Groonga-commit] groonga/groonga at d5f8fd1 [master] Add ToeknFilterStem that provides stemming feature

Back to archive index

Kouhei Sutou null+****@clear*****
Sat Oct 25 21:24:02 JST 2014


Kouhei Sutou	2014-10-25 21:24:02 +0900 (Sat, 25 Oct 2014)

  New Revision: d5f8fd146472cb01ffb993c60b80a12192c15cac
  https://github.com/groonga/groonga/commit/d5f8fd146472cb01ffb993c60b80a12192c15cac

  Message:
    Add ToeknFilterStem that provides stemming feature
    
    It is based on libstemmer provided by Snowball.
    
    TODO:
    
      * Support CMake

  Added files:
    plugins/token_filters/stem.c
    plugins/token_filters/stem_sources.am
    test/command/suite/token_filters/stem/english.expected
    test/command/suite/token_filters/stem/english.test
  Modified files:
    configure.ac
    plugins/token_filters/Makefile.am

  Modified: configure.ac (+65 -0)
===================================================================
--- configure.ac    2014-10-25 19:53:15 +0900 (4abd18e)
+++ configure.ac    2014-10-25 21:24:02 +0900 (865654d)
@@ -973,6 +973,63 @@ if test "x$with_kytea" = "xyes"; then
 fi
 AM_CONDITIONAL(WITH_KYTEA, test "x$with_kytea" = "xyes")
 
+# libstemmer
+AC_ARG_WITH(libstemmer,
+  [AS_HELP_STRING([--with-libstemmer],
+    [use libstemmer for stemming. [default=auto]])],
+  [with_libstemmer="$withval"],
+  [with_libstemmer="auto"])
+AC_ARG_WITH(libstemmer-include,
+  [AS_HELP_STRING([--with-libstemmer-include],
+    [path to libstemmer.h. [default=auto]])])
+AC_ARG_WITH(libstemmer-lib,
+  [AS_HELP_STRING([--with-libstemmer-lib],
+    [path to libstemmer.so. [default=auto]])])
+AC_MSG_CHECKING([whether enable libstemmer])
+AC_MSG_RESULT($with_libstemmer)
+if test "x$with_libstemmer" != "xno"; then
+  LIBSTEMMER_CFLAGS=""
+  LIBSTEMMER_LDFLAGS=""
+  LIBSTEMMER_LIBS=""
+
+  CFLAGS_save="${CFLAGS}"
+  LDFLAGS_save="${LDFLAGS}"
+  if test "x$with_libstemmer" != "xauto"; then
+    if test -z "${with_libstemmer_include}"; then
+      with_libstemmer_include="${with_libstemmer}/include"
+    fi
+    LIBSTEMMER_CFLAGS="-I${with_libstemmer_include:}"
+    if test -z "${with_libstemmer_lib}"; then
+      with_libstemmer_lib="${with_libstemmer}/lib"
+    fi
+    LIBSTEMMER_LDFLAGS="-L${with_libstemmer_lib}"
+    CFLAGS="${CFLAGS} ${LIBSTEMMER_CFLAGS}"
+    LDFLAGS="${LDFLAGS} ${LIBSTEMMER_LDFLAGS}"
+  fi
+  AC_CHECK_HEADERS(libstemmer.h,
+                   [libstemmer_exists=yes],
+                   [libstemmer_exists=no])
+  if test "$libstemmer_exists" = "yes"; then
+    AC_CHECK_LIB(stemmer, sb_stemmer_list,
+                 [LIBSTEMMER_LIBS="-lstemmer"],
+                 [libstemmer_exists=no])
+  fi
+  CFLAGS="${CFLAGS_save}"
+  LDFLAGS="${LDFLAGS_save}"
+
+  if test "$libstemmer_exists" = "no" -a "x$with_libstemmer" != "xauto"; then
+    AC_MSG_ERROR("No libstemmer found at ${with_libstemmer_include} and ${with_libstemmer_lib}.")
+  fi
+  with_libstemmer="$libstemmer_exists"
+fi
+if test "x$with_libstemmer" = "xyes"; then
+  AC_SUBST(LIBSTEMMER_CFLAGS)
+  AC_SUBST(LIBSTEMMER_LDFLAGS)
+  AC_SUBST(LIBSTEMMER_LIBS)
+  AC_DEFINE(GRN_WITH_LIBSTEMMER, [1], [use libstemmer])
+fi
+AM_CONDITIONAL(WITH_LIBSTEMMER, test "x$with_libstemmer" = "xyes")
+
 # futex check
 AC_ARG_ENABLE(futex,
   [AS_HELP_STRING([--enable-futex],
@@ -1489,6 +1546,14 @@ if test "x$with_kytea" = "xyes"; then
 fi
 echo
 
+echo "Token filters:"
+echo "  libstemmer:            $with_libstemmer"
+if test "x$with_libstemmer" = "xyes"; then
+  echo "    CFLAGS:              $LIBSTEMMER_CFLAGS"
+  echo "    LIBS:                $LIBSTEMMER_LIBS"
+fi
+echo
+
 echo "Libraries:"
 echo "  ZeroMQ:                $zeromq_available"
 if test "x$zeromq_available" = "xyes"; then

  Modified: plugins/token_filters/Makefile.am (+8 -0)
===================================================================
--- plugins/token_filters/Makefile.am    2014-10-25 19:53:15 +0900 (8d77466)
+++ plugins/token_filters/Makefile.am    2014-10-25 21:24:02 +0900 (c63bef7)
@@ -16,5 +16,13 @@ LIBS =						\
 
 token_filter_plugins_LTLIBRARIES =
 token_filter_plugins_LTLIBRARIES += stop_word.la
+if WITH_LIBSTEMMER
+token_filter_plugins_LTLIBRARIES += stem.la
+endif
 
 include stop_word_sources.am
+
+include stem_sources.am
+stem_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBSTEMMER_CFLAGS)
+stem_la_LIBADD = $(LIBS) $(LIBSTEMMER_LIBS)
+stem_la_LDFLAGS = $(AM_LDFLAGS) $(LIBSTEMMER_LDFLAGS)

  Added: plugins/token_filters/stem.c (+143 -0) 100644
===================================================================
--- /dev/null
+++ plugins/token_filters/stem.c    2014-10-25 21:24:02 +0900 (11465f1)
@@ -0,0 +1,143 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2014 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include <str.h>
+
+#include <groonga.h>
+#include <groonga/token_filter.h>
+
+#include <string.h>
+
+#include <libstemmer.h>
+
+typedef struct {
+  struct sb_stemmer *stemmer;
+  grn_tokenizer_token token;
+} grn_stem_token_filter;
+
+static void *
+stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode)
+{
+  grn_stem_token_filter *token_filter;
+
+  token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter));
+  if (!token_filter) {
+    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                     "[token-filter][stem] "
+                     "failed to allocate grn_stem_token_filter");
+    return NULL;
+  }
+
+  token_filter->stemmer = NULL;
+  grn_tokenizer_token_init(ctx, &(token_filter->token));
+
+  return token_filter;
+}
+
+static void
+stem_filter(grn_ctx *ctx,
+            grn_token *current_token,
+            grn_token *next_token,
+            void *user_data)
+{
+  grn_stem_token_filter *token_filter = user_data;
+  grn_obj *data;
+
+  if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) {
+    return;
+  }
+
+  data = grn_token_get_data(ctx, current_token);
+
+  if (token_filter->stemmer) {
+    sb_stemmer_delete(token_filter->stemmer);
+  }
+  {
+    /* TODO: Detect algorithm from the current token. */
+    const char *algorithm = "english";
+    const char *encoding = "UTF_8";
+    token_filter->stemmer = sb_stemmer_new(algorithm, encoding);
+    if (!token_filter->stemmer) {
+      GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+                       "[token-filter][stem] "
+                       "failed to create stemmer: "
+                       "algorithm=<%s>, encoding=<%s>",
+                       algorithm, encoding);
+      return;
+    }
+  }
+
+  {
+    const sb_symbol *stemmed;
+
+    stemmed = sb_stemmer_stem(token_filter->stemmer,
+                              GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data));
+    if (stemmed) {
+      grn_token_set_data(ctx, next_token,
+                         stemmed,
+                         sb_stemmer_length(token_filter->stemmer));
+    } else {
+      GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                       "[token-filter][stem] "
+                       "failed to allocate memory for stemmed word: <%.*s>",
+                       (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data));
+      return;
+    }
+  }
+}
+
+static void
+stem_fin(grn_ctx *ctx, void *user_data)
+{
+  grn_stem_token_filter *token_filter = user_data;
+  if (!token_filter) {
+    return;
+  }
+
+  grn_tokenizer_token_fin(ctx, &(token_filter->token));
+  if (token_filter->stemmer) {
+    sb_stemmer_delete(token_filter->stemmer);
+  }
+  GRN_PLUGIN_FREE(ctx, token_filter);
+}
+
+grn_rc
+GRN_PLUGIN_INIT(grn_ctx *ctx)
+{
+  return ctx->rc;
+}
+
+grn_rc
+GRN_PLUGIN_REGISTER(grn_ctx *ctx)
+{
+  grn_rc rc;
+
+  rc = grn_token_filter_register(ctx,
+                                 "TokenFilterStem", -1,
+                                 stem_init,
+                                 stem_filter,
+                                 stem_fin);
+
+  return rc;
+}
+
+grn_rc
+GRN_PLUGIN_FIN(grn_ctx *ctx)
+{
+  return GRN_SUCCESS;
+}

  Added: plugins/token_filters/stem_sources.am (+2 -0) 100644
===================================================================
--- /dev/null
+++ plugins/token_filters/stem_sources.am    2014-10-25 21:24:02 +0900 (d02a395)
@@ -0,0 +1,2 @@
+stem_la_SOURCES =				\
+	stem.c

  Added: test/command/suite/token_filters/stem/english.expected (+54 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/token_filters/stem/english.expected    2014-10-25 21:24:02 +0900 (a1db6fe)
@@ -0,0 +1,54 @@
+register token_filters/stem
+[[0,0.0,0.0],true]
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto   --token_filters TokenFilterStem
+[[0,0.0,0.0],true]
+column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+{"content": "I develop Groonga"},
+{"content": "I'm developing Groonga"},
+{"content": "I developed Groonga"}
+]
+[[0,0.0,0.0],3]
+select Memos --match_columns content --query "develops"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        3
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "content",
+          "ShortText"
+        ]
+      ],
+      [
+        1,
+        "I develop Groonga"
+      ],
+      [
+        2,
+        "I'm developing Groonga"
+      ],
+      [
+        3,
+        "I developed Groonga"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/token_filters/stem/english.test (+21 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/token_filters/stem/english.test    2014-10-25 21:24:02 +0900 (27220cf)
@@ -0,0 +1,21 @@
+#@on-error omit
+register token_filters/stem
+#@on-error default
+
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto \
+  --token_filters TokenFilterStem
+column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content
+
+load --table Memos
+[
+{"content": "I develop Groonga"},
+{"content": "I'm developing Groonga"},
+{"content": "I developed Groonga"}
+]
+
+select Memos --match_columns content --query "develops"
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index