Kouhei Sutou
null+****@clear*****
Sat Oct 25 21:24:02 JST 2014
Kouhei Sutou 2014-10-25 21:24:02 +0900 (Sat, 25 Oct 2014) New Revision: d5f8fd146472cb01ffb993c60b80a12192c15cac https://github.com/groonga/groonga/commit/d5f8fd146472cb01ffb993c60b80a12192c15cac Message: Add ToeknFilterStem that provides stemming feature It is based on libstemmer provided by Snowball. TODO: * Support CMake Added files: plugins/token_filters/stem.c plugins/token_filters/stem_sources.am test/command/suite/token_filters/stem/english.expected test/command/suite/token_filters/stem/english.test Modified files: configure.ac plugins/token_filters/Makefile.am Modified: configure.ac (+65 -0) =================================================================== --- configure.ac 2014-10-25 19:53:15 +0900 (4abd18e) +++ configure.ac 2014-10-25 21:24:02 +0900 (865654d) @@ -973,6 +973,63 @@ if test "x$with_kytea" = "xyes"; then fi AM_CONDITIONAL(WITH_KYTEA, test "x$with_kytea" = "xyes") +# libstemmer +AC_ARG_WITH(libstemmer, + [AS_HELP_STRING([--with-libstemmer], + [use libstemmer for stemming. [default=auto]])], + [with_libstemmer="$withval"], + [with_libstemmer="auto"]) +AC_ARG_WITH(libstemmer-include, + [AS_HELP_STRING([--with-libstemmer-include], + [path to libstemmer.h. [default=auto]])]) +AC_ARG_WITH(libstemmer-lib, + [AS_HELP_STRING([--with-libstemmer-lib], + [path to libstemmer.so. [default=auto]])]) +AC_MSG_CHECKING([whether enable libstemmer]) +AC_MSG_RESULT($with_libstemmer) +if test "x$with_libstemmer" != "xno"; then + LIBSTEMMER_CFLAGS="" + LIBSTEMMER_LDFLAGS="" + LIBSTEMMER_LIBS="" + + CFLAGS_save="${CFLAGS}" + LDFLAGS_save="${LDFLAGS}" + if test "x$with_libstemmer" != "xauto"; then + if test -z "${with_libstemmer_include}"; then + with_libstemmer_include="${with_libstemmer}/include" + fi + LIBSTEMMER_CFLAGS="-I${with_libstemmer_include:}" + if test -z "${with_libstemmer_lib}"; then + with_libstemmer_lib="${with_libstemmer}/lib" + fi + LIBSTEMMER_LDFLAGS="-L${with_libstemmer_lib}" + CFLAGS="${CFLAGS} ${LIBSTEMMER_CFLAGS}" + LDFLAGS="${LDFLAGS} ${LIBSTEMMER_LDFLAGS}" + fi + AC_CHECK_HEADERS(libstemmer.h, + [libstemmer_exists=yes], + [libstemmer_exists=no]) + if test "$libstemmer_exists" = "yes"; then + AC_CHECK_LIB(stemmer, sb_stemmer_list, + [LIBSTEMMER_LIBS="-lstemmer"], + [libstemmer_exists=no]) + fi + CFLAGS="${CFLAGS_save}" + LDFLAGS="${LDFLAGS_save}" + + if test "$libstemmer_exists" = "no" -a "x$with_libstemmer" != "xauto"; then + AC_MSG_ERROR("No libstemmer found at ${with_libstemmer_include} and ${with_libstemmer_lib}.") + fi + with_libstemmer="$libstemmer_exists" +fi +if test "x$with_libstemmer" = "xyes"; then + AC_SUBST(LIBSTEMMER_CFLAGS) + AC_SUBST(LIBSTEMMER_LDFLAGS) + AC_SUBST(LIBSTEMMER_LIBS) + AC_DEFINE(GRN_WITH_LIBSTEMMER, [1], [use libstemmer]) +fi +AM_CONDITIONAL(WITH_LIBSTEMMER, test "x$with_libstemmer" = "xyes") + # futex check AC_ARG_ENABLE(futex, [AS_HELP_STRING([--enable-futex], @@ -1489,6 +1546,14 @@ if test "x$with_kytea" = "xyes"; then fi echo +echo "Token filters:" +echo " libstemmer: $with_libstemmer" +if test "x$with_libstemmer" = "xyes"; then + echo " CFLAGS: $LIBSTEMMER_CFLAGS" + echo " LIBS: $LIBSTEMMER_LIBS" +fi +echo + echo "Libraries:" echo " ZeroMQ: $zeromq_available" if test "x$zeromq_available" = "xyes"; then Modified: plugins/token_filters/Makefile.am (+8 -0) =================================================================== --- plugins/token_filters/Makefile.am 2014-10-25 19:53:15 +0900 (8d77466) +++ plugins/token_filters/Makefile.am 2014-10-25 21:24:02 +0900 (c63bef7) @@ -16,5 +16,13 @@ LIBS = \ token_filter_plugins_LTLIBRARIES = token_filter_plugins_LTLIBRARIES += stop_word.la +if WITH_LIBSTEMMER +token_filter_plugins_LTLIBRARIES += stem.la +endif include stop_word_sources.am + +include stem_sources.am +stem_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBSTEMMER_CFLAGS) +stem_la_LIBADD = $(LIBS) $(LIBSTEMMER_LIBS) +stem_la_LDFLAGS = $(AM_LDFLAGS) $(LIBSTEMMER_LDFLAGS) Added: plugins/token_filters/stem.c (+143 -0) 100644 =================================================================== --- /dev/null +++ plugins/token_filters/stem.c 2014-10-25 21:24:02 +0900 (11465f1) @@ -0,0 +1,143 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2014 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <str.h> + +#include <groonga.h> +#include <groonga/token_filter.h> + +#include <string.h> + +#include <libstemmer.h> + +typedef struct { + struct sb_stemmer *stemmer; + grn_tokenizer_token token; +} grn_stem_token_filter; + +static void * +stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode) +{ + grn_stem_token_filter *token_filter; + + token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter)); + if (!token_filter) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stem] " + "failed to allocate grn_stem_token_filter"); + return NULL; + } + + token_filter->stemmer = NULL; + grn_tokenizer_token_init(ctx, &(token_filter->token)); + + return token_filter; +} + +static void +stem_filter(grn_ctx *ctx, + grn_token *current_token, + grn_token *next_token, + void *user_data) +{ + grn_stem_token_filter *token_filter = user_data; + grn_obj *data; + + if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) { + return; + } + + data = grn_token_get_data(ctx, current_token); + + if (token_filter->stemmer) { + sb_stemmer_delete(token_filter->stemmer); + } + { + /* TODO: Detect algorithm from the current token. */ + const char *algorithm = "english"; + const char *encoding = "UTF_8"; + token_filter->stemmer = sb_stemmer_new(algorithm, encoding); + if (!token_filter->stemmer) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "[token-filter][stem] " + "failed to create stemmer: " + "algorithm=<%s>, encoding=<%s>", + algorithm, encoding); + return; + } + } + + { + const sb_symbol *stemmed; + + stemmed = sb_stemmer_stem(token_filter->stemmer, + GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data)); + if (stemmed) { + grn_token_set_data(ctx, next_token, + stemmed, + sb_stemmer_length(token_filter->stemmer)); + } else { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stem] " + "failed to allocate memory for stemmed word: <%.*s>", + (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data)); + return; + } + } +} + +static void +stem_fin(grn_ctx *ctx, void *user_data) +{ + grn_stem_token_filter *token_filter = user_data; + if (!token_filter) { + return; + } + + grn_tokenizer_token_fin(ctx, &(token_filter->token)); + if (token_filter->stemmer) { + sb_stemmer_delete(token_filter->stemmer); + } + GRN_PLUGIN_FREE(ctx, token_filter); +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc; + + rc = grn_token_filter_register(ctx, + "TokenFilterStem", -1, + stem_init, + stem_filter, + stem_fin); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} Added: plugins/token_filters/stem_sources.am (+2 -0) 100644 =================================================================== --- /dev/null +++ plugins/token_filters/stem_sources.am 2014-10-25 21:24:02 +0900 (d02a395) @@ -0,0 +1,2 @@ +stem_la_SOURCES = \ + stem.c Added: test/command/suite/token_filters/stem/english.expected (+54 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/token_filters/stem/english.expected 2014-10-25 21:24:02 +0900 (a1db6fe) @@ -0,0 +1,54 @@ +register token_filters/stem +[[0,0.0,0.0],true] +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto --token_filters TokenFilterStem +[[0,0.0,0.0],true] +column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "I develop Groonga"}, +{"content": "I'm developing Groonga"}, +{"content": "I developed Groonga"} +] +[[0,0.0,0.0],3] +select Memos --match_columns content --query "develops" +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 3 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "ShortText" + ] + ], + [ + 1, + "I develop Groonga" + ], + [ + 2, + "I'm developing Groonga" + ], + [ + 3, + "I developed Groonga" + ] + ] + ] +] Added: test/command/suite/token_filters/stem/english.test (+21 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/token_filters/stem/english.test 2014-10-25 21:24:02 +0900 (27220cf) @@ -0,0 +1,21 @@ +#@on-error omit +register token_filters/stem +#@on-error default + +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto \ + --token_filters TokenFilterStem +column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content + +load --table Memos +[ +{"content": "I develop Groonga"}, +{"content": "I'm developing Groonga"}, +{"content": "I developed Groonga"} +] + +select Memos --match_columns content --query "develops" -------------- next part -------------- HTML����������������������������...Download