null+****@clear*****
null+****@clear*****
2012年 2月 17日 (金) 17:05:22 JST
Susumu Yata 2012-02-17 17:05:22 +0900 (Fri, 17 Feb 2012) New Revision: 38799bac4d5a55c0ce820a3daaef89b03299edb3 Log: add TokenKytea. Added files: plugins/tokenizers/kytea.cpp Modified files: plugins/tokenizers/Makefile.am Modified: plugins/tokenizers/Makefile.am (+8 -0) =================================================================== --- plugins/tokenizers/Makefile.am 2012-02-16 16:40:42 +0900 (898db4b) +++ plugins/tokenizers/Makefile.am 2012-02-17 17:05:22 +0900 (b94c035) @@ -15,8 +15,16 @@ tokenizer_plugins_LTLIBRARIES = if WITH_MECAB tokenizer_plugins_LTLIBRARIES += mecab.la endif +if WITH_KYTEA +tokenizer_plugins_LTLIBRARIES += kytea.la +endif mecab_la_CPPFLAGS = $(MECAB_CPPFLAGS) mecab_la_SOURCES = mecab.c mecab_la_LIBADD = $(LIBS) $(MECAB_LIBS) mecab_la_LDFLAGS = $(AM_LDFLAGS) $(MECAB_LDFLAGS) + +kytea_la_CPPFLAGS = $(KYTEA_CPPFLAGS) +kytea_la_SOURCES = kytea.cpp +kytea_la_LIBADD = $(LIBS) $(KYTEA_LIBS) +kytea_la_LDFLAGS = $(AM_LDFLAGS) $(KYTEA_LDFLAGS) Added: plugins/tokenizers/kytea.cpp (+290 -0) 100644 =================================================================== --- /dev/null +++ plugins/tokenizers/kytea.cpp 2012-02-17 17:05:22 +0900 (efa2f7c) @@ -0,0 +1,290 @@ +/* -*- c-basic-offset: 2 -*- */ +/* Copyright(C) 2012 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <groonga/tokenizer.h> + +#include <kytea/kytea.h> + +#include <string.h> + +#include <string> +#include <vector> + +namespace { + +grn_plugin_mutex *kytea_mutex = NULL; +kytea::KyteaConfig *kytea_config = NULL; +kytea::Kytea *kytea_tagger = NULL; +kytea::StringUtil *kytea_util = NULL; + +void kytea_init(grn_ctx *ctx); +void kytea_fin(grn_ctx *ctx); + +void kytea_init(grn_ctx *ctx) { + if ((kytea_mutex != NULL) || (kytea_config != NULL) || + (kytea_tagger != NULL) || (kytea_util != NULL)) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer] TokenKytea is already initialized"); + return; + } + + kytea_mutex = grn_plugin_mutex_create(ctx); + if (kytea_mutex == NULL) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer] grn_plugin_mutex_create() failed"); + return; + } + + kytea::KyteaConfig * const config = static_cast<kytea::KyteaConfig *>( + GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::KyteaConfig))); + if (config != NULL) try { + new (config) kytea::KyteaConfig; + kytea_config = config; + try { + kytea_config->setDebug(0); + kytea_config->setOnTraining(false); + kytea_config->parseRunCommandLine(0, NULL); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer] kytea::KyteaConfig settings failed"); + return; + } + } catch (...) { + GRN_PLUGIN_FREE(ctx, config); + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer] kytea::KyteaConfig initialization failed"); + return; + } else { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer] memory allocation to kytea::KyteaConfig failed"); + return; + } + + kytea::Kytea * const tagger = static_cast<kytea::Kytea *>( + GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::Kytea))); + if (tagger != NULL) try { + new (tagger) kytea::Kytea; + kytea_tagger = tagger; + try { + kytea_tagger->readModel(kytea_config->getModelFile().c_str()); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer] kytea::Kytea::readModel() failed"); + return; + } + } catch (...) { + GRN_PLUGIN_FREE(ctx, tagger); + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer] kytea::Kytea initialization failed"); + return; + } else { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer] memory allocation to kytea::Kytea failed"); + return; + } + + try { + kytea_util = kytea_tagger->getStringUtil(); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer] kytea::Kytea::getStringUtil() failed"); + return; + } +} + +void kytea_fin(grn_ctx *ctx) { + kytea_util = NULL; + + if (kytea_tagger != NULL) { + kytea_tagger->~Kytea(); + GRN_PLUGIN_FREE(ctx, kytea_tagger); + kytea_tagger = NULL; + } + + if (kytea_config != NULL) { + kytea_config->~KyteaConfig(); + GRN_PLUGIN_FREE(ctx, kytea_config); + kytea_config = NULL; + } + + if (kytea_mutex != NULL) { + grn_plugin_mutex_destroy(ctx, kytea_mutex); + kytea_mutex = NULL; + } +} + +struct grn_tokenizer_kytea { + grn_tokenizer_query *query; + kytea::KyteaSentence sentence; + std::vector<std::string> tokens; + std::size_t id; + grn_tokenizer_token token; + + grn_tokenizer_kytea() : query(NULL), sentence(), tokens(), id(0), token() {} + ~grn_tokenizer_kytea() {} +}; + +void grn_tokenizer_kytea_init(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) { + new (tokenizer) grn_tokenizer_kytea; + grn_tokenizer_token_init(ctx, &tokenizer->token); +} + +void grn_tokenizer_kytea_fin(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) { + grn_tokenizer_token_fin(ctx, &tokenizer->token); + if (tokenizer->query != NULL) { + grn_tokenizer_query_destroy(ctx, tokenizer->query); + } + tokenizer->~grn_tokenizer_kytea(); +} + +grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + grn_tokenizer_query * const query = + grn_tokenizer_query_create(ctx, num_args, args); + if (query == NULL) { + return NULL; + } + + grn_tokenizer_kytea * const tokenizer = static_cast<grn_tokenizer_kytea *>( + GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_kytea))); + if (tokenizer != NULL) try { + grn_tokenizer_kytea_init(ctx, tokenizer); + tokenizer->query = query; + } catch (...) { + grn_tokenizer_query_destroy(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer] tokenizer initialization failed"); + return NULL; + } else { + grn_tokenizer_query_destroy(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer] memory allocation to grn_tokenizer_kytea failed"); + return NULL; + } + + grn_plugin_mutex_lock(ctx, kytea_mutex); + try { + const std::string str(query->ptr, query->length); + tokenizer->sentence = kytea::KyteaSentence(kytea_util->mapString(str)); + kytea_tagger->calculateWS(tokenizer->sentence); + } catch (...) { + grn_plugin_mutex_unlock(ctx, kytea_mutex); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer] tokenization failed"); + return NULL; + } + grn_plugin_mutex_unlock(ctx, kytea_mutex); + + try { + for (std::size_t i = 0; i < tokenizer->sentence.words.size(); ++i) { + const std::string &token = + kytea_util->showString(tokenizer->sentence.words[i].surf); + const char *ptr = token.c_str(); + unsigned int left = static_cast<unsigned int>(token.length()); + while (left > 0) { + const int char_length = + grn_tokenizer_charlen(ctx, ptr, left, query->encoding); + if ((char_length == 0) || + (grn_tokenizer_isspace(ctx, ptr, left, query->encoding) != 0)) { + break; + } + ptr += char_length; + left -= char_length; + } + if (left == 0) { + tokenizer->tokens.push_back(token); + } + } + } catch (...) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer] adjustment failed"); + return NULL; + } + + user_data->ptr = tokenizer; + return NULL; +} + +grn_obj *grn_kytea_next(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + grn_tokenizer_kytea * const tokenizer = + static_cast<grn_tokenizer_kytea *>(user_data->ptr); + const grn_tokenizer_status status = + ((tokenizer->id + 1) < tokenizer->tokens.size()) ? + GRN_TOKENIZER_CONTINUE : GRN_TOKENIZER_LAST; + if (tokenizer->id < tokenizer->tokens.size()) { + const std::string &token = tokenizer->tokens[tokenizer->id++]; + grn_tokenizer_token_push(ctx, &tokenizer->token, + token.c_str(), token.length(), status); + } else { + grn_tokenizer_token_push(ctx, &tokenizer->token, "", 0, status); + } + return NULL; +} + +grn_obj *grn_kytea_fin(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + grn_tokenizer_kytea * const tokenizer = + static_cast<grn_tokenizer_kytea *>(user_data->ptr); + if (tokenizer != NULL) { + grn_tokenizer_kytea_fin(ctx, tokenizer); + GRN_PLUGIN_FREE(ctx, tokenizer); + } + return NULL; +} + +} // namespace + +extern "C" { + +/* + GRN_PLUGIN_INIT() is called to initialize this plugin. Note that an error + code must be set in `ctx->rc' on failure. + */ +grn_rc GRN_PLUGIN_INIT(grn_ctx *ctx) { + kytea_init(ctx); + return ctx->rc; +} + +/* + GRN_PLUGIN_REGISTER() registers this plugin to the database associated with + `ctx'. The registration requires the plugin name and the functions to be + called for tokenization. + */ +grn_rc GRN_PLUGIN_REGISTER(grn_ctx *ctx) { + return grn_tokenizer_register(ctx, "TokenKytea", 10, grn_kytea_init, + grn_kytea_next, grn_kytea_fin); +} + +/* + GRN_PLUGIN_FIN() is called to finalize the plugin that was initialized by + GRN_PLUGIN_INIT(). + */ +grn_rc GRN_PLUGIN_FIN(grn_ctx *ctx) { + kytea_fin(ctx); + return GRN_SUCCESS; +} + +} // extern "C"