Kouhei Sutou
null+****@clear*****
Thu Oct 11 17:54:27 JST 2012
Kouhei Sutou 2012-10-11 17:54:27 +0900 (Thu, 11 Oct 2012) New Revision: d1b00b8d3d8d726fef0da0b02893e8542492a041 https://github.com/groonga/groonga/commit/d1b00b8d3d8d726fef0da0b02893e8542492a041 Log: Add a sample query expander QueryExpanderTSV It reads synonyms from TSV format file. Synonyms file should have the following contents: key[TAB]synonym1[TAB]synonym2[TAB]... For example: rroonga[TAB]rroonga[TAB]Ruby groonga With the above synonyms, --query rroonga is expanded to --query "((rroonga) OR (Ruby groonga))". TODO: * Support magic comment at the head. * Install the default synonyms file as /etc/groonga/synonyms.tsv. Added files: plugins/query_expanders/Makefile.am plugins/query_expanders/tsv.c plugins/query_expanders/tsv_sources.am test/command/fixture/query_expander/tsv/synonyms.tsv test/command/suite/select/query_expansion/query_expander/tsv/expand.expected test/command/suite/select/query_expansion/query_expander/tsv/expand.test Copied files: plugins/query_expanders/CMakeLists.txt (from plugins/CMakeLists.txt) Modified files: configure.ac plugins/CMakeLists.txt plugins/Makefile.am test/command/run-test.sh Modified: configure.ac (+14 -0) =================================================================== --- configure.ac 2012-10-11 17:40:27 +0900 (ff298f7) +++ configure.ac 2012-10-11 17:54:27 +0900 (848c6b1) @@ -222,6 +222,7 @@ AC_CONFIG_FILES([ plugins/tokenizers/Makefile plugins/suggest/Makefile plugins/table/Makefile + plugins/query_expanders/Makefile examples/Makefile examples/dictionary/Makefile examples/dictionary/edict/Makefile @@ -1159,6 +1160,9 @@ AC_SUBST(expanded_pluginsdir) tokenizers_pluginsdir="\${pluginsdir}/tokenizers" AC_SUBST(tokenizers_pluginsdir) +query_expanders_pluginsdir="\${pluginsdir}/query_expanders" +AC_SUBST(query_expanders_pluginsdir) + suggest_pluginsdir="\${pluginsdir}/suggest" AC_SUBST(suggest_pluginsdir) @@ -1175,6 +1179,16 @@ if test -z "$suffix"; then fi AC_DEFINE_UNQUOTED(GRN_PLUGIN_SUFFIX, ["$suffix"], "plugin suffix") +# for query expanders +GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE="synonyms.tsv" +AC_DEFINE_UNQUOTED(GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE, + ["$GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE"], + "The relative synonyms file for TSV query expander") +GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE="${pkgdatadir}/${GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE}" +AC_DEFINE_UNQUOTED(GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE, + ["$GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE"], + "The default synonyms file for TSV query expander") + # for examples examplesdir="\$(pkgdatadir)/examples" AC_SUBST(examplesdir) Modified: plugins/CMakeLists.txt (+1 -0) =================================================================== --- plugins/CMakeLists.txt 2012-10-11 17:40:27 +0900 (0cf9205) +++ plugins/CMakeLists.txt 2012-10-11 17:54:27 +0900 (9d94824) @@ -16,3 +16,4 @@ add_subdirectory(suggest) add_subdirectory(tokenizers) add_subdirectory(table) +add_subdirectory(query_expanders) Modified: plugins/Makefile.am (+5 -4) =================================================================== --- plugins/Makefile.am 2012-10-11 17:40:27 +0900 (8d6b1ef) +++ plugins/Makefile.am 2012-10-11 17:54:27 +0900 (fbe6f15) @@ -1,7 +1,8 @@ -SUBDIRS = \ - tokenizers \ - suggest \ - table +SUBDIRS = \ + tokenizers \ + suggest \ + table \ + query_expanders EXTRA_DIST = \ CMakeLists.txt Copied: plugins/query_expanders/CMakeLists.txt (+12 -3) 60% =================================================================== --- plugins/CMakeLists.txt 2012-10-11 17:40:27 +0900 (0cf9205) +++ plugins/query_expanders/CMakeLists.txt 2012-10-11 17:54:27 +0900 (a2f1a48) @@ -13,6 +13,15 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -add_subdirectory(suggest) -add_subdirectory(tokenizers) -add_subdirectory(table) +include_directories( + ${CMAKE_SOURCE_DIR}/lib + ) + +set(QUERY_EXPANDERS_DIR "${GRN_PLUGINS_DIR}/query_expanders") +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/tsv_sources.am TSV_SOURCES) +add_library(tsv_query_expander MODULE ${TSV_SOURCES}) +set_target_properties(tsv_query_expander PROPERTIES + PREFIX "" + OUTPUT_NAME "tsv") +target_link_libraries(tsv_query_expander libgroonga) +install(TARGETS tsv_query_expander DESTINATION "${QUERY_EXPANDERS_DIR}") Added: plugins/query_expanders/Makefile.am (+20 -0) 100644 =================================================================== --- /dev/null +++ plugins/query_expanders/Makefile.am 2012-10-11 17:54:27 +0900 (cb0abc9) @@ -0,0 +1,20 @@ +EXTRA_DIST = \ + CMakeLists.txt + +INCLUDES = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +query_expanders_plugins_LTLIBRARIES = +query_expanders_plugins_LTLIBRARIES += tsv.la + +include tsv_sources.am Added: plugins/query_expanders/tsv.c (+230 -0) 100644 =================================================================== --- /dev/null +++ plugins/query_expanders/tsv.c 2012-10-11 17:54:27 +0900 (ab75c5c) @@ -0,0 +1,230 @@ +/* -*- c-basic-offset: 2 -*- */ +/* Copyright(C) 2012 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <groonga/plugin.h> + +/* groonga's internal headers: They should be removed. */ +/* for grn_text_fgets() */ +#include <str.h> +/* for GRN_PROC_ALLOC() */ +#include <db.h> +/* for grn_win32_base_dir() */ +#include <util.h> + +#include <stdio.h> +#include <string.h> + +#define MAX_SYNONYM_BYTES 4096 + +static grn_hash *synonyms = NULL; + +#ifdef WIN32 +static char *win32_synonyms_file = NULL; +const char * +get_system_synonyms_file(void) +{ + if (!win32_synonyms_file) { + const char *base_dir; + const char *relative_path = GRN_RELATIVE_QUERY_EXPANDER_TSV_SYNONYMS_FILE; + char *synonyms_file; + char *path; + size_t base_dir_length; + + base_dir = grn_win32_base_dir(); + base_dir_length = strlen(base_dir); + synonyms_file = + malloc(base_dir_length + strlen("/") + strlen(relative_path) + 1); + strcpy(synonyms_file, base_dir); + strcat(synonyms_file, "/"); + strcat(synonyms_file, relative_path); + win32_synonyms_file = synonyms_file; + } + return win32_synonyms_file; +} + +#else /* WIN32 */ +const char * +get_system_synonyms_file(void) +{ + return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE; +} +#endif /* WIN32 */ + +static inline grn_bool +is_comment_mark(char character) +{ + return character == '#'; +} + +static void +parse_synonyms_file_line(grn_ctx *ctx, const char *line, int line_length, + grn_obj *key, grn_obj *value) +{ + size_t i = 0; + + if (is_comment_mark(line[i])) { + return; + } + + while (i < line_length) { + char character = line[i]; + i++; + if (character == '\t') { + break; + } + GRN_TEXT_PUTC(ctx, key, character); + } + + if (i == line_length) { + return; + } + + GRN_TEXT_PUTS(ctx, value, "(("); + while (i < line_length) { + char character = line[i]; + i++; + if (character == '\t') { + GRN_TEXT_PUTS(ctx, value, ") OR ("); + } else { + GRN_TEXT_PUTC(ctx, value, character); + } + } + GRN_TEXT_PUTS(ctx, value, "))"); + + { + grn_id id; + void *value_location = NULL; + + id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key), + &value_location, NULL); + if (id == GRN_ID_NIL) { + GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING, + "[plugin][query-expander][tsv] " + "failed to register key: <%.*s>", + (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key)); + return; + } + + grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1); + GRN_TEXT_PUTC(ctx, value, '\0'); + memcpy(value_location, GRN_TEXT_VALUE(value), MAX_SYNONYM_BYTES); + } +} + +static void +load_synonyms(grn_ctx *ctx) +{ + const char *path; + FILE *file; + grn_obj line, key, value; + + path = getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE"); + if (!path) { + path = get_system_synonyms_file(); + } + file = fopen(path, "r"); + if (!file) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[plugin][query-expander][tsv] " + "synonyms file doesn't exist: <%s>", + path); + return; + } + + GRN_TEXT_INIT(&line, 0); + GRN_TEXT_INIT(&key, 0); + GRN_TEXT_INIT(&value, 0); + grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES); + while (grn_text_fgets(ctx, &line, file) == GRN_SUCCESS) { + GRN_BULK_REWIND(&key); + GRN_BULK_REWIND(&value); + parse_synonyms_file_line(ctx, + GRN_TEXT_VALUE(&line), GRN_TEXT_LEN(&line), + &key, &value); + GRN_BULK_REWIND(&line); + } + GRN_OBJ_FIN(ctx, &line); + GRN_OBJ_FIN(ctx, &key); + GRN_OBJ_FIN(ctx, &value); + + fclose(file); +} + +static grn_obj * +func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + grn_rc rc = GRN_END_OF_DATA; + grn_id id; + grn_obj *term, *expanded_term; + void *value; + grn_obj *rc_object; + + term = args[0]; + expanded_term = args[1]; + id = grn_hash_get(ctx, synonyms, + GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term), + &value); + if (id != GRN_ID_NIL) { + const char *query = value; + GRN_TEXT_PUTS(ctx, expanded_term, query); + rc = GRN_SUCCESS; + } + + rc_object = GRN_PROC_ALLOC(GRN_DB_INT32, 0); + if (rc_object) { + GRN_INT32_SET(ctx, rc_object, rc); + } + + return rc_object; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + if (!synonyms) { + synonyms = grn_hash_create(ctx, NULL, + GRN_TABLE_MAX_KEY_SIZE, + MAX_SYNONYM_BYTES, + GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE); + if (!synonyms) { + return ctx->rc; + } + load_synonyms(ctx); + } + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_proc_create(ctx, "QueryExpanderTSV", strlen("QueryExpanderTSV"), + GRN_PROC_FUNCTION, + func_query_expander_tsv, NULL, NULL, + 0, NULL); + return GRN_SUCCESS; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + if (synonyms) { + grn_hash_close(ctx, synonyms); + synonyms = NULL; + } + return GRN_SUCCESS; +} Added: plugins/query_expanders/tsv_sources.am (+2 -0) 100644 =================================================================== --- /dev/null +++ plugins/query_expanders/tsv_sources.am 2012-10-11 17:54:27 +0900 (f1bdabe) @@ -0,0 +1,2 @@ +tsv_la_SOURCES = \ + tsv.c Added: test/command/fixture/query_expander/tsv/synonyms.tsv (+2 -0) 100644 =================================================================== --- /dev/null +++ test/command/fixture/query_expander/tsv/synonyms.tsv 2012-10-11 17:54:27 +0900 (6c0ca20) @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +rroonga rroonga Ruby groonga Modified: test/command/run-test.sh (+3 -0) =================================================================== --- test/command/run-test.sh 2012-10-11 17:40:27 +0900 (350acc8) +++ test/command/run-test.sh 2012-10-11 17:54:27 +0900 (7871e43) @@ -53,6 +53,9 @@ export GROONGA_SUGGEST_CREATE_DATASET GRN_PLUGINS_DIR="$top_dir/plugins" export GRN_PLUGINS_DIR +GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE="$top_dir/test/command/fixture/query_expander/tsv/synonyms.tsv" +export GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE + case `uname` in Darwin) DYLD_LIBRARY_PATH="$top_dir/lib/.libs:$DYLD_LIBRARY_PATH" Added: test/command/suite/select/query_expansion/query_expander/tsv/expand.expected (+53 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/query_expansion/query_expander/tsv/expand.expected 2012-10-11 17:54:27 +0900 (91bc399) @@ -0,0 +1,53 @@ +register "query_expanders/tsv" +[[0,0.0,0.0],true] +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram +[[0,0.0,0.0],true] +column_create Lexicon diary_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +["content"], +["Start groonga!"], +["Start mroonga!"], +["Start rroonga!"], +["Start Ruby!"], +["Learning Ruby and groonga..."] +] +[[0,0.0,0.0],5] +select --table Memos --query_expansion QueryExpanderTSV --match_columns content --query rroonga +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 2 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "ShortText" + ] + ], + [ + 3, + "Start rroonga!" + ], + [ + 5, + "Learning Ruby and groonga..." + ] + ] + ] +] Added: test/command/suite/select/query_expansion/query_expander/tsv/expand.test (+21 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/query_expansion/query_expander/tsv/expand.test 2012-10-11 17:54:27 +0900 (eb07efe) @@ -0,0 +1,21 @@ +register "query_expanders/tsv" + +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR ShortText + +table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram +column_create Lexicon diary_content COLUMN_INDEX|WITH_POSITION Memos content + +load --table Memos +[ +["content"], +["Start groonga!"], +["Start mroonga!"], +["Start rroonga!"], +["Start Ruby!"], +["Learning Ruby and groonga..."] +] + +select --table Memos --query_expansion QueryExpanderTSV \ + --match_columns content --query rroonga + -------------- next part -------------- HTML����������������������������...Download