[Groonga-commit] groonga/groonga [master] Add a sample query expander QueryExpanderTSV

Back to archive index

Kouhei Sutou null+****@clear*****
Thu Oct 11 17:54:27 JST 2012


Kouhei Sutou	2012-10-11 17:54:27 +0900 (Thu, 11 Oct 2012)

  New Revision: d1b00b8d3d8d726fef0da0b02893e8542492a041
  https://github.com/groonga/groonga/commit/d1b00b8d3d8d726fef0da0b02893e8542492a041

  Log:
    Add a sample query expander QueryExpanderTSV
    
    It reads synonyms from TSV format file. Synonyms file should have the
    following contents:
    
      key[TAB]synonym1[TAB]synonym2[TAB]...
    
    For example:
    
      rroonga[TAB]rroonga[TAB]Ruby groonga
    
    With the above synonyms, --query rroonga is expanded to
    --query "((rroonga) OR (Ruby groonga))".
    
    TODO:
      * Support magic comment at the head.
      * Install the default synonyms file as /etc/groonga/synonyms.tsv.

  Added files:
    plugins/query_expanders/Makefile.am
    plugins/query_expanders/tsv.c
    plugins/query_expanders/tsv_sources.am
    test/command/fixture/query_expander/tsv/synonyms.tsv
    test/command/suite/select/query_expansion/query_expander/tsv/expand.expected
    test/command/suite/select/query_expansion/query_expander/tsv/expand.test
  Copied files:
    plugins/query_expanders/CMakeLists.txt
      (from plugins/CMakeLists.txt)
  Modified files:
    configure.ac
    plugins/CMakeLists.txt
    plugins/Makefile.am
    test/command/run-test.sh

  Modified: configure.ac (+14 -0)
===================================================================
--- configure.ac    2012-10-11 17:40:27 +0900 (ff298f7)
+++ configure.ac    2012-10-11 17:54:27 +0900 (848c6b1)
@@ -222,6 +222,7 @@ AC_CONFIG_FILES([
   plugins/tokenizers/Makefile
   plugins/suggest/Makefile
   plugins/table/Makefile
+  plugins/query_expanders/Makefile
   examples/Makefile
   examples/dictionary/Makefile
   examples/dictionary/edict/Makefile
@@ -1159,6 +1160,9 @@ AC_SUBST(expanded_pluginsdir)
 tokenizers_pluginsdir="\${pluginsdir}/tokenizers"
 AC_SUBST(tokenizers_pluginsdir)
 
+query_expanders_pluginsdir="\${pluginsdir}/query_expanders"
+AC_SUBST(query_expanders_pluginsdir)
+
 suggest_pluginsdir="\${pluginsdir}/suggest"
 AC_SUBST(suggest_pluginsdir)
 
@@ -1175,6 +1179,16 @@ if test -z "$suffix"; then
 fi
 AC_DEFINE_UNQUOTED(GRN_PLUGIN_SUFFIX, ["$suffix"], "plugin suffix")
 
+# for query expanders
+GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE="synonyms.tsv"
+AC_DEFINE_UNQUOTED(GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE,
+                   ["$GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE"],
+                   "The relative synonyms file for TSV query expander")
+GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE="${pkgdatadir}/${GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE}"
+AC_DEFINE_UNQUOTED(GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE,
+                   ["$GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE"],
+                   "The default synonyms file for TSV query expander")
+
 # for examples
 examplesdir="\$(pkgdatadir)/examples"
 AC_SUBST(examplesdir)

  Modified: plugins/CMakeLists.txt (+1 -0)
===================================================================
--- plugins/CMakeLists.txt    2012-10-11 17:40:27 +0900 (0cf9205)
+++ plugins/CMakeLists.txt    2012-10-11 17:54:27 +0900 (9d94824)
@@ -16,3 +16,4 @@
 add_subdirectory(suggest)
 add_subdirectory(tokenizers)
 add_subdirectory(table)
+add_subdirectory(query_expanders)

  Modified: plugins/Makefile.am (+5 -4)
===================================================================
--- plugins/Makefile.am    2012-10-11 17:40:27 +0900 (8d6b1ef)
+++ plugins/Makefile.am    2012-10-11 17:54:27 +0900 (fbe6f15)
@@ -1,7 +1,8 @@
-SUBDIRS =		\
-	tokenizers	\
-	suggest \
-	table
+SUBDIRS =					\
+	tokenizers				\
+	suggest					\
+	table					\
+	query_expanders
 
 EXTRA_DIST =					\
 	CMakeLists.txt

  Copied: plugins/query_expanders/CMakeLists.txt (+12 -3) 60%
===================================================================
--- plugins/CMakeLists.txt    2012-10-11 17:40:27 +0900 (0cf9205)
+++ plugins/query_expanders/CMakeLists.txt    2012-10-11 17:54:27 +0900 (a2f1a48)
@@ -13,6 +13,15 @@
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
-add_subdirectory(suggest)
-add_subdirectory(tokenizers)
-add_subdirectory(table)
+include_directories(
+  ${CMAKE_SOURCE_DIR}/lib
+  )
+
+set(QUERY_EXPANDERS_DIR "${GRN_PLUGINS_DIR}/query_expanders")
+read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/tsv_sources.am TSV_SOURCES)
+add_library(tsv_query_expander MODULE ${TSV_SOURCES})
+set_target_properties(tsv_query_expander PROPERTIES
+  PREFIX ""
+  OUTPUT_NAME "tsv")
+target_link_libraries(tsv_query_expander libgroonga)
+install(TARGETS tsv_query_expander DESTINATION "${QUERY_EXPANDERS_DIR}")

  Added: plugins/query_expanders/Makefile.am (+20 -0) 100644
===================================================================
--- /dev/null
+++ plugins/query_expanders/Makefile.am    2012-10-11 17:54:27 +0900 (cb0abc9)
@@ -0,0 +1,20 @@
+EXTRA_DIST =					\
+	CMakeLists.txt
+
+INCLUDES =			\
+	-I$(top_builddir)	\
+	-I$(top_srcdir)/include	\
+	-I$(top_srcdir)/lib
+
+AM_LDFLAGS =					\
+	-avoid-version				\
+	-module					\
+	-no-undefined
+
+LIBS =						\
+	$(top_builddir)/lib/libgroonga.la
+
+query_expanders_plugins_LTLIBRARIES =
+query_expanders_plugins_LTLIBRARIES += tsv.la
+
+include tsv_sources.am

  Added: plugins/query_expanders/tsv.c (+230 -0) 100644
===================================================================
--- /dev/null
+++ plugins/query_expanders/tsv.c    2012-10-11 17:54:27 +0900 (ab75c5c)
@@ -0,0 +1,230 @@
+/* -*- c-basic-offset: 2 -*- */
+/* Copyright(C) 2012 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include <groonga/plugin.h>
+
+/* groonga's internal headers: They should be removed. */
+/* for grn_text_fgets() */
+#include <str.h>
+/* for GRN_PROC_ALLOC() */
+#include <db.h>
+/* for grn_win32_base_dir() */
+#include <util.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#define MAX_SYNONYM_BYTES 4096
+
+static grn_hash *synonyms = NULL;
+
+#ifdef WIN32
+static char *win32_synonyms_file = NULL;
+const char *
+get_system_synonyms_file(void)
+{
+  if (!win32_synonyms_file) {
+    const char *base_dir;
+    const char *relative_path = GRN_RELATIVE_QUERY_EXPANDER_TSV_SYNONYMS_FILE;
+    char *synonyms_file;
+    char *path;
+    size_t base_dir_length;
+
+    base_dir = grn_win32_base_dir();
+    base_dir_length = strlen(base_dir);
+    synonyms_file =
+      malloc(base_dir_length + strlen("/") + strlen(relative_path) + 1);
+    strcpy(synonyms_file, base_dir);
+    strcat(synonyms_file, "/");
+    strcat(synonyms_file, relative_path);
+    win32_synonyms_file = synonyms_file;
+  }
+  return win32_synonyms_file;
+}
+
+#else /* WIN32 */
+const char *
+get_system_synonyms_file(void)
+{
+  return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE;
+}
+#endif /* WIN32 */
+
+static inline grn_bool
+is_comment_mark(char character)
+{
+  return character == '#';
+}
+
+static void
+parse_synonyms_file_line(grn_ctx *ctx, const char *line, int line_length,
+                         grn_obj *key, grn_obj *value)
+{
+  size_t i = 0;
+
+  if (is_comment_mark(line[i])) {
+    return;
+  }
+
+  while (i < line_length) {
+    char character = line[i];
+    i++;
+    if (character == '\t') {
+      break;
+    }
+    GRN_TEXT_PUTC(ctx, key, character);
+  }
+
+  if (i == line_length) {
+    return;
+  }
+
+  GRN_TEXT_PUTS(ctx, value, "((");
+  while (i < line_length) {
+    char character = line[i];
+    i++;
+    if (character == '\t') {
+      GRN_TEXT_PUTS(ctx, value, ") OR (");
+    } else {
+      GRN_TEXT_PUTC(ctx, value, character);
+    }
+  }
+  GRN_TEXT_PUTS(ctx, value, "))");
+
+  {
+    grn_id id;
+    void *value_location = NULL;
+
+    id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key),
+                      &value_location, NULL);
+    if (id == GRN_ID_NIL) {
+      GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING,
+                     "[plugin][query-expander][tsv] "
+                     "failed to register key: <%.*s>",
+                     (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key));
+      return;
+    }
+
+    grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1);
+    GRN_TEXT_PUTC(ctx, value, '\0');
+    memcpy(value_location, GRN_TEXT_VALUE(value), MAX_SYNONYM_BYTES);
+  }
+}
+
+static void
+load_synonyms(grn_ctx *ctx)
+{
+  const char *path;
+  FILE *file;
+  grn_obj line, key, value;
+
+  path = getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE");
+  if (!path) {
+    path = get_system_synonyms_file();
+  }
+  file = fopen(path, "r");
+  if (!file) {
+    GRN_LOG(ctx, GRN_LOG_WARNING,
+            "[plugin][query-expander][tsv] "
+            "synonyms file doesn't exist: <%s>",
+            path);
+    return;
+  }
+
+  GRN_TEXT_INIT(&line, 0);
+  GRN_TEXT_INIT(&key, 0);
+  GRN_TEXT_INIT(&value, 0);
+  grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES);
+  while (grn_text_fgets(ctx, &line, file) == GRN_SUCCESS) {
+    GRN_BULK_REWIND(&key);
+    GRN_BULK_REWIND(&value);
+    parse_synonyms_file_line(ctx,
+                             GRN_TEXT_VALUE(&line), GRN_TEXT_LEN(&line),
+                             &key, &value);
+    GRN_BULK_REWIND(&line);
+  }
+  GRN_OBJ_FIN(ctx, &line);
+  GRN_OBJ_FIN(ctx, &key);
+  GRN_OBJ_FIN(ctx, &value);
+
+  fclose(file);
+}
+
+static grn_obj *
+func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args,
+                        grn_user_data *user_data)
+{
+  grn_rc rc = GRN_END_OF_DATA;
+  grn_id id;
+  grn_obj *term, *expanded_term;
+  void *value;
+  grn_obj *rc_object;
+
+  term = args[0];
+  expanded_term = args[1];
+  id = grn_hash_get(ctx, synonyms,
+                    GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term),
+                    &value);
+  if (id != GRN_ID_NIL) {
+    const char *query = value;
+    GRN_TEXT_PUTS(ctx, expanded_term, query);
+    rc = GRN_SUCCESS;
+  }
+
+  rc_object = GRN_PROC_ALLOC(GRN_DB_INT32, 0);
+  if (rc_object) {
+    GRN_INT32_SET(ctx, rc_object, rc);
+  }
+
+  return rc_object;
+}
+
+grn_rc
+GRN_PLUGIN_INIT(grn_ctx *ctx)
+{
+  if (!synonyms) {
+    synonyms = grn_hash_create(ctx, NULL,
+                               GRN_TABLE_MAX_KEY_SIZE,
+                               MAX_SYNONYM_BYTES,
+                               GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE);
+    if (!synonyms) {
+      return ctx->rc;
+    }
+    load_synonyms(ctx);
+  }
+  return ctx->rc;
+}
+
+grn_rc
+GRN_PLUGIN_REGISTER(grn_ctx *ctx)
+{
+  grn_proc_create(ctx, "QueryExpanderTSV", strlen("QueryExpanderTSV"),
+                  GRN_PROC_FUNCTION,
+                  func_query_expander_tsv, NULL, NULL,
+                  0, NULL);
+  return GRN_SUCCESS;
+}
+
+grn_rc
+GRN_PLUGIN_FIN(grn_ctx *ctx)
+{
+  if (synonyms) {
+    grn_hash_close(ctx, synonyms);
+    synonyms = NULL;
+  }
+  return GRN_SUCCESS;
+}

  Added: plugins/query_expanders/tsv_sources.am (+2 -0) 100644
===================================================================
--- /dev/null
+++ plugins/query_expanders/tsv_sources.am    2012-10-11 17:54:27 +0900 (f1bdabe)
@@ -0,0 +1,2 @@
+tsv_la_SOURCES =				\
+	tsv.c

  Added: test/command/fixture/query_expander/tsv/synonyms.tsv (+2 -0) 100644
===================================================================
--- /dev/null
+++ test/command/fixture/query_expander/tsv/synonyms.tsv    2012-10-11 17:54:27 +0900 (6c0ca20)
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+rroonga	rroonga	Ruby groonga

  Modified: test/command/run-test.sh (+3 -0)
===================================================================
--- test/command/run-test.sh    2012-10-11 17:40:27 +0900 (350acc8)
+++ test/command/run-test.sh    2012-10-11 17:54:27 +0900 (7871e43)
@@ -53,6 +53,9 @@ export GROONGA_SUGGEST_CREATE_DATASET
 GRN_PLUGINS_DIR="$top_dir/plugins"
 export GRN_PLUGINS_DIR
 
+GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE="$top_dir/test/command/fixture/query_expander/tsv/synonyms.tsv"
+export GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE
+
 case `uname` in
     Darwin)
 	DYLD_LIBRARY_PATH="$top_dir/lib/.libs:$DYLD_LIBRARY_PATH"

  Added: test/command/suite/select/query_expansion/query_expander/tsv/expand.expected (+53 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/query_expansion/query_expander/tsv/expand.expected    2012-10-11 17:54:27 +0900 (91bc399)
@@ -0,0 +1,53 @@
+register "query_expanders/tsv"
+[[0,0.0,0.0],true]
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram
+[[0,0.0,0.0],true]
+column_create Lexicon diary_content COLUMN_INDEX|WITH_POSITION Memos content
+[[0,0.0,0.0],true]
+load --table Memos
+[
+["content"],
+["Start groonga!"],
+["Start mroonga!"],
+["Start rroonga!"],
+["Start Ruby!"],
+["Learning Ruby and groonga..."]
+]
+[[0,0.0,0.0],5]
+select --table Memos --query_expansion QueryExpanderTSV   --match_columns content --query rroonga
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        2
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "content",
+          "ShortText"
+        ]
+      ],
+      [
+        3,
+        "Start rroonga!"
+      ],
+      [
+        5,
+        "Learning Ruby and groonga..."
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/query_expansion/query_expander/tsv/expand.test (+21 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/query_expansion/query_expander/tsv/expand.test    2012-10-11 17:54:27 +0900 (eb07efe)
@@ -0,0 +1,21 @@
+register "query_expanders/tsv"
+
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR ShortText
+
+table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram
+column_create Lexicon diary_content COLUMN_INDEX|WITH_POSITION Memos content
+
+load --table Memos
+[
+["content"],
+["Start groonga!"],
+["Start mroonga!"],
+["Start rroonga!"],
+["Start Ruby!"],
+["Learning Ruby and groonga..."]
+]
+
+select --table Memos --query_expansion QueryExpanderTSV \
+  --match_columns content --query rroonga
+
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index