[Groonga-commit] pgroonga/pgroonga at 0595620 [master] Support custom tokenizer and normalizer by WITH

Back to archive index

Kouhei Sutou null+****@clear*****
Sun Feb 1 13:04:23 JST 2015


Kouhei Sutou	2015-02-01 13:04:23 +0900 (Sun, 01 Feb 2015)

  New Revision: 0595620761649ad6350caa72939eac11ef7b46ef
  https://github.com/pgroonga/pgroonga/commit/0595620761649ad6350caa72939eac11ef7b46ef

  Message:
    Support custom tokenizer and normalizer by WITH

  Added files:
    expected/text/options/normalizer/none.out
    expected/text/options/tokenizer/custom.out
    expected/text/options/tokenizer/none.out
    sql/text/options/normalizer/none.sql
    sql/text/options/tokenizer/custom.sql
    sql/text/options/tokenizer/none.sql
  Modified files:
    Makefile
    pgroonga.c
    pgroonga.h

  Modified: Makefile (+6 -0)
===================================================================
--- Makefile    2015-02-01 12:45:35 +0900 (88ead00)
+++ Makefile    2015-02-01 13:04:23 +0900 (26bcfaa)
@@ -29,6 +29,8 @@ installcheck: results/text/single/contain
 installcheck: results/text/single/match
 installcheck: results/text/single/and
 installcheck: results/text/multiple/contain
+installcheck: results/text/options/tokenizer
+installcheck: results/text/options/normalizer
 
 results/text/single/contain:
 	@mkdir -p results/text/single/contain
@@ -38,3 +40,7 @@ results/text/single/and:
 	@mkdir -p results/text/single/and
 results/text/multiple/contain:
 	@mkdir -p results/text/multiple/contain
+results/text/options/tokenizer:
+	@mkdir -p results/text/options/tokenizer
+results/text/options/normalizer:
+	@mkdir -p results/text/options/normalizer

  Added: expected/text/options/normalizer/none.out (+30 -0) 100644
===================================================================
--- /dev/null
+++ expected/text/options/normalizer/none.out    2015-02-01 13:04:23 +0900 (26b4ca6)
@@ -0,0 +1,30 @@
+CREATE TABLE memos (
+  id integer,
+  content text
+);
+INSERT INTO memos VALUES (1, 'PostgreSQL is a RDBMS.');
+INSERT INTO memos VALUES (2, 'Groonga is fast full text search engine.');
+INSERT INTO memos VALUES (3, 'PGroonga is a PostgreSQL extension that uses Groonga.');
+CREATE INDEX grnindex ON memos
+ USING pgroonga (content)
+  WITH (normalizer = '');
+SET enable_seqscan = off;
+SET enable_indexscan = on;
+SET enable_bitmapscan = off;
+SELECT id, content
+  FROM memos
+ WHERE content %% 'postgresql';
+ id | content 
+----+---------
+(0 rows)
+
+SELECT id, content
+  FROM memos
+ WHERE content %% 'PostgreSQL';
+ id |                        content                        
+----+-------------------------------------------------------
+  1 | PostgreSQL is a RDBMS.
+  3 | PGroonga is a PostgreSQL extension that uses Groonga.
+(2 rows)
+
+DROP TABLE memos;

  Added: expected/text/options/tokenizer/custom.out (+30 -0) 100644
===================================================================
--- /dev/null
+++ expected/text/options/tokenizer/custom.out    2015-02-01 13:04:23 +0900 (9be277d)
@@ -0,0 +1,30 @@
+CREATE TABLE memos (
+  id integer,
+  tags text
+);
+INSERT INTO memos VALUES (1, 'PostgreSQL94 RDBMS');
+INSERT INTO memos VALUES (2, 'PostgreSQL Groonga');
+INSERT INTO memos VALUES (3, 'Groonga PGroonga Mroonga');
+CREATE INDEX grnindex ON memos
+ USING pgroonga (tags)
+  WITH (tokenizer = 'TokenDelimit');
+SET enable_seqscan = off;
+SET enable_indexscan = on;
+SET enable_bitmapscan = off;
+SELECT id, tags
+  FROM memos
+ WHERE tags %% 'PostgreSQL';
+ id |        tags        
+----+--------------------
+  2 | PostgreSQL Groonga
+(1 row)
+
+SELECT id, tags
+  FROM memos
+ WHERE tags %% 'PostgreSQL94';
+ id |        tags        
+----+--------------------
+  1 | PostgreSQL94 RDBMS
+(1 row)
+
+DROP TABLE memos;

  Added: expected/text/options/tokenizer/none.out (+30 -0) 100644
===================================================================
--- /dev/null
+++ expected/text/options/tokenizer/none.out    2015-02-01 13:04:23 +0900 (9888bc5)
@@ -0,0 +1,30 @@
+CREATE TABLE memos (
+  id integer,
+  tag text
+);
+INSERT INTO memos VALUES (1, 'PostgreSQL');
+INSERT INTO memos VALUES (2, 'PostgreSQL Groonga');
+INSERT INTO memos VALUES (3, 'Groonga');
+CREATE INDEX grnindex ON memos
+ USING pgroonga (tag)
+  WITH (tokenizer = '');
+SET enable_seqscan = off;
+SET enable_indexscan = on;
+SET enable_bitmapscan = off;
+SELECT id, tag
+  FROM memos
+ WHERE tag %% 'PostgreSQL';
+ id |    tag     
+----+------------
+  1 | PostgreSQL
+(1 row)
+
+SELECT id, tag
+  FROM memos
+ WHERE tag %% 'PostgreSQL Groonga';
+ id |        tag         
+----+--------------------
+  2 | PostgreSQL Groonga
+(1 row)
+
+DROP TABLE memos;

  Modified: pgroonga.c (+147 -7)
===================================================================
--- pgroonga.c    2015-02-01 12:45:35 +0900 (30783ad)
+++ pgroonga.c    2015-02-01 13:04:23 +0900 (11be5d6)
@@ -5,6 +5,7 @@
 
 #include "pgroonga.h"
 
+#include <access/reloptions.h>
 #include <access/relscan.h>
 #include <catalog/catalog.h>
 #include <catalog/index.h>
@@ -21,6 +22,15 @@
 
 PG_MODULE_MAGIC;
 
+static relopt_kind GrnReloptionKind;
+
+typedef struct GrnOptions
+{
+	int32 vl_len_;
+	int tokenizerOffset;
+	int normalizerOffset;
+} GrnOptions;
+
 typedef struct GrnBuildStateData
 {
 	grn_obj	*idsTable;
@@ -74,7 +84,6 @@ static grn_ctx *ctx = &grnContext;
 static grn_obj buffer;
 static grn_obj inspectBuffer;
 
-#ifdef PGROONGA_DEBUG
 static const char *
 GrnInspect(grn_obj *object)
 {
@@ -83,7 +92,6 @@ GrnInspect(grn_obj *object)
 	GRN_TEXT_PUTC(ctx, &inspectBuffer, '\0');
 	return GRN_TEXT_VALUE(&inspectBuffer);
 }
-#endif
 
 static grn_encoding
 GrnGetEncoding(void)
@@ -152,6 +160,103 @@ GrnOnProcExit(int code, Datum arg)
 	grn_fin();
 }
 
+static bool
+GrnIsTokenizer(grn_obj *object)
+{
+	if (object->header.type != GRN_PROC)
+		return false;
+
+  if (grn_proc_get_type(ctx, object) != GRN_PROC_TOKENIZER)
+	  return false;
+
+  return true;
+}
+
+static void
+GrnOptionValidateTokenizer(char *name)
+{
+	grn_obj *tokenizer;
+	size_t name_length;
+
+	name_length = strlen(name);
+	if (name_length == 0)
+		return;
+
+	tokenizer = grn_ctx_get(ctx, name, name_length);
+	if (!tokenizer)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pgroonga: nonexistent tokenizer: <%s>",
+						name)));
+	}
+
+	if (!GrnIsTokenizer(tokenizer))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pgroonga: not tokenizer: <%s>: %s",
+						name, GrnInspect(tokenizer))));
+	}
+}
+
+static bool
+GrnIsNormalizer(grn_obj *object)
+{
+	if (object->header.type != GRN_PROC)
+		return false;
+
+  if (grn_proc_get_type(ctx, object) != GRN_PROC_NORMALIZER)
+	  return false;
+
+  return true;
+}
+
+static void
+GrnOptionValidateNormalizer(char *name)
+{
+	grn_obj *normalizer;
+	size_t name_length;
+
+	name_length = strlen(name);
+	if (name_length == 0)
+		return;
+
+	normalizer = grn_ctx_get(ctx, name, name_length);
+	if (!normalizer)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pgroonga: nonexistent normalizer: <%s>",
+						name)));
+	}
+
+	if (!GrnIsNormalizer(normalizer))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pgroonga: not normalizer: <%s>: %s",
+						name, GrnInspect(normalizer))));
+	}
+}
+
+static void
+GrnInitializeOptions(void)
+{
+	GrnReloptionKind = add_reloption_kind();
+
+	add_string_reloption(GrnReloptionKind,
+						 "tokenizer",
+						 "Tokenizer name to be used for full-text search",
+						 PGRN_DEFAULT_TOKENIZER,
+						 GrnOptionValidateTokenizer);
+	add_string_reloption(GrnReloptionKind,
+						 "normalizer",
+						 "Normalizer name to be used for full-text search",
+						 PGRN_DEFAULT_NORMALIZER,
+						 GrnOptionValidateNormalizer);
+}
+
 void
 _PG_init(void)
 {
@@ -170,6 +275,8 @@ _PG_init(void)
 	GRN_TEXT_INIT(&inspectBuffer, 0);
 
 	GrnEnsureDatabase();
+
+	GrnInitializeOptions();
 }
 
 static int
@@ -364,10 +471,26 @@ GrnCreate(Relation index, grn_obj **idsTable,
 							  grn_ctx_at(ctx, typeID));
 	if (typeID == GRN_DB_SHORT_TEXT)
 	{
-		grn_obj_set_info(ctx, *lexicon, GRN_INFO_NORMALIZER,
-						 GrnLookup("NormalizerAuto", WARNING));
-		grn_obj_set_info(ctx, *lexicon, GRN_INFO_DEFAULT_TOKENIZER,
-						 grn_ctx_at(ctx, GRN_DB_BIGRAM));
+		GrnOptions *options;
+		const char *tokenizerName = PGRN_DEFAULT_TOKENIZER;
+		const char *normalizerName = PGRN_DEFAULT_NORMALIZER;
+
+		options = (GrnOptions *)(index->rd_options);
+		if (options)
+		{
+			tokenizerName = (const char *)(options) + options->tokenizerOffset;
+			normalizerName = (const char *)(options) + options->normalizerOffset;
+		}
+		if (tokenizerName && tokenizerName[0])
+		{
+			grn_obj_set_info(ctx, *lexicon, GRN_INFO_DEFAULT_TOKENIZER,
+							 GrnLookup(tokenizerName, ERROR));
+		}
+		if (normalizerName && normalizerName[0])
+		{
+			grn_obj_set_info(ctx, *lexicon, GRN_INFO_NORMALIZER,
+							 GrnLookup(normalizerName, ERROR));
+		}
 	}
 
 	{
@@ -1238,5 +1361,22 @@ pgroonga_costestimate(PG_FUNCTION_ARGS)
 Datum
 pgroonga_options(PG_FUNCTION_ARGS)
 {
-	return (Datum) 0;
+	Datum reloptions = PG_GETARG_DATUM(0);
+	bool validate = PG_GETARG_BOOL(1);
+	relopt_value *options;
+	GrnOptions *grnOptions;
+	int nOptions;
+	const relopt_parse_elt optionsMap[] = {
+		{"tokenizer", RELOPT_TYPE_STRING, offsetof(GrnOptions, tokenizerOffset)},
+		{"normalizer", RELOPT_TYPE_STRING, offsetof(GrnOptions, normalizerOffset)}
+	};
+
+	options = parseRelOptions(reloptions, validate, GrnReloptionKind,
+							  &nOptions);
+	grnOptions = allocateReloptStruct(sizeof(GrnOptions), options, nOptions);
+	fillRelOptions(grnOptions, sizeof(GrnOptions), options, nOptions,
+				   validate, optionsMap, lengthof(optionsMap));
+	pfree(options);
+
+	PG_RETURN_BYTEA_P(grnOptions);
 }

  Modified: pgroonga.h (+8 -0)
===================================================================
--- pgroonga.h    2015-02-01 12:45:35 +0900 (1a25f37)
+++ pgroonga.h    2015-02-01 13:04:23 +0900 (b7cc05e)
@@ -13,6 +13,14 @@
 #  define PGDLLEXPORT
 #endif
 
+/* Default values */
+#ifndef PGRN_DEFAULT_TOKENIZER
+#  define PGRN_DEFAULT_TOKENIZER "TokenBigram"
+#endif
+#ifndef PGRN_DEFAULT_NORMALIZER
+#  define PGRN_DEFAULT_NORMALIZER "NormalizerAuto"
+#endif
+
 /* Groonga strategy types */
 #define GrnLessStrategyNumber			1	/* operator < */
 #define GrnLessEqualStrategyNumber		2	/* operator <= */

  Added: sql/text/options/normalizer/none.sql (+26 -0) 100644
===================================================================
--- /dev/null
+++ sql/text/options/normalizer/none.sql    2015-02-01 13:04:23 +0900 (70faf85)
@@ -0,0 +1,26 @@
+CREATE TABLE memos (
+  id integer,
+  content text
+);
+
+INSERT INTO memos VALUES (1, 'PostgreSQL is a RDBMS.');
+INSERT INTO memos VALUES (2, 'Groonga is fast full text search engine.');
+INSERT INTO memos VALUES (3, 'PGroonga is a PostgreSQL extension that uses Groonga.');
+
+CREATE INDEX grnindex ON memos
+ USING pgroonga (content)
+  WITH (normalizer = '');
+
+SET enable_seqscan = off;
+SET enable_indexscan = on;
+SET enable_bitmapscan = off;
+
+SELECT id, content
+  FROM memos
+ WHERE content %% 'postgresql';
+
+SELECT id, content
+  FROM memos
+ WHERE content %% 'PostgreSQL';
+
+DROP TABLE memos;

  Added: sql/text/options/tokenizer/custom.sql (+26 -0) 100644
===================================================================
--- /dev/null
+++ sql/text/options/tokenizer/custom.sql    2015-02-01 13:04:23 +0900 (0c95e87)
@@ -0,0 +1,26 @@
+CREATE TABLE memos (
+  id integer,
+  tags text
+);
+
+INSERT INTO memos VALUES (1, 'PostgreSQL94 RDBMS');
+INSERT INTO memos VALUES (2, 'PostgreSQL Groonga');
+INSERT INTO memos VALUES (3, 'Groonga PGroonga Mroonga');
+
+CREATE INDEX grnindex ON memos
+ USING pgroonga (tags)
+  WITH (tokenizer = 'TokenDelimit');
+
+SET enable_seqscan = off;
+SET enable_indexscan = on;
+SET enable_bitmapscan = off;
+
+SELECT id, tags
+  FROM memos
+ WHERE tags %% 'PostgreSQL';
+
+SELECT id, tags
+  FROM memos
+ WHERE tags %% 'PostgreSQL94';
+
+DROP TABLE memos;

  Added: sql/text/options/tokenizer/none.sql (+26 -0) 100644
===================================================================
--- /dev/null
+++ sql/text/options/tokenizer/none.sql    2015-02-01 13:04:23 +0900 (5984d8f)
@@ -0,0 +1,26 @@
+CREATE TABLE memos (
+  id integer,
+  tag text
+);
+
+INSERT INTO memos VALUES (1, 'PostgreSQL');
+INSERT INTO memos VALUES (2, 'PostgreSQL Groonga');
+INSERT INTO memos VALUES (3, 'Groonga');
+
+CREATE INDEX grnindex ON memos
+ USING pgroonga (tag)
+  WITH (tokenizer = '');
+
+SET enable_seqscan = off;
+SET enable_indexscan = on;
+SET enable_bitmapscan = off;
+
+SELECT id, tag
+  FROM memos
+ WHERE tag %% 'PostgreSQL';
+
+SELECT id, tag
+  FROM memos
+ WHERE tag %% 'PostgreSQL Groonga';
+
+DROP TABLE memos;
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index