Kouhei Sutou
null+****@clear*****
Sun Feb 1 13:04:23 JST 2015
Kouhei Sutou 2015-02-01 13:04:23 +0900 (Sun, 01 Feb 2015) New Revision: 0595620761649ad6350caa72939eac11ef7b46ef https://github.com/pgroonga/pgroonga/commit/0595620761649ad6350caa72939eac11ef7b46ef Message: Support custom tokenizer and normalizer by WITH Added files: expected/text/options/normalizer/none.out expected/text/options/tokenizer/custom.out expected/text/options/tokenizer/none.out sql/text/options/normalizer/none.sql sql/text/options/tokenizer/custom.sql sql/text/options/tokenizer/none.sql Modified files: Makefile pgroonga.c pgroonga.h Modified: Makefile (+6 -0) =================================================================== --- Makefile 2015-02-01 12:45:35 +0900 (88ead00) +++ Makefile 2015-02-01 13:04:23 +0900 (26bcfaa) @@ -29,6 +29,8 @@ installcheck: results/text/single/contain installcheck: results/text/single/match installcheck: results/text/single/and installcheck: results/text/multiple/contain +installcheck: results/text/options/tokenizer +installcheck: results/text/options/normalizer results/text/single/contain: @mkdir -p results/text/single/contain @@ -38,3 +40,7 @@ results/text/single/and: @mkdir -p results/text/single/and results/text/multiple/contain: @mkdir -p results/text/multiple/contain +results/text/options/tokenizer: + @mkdir -p results/text/options/tokenizer +results/text/options/normalizer: + @mkdir -p results/text/options/normalizer Added: expected/text/options/normalizer/none.out (+30 -0) 100644 =================================================================== --- /dev/null +++ expected/text/options/normalizer/none.out 2015-02-01 13:04:23 +0900 (26b4ca6) @@ -0,0 +1,30 @@ +CREATE TABLE memos ( + id integer, + content text +); +INSERT INTO memos VALUES (1, 'PostgreSQL is a RDBMS.'); +INSERT INTO memos VALUES (2, 'Groonga is fast full text search engine.'); +INSERT INTO memos VALUES (3, 'PGroonga is a PostgreSQL extension that uses Groonga.'); +CREATE INDEX grnindex ON memos + USING pgroonga (content) + WITH (normalizer = ''); +SET enable_seqscan = off; +SET enable_indexscan = on; +SET enable_bitmapscan = off; +SELECT id, content + FROM memos + WHERE content %% 'postgresql'; + id | content +----+--------- +(0 rows) + +SELECT id, content + FROM memos + WHERE content %% 'PostgreSQL'; + id | content +----+------------------------------------------------------- + 1 | PostgreSQL is a RDBMS. + 3 | PGroonga is a PostgreSQL extension that uses Groonga. +(2 rows) + +DROP TABLE memos; Added: expected/text/options/tokenizer/custom.out (+30 -0) 100644 =================================================================== --- /dev/null +++ expected/text/options/tokenizer/custom.out 2015-02-01 13:04:23 +0900 (9be277d) @@ -0,0 +1,30 @@ +CREATE TABLE memos ( + id integer, + tags text +); +INSERT INTO memos VALUES (1, 'PostgreSQL94 RDBMS'); +INSERT INTO memos VALUES (2, 'PostgreSQL Groonga'); +INSERT INTO memos VALUES (3, 'Groonga PGroonga Mroonga'); +CREATE INDEX grnindex ON memos + USING pgroonga (tags) + WITH (tokenizer = 'TokenDelimit'); +SET enable_seqscan = off; +SET enable_indexscan = on; +SET enable_bitmapscan = off; +SELECT id, tags + FROM memos + WHERE tags %% 'PostgreSQL'; + id | tags +----+-------------------- + 2 | PostgreSQL Groonga +(1 row) + +SELECT id, tags + FROM memos + WHERE tags %% 'PostgreSQL94'; + id | tags +----+-------------------- + 1 | PostgreSQL94 RDBMS +(1 row) + +DROP TABLE memos; Added: expected/text/options/tokenizer/none.out (+30 -0) 100644 =================================================================== --- /dev/null +++ expected/text/options/tokenizer/none.out 2015-02-01 13:04:23 +0900 (9888bc5) @@ -0,0 +1,30 @@ +CREATE TABLE memos ( + id integer, + tag text +); +INSERT INTO memos VALUES (1, 'PostgreSQL'); +INSERT INTO memos VALUES (2, 'PostgreSQL Groonga'); +INSERT INTO memos VALUES (3, 'Groonga'); +CREATE INDEX grnindex ON memos + USING pgroonga (tag) + WITH (tokenizer = ''); +SET enable_seqscan = off; +SET enable_indexscan = on; +SET enable_bitmapscan = off; +SELECT id, tag + FROM memos + WHERE tag %% 'PostgreSQL'; + id | tag +----+------------ + 1 | PostgreSQL +(1 row) + +SELECT id, tag + FROM memos + WHERE tag %% 'PostgreSQL Groonga'; + id | tag +----+-------------------- + 2 | PostgreSQL Groonga +(1 row) + +DROP TABLE memos; Modified: pgroonga.c (+147 -7) =================================================================== --- pgroonga.c 2015-02-01 12:45:35 +0900 (30783ad) +++ pgroonga.c 2015-02-01 13:04:23 +0900 (11be5d6) @@ -5,6 +5,7 @@ #include "pgroonga.h" +#include <access/reloptions.h> #include <access/relscan.h> #include <catalog/catalog.h> #include <catalog/index.h> @@ -21,6 +22,15 @@ PG_MODULE_MAGIC; +static relopt_kind GrnReloptionKind; + +typedef struct GrnOptions +{ + int32 vl_len_; + int tokenizerOffset; + int normalizerOffset; +} GrnOptions; + typedef struct GrnBuildStateData { grn_obj *idsTable; @@ -74,7 +84,6 @@ static grn_ctx *ctx = &grnContext; static grn_obj buffer; static grn_obj inspectBuffer; -#ifdef PGROONGA_DEBUG static const char * GrnInspect(grn_obj *object) { @@ -83,7 +92,6 @@ GrnInspect(grn_obj *object) GRN_TEXT_PUTC(ctx, &inspectBuffer, '\0'); return GRN_TEXT_VALUE(&inspectBuffer); } -#endif static grn_encoding GrnGetEncoding(void) @@ -152,6 +160,103 @@ GrnOnProcExit(int code, Datum arg) grn_fin(); } +static bool +GrnIsTokenizer(grn_obj *object) +{ + if (object->header.type != GRN_PROC) + return false; + + if (grn_proc_get_type(ctx, object) != GRN_PROC_TOKENIZER) + return false; + + return true; +} + +static void +GrnOptionValidateTokenizer(char *name) +{ + grn_obj *tokenizer; + size_t name_length; + + name_length = strlen(name); + if (name_length == 0) + return; + + tokenizer = grn_ctx_get(ctx, name, name_length); + if (!tokenizer) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("pgroonga: nonexistent tokenizer: <%s>", + name))); + } + + if (!GrnIsTokenizer(tokenizer)) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("pgroonga: not tokenizer: <%s>: %s", + name, GrnInspect(tokenizer)))); + } +} + +static bool +GrnIsNormalizer(grn_obj *object) +{ + if (object->header.type != GRN_PROC) + return false; + + if (grn_proc_get_type(ctx, object) != GRN_PROC_NORMALIZER) + return false; + + return true; +} + +static void +GrnOptionValidateNormalizer(char *name) +{ + grn_obj *normalizer; + size_t name_length; + + name_length = strlen(name); + if (name_length == 0) + return; + + normalizer = grn_ctx_get(ctx, name, name_length); + if (!normalizer) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("pgroonga: nonexistent normalizer: <%s>", + name))); + } + + if (!GrnIsNormalizer(normalizer)) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("pgroonga: not normalizer: <%s>: %s", + name, GrnInspect(normalizer)))); + } +} + +static void +GrnInitializeOptions(void) +{ + GrnReloptionKind = add_reloption_kind(); + + add_string_reloption(GrnReloptionKind, + "tokenizer", + "Tokenizer name to be used for full-text search", + PGRN_DEFAULT_TOKENIZER, + GrnOptionValidateTokenizer); + add_string_reloption(GrnReloptionKind, + "normalizer", + "Normalizer name to be used for full-text search", + PGRN_DEFAULT_NORMALIZER, + GrnOptionValidateNormalizer); +} + void _PG_init(void) { @@ -170,6 +275,8 @@ _PG_init(void) GRN_TEXT_INIT(&inspectBuffer, 0); GrnEnsureDatabase(); + + GrnInitializeOptions(); } static int @@ -364,10 +471,26 @@ GrnCreate(Relation index, grn_obj **idsTable, grn_ctx_at(ctx, typeID)); if (typeID == GRN_DB_SHORT_TEXT) { - grn_obj_set_info(ctx, *lexicon, GRN_INFO_NORMALIZER, - GrnLookup("NormalizerAuto", WARNING)); - grn_obj_set_info(ctx, *lexicon, GRN_INFO_DEFAULT_TOKENIZER, - grn_ctx_at(ctx, GRN_DB_BIGRAM)); + GrnOptions *options; + const char *tokenizerName = PGRN_DEFAULT_TOKENIZER; + const char *normalizerName = PGRN_DEFAULT_NORMALIZER; + + options = (GrnOptions *)(index->rd_options); + if (options) + { + tokenizerName = (const char *)(options) + options->tokenizerOffset; + normalizerName = (const char *)(options) + options->normalizerOffset; + } + if (tokenizerName && tokenizerName[0]) + { + grn_obj_set_info(ctx, *lexicon, GRN_INFO_DEFAULT_TOKENIZER, + GrnLookup(tokenizerName, ERROR)); + } + if (normalizerName && normalizerName[0]) + { + grn_obj_set_info(ctx, *lexicon, GRN_INFO_NORMALIZER, + GrnLookup(normalizerName, ERROR)); + } } { @@ -1238,5 +1361,22 @@ pgroonga_costestimate(PG_FUNCTION_ARGS) Datum pgroonga_options(PG_FUNCTION_ARGS) { - return (Datum) 0; + Datum reloptions = PG_GETARG_DATUM(0); + bool validate = PG_GETARG_BOOL(1); + relopt_value *options; + GrnOptions *grnOptions; + int nOptions; + const relopt_parse_elt optionsMap[] = { + {"tokenizer", RELOPT_TYPE_STRING, offsetof(GrnOptions, tokenizerOffset)}, + {"normalizer", RELOPT_TYPE_STRING, offsetof(GrnOptions, normalizerOffset)} + }; + + options = parseRelOptions(reloptions, validate, GrnReloptionKind, + &nOptions); + grnOptions = allocateReloptStruct(sizeof(GrnOptions), options, nOptions); + fillRelOptions(grnOptions, sizeof(GrnOptions), options, nOptions, + validate, optionsMap, lengthof(optionsMap)); + pfree(options); + + PG_RETURN_BYTEA_P(grnOptions); } Modified: pgroonga.h (+8 -0) =================================================================== --- pgroonga.h 2015-02-01 12:45:35 +0900 (1a25f37) +++ pgroonga.h 2015-02-01 13:04:23 +0900 (b7cc05e) @@ -13,6 +13,14 @@ # define PGDLLEXPORT #endif +/* Default values */ +#ifndef PGRN_DEFAULT_TOKENIZER +# define PGRN_DEFAULT_TOKENIZER "TokenBigram" +#endif +#ifndef PGRN_DEFAULT_NORMALIZER +# define PGRN_DEFAULT_NORMALIZER "NormalizerAuto" +#endif + /* Groonga strategy types */ #define GrnLessStrategyNumber 1 /* operator < */ #define GrnLessEqualStrategyNumber 2 /* operator <= */ Added: sql/text/options/normalizer/none.sql (+26 -0) 100644 =================================================================== --- /dev/null +++ sql/text/options/normalizer/none.sql 2015-02-01 13:04:23 +0900 (70faf85) @@ -0,0 +1,26 @@ +CREATE TABLE memos ( + id integer, + content text +); + +INSERT INTO memos VALUES (1, 'PostgreSQL is a RDBMS.'); +INSERT INTO memos VALUES (2, 'Groonga is fast full text search engine.'); +INSERT INTO memos VALUES (3, 'PGroonga is a PostgreSQL extension that uses Groonga.'); + +CREATE INDEX grnindex ON memos + USING pgroonga (content) + WITH (normalizer = ''); + +SET enable_seqscan = off; +SET enable_indexscan = on; +SET enable_bitmapscan = off; + +SELECT id, content + FROM memos + WHERE content %% 'postgresql'; + +SELECT id, content + FROM memos + WHERE content %% 'PostgreSQL'; + +DROP TABLE memos; Added: sql/text/options/tokenizer/custom.sql (+26 -0) 100644 =================================================================== --- /dev/null +++ sql/text/options/tokenizer/custom.sql 2015-02-01 13:04:23 +0900 (0c95e87) @@ -0,0 +1,26 @@ +CREATE TABLE memos ( + id integer, + tags text +); + +INSERT INTO memos VALUES (1, 'PostgreSQL94 RDBMS'); +INSERT INTO memos VALUES (2, 'PostgreSQL Groonga'); +INSERT INTO memos VALUES (3, 'Groonga PGroonga Mroonga'); + +CREATE INDEX grnindex ON memos + USING pgroonga (tags) + WITH (tokenizer = 'TokenDelimit'); + +SET enable_seqscan = off; +SET enable_indexscan = on; +SET enable_bitmapscan = off; + +SELECT id, tags + FROM memos + WHERE tags %% 'PostgreSQL'; + +SELECT id, tags + FROM memos + WHERE tags %% 'PostgreSQL94'; + +DROP TABLE memos; Added: sql/text/options/tokenizer/none.sql (+26 -0) 100644 =================================================================== --- /dev/null +++ sql/text/options/tokenizer/none.sql 2015-02-01 13:04:23 +0900 (5984d8f) @@ -0,0 +1,26 @@ +CREATE TABLE memos ( + id integer, + tag text +); + +INSERT INTO memos VALUES (1, 'PostgreSQL'); +INSERT INTO memos VALUES (2, 'PostgreSQL Groonga'); +INSERT INTO memos VALUES (3, 'Groonga'); + +CREATE INDEX grnindex ON memos + USING pgroonga (tag) + WITH (tokenizer = ''); + +SET enable_seqscan = off; +SET enable_indexscan = on; +SET enable_bitmapscan = off; + +SELECT id, tag + FROM memos + WHERE tag %% 'PostgreSQL'; + +SELECT id, tag + FROM memos + WHERE tag %% 'PostgreSQL Groonga'; + +DROP TABLE memos; -------------- next part -------------- HTML����������������������������...Download