null+****@clear*****
null+****@clear*****
2012年 6月 8日 (金) 16:39:06 JST
Kouhei Sutou 2012-06-08 16:39:06 +0900 (Fri, 08 Jun 2012)
New Revision: db4408268daa4e055d095d799662b33da1b58adf
Log:
Create grn_string that supports custom normalization
grn_str is deprecated. Use grn_string instead.
refs #1164
Added files:
lib/string.c
lib/string_in.h
Modified files:
include/groonga.h
lib/dat.cpp
lib/db.c
lib/expr.c
lib/ii.c
lib/pat.c
lib/snip.c
lib/snip.h
lib/sources.am
lib/token.c
lib/token.h
lib/tokenizer.c
Modified: include/groonga.h (+52 -1)
===================================================================
--- include/groonga.h 2012-06-08 17:38:04 +0900 (76ca58e)
+++ include/groonga.h 2012-06-08 16:39:06 +0900 (5c0d626)
@@ -414,6 +414,7 @@ typedef unsigned short int grn_obj_flags;
#define GRN_ACCESSOR_VIEW (0x0a)
#define GRN_SNIP (0x0b)
#define GRN_PATSNIP (0x0c)
+#define GRN_STRING (0x0d)
#define GRN_CURSOR_TABLE_HASH_KEY (0x10)
#define GRN_CURSOR_TABLE_PAT_KEY (0x11)
#define GRN_CURSOR_TABLE_DAT_KEY (0x12)
@@ -2435,7 +2436,7 @@ GRN_API void grn_time_now(grn_ctx *ctx, grn_obj *obj);
grn_bulk_write((ctx), (obj), (char *)&_val, sizeof(grn_obj *));\
} while (0)
-/* grn_str */
+/* grn_str: deprecated. use grn_string instead. */
typedef struct {
const char *orig;
@@ -2458,6 +2459,56 @@ GRN_API grn_str *grn_str_open(grn_ctx *ctx, const char *str, unsigned int str_le
int flags);
GRN_API grn_rc grn_str_close(grn_ctx *ctx, grn_str *nstr);
+/* grn_string */
+
+#define GRN_STRING_REMOVE_BLANK (0x01<<0)
+#define GRN_STRING_WITH_TYPES (0x01<<1)
+#define GRN_STRING_WITH_CHECKS (0x01<<2)
+
+#define GRN_NORMALIZER_AUTO ((grn_obj *)1)
+
+#define GRN_CHAR_BLANK 0x80
+#define GRN_CHAR_IS_BLANK(c) ((c) & (GRN_CHAR_BLANK))
+#define GRN_CHAR_TYPE(c) ((c) & 0x7f)
+
+typedef enum {
+ grn_char_null = 0,
+ grn_char_alpha,
+ grn_char_digit,
+ grn_char_symbol,
+ grn_char_hiragana,
+ grn_char_katakana,
+ grn_char_kanji,
+ grn_char_others
+} grn_char_type;
+
+GRN_API grn_obj *grn_string_open(grn_ctx *ctx,
+ const char *string,
+ unsigned int length_in_bytes,
+ grn_obj *normalizer, int flags);
+GRN_API grn_rc grn_string_get_original(grn_ctx *ctx, grn_obj *string,
+ const char **original,
+ unsigned int *length_in_bytes);
+GRN_API int grn_string_get_flags(grn_ctx *ctx, grn_obj *string);
+GRN_API grn_rc grn_string_get_normalized(grn_ctx *ctx, grn_obj *string,
+ const char **normalized,
+ unsigned int *length_in_bytes,
+ unsigned int *n_characters);
+GRN_API grn_rc grn_string_set_normalized(grn_ctx *ctx, grn_obj *string,
+ char *original,
+ unsigned int length_in_bytes,
+ unsigned int n_characters);
+GRN_API const short *grn_string_get_checks(grn_ctx *ctx, grn_obj *string);
+GRN_API grn_rc grn_string_set_checks(grn_ctx *ctx,
+ grn_obj *string,
+ short *checks);
+GRN_API const unsigned char *grn_string_get_types(grn_ctx *ctx, grn_obj *string);
+GRN_API grn_rc grn_string_set_types(grn_ctx *ctx,
+ grn_obj *string,
+ unsigned char *types);
+GRN_API grn_encoding grn_string_get_encoding(grn_ctx *ctx, grn_obj *string);
+
+
GRN_API int grn_charlen(grn_ctx *ctx, const char *str, const char *end);
/* expr */
Modified: lib/dat.cpp (+12 -9)
===================================================================
--- lib/dat.cpp 2012-06-08 17:38:04 +0900 (a20f398)
+++ lib/dat.cpp 2012-06-08 16:39:06 +0900 (552da69)
@@ -672,15 +672,17 @@ grn_dat_scan(grn_ctx *ctx, grn_dat *dat, const char *str,
int num_scan_hits = 0;
try {
if (dat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {
- grn_str * const normalized_str = grn_str_open(
- ctx, str, str_size, GRN_STR_NORMALIZE | GRN_STR_WITH_CHECKS);
- if (!normalized_str) {
- fprintf(stderr, "error: grn_str_open() failed!\n");
+ grn_obj *normalizer = GRN_NORMALIZER_AUTO;
+ int flags = GRN_STRING_WITH_CHECKS;
+ grn_obj * const normalized_string = grn_string_open(ctx, str, str_size,
+ normalizer,
+ flags);
+ if (!normalized_string) {
+ fprintf(stderr, "error: grn_string_open() failed!\n");
return -1;
}
- str = normalized_str->norm;
- str_size = normalized_str->norm_blen;
- const short *checks = normalized_str->checks;
+ grn_string_get_normalized(ctx, normalized_string, &str, &str_size, NULL);
+ const short *checks = grn_string_get_checks(ctx, normalized_string);
unsigned int offset = 0;
while (str_size) {
if (*checks) {
@@ -717,9 +719,10 @@ grn_dat_scan(grn_ctx *ctx, grn_dat *dat, const char *str,
++checks;
}
if (str_rest) {
- *str_rest = normalized_str->orig + offset;
+ grn_string_get_original(ctx, normalized_string, str_rest, NULL);
+ *str_rest += offset;
}
- grn_str_close(ctx, normalized_str);
+ grn_obj_close(ctx, normalized_string);
} else {
const char * const begin = str;
while (str_size) {
Modified: lib/db.c (+11 -5)
===================================================================
--- lib/db.c 2012-06-08 17:38:04 +0900 (91d4ca4)
+++ lib/db.c 2012-06-08 16:39:06 +0900 (dcbde8e)
@@ -26,6 +26,7 @@
#include "plugin_in.h"
#include "geo.h"
#include "snip.h"
+#include "string_in.h"
#include "util.h"
#include <string.h>
#include <float.h>
@@ -34,12 +35,14 @@
#define WITH_NORMALIZE(table,key,key_size,block) do {\
if ((table)->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {\
- grn_str *nstr;\
- if ((nstr = grn_str_open(ctx, key, key_size, GRN_STR_NORMALIZE))) { \
- char *key = nstr->norm;\
- unsigned int key_size = nstr->norm_blen;\
+ grn_obj *nstr;\
+ if ((nstr = grn_string_open(ctx, key, key_size,\
+ GRN_NORMALIZER_AUTO, 0))) {\
+ const char *key;\
+ unsigned int key_size;\
+ grn_string_get_normalized(ctx, nstr, &key, &key_size, NULL);\
block\
- grn_str_close(ctx, nstr);\
+ grn_obj_close(ctx, nstr);\
}\
} else {\
block\
@@ -6926,6 +6929,9 @@ grn_obj_close(grn_ctx *ctx, grn_obj *obj)
case GRN_SNIP :
rc = grn_snip_close_real(ctx, (grn_snip *)obj);
break;
+ case GRN_STRING :
+ rc = grn_string_close(ctx, obj);
+ break;
case GRN_CURSOR_TABLE_PAT_KEY :
grn_pat_cursor_close(ctx, (grn_pat_cursor *)obj);
break;
Modified: lib/expr.c (+12 -7)
===================================================================
--- lib/expr.c 2012-06-08 17:38:04 +0900 (ee97a01)
+++ lib/expr.c 2012-06-08 16:39:06 +0900 (cd77050)
@@ -2240,13 +2240,14 @@ grn_proc_call(grn_ctx *ctx, grn_obj *proc, int nargs, grn_obj *caller)
void
pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res)
{
- grn_str *a = NULL, *b = NULL;
+ grn_obj *a = NULL, *b = NULL;
switch (x->header.domain) {
case GRN_DB_SHORT_TEXT:
case GRN_DB_TEXT:
case GRN_DB_LONG_TEXT:
- a = grn_str_open(ctx, GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x), GRN_STR_NORMALIZE);
+ a = grn_string_open(ctx, GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x),
+ GRN_NORMALIZER_AUTO, 0);
break;
default:
break;
@@ -2256,23 +2257,27 @@ pseudo_query_scan(grn_ctx *ctx, grn_obj *x, grn_obj *y, grn_obj *res)
case GRN_DB_SHORT_TEXT:
case GRN_DB_TEXT:
case GRN_DB_LONG_TEXT:
- b = grn_str_open(ctx, GRN_TEXT_VALUE(y), GRN_TEXT_LEN(y), GRN_STR_NORMALIZE);
+ b = grn_string_open(ctx, GRN_TEXT_VALUE(y), GRN_TEXT_LEN(y),
+ GRN_NORMALIZER_AUTO, 0);
break;
default:
break;
}
/* normalized str doesn't contain '\0'. */
- if (a && b && strstr(a->norm, b->norm)) {
- GRN_INT32_SET(ctx, res, 1);
+ if (a && b) {
+ const char *a_norm, *b_norm;
+ grn_string_get_normalized(ctx, a, &a_norm, NULL, NULL);
+ grn_string_get_normalized(ctx, b, &b_norm, NULL, NULL);
+ GRN_INT32_SET(ctx, res, strstr(a_norm, b_norm) != NULL);
} else {
GRN_INT32_SET(ctx, res, 0);
}
res->header.type = GRN_BULK;
res->header.domain = GRN_DB_INT32;
- if (a) { grn_str_close(ctx, a); }
- if (b) { grn_str_close(ctx, b); }
+ if (a) { grn_obj_close(ctx, a); }
+ if (b) { grn_obj_close(ctx, b); }
}
grn_obj *
Modified: lib/ii.c (+9 -5)
===================================================================
--- lib/ii.c 2012-06-08 17:38:04 +0900 (c55b1ef)
+++ lib/ii.c 2012-06-08 16:39:06 +0900 (3e27e97)
@@ -5776,7 +5776,9 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
grn_rset_posinfo pi;
grn_id tid;
const char *p, *pe;
- grn_str *nstr;
+ grn_obj *nstr;
+ const char *normalized;
+ unsigned int normalized_length_in_bytes;
grn_ii_cursor *c;
grn_ii_posting *pos;
int skip, rep, policy;
@@ -5785,7 +5787,7 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
if (!ii || !string || !string_len || !s || !optarg) {
return GRN_INVALID_ARGUMENT;
}
- if (!(nstr = grn_str_open(ctx, string, string_len, 0))) {
+ if (!(nstr = grn_string_open(ctx, string, string_len, NULL, 0))) {
return GRN_INVALID_ARGUMENT;
}
policy = optarg->max_interval;
@@ -5801,7 +5803,9 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position);
*/
rep = 0;
- for (p = nstr->norm, pe = p + nstr->norm_blen; p < pe; p += skip) {
+ grn_string_get_normalized(ctx, nstr, &normalized, &normalized_length_in_bytes,
+ NULL);
+ for (p = normalized, pe = p + normalized_length_in_bytes; p < pe; p += skip) {
if ((tid = grn_table_lcp_search(ctx, ii->lexicon, p, pe - p))) {
if (policy == TERM_EXTRACT_EACH_POST) {
if (!(skip = grn_table_get_key(ctx, ii->lexicon, tid, NULL, 0))) { break; }
@@ -5827,7 +5831,7 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
while (grn_ii_cursor_next(ctx, c)) {
if (policy == TERM_EXTRACT_EACH_POST) {
pi.rid = c->post->rid;
- pi.sid = p - nstr->norm;
+ pi.sid = p - normalized;
res_add(ctx, s, &pi, pi.sid + 1, op);
} else {
pos = c->post;
@@ -5843,7 +5847,7 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
}
}
}
- grn_str_close(ctx, nstr);
+ grn_obj_close(ctx, nstr);
return rc;
}
Modified: lib/pat.c (+13 -5)
===================================================================
--- lib/pat.c 2012-06-08 17:38:04 +0900 (6e0c377)
+++ lib/pat.c 2012-06-08 16:39:06 +0900 (6722ab9)
@@ -1527,11 +1527,16 @@ grn_pat_scan(grn_ctx *ctx, grn_pat *pat, const char *str, unsigned int str_len,
int n = 0;
grn_id tid;
if (pat->obj.header.flags & GRN_OBJ_KEY_NORMALIZE) {
- grn_str *nstr = grn_str_open(ctx, str, str_len, GRN_STR_NORMALIZE|GRN_STR_WITH_CHECKS);
+ grn_obj *nstr = grn_string_open(ctx, str, str_len,
+ GRN_NORMALIZER_AUTO, GRN_STRING_WITH_CHECKS);
if (nstr) {
- int16_t *cp = nstr->checks;
+ const short *cp = grn_string_get_checks(ctx, nstr);
unsigned int offset = 0, offset0 = 0;
- const char *sp = nstr->norm, *se = nstr->norm + nstr->norm_blen;
+ unsigned int normalized_length_in_bytes;
+ const char *sp, *se;
+ grn_string_get_normalized(ctx, nstr, &sp, &normalized_length_in_bytes,
+ NULL);
+ se = sp + normalized_length_in_bytes;
while (n < sh_size) {
if ((tid = grn_pat_lcp_search(ctx, pat, sp, se - sp))) {
uint32_t len;
@@ -1552,8 +1557,11 @@ grn_pat_scan(grn_ctx *ctx, grn_pat *pat, const char *str, unsigned int str_len,
}
if (se <= sp) { offset = str_len; break; }
}
- if (rest) { *rest = nstr->orig + offset; }
- grn_str_close(ctx, nstr);
+ if (rest) {
+ grn_string_get_original(ctx, nstr, rest, NULL);
+ *rest += offset;
+ }
+ grn_obj_close(ctx, nstr);
} else {
n = -1;
if (rest) { *rest = str; }
Modified: lib/snip.c (+55 -36)
===================================================================
--- lib/snip.c 2012-06-08 17:38:04 +0900 (02a5c10)
+++ lib/snip.c 2012-06-08 16:39:06 +0900 (ffadc90)
@@ -83,31 +83,31 @@ grn_bm_preBmBc(const unsigned char *x, size_t m, size_t *bmBc)
}
#define GRN_BM_COMPARE do { \
- if (object->checks[found]) { \
+ if (string_checks[found]) { \
size_t offset = cond->last_offset, found_alpha_head = cond->found_alpha_head; \
/* calc real offset */\
for (i = cond->last_found; i < found; i++) { \
- if (object->checks[i] > 0) { \
+ if (string_checks[i] > 0) { \
found_alpha_head = i; \
- offset += object->checks[i]; \
+ offset += string_checks[i]; \
} \
} \
/* if real offset is in a character, move it the head of the character */ \
- if (object->checks[found] < 0) { \
- offset -= object->checks[found_alpha_head]; \
+ if (string_checks[found] < 0) { \
+ offset -= string_checks[found_alpha_head]; \
cond->last_found = found_alpha_head; \
} else { \
cond->last_found = found; \
} \
cond->start_offset = cond->last_offset = offset; \
if (flags & GRN_SNIP_SKIP_LEADING_SPACES) { \
- while (cond->start_offset < object->orig_blen && \
- (i = grn_isspace(object->orig + cond->start_offset, \
- object->encoding))) { cond->start_offset += i; } \
+ while (cond->start_offset < string_original_length_in_bytes && \
+ (i = grn_isspace(string_original + cond->start_offset, \
+ string_encoding))) { cond->start_offset += i; } \
} \
for (i = cond->last_found; i < found + m; i++) { \
- if (object->checks[i] > 0) { \
- offset += object->checks[i]; \
+ if (string_checks[i] > 0) { \
+ offset += string_checks[i]; \
} \
} \
cond->end_offset = offset; \
@@ -130,7 +130,7 @@ grn_bm_preBmBc(const unsigned char *x, size_t m, size_t *bmBc)
} while (0)
void
-grn_bm_tunedbm(snip_cond *cond, grn_str *object, int flags)
+grn_bm_tunedbm(grn_ctx *ctx, snip_cond *cond, grn_obj *string, int flags)
{
register unsigned char *limit, ck;
register const unsigned char *p, *cp;
@@ -140,13 +140,25 @@ grn_bm_tunedbm(snip_cond *cond, grn_str *object, int flags)
unsigned char *y;
size_t shift, found;
- const size_t n = object->norm_blen, m = cond->keyword->norm_blen;
-
- y = (unsigned char *) object->norm;
+ const char *string_original;
+ unsigned int string_original_length_in_bytes;
+ const short *string_checks;
+ grn_encoding string_encoding;
+ const char *string_norm, *keyword_norm;
+ unsigned int n, m;
+
+ grn_string_get_original(ctx, string,
+ &string_original, &string_original_length_in_bytes);
+ string_checks = grn_string_get_checks(ctx, string);
+ string_encoding = grn_string_get_encoding(ctx, string);
+ grn_string_get_normalized(ctx, string, &string_norm, &n, NULL);
+ grn_string_get_normalized(ctx, cond->keyword, &keyword_norm, &m, NULL);
+
+ y = (unsigned char *)string_norm;
if (m == 1) {
if (n > cond->found) {
shift = 1;
- p = memchr(y + cond->found, cond->keyword->norm[0], n - cond->found);
+ p = memchr(y + cond->found, keyword_norm[0], n - cond->found);
if (p != NULL) {
found = p - y;
GRN_BM_COMPARE;
@@ -156,7 +168,7 @@ grn_bm_tunedbm(snip_cond *cond, grn_str *object, int flags)
return;
}
- x = (unsigned char *) cond->keyword->norm;
+ x = (unsigned char *)keyword_norm;
bmBc = cond->bmBc;
shift = cond->shift;
@@ -240,7 +252,7 @@ grn_snip_cond_close(grn_ctx *ctx, snip_cond *cond)
return GRN_INVALID_ARGUMENT;
}
if (cond->keyword) {
- grn_str_close(ctx, cond->keyword);
+ grn_obj_close(ctx, cond->keyword);
}
return GRN_SUCCESS;
}
@@ -249,23 +261,27 @@ grn_rc
grn_snip_cond_init(grn_ctx *ctx, snip_cond *sc, const char *keyword, unsigned int keyword_len,
grn_encoding enc, int flags)
{
- size_t norm_blen;
+ const char *norm;
+ unsigned int norm_blen;
+ grn_obj *normalizer = NULL;
int f = GRN_STR_REMOVEBLANK;
memset(sc, 0, sizeof(snip_cond));
- if (flags & GRN_SNIP_NORMALIZE) { f |= GRN_STR_NORMALIZE; }
- if (!(sc->keyword = grn_str_open(ctx, keyword, keyword_len, f))) {
- GRN_LOG(ctx, GRN_LOG_ALERT, "grn_str_open on snip_cond_init failed !");
+ if (flags & GRN_SNIP_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; }
+ if (!(sc->keyword = grn_string_open(ctx, keyword, keyword_len,
+ normalizer, f))) {
+ GRN_LOG(ctx, GRN_LOG_ALERT,
+ "grn_string_open on snip_cond_init failed!");
return GRN_NO_MEMORY_AVAILABLE;
}
- norm_blen = sc->keyword->norm_blen; /* byte length, not cond->keyword->length */
+ grn_string_get_normalized(ctx, sc->keyword, &norm, &norm_blen, NULL);
if (!norm_blen) {
grn_snip_cond_close(ctx, sc);
return GRN_INVALID_ARGUMENT;
}
if (norm_blen != 1) {
- grn_bm_preBmBc((unsigned char *)sc->keyword->norm, norm_blen, sc->bmBc);
- sc->shift = sc->bmBc[(unsigned char)sc->keyword->norm[norm_blen - 1]];
- sc->bmBc[(unsigned char)sc->keyword->norm[norm_blen - 1]] = 0;
+ grn_bm_preBmBc((unsigned char *)norm, norm_blen, sc->bmBc);
+ sc->shift = sc->bmBc[(unsigned char)norm[norm_blen - 1]];
+ sc->bmBc[(unsigned char)norm[norm_blen - 1]] = 0;
}
return GRN_SUCCESS;
}
@@ -332,6 +348,7 @@ grn_snip_add_cond(grn_ctx *ctx, grn_snip *snip,
grn_rc rc;
int copy_tag;
snip_cond *cond;
+ unsigned int norm_blen;
if (!snip || !keyword || !keyword_len || snip->cond_len >= MAX_SNIP_COND_COUNT) {
return GRN_INVALID_ARGUMENT;
@@ -341,7 +358,8 @@ grn_snip_add_cond(grn_ctx *ctx, grn_snip *snip,
snip->encoding, snip->flags))) {
return rc;
}
- if (cond->keyword->norm_blen > snip->width) {
+ grn_string_get_normalized(ctx, cond->keyword, NULL, &norm_blen, NULL);
+ if (norm_blen > snip->width) {
grn_snip_cond_close(ctx, cond);
return GRN_INVALID_ARGUMENT;
}
@@ -490,7 +508,7 @@ exec_clean(grn_ctx *ctx, grn_snip *snip)
{
snip_cond *cond, *cond_end;
if (snip->nstr) {
- grn_str_close(ctx, snip->nstr);
+ grn_obj_close(ctx, snip->nstr);
snip->nstr = NULL;
}
snip->tag_count = 0;
@@ -522,7 +540,7 @@ grn_snip_close_real(grn_ctx *ctx, grn_snip *snip)
if (dct) { GRN_FREE((void *)dct); }
}
if (snip->nstr) {
- grn_str_close(ctx, snip->nstr);
+ grn_obj_close(ctx, snip->nstr);
}
for (cond = snip->cond, cond_end = cond + snip->cond_len;
cond < cond_end; cond++) {
@@ -547,6 +565,7 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
unsigned int *nresults, unsigned int *max_tagged_len)
{
size_t i;
+ grn_obj *normalizer = NULL;
int f = GRN_STR_WITH_CHECKS|GRN_STR_REMOVEBLANK;
if (!snip || !string || !nresults || !max_tagged_len) {
return GRN_INVALID_ARGUMENT;
@@ -554,15 +573,15 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
GRN_API_ENTER;
exec_clean(ctx, snip);
*nresults = 0;
- if (snip->flags & GRN_SNIP_NORMALIZE) { f |= GRN_STR_NORMALIZE; }
- snip->nstr = grn_str_open(ctx, string, string_len, f);
+ if (snip->flags & GRN_SNIP_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; }
+ snip->nstr = grn_string_open(ctx, string, string_len, normalizer, f);
if (!snip->nstr) {
exec_clean(ctx, snip);
- GRN_LOG(ctx, GRN_LOG_ALERT, "grn_str_open on grn_snip_exec failed !");
+ GRN_LOG(ctx, GRN_LOG_ALERT, "grn_string_open on grn_snip_exec failed !");
GRN_API_RETURN(ctx->rc);
}
for (i = 0; i < snip->cond_len; i++) {
- grn_bm_tunedbm(snip->cond + i, snip->nstr, snip->flags);
+ grn_bm_tunedbm(ctx, snip->cond + i, snip->nstr, snip->flags);
}
{
@@ -610,7 +629,7 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
}
}
if (exclude_other_cond) {
- grn_bm_tunedbm(cond, snip->nstr, snip->flags);
+ grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
continue;
}
}
@@ -623,7 +642,7 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
/* check nesting to make valid HTML */
/* ToDo: allow <test><te>te</te><st>st</st></test> */
if (cond->start_offset < last_tag_end) {
- grn_bm_tunedbm(cond, snip->nstr, snip->flags);
+ grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
continue;
}
}
@@ -631,7 +650,7 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
/* If a keyword gets across a snippet, */
/* it was skipped and never to be tagged. */
cond->stopflag = SNIPCOND_ACROSS;
- grn_bm_tunedbm(cond, snip->nstr, snip->flags);
+ grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
} else {
found_cond = 1;
if (cond->count == 0) {
@@ -650,7 +669,7 @@ grn_snip_exec(grn_ctx *ctx, grn_snip *snip, const char *string, unsigned int str
if (++snip->tag_count >= MAX_SNIP_TAG_COUNT) {
break;
}
- grn_bm_tunedbm(cond, snip->nstr, snip->flags);
+ grn_bm_tunedbm(ctx, cond, snip->nstr, snip->flags);
}
}
if (!found_cond) {
Modified: lib/snip.h (+4 -4)
===================================================================
--- lib/snip.h 2012-06-08 17:38:04 +0900 (f328af4)
+++ lib/snip.h 2012-06-08 16:39:06 +0900 (12d87c3)
@@ -1,5 +1,5 @@
/* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2009 Brazil
+/* Copyright(C) 2009-2012 Brazil
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -50,7 +50,7 @@ typedef struct _snip_cond
const char *closetag;
size_t opentag_len;
size_t closetag_len;
- grn_str *keyword;
+ grn_obj *keyword;
/* Tuned BM pre */
size_t bmBc[ASIZE];
@@ -108,7 +108,7 @@ struct _grn_snip
unsigned int snip_count;
const char *string;
- grn_str *nstr;
+ grn_obj *nstr;
_snip_result snip_result[MAX_SNIP_RESULT_COUNT];
_snip_tag_result tag_result[MAX_SNIP_TAG_COUNT];
@@ -121,7 +121,7 @@ grn_rc grn_snip_cond_init(grn_ctx *ctx, snip_cond *sc, const char *keyword, unsi
grn_encoding enc, int flags);
void grn_snip_cond_reinit(snip_cond *cond);
grn_rc grn_snip_cond_close(grn_ctx *ctx, snip_cond *cond);
-void grn_bm_tunedbm(snip_cond *cond, grn_str *object, int flags);
+void grn_bm_tunedbm(grn_ctx *ctx, snip_cond *cond, grn_obj *string, int flags);
#ifdef __cplusplus
}
Modified: lib/sources.am (+2 -0)
===================================================================
--- lib/sources.am 2012-06-08 17:38:04 +0900 (b0966a3)
+++ lib/sources.am 2012-06-08 16:39:06 +0900 (61e888f)
@@ -34,6 +34,8 @@ libgroonga_la_SOURCES = \
store.h \
str.c \
str.h \
+ string.c \
+ string_in.h \
token.c \
token.h \
tokenizer.c \
Added: lib/string.c (+1369 -0) 100644
===================================================================
--- /dev/null
+++ lib/string.c 2012-06-08 16:39:06 +0900 (6b0cfec)
@@ -0,0 +1,1369 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2009-2012 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "groonga_in.h"
+#include <string.h>
+#include "string_in.h"
+#include "str.h"
+
+static unsigned char symbol[] = {
+ ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
+ '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+inline static grn_obj *
+eucjp_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+ grn_user_data *user_data)
+{
+ static uint16_t hankana[] = {
+ 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
+ 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
+ 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
+ 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
+ 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
+ 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
+ 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
+ 0xa1eb
+ };
+ static unsigned char dakuten[] = {
+ 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
+ 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
+ 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
+ 0, 0xdc
+ };
+ static unsigned char handaku[] = {
+ 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
+ };
+ grn_string *nstr = (grn_string *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_, *e;
+ unsigned char *d, *d0, *d_, b;
+ uint_least8_t *cp, *ctypes, ctype;
+ size_t size = nstr->original_length_in_bytes, length = 0;
+ int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+ if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][eucjp] failed to allocate normalized text space");
+ return NULL;
+ }
+ d0 = (unsigned char *) nstr->normalized;
+ if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->normalized);
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][eucjp] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STRING_WITH_TYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+ GRN_FREE(nstr->checks);
+ GRN_FREE(nstr->normalized);
+ nstr->checks = NULL;
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][eucjp] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = ctypes = nstr->ctypes;
+ e = (unsigned char *)nstr->original + size;
+ for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
+ if ((*s & 0x80)) {
+ if (((s + 1) < e) && (*(s + 1) & 0x80)) {
+ unsigned char c1 = *s++, c2 = *s, c3 = 0;
+ switch (c1 >> 4) {
+ case 0x08 :
+ if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
+ uint16_t c = hankana[c2 - 0xa0];
+ switch (c) {
+ case 0xa1ab :
+ if (d > d0 + 1 && d[-2] == 0xa5
+ && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
+ *(d - 1) = b;
+ if (ch) { ch[-1] += 2; s_ += 2; }
+ continue;
+ } else {
+ *d++ = c >> 8; *d = c & 0xff;
+ }
+ break;
+ case 0xa1eb :
+ if (d > d0 + 1 && d[-2] == 0xa5
+ && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
+ *(d - 1) = b;
+ if (ch) { ch[-1] += 2; s_ += 2; }
+ continue;
+ } else {
+ *d++ = c >> 8; *d = c & 0xff;
+ }
+ break;
+ default :
+ *d++ = c >> 8; *d = c & 0xff;
+ break;
+ }
+ ctype = grn_char_katakana;
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_char_others;
+ }
+ break;
+ case 0x09 :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_others;
+ break;
+ case 0x0a :
+ switch (c1 & 0x0f) {
+ case 1 :
+ switch (c2) {
+ case 0xbc :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_katakana;
+ break;
+ case 0xb9 :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_kanji;
+ break;
+ case 0xa1 :
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_CHAR_BLANK|grn_char_symbol;
+ }
+ break;
+ default :
+ if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
+ *d = c3;
+ ctype = grn_char_symbol;
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_char_others;
+ }
+ break;
+ }
+ break;
+ case 2 :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_symbol;
+ break;
+ case 3 :
+ c3 = c2 - 0x80;
+ if ('a' <= c3 && c3 <= 'z') {
+ ctype = grn_char_alpha;
+ *d = c3;
+ } else if ('A' <= c3 && c3 <= 'Z') {
+ ctype = grn_char_alpha;
+ *d = c3 + 0x20;
+ } else if ('0' <= c3 && c3 <= '9') {
+ ctype = grn_char_digit;
+ *d = c3;
+ } else {
+ ctype = grn_char_others;
+ *d++ = c1; *d = c2;
+ }
+ break;
+ case 4 :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_hiragana;
+ break;
+ case 5 :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_katakana;
+ break;
+ case 6 :
+ case 7 :
+ case 8 :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_symbol;
+ break;
+ default :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_others;
+ break;
+ }
+ break;
+ default :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_kanji;
+ break;
+ }
+ } else {
+ /* skip invalid character */
+ continue;
+ }
+ } else {
+ unsigned char c = *s;
+ switch (c >> 4) {
+ case 0 :
+ case 1 :
+ /* skip unprintable ascii */
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ case 2 :
+ if (c == 0x20) {
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_CHAR_BLANK|grn_char_symbol;
+ }
+ } else {
+ *d = c;
+ ctype = grn_char_symbol;
+ }
+ break;
+ case 3 :
+ *d = c;
+ ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
+ break;
+ case 4 :
+ *d = ('A' <= c) ? c + 0x20 : c;
+ ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 5 :
+ *d = (c <= 'Z') ? c + 0x20 : c;
+ ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
+ break;
+ case 6 :
+ *d = c;
+ ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 7 :
+ *d = c;
+ ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
+ break;
+ default :
+ *d = c;
+ ctype = grn_char_others;
+ break;
+ }
+ }
+ d++;
+ length++;
+ if (cp) { *cp++ = ctype; }
+ if (ch) {
+ *ch++ = (int16_t)(s + 1 - s_);
+ s_ = s + 1;
+ while (++d_ < d) { *ch++ = 0; }
+ }
+ }
+ if (cp) { *cp = grn_char_null; }
+ *d = '\0';
+ nstr->n_characters = length;
+ nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+ return NULL;
+}
+
+inline static grn_obj *
+sjis_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+ grn_user_data *user_data)
+{
+ static uint16_t hankana[] = {
+ 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
+ 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
+ 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
+ 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
+ 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
+ 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
+ 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
+ 0x814b
+ };
+ static unsigned char dakuten[] = {
+ 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
+ 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
+ 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
+ 0, 0x7b
+ };
+ static unsigned char handaku[] = {
+ 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
+ };
+ grn_string *nstr = (grn_string *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_;
+ unsigned char *d, *d0, *d_, b, *e;
+ uint_least8_t *cp, *ctypes, ctype;
+ size_t size = nstr->original_length_in_bytes, length = 0;
+ int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+ if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][sjis] failed to allocate normalized text space");
+ return NULL;
+ }
+ d0 = (unsigned char *) nstr->normalized;
+ if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->normalized);
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][sjis] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STRING_WITH_TYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+ GRN_FREE(nstr->checks);
+ GRN_FREE(nstr->normalized);
+ nstr->checks = NULL;
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][sjis] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = ctypes = nstr->ctypes;
+ e = (unsigned char *)nstr->original + size;
+ for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
+ if ((*s & 0x80)) {
+ if (0xa0 <= *s && *s <= 0xdf) {
+ uint16_t c = hankana[*s - 0xa0];
+ switch (c) {
+ case 0x814a :
+ if (d > d0 + 1 && d[-2] == 0x83
+ && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
+ *(d - 1) = b;
+ if (ch) { ch[-1]++; s_++; }
+ continue;
+ } else {
+ *d++ = c >> 8; *d = c & 0xff;
+ }
+ break;
+ case 0x814b :
+ if (d > d0 + 1 && d[-2] == 0x83
+ && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
+ *(d - 1) = b;
+ if (ch) { ch[-1]++; s_++; }
+ continue;
+ } else {
+ *d++ = c >> 8; *d = c & 0xff;
+ }
+ break;
+ default :
+ *d++ = c >> 8; *d = c & 0xff;
+ break;
+ }
+ ctype = grn_char_katakana;
+ } else {
+ if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
+ unsigned char c1 = *s++, c2 = *s, c3 = 0;
+ if (0x81 <= c1 && c1 <= 0x87) {
+ switch (c1 & 0x0f) {
+ case 1 :
+ switch (c2) {
+ case 0x5b :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_katakana;
+ break;
+ case 0x58 :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_kanji;
+ break;
+ case 0x40 :
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_CHAR_BLANK|grn_char_symbol;
+ }
+ break;
+ default :
+ if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
+ *d = c3;
+ ctype = grn_char_symbol;
+ } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
+ *d = c3;
+ ctype = grn_char_symbol;
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_char_others;
+ }
+ break;
+ }
+ break;
+ case 2 :
+ c3 = c2 - 0x1f;
+ if (0x4f <= c2 && c2 <= 0x58) {
+ ctype = grn_char_digit;
+ *d = c2 - 0x1f;
+ } else if (0x60 <= c2 && c2 <= 0x79) {
+ ctype = grn_char_alpha;
+ *d = c2 + 0x01;
+ } else if (0x81 <= c2 && c2 <= 0x9a) {
+ ctype = grn_char_alpha;
+ *d = c2 - 0x20;
+ } else if (0x9f <= c2 && c2 <= 0xf1) {
+ *d++ = c1; *d = c2;
+ ctype = grn_char_hiragana;
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_char_others;
+ }
+ break;
+ case 3 :
+ if (0x40 <= c2 && c2 <= 0x96) {
+ *d++ = c1; *d = c2;
+ ctype = grn_char_katakana;
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_char_symbol;
+ }
+ break;
+ case 4 :
+ case 7 :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_symbol;
+ break;
+ default :
+ *d++ = c1; *d = c2;
+ ctype = grn_char_others;
+ break;
+ }
+ } else {
+ *d++ = c1; *d = c2;
+ ctype = grn_char_kanji;
+ }
+ } else {
+ /* skip invalid character */
+ continue;
+ }
+ }
+ } else {
+ unsigned char c = *s;
+ switch (c >> 4) {
+ case 0 :
+ case 1 :
+ /* skip unprintable ascii */
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ case 2 :
+ if (c == 0x20) {
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_CHAR_BLANK|grn_char_symbol;
+ }
+ } else {
+ *d = c;
+ ctype = grn_char_symbol;
+ }
+ break;
+ case 3 :
+ *d = c;
+ ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
+ break;
+ case 4 :
+ *d = ('A' <= c) ? c + 0x20 : c;
+ ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 5 :
+ *d = (c <= 'Z') ? c + 0x20 : c;
+ ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
+ break;
+ case 6 :
+ *d = c;
+ ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 7 :
+ *d = c;
+ ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
+ break;
+ default :
+ *d = c;
+ ctype = grn_char_others;
+ break;
+ }
+ }
+ d++;
+ length++;
+ if (cp) { *cp++ = ctype; }
+ if (ch) {
+ *ch++ = (int16_t)(s + 1 - s_);
+ s_ = s + 1;
+ while (++d_ < d) { *ch++ = 0; }
+ }
+ }
+ if (cp) { *cp = grn_char_null; }
+ *d = '\0';
+ nstr->n_characters = length;
+ nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+ return NULL;
+}
+
+#ifdef WITH_NFKC
+uint_least8_t grn_nfkc_ctype(const unsigned char *str);
+const char *grn_nfkc_map1(const unsigned char *str);
+const char *grn_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix);
+
+static inline int
+grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end)
+{
+ /* MEMO: This function allows non-null-terminated string as str. */
+ /* But requires the end of string. */
+ const unsigned char *p = str;
+ if (end <= p || !*p) { return 0; }
+ if (*p & 0x80) {
+ int b, w;
+ int size;
+ for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
+ if (!w) {
+ GRN_LOG(ctx, GRN_LOG_WARNING, "invalid utf8 string(1) on grn_str_charlen_utf8");
+ return 0;
+ }
+ for (size = 1; w--; size++) {
+ if (++p >= end || !*p || (*p & 0xc0) != 0x80) {
+ GRN_LOG(ctx, GRN_LOG_WARNING, "invalid utf8 string(2) on grn_str_charlen_utf8");
+ return 0;
+ }
+ }
+ return size;
+ } else {
+ return 1;
+ }
+ return 0;
+}
+
+inline static grn_obj *
+utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+ int16_t *ch;
+ const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
+ unsigned char *d, *d_, *de;
+ uint_least8_t *cp;
+ grn_string *nstr = (grn_string *)args[0];
+ size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3;
+ int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+ if (!(nstr->normalized = GRN_MALLOC(ds + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][utf8] failed to allocate normalized text space");
+ return NULL;
+ }
+ if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->normalized);
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][utf8] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STRING_WITH_TYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) {
+ if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+ GRN_FREE(nstr->normalized); nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][utf8] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = nstr->ctypes;
+ d = (unsigned char *)nstr->normalized;
+ de = d + ds;
+ d_ = NULL;
+ e = (unsigned char *)nstr->original + size;
+ for (s = s_ = (unsigned char *)nstr->original; ; s += ls) {
+ if (!(ls = grn_str_charlen_utf8(ctx, s, e))) {
+ break;
+ }
+ if ((p = (unsigned char *)grn_nfkc_map1(s))) {
+ pe = p + strlen((char *)p);
+ } else {
+ p = s;
+ pe = p + ls;
+ }
+ if (d_ && (p2 = (unsigned char *)grn_nfkc_map2(d_, p))) {
+ p = p2;
+ pe = p + strlen((char *)p);
+ if (cp) { cp--; }
+ if (ch) {
+ ch -= (d - d_);
+ s_ = s__;
+ }
+ d = d_;
+ length--;
+ }
+ for (; ; p += lp) {
+ if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) {
+ break;
+ }
+ if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) {
+ if (cp > nstr->ctypes) { *(cp - 1) |= GRN_STR_BLANK; }
+ } else {
+ if (de <= d + lp) {
+ unsigned char *normalized;
+ ds += (ds >> 1) + lp;
+ if (!(normalized = GRN_REALLOC(nstr->normalized, ds + 1))) {
+ if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+ if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+ GRN_FREE(nstr->normalized); nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][utf8] failed to expand normalized text space");
+ return NULL;
+ }
+ de = normalized + ds;
+ d = normalized + (d - (unsigned char *)nstr->normalized);
+ nstr->normalized = normalized;
+ if (ch) {
+ int16_t *checks;
+ if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) {
+ if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+ GRN_FREE(nstr->checks); nstr->checks = NULL;
+ GRN_FREE(nstr->normalized); nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][utf8] failed to expand checks space");
+ return NULL;
+ }
+ ch = checks + (ch - nstr->checks);
+ nstr->checks = checks;
+ }
+ if (cp) {
+ uint_least8_t *ctypes;
+ if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
+ GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
+ if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+ GRN_FREE(nstr->normalized); nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][utf8] failed to expand character types space");
+ return NULL;
+ }
+ cp = ctypes + (cp - nstr->ctypes);
+ nstr->ctypes = ctypes;
+ }
+ }
+ memcpy(d, p, lp);
+ d_ = d;
+ d += lp;
+ length++;
+ if (cp) { *cp++ = grn_nfkc_ctype(p); }
+ if (ch) {
+ size_t i;
+ if (s_ == s + ls) {
+ *ch++ = -1;
+ } else {
+ *ch++ = (int16_t)(s + ls - s_);
+ s__ = s_;
+ s_ = s + ls;
+ }
+ for (i = lp; i > 1; i--) { *ch++ = 0; }
+ }
+ }
+ }
+ }
+ if (cp) { *cp = grn_str_null; }
+ *d = '\0';
+ nstr->n_characters = length;
+ nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+ return NULL;
+}
+#endif /* WITH_NFKC */
+
+inline static grn_obj *
+ascii_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+ grn_string *nstr = (grn_string *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_, *e;
+ unsigned char *d, *d0, *d_;
+ uint_least8_t *cp, *ctypes, ctype;
+ size_t size = nstr->original_length_in_bytes, length = 0;
+ int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+ if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][ascii] failed to allocate normalized text space");
+ return NULL;
+ }
+ d0 = (unsigned char *) nstr->normalized;
+ if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->normalized);
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][ascii] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STRING_WITH_TYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+ GRN_FREE(nstr->checks);
+ GRN_FREE(nstr->normalized);
+ nstr->checks = NULL;
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][ascii] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = ctypes = nstr->ctypes;
+ e = (unsigned char *)nstr->original + size;
+ for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
+ unsigned char c = *s;
+ switch (c >> 4) {
+ case 0 :
+ case 1 :
+ /* skip unprintable ascii */
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ case 2 :
+ if (c == 0x20) {
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_CHAR_BLANK|grn_char_symbol;
+ }
+ } else {
+ *d = c;
+ ctype = grn_char_symbol;
+ }
+ break;
+ case 3 :
+ *d = c;
+ ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
+ break;
+ case 4 :
+ *d = ('A' <= c) ? c + 0x20 : c;
+ ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 5 :
+ *d = (c <= 'Z') ? c + 0x20 : c;
+ ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
+ break;
+ case 6 :
+ *d = c;
+ ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 7 :
+ *d = c;
+ ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
+ break;
+ default :
+ *d = c;
+ ctype = grn_char_others;
+ break;
+ }
+ d++;
+ length++;
+ if (cp) { *cp++ = ctype; }
+ if (ch) {
+ *ch++ = (int16_t)(s + 1 - s_);
+ s_ = s + 1;
+ while (++d_ < d) { *ch++ = 0; }
+ }
+ }
+ if (cp) { *cp = grn_char_null; }
+ *d = '\0';
+ nstr->n_characters = length;
+ nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+ return NULL;
+}
+
+/* use cp1252 as latin1 */
+inline static grn_obj *
+latin1_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+ grn_user_data *user_data)
+{
+ grn_string *nstr = (grn_string *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_, *e;
+ unsigned char *d, *d0, *d_;
+ uint_least8_t *cp, *ctypes, ctype;
+ size_t size = strlen(nstr->original), length = 0;
+ int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+ if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][latin1] failed to allocate normalized text space");
+ return NULL;
+ }
+ d0 = (unsigned char *) nstr->normalized;
+ if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->normalized);
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][latin1] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STRING_WITH_TYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+ GRN_FREE(nstr->checks);
+ GRN_FREE(nstr->normalized);
+ nstr->checks = NULL;
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[normalizer][latin1] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = ctypes = nstr->ctypes;
+ e = (unsigned char *)nstr->original + size;
+ for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
+ unsigned char c = *s;
+ switch (c >> 4) {
+ case 0 :
+ case 1 :
+ /* skip unprintable ascii */
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ case 2 :
+ if (c == 0x20) {
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_CHAR_BLANK|grn_char_symbol;
+ }
+ } else {
+ *d = c;
+ ctype = grn_char_symbol;
+ }
+ break;
+ case 3 :
+ *d = c;
+ ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
+ break;
+ case 4 :
+ *d = ('A' <= c) ? c + 0x20 : c;
+ ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 5 :
+ *d = (c <= 'Z') ? c + 0x20 : c;
+ ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
+ break;
+ case 6 :
+ *d = c;
+ ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 7 :
+ *d = c;
+ ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
+ break;
+ case 8 :
+ if (c == 0x8a || c == 0x8c || c == 0x8e) {
+ *d = c + 0x10;
+ ctype = grn_char_alpha;
+ } else {
+ *d = c;
+ ctype = grn_char_symbol;
+ }
+ break;
+ case 9 :
+ if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
+ *d = (c == 0x9f) ? c + 0x60 : c;
+ ctype = grn_char_alpha;
+ } else {
+ *d = c;
+ ctype = grn_char_symbol;
+ }
+ break;
+ case 0x0c :
+ *d = c + 0x20;
+ ctype = grn_char_alpha;
+ break;
+ case 0x0d :
+ *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
+ ctype = (c == 0xd7) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 0x0e :
+ *d = c;
+ ctype = grn_char_alpha;
+ break;
+ case 0x0f :
+ *d = c;
+ ctype = (c == 0xf7) ? grn_char_symbol : grn_char_alpha;
+ break;
+ default :
+ *d = c;
+ ctype = grn_char_others;
+ break;
+ }
+ d++;
+ length++;
+ if (cp) { *cp++ = ctype; }
+ if (ch) {
+ *ch++ = (int16_t)(s + 1 - s_);
+ s_ = s + 1;
+ while (++d_ < d) { *ch++ = 0; }
+ }
+ }
+ if (cp) { *cp = grn_char_null; }
+ *d = '\0';
+ nstr->n_characters = length;
+ nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+ return NULL;
+}
+
+inline static grn_obj *
+koi8r_normalize(grn_ctx *ctx, int nargs, grn_obj **args,
+ grn_user_data *user_data)
+{
+ grn_string *nstr = (grn_string *)args[0];
+ int16_t *ch;
+ const unsigned char *s, *s_, *e;
+ unsigned char *d, *d0, *d_;
+ uint_least8_t *cp, *ctypes, ctype;
+ size_t size = strlen(nstr->original), length = 0;
+ int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+ if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][koi8r] failed to allocate normalized text space");
+ return NULL;
+ }
+ d0 = (unsigned char *) nstr->normalized;
+ if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+ if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
+ GRN_FREE(nstr->normalized);
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][koi8r] failed to allocate checks space");
+ return NULL;
+ }
+ }
+ ch = nstr->checks;
+ if (nstr->flags & GRN_STRING_WITH_TYPES) {
+ if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
+ GRN_FREE(nstr->checks);
+ GRN_FREE(nstr->normalized);
+ nstr->checks = NULL;
+ nstr->normalized = NULL;
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][koi8r] failed to allocate character types space");
+ return NULL;
+ }
+ }
+ cp = ctypes = nstr->ctypes;
+ e = (unsigned char *)nstr->original + size;
+ for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
+ unsigned char c = *s;
+ switch (c >> 4) {
+ case 0 :
+ case 1 :
+ /* skip unprintable ascii */
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ case 2 :
+ if (c == 0x20) {
+ if (removeblankp) {
+ if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
+ continue;
+ } else {
+ *d = ' ';
+ ctype = GRN_CHAR_BLANK|grn_char_symbol;
+ }
+ } else {
+ *d = c;
+ ctype = grn_char_symbol;
+ }
+ break;
+ case 3 :
+ *d = c;
+ ctype = (c <= 0x39) ? grn_char_digit : grn_char_symbol;
+ break;
+ case 4 :
+ *d = ('A' <= c) ? c + 0x20 : c;
+ ctype = (c == 0x40) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 5 :
+ *d = (c <= 'Z') ? c + 0x20 : c;
+ ctype = (c <= 0x5a) ? grn_char_alpha : grn_char_symbol;
+ break;
+ case 6 :
+ *d = c;
+ ctype = (c == 0x60) ? grn_char_symbol : grn_char_alpha;
+ break;
+ case 7 :
+ *d = c;
+ ctype = (c <= 0x7a) ? grn_char_alpha : (c == 0x7f ? grn_char_others : grn_char_symbol);
+ break;
+ case 0x0a :
+ *d = c;
+ ctype = (c == 0xa3) ? grn_char_alpha : grn_char_others;
+ break;
+ case 0x0b :
+ if (c == 0xb3) {
+ *d = c - 0x10;
+ ctype = grn_char_alpha;
+ } else {
+ *d = c;
+ ctype = grn_char_others;
+ }
+ break;
+ case 0x0c :
+ case 0x0d :
+ *d = c;
+ ctype = grn_char_alpha;
+ break;
+ case 0x0e :
+ case 0x0f :
+ *d = c - 0x20;
+ ctype = grn_char_alpha;
+ break;
+ default :
+ *d = c;
+ ctype = grn_char_others;
+ break;
+ }
+ d++;
+ length++;
+ if (cp) { *cp++ = ctype; }
+ if (ch) {
+ *ch++ = (int16_t)(s + 1 - s_);
+ s_ = s + 1;
+ while (++d_ < d) { *ch++ = 0; }
+ }
+ }
+ if (cp) { *cp = grn_char_null; }
+ *d = '\0';
+ nstr->n_characters = length;
+ nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
+ return NULL;
+}
+
+static grn_string *
+grn_fake_string_open(grn_ctx *ctx, grn_string *string)
+{
+ /* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */
+ grn_string *nstr = string;
+ const char *str;
+ unsigned int str_len;
+
+ str = nstr->original;
+ str_len = nstr->original_length_in_bytes;
+
+ if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) {
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][fake] failed to allocate normalized text space");
+ grn_string_close(ctx, (grn_obj *)nstr);
+ return NULL;
+ }
+
+ memcpy(nstr->normalized, str, str_len);
+ nstr->normalized[str_len] = '\0';
+ nstr->normalized_length_in_bytes = str_len;
+
+ if (nstr->flags & GRN_STRING_WITH_CHECKS) {
+ int16_t f = 0;
+ unsigned char c;
+ size_t i;
+ if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) {
+ grn_string_close(ctx, (grn_obj *)nstr);
+ ERR(GRN_NO_MEMORY_AVAILABLE,
+ "[strinig][fake] failed to allocate checks space");
+ return NULL;
+ }
+ switch (nstr->encoding) {
+ case GRN_ENC_EUC_JP:
+ for (i = 0; i < str_len; i++) {
+ if (!f) {
+ c = (unsigned char) str[i];
+ f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1)
+ );
+ nstr->checks[i] = f;
+ } else {
+ nstr->checks[i] = 0;
+ }
+ f--;
+ }
+ break;
+ case GRN_ENC_SJIS:
+ for (i = 0; i < str_len; i++) {
+ if (!f) {
+ c = (unsigned char) str[i];
+ f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1);
+ nstr->checks[i] = f;
+ } else {
+ nstr->checks[i] = 0;
+ }
+ f--;
+ }
+ break;
+ case GRN_ENC_UTF8:
+ for (i = 0; i < str_len; i++) {
+ if (!f) {
+ c = (unsigned char) str[i];
+ f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3)
+ : 2)
+ : 1);
+ nstr->checks[i] = f;
+ } else {
+ nstr->checks[i] = 0;
+ }
+ f--;
+ }
+ break;
+ default:
+ for (i = 0; i < str_len; i++) {
+ nstr->checks[i] = 1;
+ }
+ break;
+ }
+ }
+ return nstr;
+}
+
+grn_obj *
+grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len,
+ grn_obj *normalizer, int flags, grn_encoding encoding)
+{
+ grn_string *string;
+ grn_obj *obj;
+ grn_obj *args[1];
+
+ if (!str || !str_len) {
+ return NULL;
+ }
+
+ string = GRN_MALLOCN(grn_string, 1);
+ if (!string) {
+ GRN_LOG(ctx, GRN_LOG_ALERT,
+ "[string][open] failed to allocate memory");
+ return NULL;
+ }
+
+ obj = (grn_obj *)string;
+ GRN_OBJ_INIT(obj, GRN_STRING, GRN_OBJ_ALLOCATED, GRN_ID_NIL);
+ string->original = str;
+ string->original_length_in_bytes = str_len;
+ string->normalized = NULL;
+ string->normalized_length_in_bytes = 0;
+ string->n_characters = 0;
+ string->checks = NULL;
+ string->ctypes = NULL;
+ string->encoding = encoding;
+ string->flags = flags;
+
+ if (!normalizer) {
+ return (grn_obj *)grn_fake_string_open(ctx, string);
+ }
+
+ args[0] = obj;
+ switch (encoding) {
+ case GRN_ENC_EUC_JP :
+ eucjp_normalize(ctx, 1, args, NULL);
+ break;
+ case GRN_ENC_UTF8 :
+#ifdef WITH_NFKC
+ utf8_normalize(ctx, 1, args, NULL);
+#else /* WITH_NFKC */
+ ascii_normalize(ctx, 1, args, NULL);
+#endif /* WITH_NFKC */
+ break;
+ case GRN_ENC_SJIS :
+ sjis_normalize(ctx, 1, args, NULL);
+ break;
+ case GRN_ENC_LATIN1 :
+ latin1_normalize(ctx, 1, args, NULL);
+ break;
+ case GRN_ENC_KOI8R :
+ koi8r_normalize(ctx, 1, args, NULL);
+ break;
+ default :
+ ascii_normalize(ctx, 1, args, NULL);
+ break;
+ }
+ if (ctx->rc) {
+ grn_obj_close(ctx, obj);
+ obj = NULL;
+ }
+
+ return obj;
+}
+
+grn_obj *
+grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len,
+ grn_obj *normalizer, int flags)
+{
+ return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding);
+}
+
+grn_rc
+grn_string_get_original(grn_ctx *ctx, grn_obj *string,
+ const char **original,
+ unsigned int *length_in_bytes)
+{
+ grn_rc rc;
+ grn_string *string_ = (grn_string *)string;
+ GRN_API_ENTER;
+ if (string_) {
+ if (original) { *original = string_->original; }
+ if (length_in_bytes) {
+ *length_in_bytes = string_->original_length_in_bytes;
+ }
+ rc = GRN_SUCCESS;
+ } else {
+ rc = GRN_INVALID_ARGUMENT;
+ }
+ GRN_API_RETURN(rc);
+}
+
+int
+grn_string_get_flags(grn_ctx *ctx, grn_obj *string)
+{
+ int flags = 0;
+ grn_string *string_ = (grn_string *)string;
+ GRN_API_ENTER;
+ if (string_) {
+ flags = string_->flags;
+ }
+ GRN_API_RETURN(flags);
+}
+
+grn_rc
+grn_string_get_normalized(grn_ctx *ctx, grn_obj *string,
+ const char **normalized,
+ unsigned int *length_in_bytes,
+ unsigned int *n_characters)
+{
+ grn_rc rc;
+ grn_string *string_ = (grn_string *)string;
+ GRN_API_ENTER;
+ if (string_) {
+ if (normalized) { *normalized = string_->normalized; }
+ if (length_in_bytes) {
+ *length_in_bytes = string_->normalized_length_in_bytes;
+ }
+ if (n_characters) { *n_characters = string_->n_characters; }
+ rc = GRN_SUCCESS;
+ } else {
+ rc = GRN_INVALID_ARGUMENT;
+ }
+ GRN_API_RETURN(rc);
+}
+
+grn_rc
+grn_string_set_normalized(grn_ctx *ctx, grn_obj *string,
+ char *normalized, unsigned int length_in_bytes,
+ unsigned int n_characters)
+{
+ grn_rc rc;
+ grn_string *string_ = (grn_string *)string;
+ GRN_API_ENTER;
+ if (string_) {
+ if (string_->normalized) { GRN_FREE(string_->normalized); }
+ string_->normalized = normalized;
+ string_->normalized_length_in_bytes = length_in_bytes;
+ string_->n_characters = n_characters;
+ rc = GRN_SUCCESS;
+ } else {
+ rc = GRN_INVALID_ARGUMENT;
+ }
+ GRN_API_RETURN(rc);
+}
+
+const short *
+grn_string_get_checks(grn_ctx *ctx, grn_obj *string)
+{
+ int16_t *checks = NULL;
+ grn_string *string_ = (grn_string *)string;
+ GRN_API_ENTER;
+ if (string_) {
+ checks = string_->checks;
+ } else {
+ checks = NULL;
+ }
+ GRN_API_RETURN(checks);
+}
+
+grn_rc
+grn_string_set_checks(grn_ctx *ctx, grn_obj *string, short *checks)
+{
+ grn_rc rc;
+ grn_string *string_ = (grn_string *)string;
+ GRN_API_ENTER;
+ if (string_) {
+ if (string_->checks) { GRN_FREE(string_->checks); }
+ string_->checks = checks;
+ rc = GRN_SUCCESS;
+ } else {
+ rc = GRN_INVALID_ARGUMENT;
+ }
+ GRN_API_RETURN(rc);
+}
+
+const unsigned char *
+grn_string_get_types(grn_ctx *ctx, grn_obj *string)
+{
+ unsigned char *types = NULL;
+ grn_string *string_ = (grn_string *)string;
+ GRN_API_ENTER;
+ if (string_) {
+ types = string_->ctypes;
+ } else {
+ types = NULL;
+ }
+ GRN_API_RETURN(types);
+}
+
+grn_rc
+grn_string_set_types(grn_ctx *ctx, grn_obj *string, unsigned char *types)
+{
+ grn_rc rc;
+ grn_string *string_ = (grn_string *)string;
+ GRN_API_ENTER;
+ if (string_) {
+ if (string_->ctypes) { GRN_FREE(string_->ctypes); }
+ string_->ctypes = types;
+ rc = GRN_SUCCESS;
+ } else {
+ rc = GRN_INVALID_ARGUMENT;
+ }
+ GRN_API_RETURN(rc);
+}
+
+grn_encoding
+grn_string_get_encoding(grn_ctx *ctx, grn_obj *string)
+{
+ grn_encoding encoding = GRN_ENC_NONE;
+ grn_string *string_ = (grn_string *)string;
+ GRN_API_ENTER;
+ if (string_) {
+ encoding = string_->encoding;
+ }
+ GRN_API_RETURN(encoding);
+}
+
+grn_rc
+grn_string_close(grn_ctx *ctx, grn_obj *string)
+{
+ grn_rc rc;
+ grn_string *string_ = (grn_string *)string;
+ if (string_) {
+ if (string_->normalized) { GRN_FREE(string_->normalized); }
+ if (string_->ctypes) { GRN_FREE(string_->ctypes); }
+ if (string_->checks) { GRN_FREE(string_->checks); }
+ GRN_FREE(string);
+ rc = GRN_SUCCESS;
+ } else {
+ rc = GRN_INVALID_ARGUMENT;
+ }
+ return rc;
+}
Added: lib/string_in.h (+64 -0) 100644
===================================================================
--- /dev/null
+++ lib/string_in.h 2012-06-08 16:39:06 +0900 (a6cc1c9)
@@ -0,0 +1,64 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2012 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#ifndef GRN_STRING_H
+#define GRN_STRING_H
+
+#ifndef GROONGA_IN_H
+# include "groonga_in.h"
+#endif /* GROONGA_IN_H */
+
+#ifndef GRN_CTX_H
+# include "ctx.h"
+#endif /* GRN_CTX_H */
+
+#ifndef GRN_DB_H
+# include "db.h"
+#endif /* GRN_DB_H */
+
+#ifndef GRN_STR_H
+# include "str.h"
+#endif /* GRN_STR_IN_H */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ grn_obj_header header;
+ const char *original;
+ unsigned int original_length_in_bytes;
+ char *normalized;
+ unsigned int normalized_length_in_bytes;
+ unsigned int n_characters;
+ short *checks;
+ unsigned char *ctypes;
+ grn_encoding encoding;
+ int flags;
+} grn_string;
+
+grn_obj *grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len,
+ grn_obj *normalizer, int flags, grn_encoding encoding);
+grn_rc grn_string_close(grn_ctx *ctx, grn_obj *string);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GRN_STRING_IN_H */
Modified: lib/token.c (+52 -28)
===================================================================
--- lib/token.c 2012-06-08 17:38:04 +0900 (deb8bc2)
+++ lib/token.c 2012-06-08 16:39:06 +0900 (3cb8279)
@@ -23,6 +23,7 @@
#include "pat.h"
#include "dat.h"
#include "hash.h"
+#include "string_in.h"
grn_obj *grn_uvector_tokenizer = NULL;
@@ -79,7 +80,7 @@ uvector_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
}
typedef struct {
- grn_str *nstr;
+ grn_obj *nstr;
const uint8_t *delimiter;
uint32_t delimiter_len;
int32_t pos;
@@ -97,7 +98,10 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data,
const uint8_t *delimiter, uint32_t delimiter_len)
{
grn_obj *str;
+ grn_obj *normalizer = NULL;
int nflags = 0;
+ const char *normalized;
+ unsigned int normalized_length_in_bytes;
grn_delimited_tokenizer *token;
grn_obj_flags table_flags;
if (!(str = grn_ctx_pop(ctx))) {
@@ -110,16 +114,21 @@ delimited_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data,
token->delimiter_len = delimiter_len;
token->pos = 0;
grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
- nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
- if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
- nflags, token->encoding))) {
+ if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
+ normalizer = GRN_NORMALIZER_AUTO;
+ }
+ if (!(token->nstr = grn_string_open_(ctx,
+ GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
+ normalizer, nflags, token->encoding))) {
GRN_FREE(token);
- ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+ ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
return NULL;
}
- token->next = (unsigned char *)token->nstr->norm;
- token->end = token->next + token->nstr->norm_blen;
- token->len = token->nstr->length;
+ grn_string_get_normalized(ctx, token->nstr,
+ &normalized, &normalized_length_in_bytes,
+ &(token->len));
+ token->next = (const unsigned char *)normalized;
+ token->end = token->next + normalized_length_in_bytes;
GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
GRN_UINT32_INIT(&token->stat_, 0);
return NULL;
@@ -154,7 +163,7 @@ static grn_obj *
delimited_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_delimited_tokenizer *token = user_data->ptr;
- grn_str_close(ctx, token->nstr);
+ grn_obj_close(ctx, token->nstr);
GRN_FREE(token);
return NULL;
}
@@ -178,7 +187,7 @@ delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
/* ngram tokenizer */
typedef struct {
- grn_str *nstr;
+ grn_obj *nstr;
uint8_t uni_alpha;
uint8_t uni_digit;
uint8_t uni_symbol;
@@ -190,7 +199,7 @@ typedef struct {
grn_encoding encoding;
const unsigned char *next;
const unsigned char *end;
- uint_least8_t *ctypes;
+ const uint_least8_t *ctypes;
int32_t len;
uint32_t tail;
grn_obj curr_;
@@ -202,7 +211,10 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
{
grn_obj *str;
- int nflags = GRN_STR_REMOVEBLANK|GRN_STR_WITH_CTYPES;
+ grn_obj *normalizer = NULL;
+ int nflags = GRN_STRING_REMOVE_BLANK|GRN_STRING_WITH_TYPES;
+ const char *normalized;
+ unsigned int normalized_length_in_bytes;
grn_ngram_tokenizer *token;
grn_obj_flags table_flags;
if (!(str = grn_ctx_pop(ctx))) {
@@ -220,17 +232,22 @@ ngram_init(grn_ctx *ctx, grn_obj *table, grn_user_data *user_data, uint8_t ngram
token->pos = 0;
token->skip = 0;
grn_table_get_info(ctx, table, &table_flags, &token->encoding, NULL);
- nflags |= (table_flags & GRN_OBJ_KEY_NORMALIZE);
- if (!(token->nstr = grn_str_open_(ctx, GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
- nflags, token->encoding))) {
+ if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
+ normalizer = GRN_NORMALIZER_AUTO;
+ }
+ if (!(token->nstr = grn_string_open_(ctx,
+ GRN_TEXT_VALUE(str), GRN_TEXT_LEN(str),
+ normalizer, nflags, token->encoding))) {
GRN_FREE(token);
- ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+ ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
return NULL;
}
- token->next = (unsigned char *)token->nstr->norm;
- token->end = token->next + token->nstr->norm_blen;
- token->ctypes = token->nstr->ctypes;
- token->len = token->nstr->length;
+ grn_string_get_normalized(ctx, token->nstr,
+ &normalized, &normalized_length_in_bytes,
+ &(token->len));
+ token->next = (const unsigned char *)normalized;
+ token->end = token->next + normalized_length_in_bytes;
+ token->ctypes = grn_string_get_types(ctx, token->nstr);
GRN_TEXT_INIT(&token->curr_, GRN_OBJ_DO_SHALLOW_COPY);
GRN_UINT32_INIT(&token->stat_, 0);
return NULL;
@@ -283,7 +300,7 @@ ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
grn_ngram_tokenizer *token = user_data->ptr;
const unsigned char *p = token->next, *r = p, *e = token->end;
int32_t len = 0, pos = token->pos + token->skip, status = 0;
- uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
+ const uint_least8_t *cp = token->ctypes ? token->ctypes + pos : NULL;
if (cp && token->uni_alpha && GRN_STR_CTYPE(*cp) == grn_str_alpha) {
while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, token->encoding))) {
len++;
@@ -371,7 +388,7 @@ static grn_obj *
ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_ngram_tokenizer *token = user_data->ptr;
- grn_str_close(ctx, token->nstr);
+ grn_obj_close(ctx, token->nstr);
GRN_FREE(token);
return NULL;
}
@@ -437,13 +454,20 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data);
grn_obj_close(ctx, &str_);
} else {
- int nflags = table_flags & GRN_OBJ_KEY_NORMALIZE;
- token->nstr = grn_str_open_(ctx, str, str_len, nflags, token->encoding);
+ grn_obj *normalizer = NULL;
+ int nflags = 0;
+ if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
+ normalizer = GRN_NORMALIZER_AUTO;
+ }
+ token->nstr = grn_string_open_(ctx, str, str_len,
+ normalizer, nflags, token->encoding);
if (token->nstr) {
- token->curr = (unsigned char *)token->nstr->norm;
- token->curr_size = token->nstr->norm_blen;
+ const char *normalized;
+ grn_string_get_normalized(ctx, token->nstr,
+ &normalized, &(token->curr_size), NULL);
+ token->curr = (const unsigned char *)normalized;
} else {
- ERR(GRN_TOKENIZER_ERROR, "grn_str_open failed at grn_token_open");
+ ERR(GRN_TOKENIZER_ERROR, "grn_string_open failed at grn_token_open");
}
}
if (ctx->rc) {
@@ -561,7 +585,7 @@ grn_token_close(grn_ctx *ctx, grn_token *token)
return GRN_SUCCESS;
} else {
if (token->nstr) {
- grn_str_close(ctx, token->nstr);
+ grn_obj_close(ctx, token->nstr);
}
return GRN_INVALID_ARGUMENT;
}
Modified: lib/token.h (+1 -1)
===================================================================
--- lib/token.h 2012-06-08 17:38:04 +0900 (fb9a5b6)
+++ lib/token.h 2012-06-08 16:39:06 +0900 (785c52c)
@@ -58,7 +58,7 @@ typedef struct {
grn_obj *tokenizer;
grn_proc_ctx pctx;
uint32_t variant;
- grn_str *nstr;
+ grn_obj *nstr;
} grn_token;
enum {
Modified: lib/tokenizer.c (+15 -5)
===================================================================
--- lib/tokenizer.c 2012-06-08 17:38:04 +0900 (0d767bf)
+++ lib/tokenizer.c 2012-06-08 16:39:06 +0900 (addded9)
@@ -25,6 +25,7 @@
#include "ctx.h"
#include "db.h"
#include "str.h"
+#include "string_in.h"
#include "token.h"
/*
@@ -116,14 +117,23 @@ grn_tokenizer_query *grn_tokenizer_query_create(grn_ctx *ctx,
}
grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL);
{
- grn_str * const str = grn_str_open_(ctx, GRN_TEXT_VALUE(query_str),
- GRN_TEXT_LEN(query_str),
- table_flags & GRN_OBJ_KEY_NORMALIZE,
- table_encoding);
- if (str == NULL) {
+ grn_obj *normalizer = NULL;
+ int flags = 0;
+ grn_obj *normalized_string;
+ if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
+ normalizer = GRN_NORMALIZER_AUTO;
+ }
+ normalized_string = grn_string_open_(ctx,
+ GRN_TEXT_VALUE(query_str),
+ GRN_TEXT_LEN(query_str),
+ normalizer,
+ flags,
+ table_encoding);
+ if (!normalized_string) {
GRN_PLUGIN_FREE(ctx, query);
return NULL;
}
+ query->normalized_query = normalized_string;
memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
query_buf[query_length] = '\0';
query->query_buf = query_buf;