Kouhei Sutou
null+****@clear*****
Wed May 9 18:13:15 JST 2018
Kouhei Sutou 2018-05-09 18:13:15 +0900 (Wed, 09 May 2018) New Revision: 5d2675f7be5e6bb8e7954079283eb5d52bf67264 https://github.com/groonga/groonga/commit/5d2675f7be5e6bb8e7954079283eb5d52bf67264 Message: highligher: implement lexicon mode TODO: * Performance tuning Modified files: lib/highlighter.c Modified: lib/highlighter.c (+262 -27) =================================================================== --- lib/highlighter.c 2018-05-09 17:53:04 +0900 (f87d4e7e5) +++ lib/highlighter.c 2018-05-09 18:13:15 +0900 (ef8e2d6b6) @@ -17,7 +17,10 @@ */ #include "grn.h" +#include "grn_token_cursor.h" + #include "grn_pat.h" +#include "grn_str.h" /* * TODO: @@ -39,11 +42,20 @@ struct _grn_highlighter { } tag; /* For lexicon mode */ - grn_obj *lexicon; - grn_obj keyword_token_ids; + struct { + grn_obj *object; + grn_encoding encoding; + grn_obj *token_id_chunks; + grn_obj token_id_chunk; + grn_obj token_ids; + grn_obj offsets; + grn_obj lengths; + } lexicon; /* For patricia trie mode */ - grn_obj *keywords; + struct { + grn_obj *keywords; + } pat; }; grn_highlighter * @@ -75,10 +87,15 @@ grn_highlighter_open(grn_ctx *ctx) highlighter->tag.close = "</span>"; highlighter->tag.close_length = strlen(highlighter->tag.close); - highlighter->lexicon = NULL; - GRN_RECORD_INIT(&(highlighter->keyword_token_ids), GRN_OBJ_VECTOR, GRN_ID_NIL); + highlighter->lexicon.object = NULL; + highlighter->lexicon.encoding = GRN_ENC_NONE; + highlighter->lexicon.token_id_chunks = NULL; + GRN_TEXT_INIT(&(highlighter->lexicon.token_id_chunk), 0); + GRN_RECORD_INIT(&(highlighter->lexicon.token_ids), GRN_OBJ_VECTOR, GRN_ID_NIL); + GRN_UINT64_INIT(&(highlighter->lexicon.offsets), GRN_OBJ_VECTOR); + GRN_UINT32_INIT(&(highlighter->lexicon.lengths), GRN_OBJ_VECTOR); - highlighter->keywords = NULL; + highlighter->pat.keywords = NULL; GRN_API_RETURN(highlighter); } @@ -93,11 +110,16 @@ grn_highlighter_close(grn_ctx *ctx, GRN_API_RETURN(ctx->rc); } - if (highlighter->keywords) { - grn_obj_close(ctx, highlighter->keywords); + if (highlighter->pat.keywords) { + grn_obj_close(ctx, highlighter->pat.keywords); } - GRN_OBJ_FIN(ctx, &(highlighter->keyword_token_ids)); + if (highlighter->lexicon.token_id_chunks) { + grn_obj_close(ctx, highlighter->lexicon.token_id_chunks); + } + GRN_OBJ_FIN(ctx, &(highlighter->lexicon.token_ids)); + GRN_OBJ_FIN(ctx, &(highlighter->lexicon.offsets)); + GRN_OBJ_FIN(ctx, &(highlighter->lexicon.lengths)); GRN_OBJ_FIN(ctx, &(highlighter->raw_keywords)); GRN_FREE(highlighter); @@ -109,24 +131,106 @@ static void grn_highlighter_prepare_lexicon(grn_ctx *ctx, grn_highlighter *highlighter) { - /* TODO */ + size_t i, n_keywords; + grn_obj *token_id_chunk = &(highlighter->lexicon.token_id_chunk); + + highlighter->lexicon.token_ids.header.domain = + grn_obj_id(ctx, highlighter->lexicon.object); + + grn_table_get_info(ctx, + highlighter->lexicon.object, + NULL, + &(highlighter->lexicon.encoding), + NULL, + NULL, + NULL); + + if (highlighter->lexicon.token_id_chunks) { + grn_obj_close(ctx, highlighter->lexicon.token_id_chunks); + } + highlighter->lexicon.token_id_chunks = + grn_table_create(ctx, + NULL, 0, + NULL, + GRN_OBJ_TABLE_PAT_KEY, + grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), + NULL); + if (!highlighter->lexicon.token_id_chunks) { + grn_rc rc = ctx->rc; + if (rc == GRN_SUCCESS) { + rc = GRN_UNKNOWN_ERROR; + } + ERR(rc, + "[highlighter][prepare][lexicon] " + "failed to create an internal patricia trie: %s", + ctx->errbuf); + return; + } + + n_keywords = grn_vector_size(ctx, &(highlighter->raw_keywords)); + for (i = 0; i < n_keywords; i++) { + const char *keyword; + unsigned int keyword_length; + grn_token_cursor *cursor; + grn_id token_id; + + keyword_length = grn_vector_get_element(ctx, + &(highlighter->raw_keywords), + i, + &keyword, + NULL, + NULL); + cursor = grn_token_cursor_open(ctx, + highlighter->lexicon.object, + keyword, + keyword_length, + GRN_TOKENIZE_ADD, + 0); + if (!cursor) { + continue; + } + while (grn_token_cursor_next(ctx, cursor) != GRN_ID_NIL) { + } + grn_token_cursor_close(ctx, cursor); + + cursor = grn_token_cursor_open(ctx, + highlighter->lexicon.object, + keyword, + keyword_length, + GRN_TOKENIZE_GET, + 0); + if (!cursor) { + continue; + } + GRN_BULK_REWIND(token_id_chunk); + while ((token_id = grn_token_cursor_next(ctx, cursor)) != GRN_ID_NIL) { + GRN_TEXT_PUT(ctx, token_id_chunk, &token_id, sizeof(grn_id)); + } + grn_token_cursor_close(ctx, cursor); + grn_table_add(ctx, + highlighter->lexicon.token_id_chunks, + GRN_TEXT_VALUE(token_id_chunk), + GRN_TEXT_LEN(token_id_chunk), + NULL); + } } static void grn_highlighter_prepare_patricia_trie(grn_ctx *ctx, grn_highlighter *highlighter) { - if (highlighter->keywords) { - grn_obj_close(ctx, highlighter->keywords); + if (highlighter->pat.keywords) { + grn_obj_close(ctx, highlighter->pat.keywords); } - highlighter->keywords = grn_table_create(ctx, - NULL, 0, - NULL, - GRN_OBJ_TABLE_PAT_KEY, - grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), - NULL); - if (!highlighter->keywords) { + highlighter->pat.keywords = + grn_table_create(ctx, + NULL, 0, + NULL, + GRN_OBJ_TABLE_PAT_KEY, + grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), + NULL); + if (!highlighter->pat.keywords) { grn_rc rc = ctx->rc; if (rc == GRN_SUCCESS) { rc = GRN_UNKNOWN_ERROR; @@ -139,7 +243,7 @@ grn_highlighter_prepare_patricia_trie(grn_ctx *ctx, } grn_obj_set_info(ctx, - highlighter->keywords, + highlighter->pat.keywords, GRN_INFO_NORMALIZER, grn_ctx_get(ctx, "NormalizerAuto", -1)); @@ -158,7 +262,7 @@ grn_highlighter_prepare_patricia_trie(grn_ctx *ctx, NULL, NULL); grn_table_add(ctx, - highlighter->keywords, + highlighter->pat.keywords, keyword, keyword_size, NULL); @@ -170,11 +274,12 @@ static void grn_highlighter_prepare(grn_ctx *ctx, grn_highlighter *highlighter) { - if (highlighter->lexicon) { + if (highlighter->lexicon.object) { grn_highlighter_prepare_lexicon(ctx, highlighter); } else { grn_highlighter_prepare_patricia_trie(ctx, highlighter); } + highlighter->need_prepared = GRN_FALSE; } static void @@ -184,7 +289,134 @@ grn_highlighter_highlight_lexicon(grn_ctx *ctx, size_t text_length, grn_obj *output) { - /* TODO */ + grn_token_cursor *cursor; + grn_encoding encoding = highlighter->lexicon.encoding; + grn_obj *token_ids = &(highlighter->lexicon.token_ids); + grn_obj *offsets = &(highlighter->lexicon.offsets); + grn_obj *lengths = &(highlighter->lexicon.lengths); + + GRN_BULK_REWIND(token_ids); + GRN_BULK_REWIND(offsets); + GRN_BULK_REWIND(lengths); + cursor = grn_token_cursor_open(ctx, + highlighter->lexicon.object, + text, + text_length, + GRN_TOKENIZE_ADD, + 0); + if (!cursor) { + ERR(ctx->rc, + "[highlighter][highlight][lexicon] failed to start tokenizing: %s", + ctx->errbuf); + return; + } + + while (cursor->status == GRN_TOKEN_CURSOR_DOING) { + grn_id token_id = grn_token_cursor_next(ctx, cursor); + grn_token *token; + uint32_t length; + + if (token_id == GRN_ID_NIL) { + continue; + } + token = grn_token_cursor_get_token(ctx, cursor); + GRN_RECORD_PUT(ctx, token_ids, token_id); + GRN_UINT64_PUT(ctx, offsets, grn_token_get_source_offset(ctx, token)); + length = grn_token_get_source_length(ctx, token); + if (grn_token_is_overlap(ctx, token)) { + const char *data; + size_t data_length; + int first_character_length; + + data = grn_token_get_data_raw(ctx, token, &data_length); + first_character_length = + grn_charlen_(ctx, data, data + data_length, encoding); + GRN_UINT32_PUT(ctx, lengths, first_character_length); + } else { + GRN_UINT32_PUT(ctx, lengths, length); + } + } + grn_token_cursor_close(ctx, cursor); + + { + grn_pat *chunks = (grn_pat *)(highlighter->lexicon.token_id_chunks); + size_t i, n_token_ids = GRN_BULK_VSIZE(token_ids) / sizeof(grn_id); + grn_id *raw_token_ids = (grn_id *)GRN_BULK_HEAD(token_ids); + uint64_t max_offset = 0; + uint32_t last_length = 0; + grn_bool have_highlight = GRN_FALSE; + + for (i = 0; i < n_token_ids; i++) { + grn_id chunk_id; + uint64_t offset; + uint32_t length; + + chunk_id = grn_pat_lcp_search(ctx, + chunks, + raw_token_ids + i, + (n_token_ids - i) * sizeof(grn_id)); + offset = GRN_UINT64_VALUE_AT(offsets, i); + if (offset == 0 && GRN_TEXT_LEN(output) > 0) { + if (have_highlight) { + break; + } + max_offset = 0; + last_length = 0; + GRN_BULK_REWIND(output); + } + if (chunk_id == GRN_ID_NIL) { + if (offset < max_offset) { + continue; + } + length = GRN_UINT32_VALUE_AT(lengths, i); + grn_text_escape_xml(ctx, + output, + text + offset, + length); + max_offset = offset + 1; + last_length = length; + } else { + uint32_t key_size; + size_t j; + size_t n_ids; + + have_highlight = GRN_TRUE; + + if (last_length > 0 && offset + 1 == max_offset) { + grn_bulk_truncate(ctx, output, GRN_TEXT_LEN(output) - last_length); + max_offset -= last_length; + last_length = 0; + } else if (offset < max_offset) { + continue; + } + + _grn_pat_key(ctx, chunks, chunk_id, &key_size); + n_ids = key_size / sizeof(grn_id); + GRN_TEXT_PUT(ctx, + output, + highlighter->tag.open, + highlighter->tag.open_length); + for (j = 0; j < n_ids; j++) { + offset = GRN_UINT64_VALUE_AT(offsets, i + j); + if (offset < max_offset) { + continue; + } + length = GRN_UINT32_VALUE_AT(lengths, i + j); + grn_text_escape_xml(ctx, + output, + text + offset, + length); + max_offset = offset + 1; + last_length = length; + } + GRN_TEXT_PUT(ctx, + output, + highlighter->tag.close, + highlighter->tag.close_length); + i += n_ids - 1; + } + } + } } static void @@ -206,7 +438,7 @@ grn_highlighter_highlight_patricia_trie(grn_ctx *ctx, size_t chunk_length; n_hits = grn_pat_scan(ctx, - (grn_pat *)(highlighter->keywords), + (grn_pat *)(highlighter->pat.keywords), current, current_length, hits, MAX_N_HITS, &rest); @@ -277,7 +509,7 @@ grn_highlighter_highlight(grn_ctx *ctx, } } - if (highlighter->lexicon) { + if (highlighter->lexicon.object) { grn_highlighter_highlight_lexicon(ctx, highlighter, text, @@ -302,7 +534,10 @@ grn_highlighter_set_lexicon(grn_ctx *ctx, { GRN_API_ENTER; - highlighter->lexicon = lexicon; + if (highlighter->lexicon.object != lexicon) { + highlighter->need_prepared = GRN_TRUE; + highlighter->lexicon.object = lexicon; + } GRN_API_RETURN(ctx->rc); } @@ -313,7 +548,7 @@ grn_highlighter_get_lexicon(grn_ctx *ctx, { GRN_API_ENTER; - GRN_API_RETURN(highlighter->lexicon); + GRN_API_RETURN(highlighter->lexicon.object); } grn_rc -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180509/c8075fe9/attachment-0001.htm