Kouhei Sutou
null+****@clear*****
Fri May 11 15:10:16 JST 2018
Kouhei Sutou 2018-05-11 15:10:16 +0900 (Fri, 11 May 2018) New Revision: a2047308a1658603b8da53b763005fd01e6fa6ce https://github.com/groonga/groonga/commit/a2047308a1658603b8da53b763005fd01e6fa6ce Message: highlighter: improve lexicon mode precision Added files: test/command/suite/select/function/highlight_html/lexicon/include.expected test/command/suite/select/function/highlight_html/lexicon/include.test test/command/suite/select/function/highlight_html/lexicon/overlap.expected test/command/suite/select/function/highlight_html/lexicon/overlap.test Modified files: lib/highlighter.c Modified: lib/highlighter.c (+155 -72) =================================================================== --- lib/highlighter.c 2018-05-11 14:40:37 +0900 (ef8e2d6b6) +++ lib/highlighter.c 2018-05-11 15:10:16 +0900 (650f87043) @@ -22,6 +22,26 @@ #include "grn_pat.h" #include "grn_str.h" +typedef struct { + uint64_t offset; + uint32_t length; + grn_bool is_overlapping; + uint8_t first_character_length; +} grn_highlighter_location; + +static int +grn_highlighter_location_compare(const void *data1, const void *data2) +{ + const grn_highlighter_location *location1 = data1; + const grn_highlighter_location *location2 = data2; + + if (location1->offset == location2->offset) { + return location1->length - location2->length; + } else { + return location1->offset - location2->offset; + } +} + /* * TODO: * * Support non HTML mode. @@ -48,8 +68,8 @@ struct _grn_highlighter { grn_obj *token_id_chunks; grn_obj token_id_chunk; grn_obj token_ids; - grn_obj offsets; - grn_obj lengths; + grn_obj token_locations; + grn_obj candidates; } lexicon; /* For patricia trie mode */ @@ -92,8 +112,8 @@ grn_highlighter_open(grn_ctx *ctx) highlighter->lexicon.token_id_chunks = NULL; GRN_TEXT_INIT(&(highlighter->lexicon.token_id_chunk), 0); GRN_RECORD_INIT(&(highlighter->lexicon.token_ids), GRN_OBJ_VECTOR, GRN_ID_NIL); - GRN_UINT64_INIT(&(highlighter->lexicon.offsets), GRN_OBJ_VECTOR); - GRN_UINT32_INIT(&(highlighter->lexicon.lengths), GRN_OBJ_VECTOR); + GRN_TEXT_INIT(&(highlighter->lexicon.token_locations), 0); + GRN_TEXT_INIT(&(highlighter->lexicon.candidates), 0); highlighter->pat.keywords = NULL; @@ -117,9 +137,9 @@ grn_highlighter_close(grn_ctx *ctx, if (highlighter->lexicon.token_id_chunks) { grn_obj_close(ctx, highlighter->lexicon.token_id_chunks); } + GRN_OBJ_FIN(ctx, &(highlighter->lexicon.candidates)); + GRN_OBJ_FIN(ctx, &(highlighter->lexicon.token_locations)); GRN_OBJ_FIN(ctx, &(highlighter->lexicon.token_ids)); - GRN_OBJ_FIN(ctx, &(highlighter->lexicon.offsets)); - GRN_OBJ_FIN(ctx, &(highlighter->lexicon.lengths)); GRN_OBJ_FIN(ctx, &(highlighter->raw_keywords)); GRN_FREE(highlighter); @@ -292,12 +312,11 @@ grn_highlighter_highlight_lexicon(grn_ctx *ctx, grn_token_cursor *cursor; grn_encoding encoding = highlighter->lexicon.encoding; grn_obj *token_ids = &(highlighter->lexicon.token_ids); - grn_obj *offsets = &(highlighter->lexicon.offsets); - grn_obj *lengths = &(highlighter->lexicon.lengths); + grn_obj *token_locations = &(highlighter->lexicon.token_locations); + grn_obj *candidates = &(highlighter->lexicon.candidates); GRN_BULK_REWIND(token_ids); - GRN_BULK_REWIND(offsets); - GRN_BULK_REWIND(lengths); + GRN_BULK_REWIND(token_locations); cursor = grn_token_cursor_open(ctx, highlighter->lexicon.object, text, @@ -313,107 +332,171 @@ grn_highlighter_highlight_lexicon(grn_ctx *ctx, while (cursor->status == GRN_TOKEN_CURSOR_DOING) { grn_id token_id = grn_token_cursor_next(ctx, cursor); + grn_highlighter_location location; grn_token *token; - uint32_t length; if (token_id == GRN_ID_NIL) { continue; } token = grn_token_cursor_get_token(ctx, cursor); GRN_RECORD_PUT(ctx, token_ids, token_id); - GRN_UINT64_PUT(ctx, offsets, grn_token_get_source_offset(ctx, token)); - length = grn_token_get_source_length(ctx, token); - if (grn_token_is_overlap(ctx, token)) { + location.offset = grn_token_get_source_offset(ctx, token); + location.length = grn_token_get_source_length(ctx, token); + location.is_overlapping = grn_token_is_overlap(ctx, token); + { const char *data; size_t data_length; - int first_character_length; data = grn_token_get_data_raw(ctx, token, &data_length); - first_character_length = + location.first_character_length = grn_charlen_(ctx, data, data + data_length, encoding); - GRN_UINT32_PUT(ctx, lengths, first_character_length); - } else { - GRN_UINT32_PUT(ctx, lengths, length); } + GRN_TEXT_PUT(ctx, token_locations, &location, sizeof(location)); } grn_token_cursor_close(ctx, cursor); + GRN_BULK_REWIND(candidates); { grn_pat *chunks = (grn_pat *)(highlighter->lexicon.token_id_chunks); - size_t i, n_token_ids = GRN_BULK_VSIZE(token_ids) / sizeof(grn_id); + size_t i; + size_t n_token_ids = GRN_BULK_VSIZE(token_ids) / sizeof(grn_id); grn_id *raw_token_ids = (grn_id *)GRN_BULK_HEAD(token_ids); - uint64_t max_offset = 0; - uint32_t last_length = 0; - grn_bool have_highlight = GRN_FALSE; + grn_highlighter_location *raw_token_locations = + (grn_highlighter_location *)GRN_BULK_HEAD(token_locations); for (i = 0; i < n_token_ids; i++) { grn_id chunk_id; - uint64_t offset; - uint32_t length; chunk_id = grn_pat_lcp_search(ctx, chunks, raw_token_ids + i, (n_token_ids - i) * sizeof(grn_id)); - offset = GRN_UINT64_VALUE_AT(offsets, i); - if (offset == 0 && GRN_TEXT_LEN(output) > 0) { - if (have_highlight) { - break; - } - max_offset = 0; - last_length = 0; - GRN_BULK_REWIND(output); - } if (chunk_id == GRN_ID_NIL) { - if (offset < max_offset) { - continue; - } - length = GRN_UINT32_VALUE_AT(lengths, i); - grn_text_escape_xml(ctx, - output, - text + offset, - length); - max_offset = offset + 1; - last_length = length; - } else { + continue; + } + + { uint32_t key_size; size_t j; size_t n_ids; + grn_highlighter_location candidate; + grn_highlighter_location *first = raw_token_locations + i; - have_highlight = GRN_TRUE; + _grn_pat_key(ctx, chunks, chunk_id, &key_size); + n_ids = key_size / sizeof(grn_id); + candidate.offset = first->offset; + if (first->is_overlapping && n_ids > 1) { + candidate.length = first->first_character_length; + } else { + candidate.length = first->length; + } + for (j = 1; j < n_ids; j++) { + grn_highlighter_location *current = raw_token_locations + i + j; + grn_highlighter_location *previous = current - 1; + uint32_t current_length; + uint32_t previous_length; + if (current->is_overlapping && j + 1 < n_ids) { + current_length = current->first_character_length; + } else { + current_length = current->length; + } + if (previous->is_overlapping && j + 1 < n_ids) { + previous_length = previous->first_character_length; + } else { + previous_length = previous->length; + } + if (current->offset == previous->offset) { + if (current_length > previous_length) { + candidate.length += current_length - previous_length; + } + } else { + candidate.length += current_length; + } + } + GRN_TEXT_PUT(ctx, candidates, &candidate, sizeof(candidate)); + i += n_ids - 1; + } + } + } - if (last_length > 0 && offset + 1 == max_offset) { - grn_bulk_truncate(ctx, output, GRN_TEXT_LEN(output) - last_length); - max_offset -= last_length; - last_length = 0; - } else if (offset < max_offset) { + { + size_t i; + size_t n_candidates = + GRN_BULK_VSIZE(candidates) / sizeof(grn_highlighter_location); + grn_highlighter_location *raw_candidates = + (grn_highlighter_location *)GRN_BULK_HEAD(candidates); + + if (n_candidates == 0) { + grn_text_escape_xml(ctx, output, text, text_length); + } else { + grn_highlighter_location *previous = NULL; + uint64_t offset = 0; + + qsort(raw_candidates, + n_candidates, + sizeof(grn_highlighter_location), + grn_highlighter_location_compare); + previous = raw_candidates; + for (i = 1; i < n_candidates; i++) { + grn_highlighter_location *current = raw_candidates + i; + if (previous->offset + previous->length >= current->offset) { + if (previous->offset + previous->length > + current->offset + current->length) { + current->length = previous->length; + } else { + current->length += current->offset - previous->offset; + } + current->offset = previous->offset; + previous = current; continue; } - - _grn_pat_key(ctx, chunks, chunk_id, &key_size); - n_ids = key_size / sizeof(grn_id); - GRN_TEXT_PUT(ctx, - output, - highlighter->tag.open, - highlighter->tag.open_length); - for (j = 0; j < n_ids; j++) { - offset = GRN_UINT64_VALUE_AT(offsets, i + j); - if (offset < max_offset) { - continue; + if (current->offset > previous->offset) { + if (previous->offset > offset) { + grn_text_escape_xml(ctx, + output, + text + offset, + previous->offset - offset); } - length = GRN_UINT32_VALUE_AT(lengths, i + j); + GRN_TEXT_PUT(ctx, + output, + highlighter->tag.open, + highlighter->tag.open_length); grn_text_escape_xml(ctx, output, - text + offset, - length); - max_offset = offset + 1; - last_length = length; + text + previous->offset, + previous->length); + offset = previous->offset + previous->length; + GRN_TEXT_PUT(ctx, + output, + highlighter->tag.close, + highlighter->tag.close_length); } - GRN_TEXT_PUT(ctx, - output, - highlighter->tag.close, - highlighter->tag.close_length); - i += n_ids - 1; + previous = current; + } + if (previous->offset > offset) { + grn_text_escape_xml(ctx, + output, + text + offset, + previous->offset - offset); + } + GRN_TEXT_PUT(ctx, + output, + highlighter->tag.open, + highlighter->tag.open_length); + grn_text_escape_xml(ctx, + output, + text + previous->offset, + previous->length); + offset = previous->offset + previous->length; + GRN_TEXT_PUT(ctx, + output, + highlighter->tag.close, + highlighter->tag.close_length); + if (offset < text_length) { + grn_text_escape_xml(ctx, + output, + text + offset, + text_length - offset); } } } Added: test/command/suite/select/function/highlight_html/lexicon/include.expected (+37 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/lexicon/include.expected 2018-05-11 15:10:16 +0900 (6e6a42c6a) @@ -0,0 +1,37 @@ +table_create Entries TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Entries body COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer 'TokenNgram("report_source_location", true)' --normalizer 'NormalizerNFKC100' +[[0,0.0,0.0],true] +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body +[[0,0.0,0.0],true] +load --table Entries +[ +{"body": "あいうえおかきくけこ"} +] +[[0,0.0,0.0],1] +select Entries --match_columns body --query 'うえ いうえお' --output_columns 'highlight_html(body, Terms)' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "highlight_html", + null + ] + ], + [ + "あ<span class=\"keyword\">いうえお</span>かきくけこ" + ] + ] + ] +] Added: test/command/suite/select/function/highlight_html/lexicon/include.test (+17 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/lexicon/include.test 2018-05-11 15:10:16 +0900 (c5ead274e) @@ -0,0 +1,17 @@ +table_create Entries TABLE_NO_KEY +column_create Entries body COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer 'TokenNgram("report_source_location", true)' \ + --normalizer 'NormalizerNFKC100' +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body + +load --table Entries +[ +{"body": "あいうえおかきくけこ"} +] + +select Entries \ + --match_columns body \ + --query 'うえ いうえお' \ + --output_columns 'highlight_html(body, Terms)' Added: test/command/suite/select/function/highlight_html/lexicon/overlap.expected (+37 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/lexicon/overlap.expected 2018-05-11 15:10:16 +0900 (7bde0fd8a) @@ -0,0 +1,37 @@ +table_create Entries TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Entries body COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer 'TokenNgram("report_source_location", true)' --normalizer 'NormalizerNFKC100' +[[0,0.0,0.0],true] +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body +[[0,0.0,0.0],true] +load --table Entries +[ +{"body": "あいうえおか"} +] +[[0,0.0,0.0],1] +select Entries --match_columns body --query 'いう うえお' --output_columns 'highlight_html(body, Terms)' +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 1 + ], + [ + [ + "highlight_html", + null + ] + ], + [ + "あ<span class=\"keyword\">いうえお</span>か" + ] + ] + ] +] Added: test/command/suite/select/function/highlight_html/lexicon/overlap.test (+17 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/highlight_html/lexicon/overlap.test 2018-05-11 15:10:16 +0900 (a7357b5d0) @@ -0,0 +1,17 @@ +table_create Entries TABLE_NO_KEY +column_create Entries body COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer 'TokenNgram("report_source_location", true)' \ + --normalizer 'NormalizerNFKC100' +column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body + +load --table Entries +[ +{"body": "あいうえおか"} +] + +select Entries \ + --match_columns body \ + --query 'いう うえお' \ + --output_columns 'highlight_html(body, Terms)' -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180511/48f09772/attachment-0001.htm