[Groonga-commit] groonga/groonga at 49cae7d [master] TokenNgram: fix wrong first character length

Back to archive index

Kouhei Sutou null+****@clear*****
Tue Jun 26 15:09:40 JST 2018


Kouhei Sutou	2018-06-26 15:09:40 +0900 (Tue, 26 Jun 2018)

  New Revision: 49cae7d24ca79512505abe0fc1a9781ae64b1bf1
  https://github.com/groonga/groonga/commit/49cae7d24ca79512505abe0fc1a9781ae64b1bf1

  Message:
    TokenNgram: fix wrong first character length
    
    It's caused for U+3231 PARENTHESIZED IDEOGRAPH characters such as
    U+3231 PARENTHESIZED IDEOGRAPH STOCK.

  Added files:
    test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.expected
    test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.test
    test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.expected
    test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.test
  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+3 -1)
===================================================================
--- lib/tokenizers.c    2018-06-26 12:59:24 +0900 (854d5133d)
+++ lib/tokenizers.c    2018-06-26 15:09:40 +0900 (bfd88d7b8)
@@ -416,7 +416,9 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
             removed_checks = NULL;
           }
           for (i = 0; i < length; i++) {
-            loose_checks[i] += checks[i];
+            if (checks[i] != -1) {
+              loose_checks[i] += checks[i];
+            }
           }
           loose_checks += length;
         }

  Added: test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.expected (+37 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.expected    2018-06-26 15:09:40 +0900 (715103f97)
@@ -0,0 +1,37 @@
+table_create Entries TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Entries body COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer 'TokenNgram("loose_symbol", true,                                   "report_source_location", true)'   --normalizer 'NormalizerNFKC100'
+[[0,0.0,0.0],true]
+column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
+[[0,0.0,0.0],true]
+load --table Entries
+[
+{"body": "ここは㈱グルンガ"}
+]
+[[0,0.0,0.0],1]
+select Entries   --match_columns body   --query '株グル'   --output_columns 'highlight_html(body, Terms)'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "highlight_html",
+          null
+        ]
+      ],
+      [
+        "ここは<span class=\"keyword\">㈱グル</span>ンガ"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.test (+19 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.test    2018-06-26 15:09:40 +0900 (d8e77567a)
@@ -0,0 +1,19 @@
+table_create Entries TABLE_NO_KEY
+column_create Entries body COLUMN_SCALAR ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer 'TokenNgram("loose_symbol", true, \
+                                  "report_source_location", true)' \
+  --normalizer 'NormalizerNFKC100'
+column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
+
+load --table Entries
+[
+{"body": "ここは㈱グルンガ"}
+]
+
+select Entries \
+  --match_columns body \
+  --query '株グル' \
+  --output_columns 'highlight_html(body, Terms)'
+

  Added: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.expected (+154 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.expected    2018-06-26 15:09:40 +0900 (d8add60e0)
@@ -0,0 +1,154 @@
+tokenize   'TokenNgram("loose_symbol", true,               "report_source_location", true)'   "ここは㈱グルンガ"   'NormalizerNFKC100("report_source_offset", true)'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "ここ",
+      "position": 0,
+      "force_prefix": false,
+      "source_offset": 0,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "こは",
+      "position": 1,
+      "force_prefix": false,
+      "source_offset": 3,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "は",
+      "position": 2,
+      "force_prefix": false,
+      "source_offset": 6,
+      "source_length": 3,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "(",
+      "position": 3,
+      "force_prefix": false,
+      "source_offset": 9,
+      "source_length": 0,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "株",
+      "position": 4,
+      "force_prefix": false,
+      "source_offset": 9,
+      "source_length": 0,
+      "source_first_character_length": 3
+    },
+    {
+      "value": ")",
+      "position": 5,
+      "force_prefix": false,
+      "source_offset": 9,
+      "source_length": 3,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "グル",
+      "position": 6,
+      "force_prefix": false,
+      "source_offset": 12,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "ルン",
+      "position": 7,
+      "force_prefix": false,
+      "source_offset": 15,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "ンガ",
+      "position": 8,
+      "force_prefix": false,
+      "source_offset": 18,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "￰",
+      "position": 9,
+      "force_prefix": false,
+      "source_offset": 24,
+      "source_length": 0,
+      "source_first_character_length": 0
+    },
+    {
+      "value": "ここ",
+      "position": 10,
+      "force_prefix": false,
+      "source_offset": 0,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "こは",
+      "position": 11,
+      "force_prefix": false,
+      "source_offset": 3,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "は株",
+      "position": 12,
+      "force_prefix": false,
+      "source_offset": 6,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "株グ",
+      "position": 13,
+      "force_prefix": false,
+      "source_offset": 9,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "グル",
+      "position": 14,
+      "force_prefix": false,
+      "source_offset": 12,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "ルン",
+      "position": 15,
+      "force_prefix": false,
+      "source_offset": 15,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "ンガ",
+      "position": 16,
+      "force_prefix": false,
+      "source_offset": 18,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "ガ",
+      "position": 17,
+      "force_prefix": false,
+      "source_offset": 21,
+      "source_length": 3,
+      "source_first_character_length": 3
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.test (+5 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.test    2018-06-26 15:09:40 +0900 (ac0999366)
@@ -0,0 +1,5 @@
+tokenize \
+  'TokenNgram("loose_symbol", true, \
+              "report_source_location", true)' \
+  "ここは㈱グルンガ" \
+  'NormalizerNFKC100("report_source_offset", true)'
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180626/4df8ecf1/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index