[Groonga-commit] groonga/groonga at ba79b93 [master] TokenNgram: add a missing NULL check

Back to archive index

Kouhei Sutou null+****@clear*****
Mon May 28 14:05:24 JST 2018


Kouhei Sutou	2018-05-28 14:05:24 +0900 (Mon, 28 May 2018)

  New Revision: ba79b939a7861e265539f6aa5e333ee03ad902aa
  https://github.com/groonga/groonga/commit/ba79b939a7861e265539f6aa5e333ee03ad902aa

  Message:
    TokenNgram: add a missing NULL check

  Added files:
    test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.expected
    test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.test
  Modified files:
    lib/tokenizers.c
    test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected
    test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test

  Modified: lib/tokenizers.c (+22 -7)
===================================================================
--- lib/tokenizers.c    2018-05-28 14:01:54 +0900 (8fd624ce0)
+++ lib/tokenizers.c    2018-05-28 14:05:24 +0900 (6b3667d21)
@@ -268,6 +268,7 @@ typedef struct {
   grn_bool loose_symbol;
   grn_bool loose_blank;
   grn_bool report_source_location;
+  grn_bool include_removed_source_location;
 } grn_ngram_options;
 
 typedef struct {
@@ -309,6 +310,7 @@ ngram_options_init(grn_ngram_options *options, uint8_t unit)
   options->loose_symbol = GRN_FALSE;
   options->loose_blank = GRN_FALSE;
   options->report_source_location = GRN_FALSE;
+  options->include_removed_source_location = GRN_TRUE;
 }
 
 static void
@@ -388,8 +390,10 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
           (!tokenizer->options.remove_blank &&
            tokenizer->options.loose_blank &&
            GRN_STR_ISBLANK(*types))) {
-        if (!removed_checks) {
-          removed_checks = checks;
+        if (tokenizer->options.include_removed_source_location) {
+          if (!removed_checks) {
+            removed_checks = checks;
+          }
         }
         if (offsets && last_offset == 0) {
           last_offset = *offsets;
@@ -403,12 +407,14 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
         loose_types++;
         if (loose_checks) {
           size_t i;
-          for (; removed_checks && removed_checks < checks; removed_checks++) {
-            if (*removed_checks > 0) {
-              *loose_checks += *removed_checks;
+          if (tokenizer->options.include_removed_source_location) {
+            for (; removed_checks && removed_checks < checks; removed_checks++) {
+              if (*removed_checks > 0) {
+                *loose_checks += *removed_checks;
+              }
             }
+            removed_checks = NULL;
           }
-          removed_checks = NULL;
           for (i = 0; i < length; i++) {
             loose_checks[i] += checks[i];
           }
@@ -430,7 +436,9 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
         offsets++;
       }
     }
-    *loose_checks = *checks;
+    if (checks) {
+      *loose_checks = *checks;
+    }
     if (offsets) {
       if (last_offset) {
         *loose_offsets = last_offset;
@@ -700,6 +708,13 @@ ngram_open_options(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->report_source_location);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw,
+                                            "include_removed_source_location")) {
+      options->include_removed_source_location =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->include_removed_source_location);
     }
   } GRN_OPTION_VALUES_EACH_END();
 

  Added: test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.expected (+82 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.expected    2018-05-28 14:05:24 +0900 (ad2d85f6d)
@@ -0,0 +1,82 @@
+tokenize   'TokenNgram("report_source_location", true,               "include_removed_source_location", false,               "loose_symbol", true)'   "090(1234)56−78"   'NormalizerNFKC100("include_removed_source_location", false,                      "report_source_offset", true)'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    {
+      "value": "090",
+      "position": 0,
+      "force_prefix": false,
+      "source_offset": 0,
+      "source_length": 9,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "(",
+      "position": 1,
+      "force_prefix": false,
+      "source_offset": 9,
+      "source_length": 3,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "1234",
+      "position": 2,
+      "force_prefix": false,
+      "source_offset": 12,
+      "source_length": 8,
+      "source_first_character_length": 3
+    },
+    {
+      "value": ")",
+      "position": 3,
+      "force_prefix": false,
+      "source_offset": 20,
+      "source_length": 3,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "56",
+      "position": 4,
+      "force_prefix": false,
+      "source_offset": 23,
+      "source_length": 4,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "−",
+      "position": 5,
+      "force_prefix": false,
+      "source_offset": 27,
+      "source_length": 3,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "78",
+      "position": 6,
+      "force_prefix": false,
+      "source_offset": 30,
+      "source_length": 6,
+      "source_first_character_length": 3
+    },
+    {
+      "value": "￰",
+      "position": 7,
+      "force_prefix": false,
+      "source_offset": 36,
+      "source_length": 0,
+      "source_first_character_length": 0
+    },
+    {
+      "value": "09012345678",
+      "position": 8,
+      "force_prefix": false,
+      "source_offset": 0,
+      "source_length": 36,
+      "source_first_character_length": 3
+    }
+  ]
+]

  Added: test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.test (+7 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/tokenizers/ngram/report_source_location/include_removed_source_location.test    2018-05-28 14:05:24 +0900 (aafbe5592)
@@ -0,0 +1,7 @@
+tokenize \
+  'TokenNgram("report_source_location", true, \
+              "include_removed_source_location", false, \
+              "loose_symbol", true)' \
+  "090(1234)56−78" \
+  'NormalizerNFKC100("include_removed_source_location", false, \
+                     "report_source_offset", true)'

  Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected (+3 -3)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected    2018-05-28 14:01:54 +0900 (63a079929)
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.expected    2018-05-28 14:05:24 +0900 (496eefd29)
@@ -1,4 +1,4 @@
-tokenize   'TokenNgram("loose_symbol", true,               "report_source_location", true)'   "[クリアコード]"   'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true,                      "report_source_offset", true)'
+tokenize   'TokenNgram("loose_symbol", true,               "include_removed_source_location", false,               "report_source_location", true)'   "[クリアコード]"   'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true,                      "include_removed_source_location", false,                      "report_source_offset", true)'
 [
   [
     0,
@@ -84,7 +84,7 @@ tokenize   'TokenNgram("loose_symbol", true,               "report_source_locati
       "force_prefix": false,
       "source_offset": 1,
       "source_length": 6,
-      "source_first_character_length": 4
+      "source_first_character_length": 3
     },
     {
       "value": "リア",
@@ -116,7 +116,7 @@ tokenize   'TokenNgram("loose_symbol", true,               "report_source_locati
       "force_prefix": false,
       "source_offset": 16,
       "source_length": 3,
-      "source_first_character_length": 6
+      "source_first_character_length": 3
     }
   ]
 ]

  Modified: test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test (+2 -0)
===================================================================
--- test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test    2018-05-28 14:01:54 +0900 (f1e51e3b9)
+++ test/command/suite/tokenizers/ngram/report_source_location/loose_and_unify.test    2018-05-28 14:05:24 +0900 (8338e1ecb)
@@ -1,6 +1,8 @@
 tokenize \
   'TokenNgram("loose_symbol", true, \
+              "include_removed_source_location", false, \
               "report_source_location", true)' \
   "[クリアコード]" \
   'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true, \
+                     "include_removed_source_location", false, \
                      "report_source_offset", true)'
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180528/98c10bb4/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index