[Groonga-commit] groonga/groonga at 24bffe1 [pat-scan-ignore-blank] grn_pat_scan: ignore blanks in target string.

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Mar 10 15:00:36 JST 2017


Kouhei Sutou	2017-03-10 15:00:36 +0900 (Fri, 10 Mar 2017)

  New Revision: 24bffe1a74b6efefaf895c40de94fb5fb809cbc0
  https://github.com/groonga/groonga/commit/24bffe1a74b6efefaf895c40de94fb5fb809cbc0

  Message:
    grn_pat_scan: ignore blanks in target string.
    
    With this change, highlight family functions support highlighting text
    that has some blanks. See the test in this change for example.

  Added files:
    test/command/suite/select/function/highlight_html/space_in_target.expected
    test/command/suite/select/function/highlight_html/space_in_target.test
  Modified files:
    lib/pat.c

  Modified: lib/pat.c (+47 -8)
===================================================================
--- lib/pat.c    2017-03-10 14:57:01 +0900 (0fa4da9)
+++ lib/pat.c    2017-03-10 15:00:36 +0900 (5b168ff)
@@ -2161,10 +2161,15 @@ grn_pat_scan(grn_ctx *ctx, grn_pat *pat, const char *str, unsigned int str_len,
     return 0;
   }
   if (pat->normalizer) {
+    int flags =
+      GRN_STRING_REMOVE_BLANK |
+      GRN_STRING_WITH_TYPES |
+      GRN_STRING_WITH_CHECKS;
     grn_obj *nstr = grn_string_open(ctx, str, str_len,
-                                    pat->normalizer, GRN_STRING_WITH_CHECKS);
+                                    pat->normalizer, flags);
     if (nstr) {
       const short *cp = grn_string_get_checks(ctx, nstr);
+      const unsigned char *tp = grn_string_get_types(ctx, nstr);
       unsigned int offset = 0, offset0 = 0;
       unsigned int normalized_length_in_bytes;
       const char *sp, *se;
@@ -2173,18 +2178,52 @@ grn_pat_scan(grn_ctx *ctx, grn_pat *pat, const char *str, unsigned int str_len,
       se = sp + normalized_length_in_bytes;
       while (n < sh_size) {
         if ((tid = grn_pat_lcp_search(ctx, pat, sp, se - sp))) {
+          const char *key;
           uint32_t len;
-          _grn_pat_key(ctx, pat, tid, &len);
+          key = _grn_pat_key(ctx, pat, tid, &len);
           sh[n].id = tid;
           sh[n].offset = (*cp > 0) ? offset : offset0;
-          while (len--) {
-            if (*cp > 0) { offset0 = offset; offset += *cp; }
-            sp++; cp++;
+          if (sh[n].offset > 0 &&
+              GRN_CHAR_IS_BLANK(tp[-1]) &&
+              grn_charlen(ctx, key, key + len) == 1 &&
+              key[0] != ' ') {
+            /* Remove leading spaces. */
+            const char *original_str = str + sh[n].offset;
+            while (grn_charlen(ctx, original_str, str + str_len) == 1 &&
+                   original_str[0] == ' ') {
+              original_str++;
+              sh[n].offset++;
+            }
+          }
+          {
+            grn_bool blank_in_alnum = GRN_FALSE;
+            const unsigned char *start_tp = tp;
+            const unsigned char *blank_in_alnum_check_tp;
+            while (len--) {
+              if (*cp > 0) { offset0 = offset; offset += *cp; tp++; }
+              sp++; cp++;
+            }
+            sh[n].length = offset - sh[n].offset;
+            for (blank_in_alnum_check_tp = start_tp + 1;
+                 blank_in_alnum_check_tp < tp;
+                 blank_in_alnum_check_tp++) {
+#define GRN_CHAR_IS_ALNUM(char_type)                         \
+              (GRN_CHAR_TYPE(char_type) == GRN_CHAR_ALPHA || \
+               GRN_CHAR_TYPE(char_type) == GRN_CHAR_DIGIT)
+              if (GRN_CHAR_IS_BLANK(blank_in_alnum_check_tp[0]) &&
+                  GRN_CHAR_IS_ALNUM(blank_in_alnum_check_tp[-1]) &&
+                  (blank_in_alnum_check_tp + 1) < tp &&
+                  GRN_CHAR_IS_ALNUM(blank_in_alnum_check_tp[1])) {
+                blank_in_alnum = GRN_TRUE;
+              }
+#undef GRN_CHAR_IS_ALNUM
+            }
+            if (!blank_in_alnum) {
+              n++;
+            }
           }
-          sh[n].length = offset - sh[n].offset;
-          n++;
         } else {
-          if (*cp > 0) { offset0 = offset; offset += *cp; }
+          if (*cp > 0) { offset0 = offset; offset += *cp; tp++; }
           do {
             sp++; cp++;
           } while (sp < se && !*cp);

  Added: test/command/suite/select/function/highlight_html/space_in_target.expected (+37 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/highlight_html/space_in_target.expected    2017-03-10 15:00:36 +0900 (30e3e0d)
@@ -0,0 +1,37 @@
+table_create Entries TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Entries body COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
+[[0,0.0,0.0],true]
+load --table Entries
+[
+{"body": "高速な Mroonga ストレージエンジン。 Mr oongaストレージ"}
+]
+[[0,0.0,0.0],1]
+select Entries --output_columns   --match_columns body --query 'Mroongaストレージ'   --output_columns 'highlight_html(body)'
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "highlight_html",
+          null
+        ]
+      ],
+      [
+        "高速な <span class=\"keyword\">Mroonga ストレージ</span>エンジン。 Mr oongaストレージ"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/function/highlight_html/space_in_target.test (+14 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/highlight_html/space_in_target.test    2017-03-10 15:00:36 +0900 (4a8cd04)
@@ -0,0 +1,14 @@
+table_create Entries TABLE_NO_KEY
+column_create Entries body COLUMN_SCALAR ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto
+column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
+
+load --table Entries
+[
+{"body": "高速な Mroonga ストレージエンジン。 Mr oongaストレージ"}
+]
+
+select Entries --output_columns \
+  --match_columns body --query 'Mroongaストレージ' \
+  --output_columns 'highlight_html(body)'
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index