[Groonga-commit] groonga/groonga at 1974161 [master] NormalizerNFKC100: Add unify_middle_dot option

Back to archive index

Kouhei Sutou null+****@clear*****
Mon May 14 14:12:22 JST 2018


Kouhei Sutou	2018-05-14 14:12:22 +0900 (Mon, 14 May 2018)

  New Revision: 19741615296c8e3aa48bb087168b1b47492f83ab
  https://github.com/groonga/groonga/commit/19741615296c8e3aa48bb087168b1b47492f83ab

  Message:
    NormalizerNFKC100: Add unify_middle_dot option

  Added files:
    test/command/suite/normalizers/nfkc100/unify_middle_dot.expected
    test/command/suite/normalizers/nfkc100/unify_middle_dot.test
  Modified files:
    lib/normalizer.c

  Modified: lib/normalizer.c (+58 -0)
===================================================================
--- lib/normalizer.c    2018-05-14 14:12:09 +0900 (ea218b0ac)
+++ lib/normalizer.c    2018-05-14 14:12:22 +0900 (0e131300e)
@@ -626,6 +626,7 @@ typedef struct {
   grn_bool unify_hyphen;
   grn_bool unify_prolonged_sound_mark;
   grn_bool unify_hyphen_and_prolonged_sound_mark;
+  grn_bool unify_middle_dot;
 } grn_utf8_normalize_options;
 
 static void
@@ -643,6 +644,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options,
   options->unify_hyphen = GRN_FALSE;
   options->unify_prolonged_sound_mark = GRN_FALSE;
   options->unify_hyphen_and_prolonged_sound_mark = GRN_FALSE;
+  options->unify_middle_dot = GRN_FALSE;
 }
 
 grn_inline static const unsigned char *
@@ -941,6 +943,46 @@ utf8_normalize_is_prolonged_sound_mark_famity(const unsigned char *utf8_char,
   return GRN_FALSE;
 }
 
+grn_inline static grn_bool
+utf8_normalize_is_middle_dot_family(const unsigned char *utf8_char,
+                                    size_t length)
+{
+  if (length == 3) {
+    if (utf8_char[0] == 0xe1) {
+      if (utf8_char[1] == 0x90 && utf8_char[2] == 0xa7) {
+        /* U+1427 CANADIAN SYLLABICS FINAL MIDDLE DOT */
+        return GRN_TRUE;
+      }
+    } else if (utf8_char[0] == 0xe2) {
+      if (utf8_char[1] == 0x80 && utf8_char[2] == 0xa2) {
+        /* U+2022 BULLET */
+        return GRN_TRUE;
+      } else if (utf8_char[1] == 0x88 && utf8_char[2] == 0x99) {
+        /* U+2219 BULLET OPERATOR */
+        return GRN_TRUE;
+      } else if (utf8_char[1] == 0x8b && utf8_char[2] == 0x85) {
+        /* U+22C5 DOT OPERATOR */
+        return GRN_TRUE;
+      } else if (utf8_char[1] == 0xb8 && utf8_char[2] == 0xb1) {
+        /* U+2E31 WORD SEPARATOR MIDDLE DOT */
+        return GRN_TRUE;
+      }
+    } else if (utf8_char[0] == 0xe3) {
+      if (utf8_char[1] == 0x83 && utf8_char[2] == 0xbb) {
+        /* U+30FB KATAKANA MIDDLE DOT */
+        return GRN_TRUE;
+      }
+    } else if (utf8_char[0] == 0xef) {
+      if (utf8_char[1] == 0xbd && utf8_char[2] == 0xa5) {
+        /* U+FF65 HALFWIDTH KATAKANA MIDDLE DOT */
+        return GRN_TRUE;
+      }
+    }
+  }
+
+  return GRN_FALSE;
+}
+
 grn_inline static grn_obj *
 utf8_normalize(grn_ctx *ctx,
                grn_string *nstr,
@@ -1073,6 +1115,8 @@ utf8_normalize(grn_ctx *ctx,
           /* U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK */
           const unsigned char unified_prolonged_sound_mark[] =
             {0xe3, 0x83, 0xbc};
+          /* U+00B7 MIDDLE DOT */
+          const unsigned char unified_middle_dot[] = {0xc2, 0xb7};
 
           if (options->unify_kana &&
               char_type == GRN_CHAR_KATAKANA &&
@@ -1144,6 +1188,14 @@ utf8_normalize(grn_ctx *ctx,
             }
           }
 
+          if (options->unify_middle_dot) {
+            if (utf8_normalize_is_middle_dot_family(p, lp)) {
+              p = unified_middle_dot;
+              lp = sizeof(unified_middle_dot);
+              char_type = GRN_CHAR_SYMBOL;
+            }
+          }
+
           grn_memcpy(d, p, lp);
           p = p_original;
         }
@@ -1651,6 +1703,12 @@ nfkc100_open_options(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->unify_hyphen_and_prolonged_sound_mark);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_middle_dot")) {
+      options->unify_middle_dot =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->unify_middle_dot);
     }
   } GRN_OPTION_VALUES_EACH_END();
 

  Added: test/command/suite/normalizers/nfkc100/unify_middle_dot.expected (+24 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_middle_dot.expected    2018-05-14 14:12:22 +0900 (2b3413af3)
@@ -0,0 +1,24 @@
+normalize   'NormalizerNFKC100("unify_middle_dot", true)'   "·ᐧ•∙⋅⸱・・"   WITH_TYPES
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "········",
+    "types": [
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol"
+    ],
+    "checks": [
+
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/unify_middle_dot.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_middle_dot.test    2018-05-14 14:12:22 +0900 (4336e92e1)
@@ -0,0 +1,4 @@
+normalize \
+  'NormalizerNFKC100("unify_middle_dot", true)' \
+  "·ᐧ•∙⋅⸱・・" \
+  WITH_TYPES
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180514/a5444000/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index