[Groonga-commit] groonga/groonga at 0d17b2e [master] NormalizerNFKC100: add unify_hyphen_and_prolonged_sound_mark option

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Apr 18 10:52:30 JST 2018


Kouhei Sutou	2018-04-18 10:52:30 +0900 (Wed, 18 Apr 2018)

  New Revision: 0d17b2e296e764bb43661a6fdf6a446667ebbe7a
  https://github.com/groonga/groonga/commit/0d17b2e296e764bb43661a6fdf6a446667ebbe7a

  Message:
    NormalizerNFKC100: add unify_hyphen_and_prolonged_sound_mark option

  Added files:
    test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.expected
    test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.test
  Modified files:
    lib/normalizer.c

  Modified: lib/normalizer.c (+29 -1)
===================================================================
--- lib/normalizer.c    2018-04-18 10:39:39 +0900 (fd5e3842a)
+++ lib/normalizer.c    2018-04-18 10:52:30 +0900 (6ea189872)
@@ -625,6 +625,7 @@ typedef struct {
   grn_bool unify_kana_voiced_sound_mark;
   grn_bool unify_hyphen;
   grn_bool unify_prolonged_sound_mark;
+  grn_bool unify_hyphen_and_prolonged_sound_mark;
 } grn_utf8_normalize_options;
 
 static void
@@ -641,6 +642,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options,
   options->unify_kana_voiced_sound_mark = GRN_FALSE;
   options->unify_hyphen = GRN_FALSE;
   options->unify_prolonged_sound_mark = GRN_FALSE;
+  options->unify_hyphen_and_prolonged_sound_mark = GRN_FALSE;
 }
 
 grn_inline static const unsigned char *
@@ -854,7 +856,12 @@ grn_inline static const grn_bool
 utf8_normalize_is_hyphen_famity(const unsigned char *utf8_char,
                                 size_t length)
 {
-  if (length == 2) {
+  if (length == 1) {
+    if (utf8_char[0] == '-') {
+      /* U+002D HYPHEN-MINUS */
+      return GRN_TRUE;
+    }
+  } else if (length == 2) {
     switch (utf8_char[0]) {
     case 0xcb :
       if (utf8_char[1] == 0x97) {
@@ -914,6 +921,11 @@ utf8_normalize_is_prolonged_sound_mark_famity(const unsigned char *utf8_char,
          * U+2501 BOX DRAWINGS HEAVY HORIZONTAL */
         return GRN_TRUE;
       }
+    } else if (utf8_char[0] == 0xe3) {
+      if (utf8_char[1] == 0x83 && utf8_char[2] == 0xbc) {
+        /* U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK */
+        return GRN_TRUE;
+      }
     } else if (utf8_char[0] == 0xef) {
       if (utf8_char[1] == 0xbd && utf8_char[2] == 0xb0) {
         /* U+FF70 HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
@@ -1119,6 +1131,15 @@ utf8_normalize(grn_ctx *ctx,
             }
           }
 
+          if (options->unify_hyphen_and_prolonged_sound_mark) {
+            if (utf8_normalize_is_hyphen_famity(p, lp) ||
+                utf8_normalize_is_prolonged_sound_mark_famity(p, lp)) {
+              p = unified_hyphen;
+              lp = sizeof(unified_hyphen);
+              char_type = GRN_CHAR_SYMBOL;
+            }
+          }
+
           grn_memcpy(d, p, lp);
           p = p_original;
         }
@@ -1619,6 +1640,13 @@ nfkc100_open_options(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->unify_prolonged_sound_mark);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw,
+                                            "unify_hyphen_and_prolonged_sound_mark")) {
+      options->unify_hyphen_and_prolonged_sound_mark =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->unify_hyphen_and_prolonged_sound_mark);
     }
   } GRN_OPTION_VALUES_EACH_END();
 

  Added: test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.expected (+37 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.expected    2018-04-18 10:52:30 +0900 (1f8c926e2)
@@ -0,0 +1,37 @@
+normalize   'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true)'   "-˗֊‐‑‒–⁃⁻₋− ﹣- ー—―─━ー"   WITH_TYPES
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "----------- -- ------",
+    "types": [
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "others",
+      "symbol",
+      "symbol",
+      "others",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol",
+      "symbol"
+    ],
+    "checks": [
+
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_hyphen_and_prolonged_sound_mark.test    2018-04-18 10:52:30 +0900 (fc9898b11)
@@ -0,0 +1,4 @@
+normalize \
+  'NormalizerNFKC100("unify_hyphen_and_prolonged_sound_mark", true)' \
+  "-˗֊‐‑‒–⁃⁻₋− ﹣- ー—―─━ー" \
+  WITH_TYPES
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180418/0b315666/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index