[Groonga-commit] groonga/groonga at fa48d25 [master] NormalizerNFKC100: add unify_prolonged_sound_mark option

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Apr 18 10:39:39 JST 2018


Kouhei Sutou	2018-04-18 10:39:39 +0900 (Wed, 18 Apr 2018)

  New Revision: fa48d25ff64e72303f263c75e317391a1a3e9e18
  https://github.com/groonga/groonga/commit/fa48d25ff64e72303f263c75e317391a1a3e9e18

  Message:
    NormalizerNFKC100: add unify_prolonged_sound_mark option
    
    NEologd unifies "U+FE63 SMALL HYPHEN-MINUS" and "U+FF0D FULLWIDTH
    HYPHEN-MINUS" to "U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK" but
    we unify them to "U+002D HYPHEN-MINUS".

  Added files:
    test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.expected
    test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.test
  Modified files:
    lib/normalizer.c

  Modified: lib/normalizer.c (+49 -1)
===================================================================
--- lib/normalizer.c    2018-04-18 10:10:47 +0900 (af75019a2)
+++ lib/normalizer.c    2018-04-18 10:39:39 +0900 (fd5e3842a)
@@ -624,6 +624,7 @@ typedef struct {
   grn_bool unify_kana_case;
   grn_bool unify_kana_voiced_sound_mark;
   grn_bool unify_hyphen;
+  grn_bool unify_prolonged_sound_mark;
 } grn_utf8_normalize_options;
 
 static void
@@ -639,6 +640,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options,
   options->unify_kana_case = GRN_FALSE;
   options->unify_kana_voiced_sound_mark = GRN_FALSE;
   options->unify_hyphen = GRN_FALSE;
+  options->unify_prolonged_sound_mark = GRN_FALSE;
 }
 
 grn_inline static const unsigned char *
@@ -895,6 +897,34 @@ utf8_normalize_is_hyphen_famity(const unsigned char *utf8_char,
   return GRN_FALSE;
 }
 
+grn_inline static const grn_bool
+utf8_normalize_is_prolonged_sound_mark_famity(const unsigned char *utf8_char,
+                                              size_t length)
+{
+  if (length == 3) {
+    if (utf8_char[0] == 0xe2) {
+      if (utf8_char[1] == 0x80 &&
+          (0x94 <= utf8_char[2] && utf8_char[2] <= 0x95)) {
+        /* U+2014 EM DASH ..
+         * U+2015 HORIZONTAL BAR */
+        return GRN_TRUE;
+      } else if (utf8_char[1] == 0x94 &&
+          (0x80 <= utf8_char[2] && utf8_char[2] <= 0x81)) {
+        /* U+2500 BOX DRAWINGS LIGHT HORIZONTAL ..
+         * U+2501 BOX DRAWINGS HEAVY HORIZONTAL */
+        return GRN_TRUE;
+      }
+    } else if (utf8_char[0] == 0xef) {
+      if (utf8_char[1] == 0xbd && utf8_char[2] == 0xb0) {
+        /* U+FF70 HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
+        return GRN_TRUE;
+      }
+    }
+  }
+
+  return GRN_FALSE;
+}
+
 grn_inline static grn_obj *
 utf8_normalize(grn_ctx *ctx,
                grn_string *nstr,
@@ -1024,6 +1054,9 @@ utf8_normalize(grn_ctx *ctx,
           unsigned char unified_kana_case[3];
           unsigned char unified_kana_voiced_sound_mark[3];
           const unsigned char unified_hyphen[] = {'-'};
+          /* U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK */
+          const unsigned char unified_prolonged_sound_mark[] =
+            {0xe3, 0x83, 0xbc};
 
           if (options->unify_kana &&
               char_type == GRN_CHAR_KATAKANA &&
@@ -1073,11 +1106,19 @@ utf8_normalize(grn_ctx *ctx,
           if (options->unify_hyphen) {
             if (utf8_normalize_is_hyphen_famity(p, lp)) {
               p = unified_hyphen;
-              lp = 1;
+              lp = sizeof(unified_hyphen);
               char_type = GRN_CHAR_SYMBOL;
             }
           }
 
+          if (options->unify_prolonged_sound_mark) {
+            if (utf8_normalize_is_prolonged_sound_mark_famity(p, lp)) {
+              p = unified_prolonged_sound_mark;
+              lp = sizeof(unified_prolonged_sound_mark);
+              char_type = GRN_CHAR_KATAKANA;
+            }
+          }
+
           grn_memcpy(d, p, lp);
           p = p_original;
         }
@@ -1571,6 +1612,13 @@ nfkc100_open_options(grn_ctx *ctx,
                                                           raw_options,
                                                           i,
                                                           options->unify_hyphen);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw,
+                                            "unify_prolonged_sound_mark")) {
+      options->unify_prolonged_sound_mark =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->unify_prolonged_sound_mark);
     }
   } GRN_OPTION_VALUES_EACH_END();
 

  Added: test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.expected (+22 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.expected    2018-04-18 10:39:39 +0900 (f00f2de1b)
@@ -0,0 +1,22 @@
+normalize   'NormalizerNFKC100("unify_prolonged_sound_mark", true)'   "ー—―─━ー"   WITH_TYPES
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "ーーーーーー",
+    "types": [
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana",
+      "katakana"
+    ],
+    "checks": [
+
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/unify_prolonged_sound_mark.test    2018-04-18 10:39:39 +0900 (ae8270f44)
@@ -0,0 +1,4 @@
+normalize \
+  'NormalizerNFKC100("unify_prolonged_sound_mark", true)' \
+  "ー—―─━ー" \
+  WITH_TYPES
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180418/0c41099d/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index