[Groonga-commit] groonga/groonga-normalizer-mysql at acc4bdf [master] Support checks

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Jan 29 00:11:08 JST 2014


Kouhei Sutou	2014-01-29 00:11:08 +0900 (Wed, 29 Jan 2014)

  New Revision: acc4bdfd606756a97bde78b324f94565a4dece19
  https://github.com/groonga/groonga-normalizer-mysql/commit/acc4bdfd606756a97bde78b324f94565a4dece19

  Message:
    Support checks

  Added files:
    test/suite/general_ci/with_checks.expected
    test/suite/general_ci/with_checks.test
    test/suite/unicode_ci/with_checks.expected
    test/suite/unicode_ci/with_checks.test
    test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/remobe_blank.expected
    test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/remobe_blank.test
    test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/with_checks.expected
    test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/with_checks.test
  Modified files:
    normalizers/mysql.c
    test/suite/general_ci/remove_blank.expected
    test/suite/general_ci/remove_blank.test
    test/suite/unicode_ci/remove_blank.expected
    test/suite/unicode_ci/remove_blank.test

  Modified: normalizers/mysql.c (+33 -1)
===================================================================
--- normalizers/mysql.c    2014-01-28 23:03:11 +0900 (1cd32c8)
+++ normalizers/mysql.c    2014-01-29 00:11:08 +0900 (1f0bb4d)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2013  Kouhei Sutou <kou �� clear-code.com>
+  Copyright(C) 2013-2014  Kouhei Sutou <kou �� clear-code.com>
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public
@@ -47,6 +47,7 @@ typedef grn_bool (*normalizer_func)(grn_ctx *ctx,
                                     int rest_length,
                                     uint32_t **normalize_table,
                                     char *normalized,
+                                    unsigned int *normalized_characer_length,
                                     unsigned int *normalized_length_in_bytes,
                                     unsigned int *normalized_n_characters);
 
@@ -179,6 +180,7 @@ static inline void
 normalize_character(const char *utf8, int character_length,
                     uint32_t **normalize_table,
                     char *normalized,
+                    unsigned int *normalized_character_length,
                     unsigned int *normalized_length_in_bytes,
                     unsigned int *normalized_n_characters)
 {
@@ -192,6 +194,7 @@ normalize_character(const char *utf8, int character_length,
     if (normalized_code != 0) {
       n_bytes = unichar_to_utf8(normalized_code,
                                 normalized + *normalized_length_in_bytes);
+      *normalized_character_length = n_bytes;
       *normalized_length_in_bytes += n_bytes;
     }
   } else {
@@ -199,6 +202,7 @@ normalize_character(const char *utf8, int character_length,
     for (i = 0; i < character_length; i++) {
       normalized[*normalized_length_in_bytes + i] = utf8[i];
     }
+    *normalized_character_length = character_length;
     *normalized_length_in_bytes += character_length;
   }
   (*normalized_n_characters)++;
@@ -298,6 +302,8 @@ normalize(grn_ctx *ctx, grn_obj *string,
   unsigned int normalized_n_characters = 0;
   unsigned char *types = NULL;
   unsigned char *current_type = NULL;
+  short *checks = NULL;
+  short *current_check = NULL;
   grn_encoding encoding;
   int flags;
   grn_bool remove_blank_p;
@@ -315,6 +321,12 @@ normalize(grn_ctx *ctx, grn_obj *string,
     types = GRN_PLUGIN_MALLOC(ctx, max_normalized_n_characters);
     current_type = types;
   }
+  if (flags & GRN_STRING_WITH_CHECKS) {
+    unsigned int max_normalized_length_in_bytes = original_length_in_bytes + 1;
+    checks = GRN_PLUGIN_MALLOC(ctx, max_normalized_length_in_bytes);
+    current_check = checks;
+    current_check[0] = 0;
+  }
   rest = original;
   rest_length = original_length_in_bytes;
   while (rest_length > 0) {
@@ -329,8 +341,12 @@ normalize(grn_ctx *ctx, grn_obj *string,
       if (current_type > types) {
         current_type[-1] |= GRN_CHAR_BLANK;
       }
+      if (current_check) {
+        current_check[0]++;
+      }
     } else {
       grn_bool custom_normalized = GRN_FALSE;
+      unsigned int normalized_character_length;
       if (custom_normalizer) {
         custom_normalized = custom_normalizer(ctx,
                                               rest,
@@ -338,12 +354,14 @@ normalize(grn_ctx *ctx, grn_obj *string,
                                               rest_length - character_length,
                                               normalize_table,
                                               normalized,
+                                              &normalized_character_length,
                                               &normalized_length_in_bytes,
                                               &normalized_n_characters);
       }
       if (!custom_normalized) {
         normalize_character(rest, character_length, normalize_table,
                             normalized,
+                            &normalized_character_length,
                             &normalized_length_in_bytes,
                             &normalized_n_characters);
       }
@@ -355,6 +373,16 @@ normalize(grn_ctx *ctx, grn_obj *string,
           grn_nfkc_char_type((unsigned char *)current_normalized);
         current_type++;
       }
+      if (current_check) {
+        unsigned int i;
+        current_check[0] += character_length;
+        current_check++;
+        for (i = 1; i < normalized_character_length; i++) {
+          current_check[0] = 0;
+          current_check++;
+        }
+        current_check[0] = 0;
+      }
     }
 
     rest += character_length;
@@ -382,6 +410,7 @@ normalize(grn_ctx *ctx, grn_obj *string,
                             normalized_length_in_bytes,
                             normalized_n_characters);
   grn_string_set_types(ctx, string, types);
+  grn_string_set_checks(ctx, string, checks);
 }
 
 static grn_obj *
@@ -476,6 +505,7 @@ normalize_halfwidth_katakana_with_voiced_sound_mark(
   int rest_length,
   GNUC_UNUSED uint32_t **normalize_table,
   char *normalized,
+  unsigned int *normalized_character_length,
   unsigned int *normalized_length_in_bytes,
   unsigned int *normalized_n_characters)
 {
@@ -543,6 +573,7 @@ normalize_halfwidth_katakana_with_voiced_sound_mark(
                                     normalized + *normalized_length_in_bytes);
         }
         *character_length += next_character_length;
+        *normalized_character_length = n_bytes;
         *normalized_length_in_bytes += n_bytes;
         (*normalized_n_characters)++;
         custom_normalized = GRN_TRUE;
@@ -556,6 +587,7 @@ normalize_halfwidth_katakana_with_voiced_sound_mark(
                                    HIRAGANA_HA_LINE_GAP),
                                   normalized + *normalized_length_in_bytes);
         *character_length += next_character_length;
+        *normalized_character_length = n_bytes;
         *normalized_length_in_bytes += n_bytes;
         (*normalized_n_characters)++;
         custom_normalized = GRN_TRUE;

  Modified: test/suite/general_ci/remove_blank.expected (+2 -2)
===================================================================
--- test/suite/general_ci/remove_blank.expected    2014-01-28 23:03:11 +0900 (8c9e949)
+++ test/suite/general_ci/remove_blank.expected    2014-01-29 00:11:08 +0900 (e22989e)
@@ -1,4 +1,4 @@
 register normalizers/mysql
 [[0,0.0,0.0],true]
-normalize NormalizerMySQLGeneralCI "a b c" REMOVE_BLANK
-[[0,0.0,0.0],{"normalized":"ABC","types":[],"checks":[]}]
+normalize NormalizerMySQLGeneralCI " a  b   c " REMOVE_BLANK|WITH_CHECKS
+[[0,0.0,0.0],{"normalized":"ABC","types":[],"checks":[2,3,4]}]

  Modified: test/suite/general_ci/remove_blank.test (+1 -1)
===================================================================
--- test/suite/general_ci/remove_blank.test    2014-01-28 23:03:11 +0900 (a070785)
+++ test/suite/general_ci/remove_blank.test    2014-01-29 00:11:08 +0900 (464759c)
@@ -1,3 +1,3 @@
 register normalizers/mysql
 
-normalize NormalizerMySQLGeneralCI "a b c" REMOVE_BLANK
+normalize NormalizerMySQLGeneralCI " a  b   c " REMOVE_BLANK|WITH_CHECKS

  Added: test/suite/general_ci/with_checks.expected (+28 -0) 100644
===================================================================
--- /dev/null
+++ test/suite/general_ci/with_checks.expected    2014-01-29 00:11:08 +0900 (b751588)
@@ -0,0 +1,28 @@
+register normalizers/mysql
+[[0,0.0,0.0],true]
+normalize NormalizerMySQLGeneralCI "ア㌕AZ" WITH_CHECKS
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "ア㌕AZ",
+    "types": [
+
+    ],
+    "checks": [
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      3,
+      0,
+      0,
+      1
+    ]
+  }
+]

  Added: test/suite/general_ci/with_checks.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/suite/general_ci/with_checks.test    2014-01-29 00:11:08 +0900 (a61151a)
@@ -0,0 +1,3 @@
+register normalizers/mysql
+
+normalize NormalizerMySQLGeneralCI "ア㌕AZ" WITH_CHECKS

  Modified: test/suite/unicode_ci/remove_blank.expected (+2 -2)
===================================================================
--- test/suite/unicode_ci/remove_blank.expected    2014-01-28 23:03:11 +0900 (2b22a4b)
+++ test/suite/unicode_ci/remove_blank.expected    2014-01-29 00:11:08 +0900 (dc1df15)
@@ -1,4 +1,4 @@
 register normalizers/mysql
 [[0,0.0,0.0],true]
-normalize NormalizerMySQLUnicodeCI "a b c" REMOVE_BLANK
-[[0,0.0,0.0],{"normalized":"ABC","types":[],"checks":[]}]
+normalize NormalizerMySQLUnicodeCI " a  b   c" REMOVE_BLANK|WITH_CHECKS
+[[0,0.0,0.0],{"normalized":"ABC","types":[],"checks":[2,3,4]}]

  Modified: test/suite/unicode_ci/remove_blank.test (+1 -1)
===================================================================
--- test/suite/unicode_ci/remove_blank.test    2014-01-28 23:03:11 +0900 (ba58c02)
+++ test/suite/unicode_ci/remove_blank.test    2014-01-29 00:11:08 +0900 (0e02b32)
@@ -1,3 +1,3 @@
 register normalizers/mysql
 
-normalize NormalizerMySQLUnicodeCI "a b c" REMOVE_BLANK
+normalize NormalizerMySQLUnicodeCI " a  b   c" REMOVE_BLANK|WITH_CHECKS

  Added: test/suite/unicode_ci/with_checks.expected (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/suite/unicode_ci/with_checks.expected    2014-01-29 00:11:08 +0900 (9ab7691)
@@ -0,0 +1,4 @@
+register normalizers/mysql
+[[0,0.0,0.0],true]
+normalize NormalizerMySQLUnicodeCI "ア㌕AZ" WITH_CHECKS
+[[0,0.0,0.0],{"normalized":"あ㌕AZ","types":[],"checks":[3,0,0,3,0,0,3,1]}]

  Added: test/suite/unicode_ci/with_checks.test (+3 -0) 100644
===================================================================
--- /dev/null
+++ test/suite/unicode_ci/with_checks.test    2014-01-29 00:11:08 +0900 (f581eec)
@@ -0,0 +1,3 @@
+register normalizers/mysql
+
+normalize NormalizerMySQLUnicodeCI "ア㌕AZ" WITH_CHECKS

  Added: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/remobe_blank.expected (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/remobe_blank.expected    2014-01-29 00:11:08 +0900 (2decaf7)
@@ -0,0 +1,4 @@
+register normalizers/mysql
+[[0,0.0,0.0],true]
+normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark   " a  b   c" REMOVE_BLANK|WITH_CHECKS
+[[0,0.0,0.0],{"normalized":"ABC","types":[],"checks":[2,3,4]}]

  Added: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/remobe_blank.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/remobe_blank.test    2014-01-29 00:11:08 +0900 (195326c)
@@ -0,0 +1,4 @@
+register normalizers/mysql
+
+normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark \
+  " a  b   c" REMOVE_BLANK|WITH_CHECKS

  Added: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/with_checks.expected (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/with_checks.expected    2014-01-29 00:11:08 +0900 (9931731)
@@ -0,0 +1,4 @@
+register normalizers/mysql
+[[0,0.0,0.0],true]
+normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark   "ア㌕AZ" WITH_CHECKS
+[[0,0.0,0.0],{"normalized":"あ㌕AZ","types":[],"checks":[3,0,0,3,0,0,3,1]}]

  Added: test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/with_checks.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/suite/unicode_ci_except_kana_ci_kana_with_voiced_sound_mark/with_checks.test    2014-01-29 00:11:08 +0900 (833a5c9)
@@ -0,0 +1,4 @@
+register normalizers/mysql
+
+normalize NormalizerMySQLUnicodeCIExceptKanaCIKanaWithVoicedSoundMark \
+  "ア㌕AZ" WITH_CHECKS
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index