[Groonga-commit] groonga/groonga-normalizer-mysql at c2bc7b3 [master] Extract character normalization function

Back to archive index

Kouhei Sutou null+****@clear*****
Mon May 20 21:44:46 JST 2013


Kouhei Sutou	2013-05-20 21:44:46 +0900 (Mon, 20 May 2013)

  New Revision: c2bc7b3fdba91088199e010cc0a901376f98c389
  https://github.com/groonga/groonga-normalizer-mysql/commit/c2bc7b3fdba91088199e010cc0a901376f98c389

  Message:
    Extract character normalization function

  Modified files:
    normalizers/mysql.c

  Modified: normalizers/mysql.c (+32 -20)
===================================================================
--- normalizers/mysql.c    2013-05-20 17:35:24 +0900 (1702d47)
+++ normalizers/mysql.c    2013-05-20 21:44:46 +0900 (60e65b7)
@@ -109,6 +109,34 @@ decompose_character(const char *rest, int character_length,
   }
 }
 
+static inline void
+normalize_character(const char *utf8, int character_length,
+                    uint32_t **normalize_table,
+                    char *normalized,
+                    unsigned int *normalized_length_in_bytes,
+                    unsigned int *normalized_n_characters)
+{
+  int page;
+  uint32_t low_code;
+  decompose_character(utf8, character_length, &page, &low_code);
+  if ((0x00 <= page && page <= 0xff) && normalize_table[page]) {
+    uint32_t normalized_code;
+    unsigned int n_bytes;
+    normalized_code = normalize_table[page][low_code];
+    if (normalized_code != 0) {
+      n_bytes = unichar_to_utf8(normalized_code,
+                                normalized + *normalized_length_in_bytes);
+      *normalized_length_in_bytes += n_bytes;
+    }
+  } else {
+    int i;
+    for (i = 0; i < character_length; i++) {
+      normalized[*normalized_length_in_bytes + i] = utf8[i];
+    }
+    *normalized_length_in_bytes += character_length;
+  }
+  (*normalized_n_characters)++;
+}
 
 static void
 normalize(grn_ctx *ctx, grn_obj *string, uint32_t **normalize_table)
@@ -152,25 +180,10 @@ normalize(grn_ctx *ctx, grn_obj *string, uint32_t **normalize_table)
         current_type[-1] |= GRN_CHAR_BLANK;
       }
     } else {
-      int page;
-      uint32_t low_code;
-      decompose_character(rest, character_length, &page, &low_code);
-      if ((0x00 <= page && page <= 0xff) && normalize_table[page]) {
-        uint32_t normalized_code;
-        unsigned int n_bytes;
-        normalized_code = normalize_table[page][low_code];
-        if (normalized_code != 0) {
-          n_bytes = unichar_to_utf8(normalized_code,
-                                    normalized + normalized_length_in_bytes);
-          normalized_length_in_bytes += n_bytes;
-        }
-      } else {
-        int i;
-        for (i = 0; i < character_length; i++) {
-          normalized[normalized_length_in_bytes + i] = rest[i];
-        }
-        normalized_length_in_bytes += character_length;
-      }
+      normalize_character(rest, character_length, normalize_table,
+                          normalized,
+                          &normalized_length_in_bytes,
+                          &normalized_n_characters);
       if (current_type) {
         char *current_normalized;
         current_normalized =
@@ -179,7 +192,6 @@ normalize(grn_ctx *ctx, grn_obj *string, uint32_t **normalize_table)
           grn_nfkc_char_type((unsigned char *)current_normalized);
         current_type++;
       }
-      normalized_n_characters++;
     }
 
     rest += character_length;
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index