[Groonga-commit] groonga/groonga [master] Add GRN_STRING_REMOVE_TOKENIZER_DELIMITER flag

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Nov 9 13:38:10 JST 2012


Kouhei Sutou	2012-11-09 13:38:10 +0900 (Fri, 09 Nov 2012)

  New Revision: d217a84b1540ab826c232bd81aa99e5b0b32d4ce
  https://github.com/groonga/groonga/commit/d217a84b1540ab826c232bd81aa99e5b0b32d4ce

  Log:
    Add GRN_STRING_REMOVE_TOKENIZER_DELIMITER flag
    
    It is a flag for removing tokenizer delimiter (U+FFFE) from string by
    normalization.

  Modified files:
    include/groonga.h
    lib/string.c
    test/unit/util/test-string.c

  Modified: include/groonga.h (+4 -3)
===================================================================
--- include/groonga.h    2012-11-09 13:07:31 +0900 (d83e1db)
+++ include/groonga.h    2012-11-09 13:38:10 +0900 (74704b8)
@@ -2547,9 +2547,10 @@ GRN_API grn_rc grn_str_close(grn_ctx *ctx, grn_str *nstr);
 
 /* grn_string */
 
-#define GRN_STRING_REMOVE_BLANK (0x01<<0)
-#define GRN_STRING_WITH_TYPES   (0x01<<1)
-#define GRN_STRING_WITH_CHECKS  (0x01<<2)
+#define GRN_STRING_REMOVE_BLANK               (0x01<<0)
+#define GRN_STRING_WITH_TYPES                 (0x01<<1)
+#define GRN_STRING_WITH_CHECKS                (0x01<<2)
+#define GRN_STRING_REMOVE_TOKENIZER_DELIMITER (0x01<<3)
 
 #define GRN_NORMALIZER_AUTO ((grn_obj *)1)
 

  Modified: lib/string.c (+32 -3)
===================================================================
--- lib/string.c    2012-11-09 13:07:31 +0900 (0f203bb)
+++ lib/string.c    2012-11-09 13:38:10 +0900 (0f90636)
@@ -21,6 +21,8 @@
 #include "string_in.h"
 #include "str.h"
 
+#include <groonga/tokenizer.h>
+
 static unsigned char symbol[] = {
   ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
@@ -557,6 +559,8 @@ utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data
   grn_string *nstr = (grn_string *)args[0];
   size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3;
   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
+  grn_bool remove_tokenizer_delimiter_p =
+    nstr->flags & GRN_STRING_REMOVE_TOKENIZER_DELIMITER;
   if (!(nstr->normalized = GRN_MALLOC(ds + 1))) {
     ERR(GRN_NO_MEMORY_AVAILABLE,
         "[strinig][utf8] failed to allocate normalized text space");
@@ -590,6 +594,10 @@ utf8_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data
     if (!(ls = grn_str_charlen_utf8(ctx, s, e))) {
       break;
     }
+    if (remove_tokenizer_delimiter_p &&
+        grn_tokenizer_is_delimiter(ctx, s, ls, GRN_ENC_UTF8)) {
+      continue;
+    }
     if ((p = (unsigned char *)grn_nfkc_map1(s))) {
       pe = p + strlen((char *)p);
     } else {
@@ -1068,9 +1076,30 @@ grn_fake_string_open(grn_ctx *ctx, grn_string *string)
     return NULL;
   }
 
-  memcpy(nstr->normalized, str, str_len);
-  nstr->normalized[str_len] = '\0';
-  nstr->normalized_length_in_bytes = str_len;
+  if (nstr->flags & GRN_STRING_REMOVE_TOKENIZER_DELIMITER &&
+      ctx->encoding == GRN_ENC_UTF8) {
+    int char_length;
+    const char *source_current = str;
+    const char *source_end = str + str_len;
+    char *destination = nstr->normalized;
+    unsigned int destination_length = 0;
+    while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) {
+      if (!grn_tokenizer_is_delimiter(ctx,
+                                      source_current, char_length,
+                                      ctx->encoding)) {
+        memcpy(destination, source_current, char_length);
+        destination += char_length;
+        destination_length += char_length;
+      }
+      source_current += char_length;
+    }
+    destination[destination_length] = '\0';
+    nstr->normalized_length_in_bytes = destination_length;
+  } else {
+    memcpy(nstr->normalized, str, str_len);
+    nstr->normalized[str_len] = '\0';
+    nstr->normalized_length_in_bytes = str_len;
+  }
 
   if (nstr->flags & GRN_STRING_WITH_CHECKS) {
     int16_t f = 0;

  Modified: test/unit/util/test-string.c (+65 -0)
===================================================================
--- test/unit/util/test-string.c    2012-11-09 13:07:31 +0900 (2c87da4)
+++ test/unit/util/test-string.c    2012-11-09 13:38:10 +0900 (81d0a27)
@@ -31,6 +31,8 @@ void data_normalize(void);
 void test_normalize(gconstpointer data);
 void data_normalize_broken(void);
 void test_normalize_broken(gconstpointer data);
+void data_remove_tokenizer_delimiter(void);
+void test_remove_tokenizer_delimiter(gconstpointer data);
 void data_charlen_broken(void);
 void test_charlen_broken(gconstpointer data);
 void data_urlenc(void);
@@ -261,6 +263,69 @@ test_normalize_broken(gconstpointer data)
 }
 
 void
+data_remove_tokenizer_delimiter(void)
+{
+#define ADD_DATUM(label, expected, input, flags)                        \
+  gcut_add_datum(label,                                                 \
+                 "expected", G_TYPE_STRING, expected,                   \
+                 "input", G_TYPE_STRING, input,                         \
+                 "flags", G_TYPE_INT, flags,                            \
+                 NULL)
+
+#define UFFFE_IN_UTF8 "\xef\xbf\xbe"
+
+  ADD_DATUM("normalize",
+            "abあい",
+            UFFFE_IN_UTF8 "A"
+            UFFFE_IN_UTF8 "B"
+            UFFFE_IN_UTF8 "あ"
+            UFFFE_IN_UTF8 "い"
+            UFFFE_IN_UTF8,
+            GRN_OBJ_KEY_NORMALIZE);
+  ADD_DATUM("not normalize",
+            "ABあい",
+            UFFFE_IN_UTF8 "A"
+            UFFFE_IN_UTF8 "B"
+            UFFFE_IN_UTF8 "あ"
+            UFFFE_IN_UTF8 "い"
+            UFFFE_IN_UTF8,
+            0);
+
+#undef UFFFE_IN_UTF8
+
+#undef ADD_DATUM
+}
+
+void
+test_remove_tokenizer_delimiter(gconstpointer data)
+{
+  grn_obj *string;
+  grn_obj *normalizer = NULL;
+  const gchar *expected;
+  const gchar *input;
+  const gchar *normalized;
+  unsigned int length_in_bytes;
+  int flags = GRN_STRING_REMOVE_TOKENIZER_DELIMITER;
+
+  GRN_CTX_SET_ENCODING(&context, GRN_ENC_UTF8);
+
+  input = gcut_data_get_string(data, "input");
+  flags |= gcut_data_get_int(data, "flags");
+  if (flags & GRN_OBJ_KEY_NORMALIZE) {
+    normalizer = GRN_NORMALIZER_AUTO;
+  }
+
+  string = grn_string_open(&context, input, strlen(input), normalizer, flags);
+  grn_string_get_normalized(&context, string,
+                            &normalized, &length_in_bytes, NULL);
+  normalized = cut_take_strndup(normalized, length_in_bytes);
+  grn_obj_unlink(&context, string);
+
+  expected = gcut_data_get_string(data, "expected");
+  cut_assert_equal_string(expected, normalized);
+}
+
+void
 data_charlen_broken(void)
 {
 #define ADD_DATUM_WITH_ENCODING(label, input, input_length, encoding)   \
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index