[Groonga-commit] groonga/groonga at e325b75 [master] NormalizerNFKC100: add report_source_offset option

Back to archive index

Kouhei Sutou null+****@clear*****
Mon May 28 11:41:05 JST 2018


Kouhei Sutou	2018-05-28 11:41:05 +0900 (Mon, 28 May 2018)

  New Revision: e325b7571faaec9d5608774e8d9957a79dd9b706
  https://github.com/groonga/groonga/commit/e325b7571faaec9d5608774e8d9957a79dd9b706

  Message:
    NormalizerNFKC100: add report_source_offset option
    
    normalize command also supports "offsets" output.

  Added files:
    test/command/suite/normalizers/nfkc100/report_source_offset.expected
    test/command/suite/normalizers/nfkc100/report_source_offset.test
  Modified files:
    lib/normalizer.c
    lib/proc/proc_normalize.c

  Modified: lib/normalizer.c (+44 -0)
===================================================================
--- lib/normalizer.c    2018-05-28 11:40:43 +0900 (0be649903)
+++ lib/normalizer.c    2018-05-28 11:41:05 +0900 (fffbc90ad)
@@ -621,6 +621,7 @@ typedef struct {
   grn_nfkc_decompose_func decompose_func;
   grn_nfkc_compose_func compose_func;
   grn_bool include_removed_source_location;
+  grn_bool report_source_offset;
   grn_bool unify_kana;
   grn_bool unify_kana_case;
   grn_bool unify_kana_voiced_sound_mark;
@@ -642,6 +643,7 @@ utf8_normalize_options_init(grn_utf8_normalize_options *options,
   options->decompose_func = decompose_func;
   options->compose_func = compose_func;
   options->include_removed_source_location = GRN_TRUE;
+  options->report_source_offset = GRN_FALSE;
   options->unify_kana = GRN_FALSE;
   options->unify_kana_case = GRN_FALSE;
   options->unify_kana_voiced_sound_mark = GRN_FALSE;
@@ -1088,6 +1090,7 @@ utf8_normalize(grn_ctx *ctx,
   const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
   unsigned char *d, *d_, *de;
   uint_least8_t *cp;
+  uint64_t *offsets;
   size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3;
   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
   grn_bool remove_tokenized_delimiter_p =
@@ -1117,6 +1120,17 @@ utf8_normalize(grn_ctx *ctx,
     }
   }
   cp = nstr->ctypes;
+  if (options->report_source_offset) {
+    if (!(nstr->offsets = GRN_MALLOC(ds + 1))) {
+      if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+      if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+      GRN_FREE(nstr->normalized); nstr->normalized = NULL;
+      ERR(GRN_NO_MEMORY_AVAILABLE,
+          "[string][utf8] failed to allocate offsets space");
+      return NULL;
+    }
+  }
+  offsets = nstr->offsets;
   d = (unsigned char *)nstr->normalized;
   de = d + ds;
   d_ = NULL;
@@ -1146,6 +1160,9 @@ utf8_normalize(grn_ctx *ctx,
           s_ = s__;
         }
       }
+      if (offsets) {
+        offsets--;
+      }
       d = d_;
       length--;
     }
@@ -1169,6 +1186,7 @@ utf8_normalize(grn_ctx *ctx,
           if (!(normalized = GRN_REALLOC(nstr->normalized, ds + 1))) {
             if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
             if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+            if (nstr->offsets) { GRN_FREE(nstr->offsets); nstr->offsets = NULL; }
             GRN_FREE(nstr->normalized); nstr->normalized = NULL;
             ERR(GRN_NO_MEMORY_AVAILABLE,
                 "[string][utf8] failed to expand normalized text space");
@@ -1181,6 +1199,7 @@ utf8_normalize(grn_ctx *ctx,
             int16_t *checks;
             if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t) + 1))) {
               if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+              if (nstr->offsets) { GRN_FREE(nstr->offsets); nstr->offsets = NULL; }
               GRN_FREE(nstr->checks); nstr->checks = NULL;
               GRN_FREE(nstr->normalized); nstr->normalized = NULL;
               ERR(GRN_NO_MEMORY_AVAILABLE,
@@ -1195,6 +1214,7 @@ utf8_normalize(grn_ctx *ctx,
             if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
               GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
               if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+              if (nstr->offsets) { GRN_FREE(nstr->offsets); nstr->offsets = NULL; }
               GRN_FREE(nstr->normalized); nstr->normalized = NULL;
               ERR(GRN_NO_MEMORY_AVAILABLE,
                   "[string][utf8] failed to expand character types space");
@@ -1203,6 +1223,20 @@ utf8_normalize(grn_ctx *ctx,
             cp = ctypes + (cp - nstr->ctypes);
             nstr->ctypes = ctypes;
           }
+          if (offsets) {
+            uint64_t *new_offsets;
+            if (!(new_offsets = GRN_REALLOC(nstr->offsets, ds + 1))) {
+              GRN_FREE(nstr->offsets); nstr->offsets = NULL;
+              if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
+              if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
+              GRN_FREE(nstr->normalized); nstr->normalized = NULL;
+              ERR(GRN_NO_MEMORY_AVAILABLE,
+                  "[string][utf8] failed to expand offsets space");
+              return NULL;
+            }
+            offsets = new_offsets + (offsets - nstr->offsets);
+            nstr->offsets = new_offsets;
+          }
         }
 
         {
@@ -1326,12 +1360,16 @@ utf8_normalize(grn_ctx *ctx,
             }
             for (i = lp; i > 1; i--) { *ch++ = 0; }
           }
+          if (offsets) {
+            *offsets++ = (uint64_t)(s - (const unsigned char *)nstr->original);
+          }
         }
         lp = lp_original;
       }
     }
   }
   if (cp) { *cp = GRN_CHAR_NULL; }
+  if (offsets) { *offsets = nstr->original_length_in_bytes; }
   if (options->unify_katakana_v_sounds) {
     utf8_normalize_unify_katakana_v_sounds(NULL, 0, d_, d);
   }
@@ -1792,6 +1830,12 @@ nfkc100_open_options(grn_ctx *ctx,
                                     raw_options,
                                     i,
                                     options->include_removed_source_location);
+    } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "report_source_offset")) {
+      options->report_source_offset =
+        grn_vector_get_element_bool(ctx,
+                                    raw_options,
+                                    i,
+                                    options->report_source_offset);
     } else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_kana")) {
       options->unify_kana = grn_vector_get_element_bool(ctx,
                                                         raw_options,

  Modified: lib/proc/proc_normalize.c (+19 -1)
===================================================================
--- lib/proc/proc_normalize.c    2018-05-28 11:40:43 +0900 (63f061e49)
+++ lib/proc/proc_normalize.c    2018-05-28 11:41:05 +0900 (9f764ba88)
@@ -93,10 +93,12 @@ command_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
 
   {
     int flags;
+    int n_elements = 3;
     grn_obj *lexicon;
     grn_obj *grn_string;
     unsigned int normalized_length_in_bytes;
     unsigned int normalized_n_characters;
+    const uint64_t *offsets;
 
     flags = parse_normalize_flags(ctx, &flags_raw);
     if (ctx->rc != GRN_SUCCESS) {
@@ -118,7 +120,12 @@ command_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
                                  lexicon,
                                  flags);
 
-    grn_ctx_output_map_open(ctx, "RESULT", 3);
+    offsets = grn_string_get_offsets(ctx, grn_string);
+    if (offsets) {
+      n_elements++;
+    }
+
+    grn_ctx_output_map_open(ctx, "RESULT", n_elements);
     {
       const char *normalized;
 
@@ -164,6 +171,17 @@ command_normalize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
         grn_ctx_output_array_close(ctx);
       }
     }
+    if (offsets) {
+      unsigned int i;
+
+      grn_ctx_output_cstr(ctx, "offsets");
+      grn_ctx_output_array_open(ctx, "offsets", normalized_n_characters);
+      for (i = 0; i < normalized_n_characters; i++) {
+        grn_ctx_output_uint64(ctx, offsets[i]);
+      }
+      grn_ctx_output_array_close(ctx);
+    }
+
     grn_ctx_output_map_close(ctx);
 
     grn_obj_unlink(ctx, grn_string);

  Added: test/command/suite/normalizers/nfkc100/report_source_offset.expected (+26 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/report_source_offset.expected    2018-05-28 11:41:05 +0900 (0ec059aab)
@@ -0,0 +1,26 @@
+normalize   'NormalizerNFKC100("report_source_offset", true)'   "( あ  いうえお )"   REMOVE_BLANK
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  {
+    "normalized": "(あいうえお)",
+    "types": [
+
+    ],
+    "checks": [
+
+    ],
+    "offsets": [
+      0,
+      2,
+      7,
+      10,
+      13,
+      16,
+      20
+    ]
+  }
+]

  Added: test/command/suite/normalizers/nfkc100/report_source_offset.test (+4 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/normalizers/nfkc100/report_source_offset.test    2018-05-28 11:41:05 +0900 (d4938b98e)
@@ -0,0 +1,4 @@
+normalize \
+  'NormalizerNFKC100("report_source_offset", true)' \
+  "( あ  いうえお )" \
+  REMOVE_BLANK
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180528/b1d764b4/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index