[Groonga-commit] groonga/groonga at 764b66e [master] load: support surrogate pairs

Back to archive index

Susumu Yata null+****@clear*****
Thu Jan 11 10:02:43 JST 2018


Susumu Yata	2018-01-11 10:02:43 +0900 (Thu, 11 Jan 2018)

  New Revision: 764b66ecb36ebbca54ac2ef61d3c8b1f63d67dd9
  https://github.com/groonga/groonga/commit/764b66ecb36ebbca54ac2ef61d3c8b1f63d67dd9

  Merged 1decf0a: Merge pull request #808 from groonga/support-surrogate-pairs

  Message:
    load: support surrogate pairs

  Added files:
    test/command/suite/load/surrogate_pair/emoji.expected
    test/command/suite/load/surrogate_pair/emoji.test
    test/command/suite/load/surrogate_pair/normalize.expected
    test/command/suite/load/surrogate_pair/normalize.test
    test/command/suite/load/surrogate_pair/raw.expected
    test/command/suite/load/surrogate_pair/raw.test
  Modified files:
    lib/grn_ctx_impl.h
    lib/load.c

  Modified: lib/grn_ctx_impl.h (+1 -0)
===================================================================
--- lib/grn_ctx_impl.h    2018-01-11 09:34:34 +0900 (ebc691e7b)
+++ lib/grn_ctx_impl.h    2018-01-11 10:02:43 +0900 (ff2e79d92)
@@ -85,6 +85,7 @@ typedef struct {
   grn_obj *ifexists;
   grn_obj *each;
   uint32_t unichar;
+  uint32_t unichar_hi;
   uint32_t values_size;
   uint32_t nrecords;
   uint32_t n_record_errors;

  Modified: lib/load.c (+16 -2)
===================================================================
--- lib/load.c    2018-01-11 09:34:34 +0900 (4f90dc9c3)
+++ lib/load.c    2018-01-11 10:02:43 +0900 (69a85eaf5)
@@ -1003,13 +1003,27 @@ json_read(grn_ctx *ctx, grn_loader *loader, const char *str, unsigned int str_le
       }
       {
         uint32_t u = loader->unichar;
+        if (u >= 0xd800 && u <= 0xdbff) { /* High-surrogate code points */
+          loader->unichar_hi = u;
+          loader->stat = GRN_LOADER_STRING;
+          str++;
+          break;
+        }
+        if (u >= 0xdc00 && u <= 0xdfff) { /* Low-surrogate code points */
+          u = 0x10000 + (loader->unichar_hi - 0xd800) * 0x400 + u - 0xdc00;
+        }
         if (u < 0x80) {
           GRN_TEXT_PUTC(ctx, loader->last, u);
         } else {
           if (u < 0x800) {
-            GRN_TEXT_PUTC(ctx, loader->last, ((u >> 6) & 0x1f) | 0xc0);
+            GRN_TEXT_PUTC(ctx, loader->last, (u >> 6) | 0xc0);
           } else {
-            GRN_TEXT_PUTC(ctx, loader->last, (u >> 12) | 0xe0);
+            if (u < 0x10000) {
+              GRN_TEXT_PUTC(ctx, loader->last, (u >> 12) | 0xe0);
+            } else {
+              GRN_TEXT_PUTC(ctx, loader->last, (u >> 18) | 0xf0);
+              GRN_TEXT_PUTC(ctx, loader->last, ((u >> 12) & 0x3f) | 0x80);
+            }
             GRN_TEXT_PUTC(ctx, loader->last, ((u >> 6) & 0x3f) | 0x80);
           }
           GRN_TEXT_PUTC(ctx, loader->last, (u & 0x3f) | 0x80);

  Added: test/command/suite/load/surrogate_pair/emoji.expected (+38 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/load/surrogate_pair/emoji.expected    2018-01-11 10:02:43 +0900 (88b759e7c)
@@ -0,0 +1,38 @@
+table_create Characters TABLE_HASH_KEY ShortText
+[[0,0.0,0.0],true]
+column_create Characters unicode COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+load --table Characters
+[
+{"_key": "\uD83C\uDF7A", "unicode": "U+1F37A BEER MUG"}
+]
+[[0,0.0,0.0],1]
+select Characters --output_columns _key,unicode
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ],
+        [
+          "unicode",
+          "ShortText"
+        ]
+      ],
+      [
+        "🍺",
+        "U+1F37A BEER MUG"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/load/surrogate_pair/emoji.test (+9 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/load/surrogate_pair/emoji.test    2018-01-11 10:02:43 +0900 (9389618b3)
@@ -0,0 +1,9 @@
+table_create Characters TABLE_HASH_KEY ShortText
+column_create Characters unicode COLUMN_SCALAR ShortText
+
+load --table Characters
+[
+{"_key": "\uD83C\uDF7A", "unicode": "U+1F37A BEER MUG"}
+]
+
+select Characters --output_columns _key,unicode

  Added: test/command/suite/load/surrogate_pair/normalize.expected (+38 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/load/surrogate_pair/normalize.expected    2018-01-11 10:02:43 +0900 (6c5601d43)
@@ -0,0 +1,38 @@
+table_create Characters TABLE_HASH_KEY|KEY_NORMALIZE ShortText
+[[0,0.0,0.0],true]
+column_create Characters unicode COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+load --table Characters
+[
+{"_key": "\uD835\uDC00", "unicode": "U+1D400 MATHEMATICAL BOLD CAPITAL A"}
+]
+[[0,0.0,0.0],1]
+select Characters --filter '_key == "A"' --output_columns _key,unicode
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ],
+        [
+          "unicode",
+          "ShortText"
+        ]
+      ],
+      [
+        "a",
+        "U+1D400 MATHEMATICAL BOLD CAPITAL A"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/load/surrogate_pair/normalize.test (+9 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/load/surrogate_pair/normalize.test    2018-01-11 10:02:43 +0900 (83291cb47)
@@ -0,0 +1,9 @@
+table_create Characters TABLE_HASH_KEY|KEY_NORMALIZE ShortText
+column_create Characters unicode COLUMN_SCALAR ShortText
+
+load --table Characters
+[
+{"_key": "\uD835\uDC00", "unicode": "U+1D400 MATHEMATICAL BOLD CAPITAL A"}
+]
+
+select Characters --filter '_key == "A"' --output_columns _key,unicode

  Added: test/command/suite/load/surrogate_pair/raw.expected (+38 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/load/surrogate_pair/raw.expected    2018-01-11 10:02:43 +0900 (c107e2423)
@@ -0,0 +1,38 @@
+table_create Characters TABLE_HASH_KEY ShortText
+[[0,0.0,0.0],true]
+column_create Characters unicode COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+load --table Characters
+[
+{"_key": "\uD835\uDC00", "unicode": "U+1D400 MATHEMATICAL BOLD CAPITAL A"}
+]
+[[0,0.0,0.0],1]
+select Characters --output_columns _key,unicode
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "_key",
+          "ShortText"
+        ],
+        [
+          "unicode",
+          "ShortText"
+        ]
+      ],
+      [
+        "𝐀",
+        "U+1D400 MATHEMATICAL BOLD CAPITAL A"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/load/surrogate_pair/raw.test (+9 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/load/surrogate_pair/raw.test    2018-01-11 10:02:43 +0900 (06b754055)
@@ -0,0 +1,9 @@
+table_create Characters TABLE_HASH_KEY ShortText
+column_create Characters unicode COLUMN_SCALAR ShortText
+
+load --table Characters
+[
+{"_key": "\uD835\uDC00", "unicode": "U+1D400 MATHEMATICAL BOLD CAPITAL A"}
+]
+
+select Characters --output_columns _key,unicode
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180111/0dc6c4d7/attachment-0003.htm 



More information about the Groonga-commit mailing list
Back to archive index