[Groonga-commit] groonga/grnxx [master] Add grnxx::Charset::get_char_size().

Back to archive index

susumu.yata null+****@clear*****
Mon Mar 25 10:02:16 JST 2013


susumu.yata	2013-03-25 10:02:16 +0900 (Mon, 25 Mar 2013)

  New Revision: c845415333a8fba0ba28c558e3de94153a62442c
  https://github.com/groonga/grnxx/commit/c845415333a8fba0ba28c558e3de94153a62442c

  Message:
    Add grnxx::Charset::get_char_size().

  Modified files:
    lib/grnxx/charset.hpp
    lib/grnxx/charset/euc-jp.cpp
    lib/grnxx/charset/euc-jp.hpp
    lib/grnxx/charset/shift_jis.cpp
    lib/grnxx/charset/shift_jis.hpp
    lib/grnxx/charset/utf-8.cpp
    lib/grnxx/charset/utf-8.hpp

  Modified: lib/grnxx/charset.hpp (+5 -2)
===================================================================
--- lib/grnxx/charset.hpp    2013-03-25 09:34:51 +0900 (cde85d6)
+++ lib/grnxx/charset.hpp    2013-03-25 10:02:16 +0900 (0016814)
@@ -46,9 +46,12 @@ class Charset {
   // Return the charset code.
   virtual CharsetCode code() const = 0;
 
-  // Return the first character of the string "slice". This function may return
-  // an empty slice if "slice" is empty or an invalid sequence.
+  // Return the first character of "slice". This function may return an empty
+  // slice if "slice" is empty or an invalid sequence.
   virtual Slice get_char(const Slice &slice) const = 0;
+  // Return the size of the first character of "slice". This function may
+  // return 0 if "slice" is empty or an invalid sequence.
+  virtual size_t get_char_size(const Slice &slice) const = 0;
 };
 
 }  // namespace grnxx

  Modified: lib/grnxx/charset/euc-jp.cpp (+22 -17)
===================================================================
--- lib/grnxx/charset/euc-jp.cpp    2013-03-25 09:34:51 +0900 (214c056)
+++ lib/grnxx/charset/euc-jp.cpp    2013-03-25 10:02:16 +0900 (93c7d1a)
@@ -30,45 +30,50 @@ CharsetCode EUC_JP::code() const {
 }
 
 Slice EUC_JP::get_char(const Slice &slice) const {
+  return slice.prefix(get_char_size(slice));
+}
+
+size_t EUC_JP::get_char_size(const Slice &slice) const {
   if (!slice) {
-    return slice;
+    return 0;
   }
   // Reference: http://ja.wikipedia.org/wiki/EUC-JP
   if (slice[0] & 0x80) {
-    // 3-byte characters start with 0x8F.
+    // A 3-byte character starts with 0x8F.
     if (slice[0] == 0x8F) {
-      // Return an empty slice if the character is incomplete.
+      // Return 0 if the character is incomplete.
       if (slice.size() < 3) {
-        return slice.prefix(0);
+        return 0;
       }
-      // Return an empty slice if the 2nd byte is invalid.
+      // Return 0 if the 2nd byte is invalid.
       // In fact, only bytes in [A1, A8], [B0, ED], and [F3, FE] are valid.
       if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) {
-        return slice.prefix(0);
+        return 0;
       }
-      // Return an empty slice if the 3rd byte is invalid.
+      // Return 0 if the 3rd byte is invalid.
       if (static_cast<unsigned>(slice[2] - 0xA1) > (0xFE - 0xA1)) {
-        return slice.prefix(0);
+        return 0;
       }
-      return slice.prefix(3);
+      return 3;
     } else {
-      // Return an empty slice if the 1st byte is invalid.
+      // Return 0 if the 1st byte is invalid.
       // In fact, only bytes in [A1, A8], [AD, AD], and [B0, FE] are valid.
       if (static_cast<unsigned>(slice[0] - 0xA1) > (0xFE - 0xA1)) {
-        return slice.prefix(0);
+        return 0;
       }
-      // Return an empty slice if the character is incomplete.
+      // Return 0 if the character is incomplete.
       if (slice.size() < 2) {
-        return slice.prefix(0);
+        return 0;
       }
-      // Return an empty slice if the 2nd byte is invalid.
+      // Return 0 if the 2nd byte is invalid.
       if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) {
-        return slice.prefix(0);
+        return 0;
       }
-      return slice.prefix(2);
+      return 2;
     }
   }
-  return slice.prefix(1);
+  // Return 1 for an ASCII character.
+  return 1;
 }
 
 }  // namespace charset

  Modified: lib/grnxx/charset/euc-jp.hpp (+1 -0)
===================================================================
--- lib/grnxx/charset/euc-jp.hpp    2013-03-25 09:34:51 +0900 (0aea6ad)
+++ lib/grnxx/charset/euc-jp.hpp    2013-03-25 10:02:16 +0900 (3b0dc53)
@@ -31,6 +31,7 @@ class EUC_JP : public Charset {
   CharsetCode code() const;
 
   Slice get_char(const Slice &slice) const;
+  size_t get_char_size(const Slice &slice) const;
 };
 
 }  // namespace charset

  Modified: lib/grnxx/charset/shift_jis.cpp (+12 -7)
===================================================================
--- lib/grnxx/charset/shift_jis.cpp    2013-03-25 09:34:51 +0900 (25b8d14)
+++ lib/grnxx/charset/shift_jis.cpp    2013-03-25 10:02:16 +0900 (5501434)
@@ -30,23 +30,28 @@ CharsetCode Shift_JIS::code() const {
 }
 
 Slice Shift_JIS::get_char(const Slice &slice) const {
+  return slice.prefix(get_char_size(slice));
+}
+
+size_t Shift_JIS::get_char_size(const Slice &slice) const {
   if (!slice) {
-    return slice;
+    return 0;
   }
   // The 1st byte of a multibyte character is in [81, 9F] or [E0, FC].
   // Reference: http://www.st.rim.or.jp/~phinloda/cqa/cqa15.html#Q4
   if (static_cast<unsigned>((slice[0] ^ 0x20) - 0xA1) < 0x3C) {
-    // Return an empty slice if the character is incomplete.
+    // Return 0 if the character is incomplete.
     if (slice.size() < 2) {
-      return slice.prefix(0);
+      return 0;
     }
-    // Return an empty slice if the 2nd byte is invalid.
+    // Return 0 if the 2nd byte is invalid.
     if (static_cast<unsigned>(slice[1] - 0x40) > (0xFC - 0x40)) {
-      return slice.prefix(0);
+      return 0;
     }
-    return slice.prefix(2);
+    return 2;
   }
-  return slice.prefix(1);
+  // Return 1 for an ASCII character.
+  return 1;
 }
 
 }  // namespace charset

  Modified: lib/grnxx/charset/shift_jis.hpp (+1 -0)
===================================================================
--- lib/grnxx/charset/shift_jis.hpp    2013-03-25 09:34:51 +0900 (5b4ecd4)
+++ lib/grnxx/charset/shift_jis.hpp    2013-03-25 10:02:16 +0900 (b7e18f3)
@@ -31,6 +31,7 @@ class Shift_JIS : public Charset {
   CharsetCode code() const;
 
   Slice get_char(const Slice &slice) const;
+  size_t get_char_size(const Slice &slice) const;
 };
 
 }  // namespace charset

  Modified: lib/grnxx/charset/utf-8.cpp (+18 -12)
===================================================================
--- lib/grnxx/charset/utf-8.cpp    2013-03-25 09:34:51 +0900 (dde7a1c)
+++ lib/grnxx/charset/utf-8.cpp    2013-03-25 10:02:16 +0900 (5ef4a83)
@@ -32,8 +32,12 @@ CharsetCode UTF_8::code() const {
 }
 
 Slice UTF_8::get_char(const Slice &slice) const {
+  return slice.prefix(get_char_size(slice));
+}
+
+size_t UTF_8::get_char_size(const Slice &slice) const {
   if (!slice) {
-    return slice;
+    return 0;
   }
   if (slice[0] & 0x80) {
     // A multibyte character can be 2, 3, or 4 bytes long. Also, the 2nd,
@@ -41,36 +45,38 @@ Slice UTF_8::get_char(const Slice &slice) const {
     // be 10.
     const size_t char_size =
         31 - bit_scan_reverse(~(static_cast<uint32_t>(slice[0]) << 24));
-    // Return an empty slice if the character is incomplete.
+    // Return 0 if the character is incomplete.
     if (char_size > slice.size()) {
-      return slice.prefix(0);
+      return 0;
     }
     switch (char_size) {
       case 4: {
-        // Return an empty slice if the 4th byte is invalid.
+        // Return 0 if the 4th byte is invalid.
         if ((slice[3] & 0xC0) != 0x80) {
-          return slice.prefix(0);
+          return 0;
         }
       }
       case 3: {
-        // Return an empty slice if the 3rd byte is invalid.
+        // Return 0 if the 3rd byte is invalid.
         if ((slice[2] & 0xC0) != 0x80) {
-          return slice.prefix(0);
+          return 0;
         }
       }
       case 2: {
-        // Return an empty slice if the 2nd byte is invalid.
+        // Return 0 if the 2nd byte is invalid.
         if ((slice[1] & 0xC0) != 0x80) {
-          return slice.prefix(0);
+          return 0;
         }
-        return slice.prefix(char_size);
+        return char_size;
       }
       default: {
-        return slice.prefix(0);
+        // Return 0 if the character size is invalid.
+        return 0;
       }
     }
   }
-  return slice.prefix(1);
+  // Return 1 for an ASCII character.
+  return 1;
 }
 
 }  // namespace charset

  Modified: lib/grnxx/charset/utf-8.hpp (+1 -0)
===================================================================
--- lib/grnxx/charset/utf-8.hpp    2013-03-25 09:34:51 +0900 (cb6829f)
+++ lib/grnxx/charset/utf-8.hpp    2013-03-25 10:02:16 +0900 (dced404)
@@ -31,6 +31,7 @@ class UTF_8 : public Charset {
   CharsetCode code() const;
 
   Slice get_char(const Slice &slice) const;
+  size_t get_char_size(const Slice &slice) const;
 };
 
 }  // namespace charset
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index