susumu.yata
null+****@clear*****
Mon Mar 25 10:02:16 JST 2013
susumu.yata 2013-03-25 10:02:16 +0900 (Mon, 25 Mar 2013) New Revision: c845415333a8fba0ba28c558e3de94153a62442c https://github.com/groonga/grnxx/commit/c845415333a8fba0ba28c558e3de94153a62442c Message: Add grnxx::Charset::get_char_size(). Modified files: lib/grnxx/charset.hpp lib/grnxx/charset/euc-jp.cpp lib/grnxx/charset/euc-jp.hpp lib/grnxx/charset/shift_jis.cpp lib/grnxx/charset/shift_jis.hpp lib/grnxx/charset/utf-8.cpp lib/grnxx/charset/utf-8.hpp Modified: lib/grnxx/charset.hpp (+5 -2) =================================================================== --- lib/grnxx/charset.hpp 2013-03-25 09:34:51 +0900 (cde85d6) +++ lib/grnxx/charset.hpp 2013-03-25 10:02:16 +0900 (0016814) @@ -46,9 +46,12 @@ class Charset { // Return the charset code. virtual CharsetCode code() const = 0; - // Return the first character of the string "slice". This function may return - // an empty slice if "slice" is empty or an invalid sequence. + // Return the first character of "slice". This function may return an empty + // slice if "slice" is empty or an invalid sequence. virtual Slice get_char(const Slice &slice) const = 0; + // Return the size of the first character of "slice". This function may + // return 0 if "slice" is empty or an invalid sequence. + virtual size_t get_char_size(const Slice &slice) const = 0; }; } // namespace grnxx Modified: lib/grnxx/charset/euc-jp.cpp (+22 -17) =================================================================== --- lib/grnxx/charset/euc-jp.cpp 2013-03-25 09:34:51 +0900 (214c056) +++ lib/grnxx/charset/euc-jp.cpp 2013-03-25 10:02:16 +0900 (93c7d1a) @@ -30,45 +30,50 @@ CharsetCode EUC_JP::code() const { } Slice EUC_JP::get_char(const Slice &slice) const { + return slice.prefix(get_char_size(slice)); +} + +size_t EUC_JP::get_char_size(const Slice &slice) const { if (!slice) { - return slice; + return 0; } // Reference: http://ja.wikipedia.org/wiki/EUC-JP if (slice[0] & 0x80) { - // 3-byte characters start with 0x8F. + // A 3-byte character starts with 0x8F. if (slice[0] == 0x8F) { - // Return an empty slice if the character is incomplete. + // Return 0 if the character is incomplete. if (slice.size() < 3) { - return slice.prefix(0); + return 0; } - // Return an empty slice if the 2nd byte is invalid. + // Return 0 if the 2nd byte is invalid. // In fact, only bytes in [A1, A8], [B0, ED], and [F3, FE] are valid. if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) { - return slice.prefix(0); + return 0; } - // Return an empty slice if the 3rd byte is invalid. + // Return 0 if the 3rd byte is invalid. if (static_cast<unsigned>(slice[2] - 0xA1) > (0xFE - 0xA1)) { - return slice.prefix(0); + return 0; } - return slice.prefix(3); + return 3; } else { - // Return an empty slice if the 1st byte is invalid. + // Return 0 if the 1st byte is invalid. // In fact, only bytes in [A1, A8], [AD, AD], and [B0, FE] are valid. if (static_cast<unsigned>(slice[0] - 0xA1) > (0xFE - 0xA1)) { - return slice.prefix(0); + return 0; } - // Return an empty slice if the character is incomplete. + // Return 0 if the character is incomplete. if (slice.size() < 2) { - return slice.prefix(0); + return 0; } - // Return an empty slice if the 2nd byte is invalid. + // Return 0 if the 2nd byte is invalid. if (static_cast<unsigned>(slice[1] - 0xA1) > (0xFE - 0xA1)) { - return slice.prefix(0); + return 0; } - return slice.prefix(2); + return 2; } } - return slice.prefix(1); + // Return 1 for an ASCII character. + return 1; } } // namespace charset Modified: lib/grnxx/charset/euc-jp.hpp (+1 -0) =================================================================== --- lib/grnxx/charset/euc-jp.hpp 2013-03-25 09:34:51 +0900 (0aea6ad) +++ lib/grnxx/charset/euc-jp.hpp 2013-03-25 10:02:16 +0900 (3b0dc53) @@ -31,6 +31,7 @@ class EUC_JP : public Charset { CharsetCode code() const; Slice get_char(const Slice &slice) const; + size_t get_char_size(const Slice &slice) const; }; } // namespace charset Modified: lib/grnxx/charset/shift_jis.cpp (+12 -7) =================================================================== --- lib/grnxx/charset/shift_jis.cpp 2013-03-25 09:34:51 +0900 (25b8d14) +++ lib/grnxx/charset/shift_jis.cpp 2013-03-25 10:02:16 +0900 (5501434) @@ -30,23 +30,28 @@ CharsetCode Shift_JIS::code() const { } Slice Shift_JIS::get_char(const Slice &slice) const { + return slice.prefix(get_char_size(slice)); +} + +size_t Shift_JIS::get_char_size(const Slice &slice) const { if (!slice) { - return slice; + return 0; } // The 1st byte of a multibyte character is in [81, 9F] or [E0, FC]. // Reference: http://www.st.rim.or.jp/~phinloda/cqa/cqa15.html#Q4 if (static_cast<unsigned>((slice[0] ^ 0x20) - 0xA1) < 0x3C) { - // Return an empty slice if the character is incomplete. + // Return 0 if the character is incomplete. if (slice.size() < 2) { - return slice.prefix(0); + return 0; } - // Return an empty slice if the 2nd byte is invalid. + // Return 0 if the 2nd byte is invalid. if (static_cast<unsigned>(slice[1] - 0x40) > (0xFC - 0x40)) { - return slice.prefix(0); + return 0; } - return slice.prefix(2); + return 2; } - return slice.prefix(1); + // Return 1 for an ASCII character. + return 1; } } // namespace charset Modified: lib/grnxx/charset/shift_jis.hpp (+1 -0) =================================================================== --- lib/grnxx/charset/shift_jis.hpp 2013-03-25 09:34:51 +0900 (5b4ecd4) +++ lib/grnxx/charset/shift_jis.hpp 2013-03-25 10:02:16 +0900 (b7e18f3) @@ -31,6 +31,7 @@ class Shift_JIS : public Charset { CharsetCode code() const; Slice get_char(const Slice &slice) const; + size_t get_char_size(const Slice &slice) const; }; } // namespace charset Modified: lib/grnxx/charset/utf-8.cpp (+18 -12) =================================================================== --- lib/grnxx/charset/utf-8.cpp 2013-03-25 09:34:51 +0900 (dde7a1c) +++ lib/grnxx/charset/utf-8.cpp 2013-03-25 10:02:16 +0900 (5ef4a83) @@ -32,8 +32,12 @@ CharsetCode UTF_8::code() const { } Slice UTF_8::get_char(const Slice &slice) const { + return slice.prefix(get_char_size(slice)); +} + +size_t UTF_8::get_char_size(const Slice &slice) const { if (!slice) { - return slice; + return 0; } if (slice[0] & 0x80) { // A multibyte character can be 2, 3, or 4 bytes long. Also, the 2nd, @@ -41,36 +45,38 @@ Slice UTF_8::get_char(const Slice &slice) const { // be 10. const size_t char_size = 31 - bit_scan_reverse(~(static_cast<uint32_t>(slice[0]) << 24)); - // Return an empty slice if the character is incomplete. + // Return 0 if the character is incomplete. if (char_size > slice.size()) { - return slice.prefix(0); + return 0; } switch (char_size) { case 4: { - // Return an empty slice if the 4th byte is invalid. + // Return 0 if the 4th byte is invalid. if ((slice[3] & 0xC0) != 0x80) { - return slice.prefix(0); + return 0; } } case 3: { - // Return an empty slice if the 3rd byte is invalid. + // Return 0 if the 3rd byte is invalid. if ((slice[2] & 0xC0) != 0x80) { - return slice.prefix(0); + return 0; } } case 2: { - // Return an empty slice if the 2nd byte is invalid. + // Return 0 if the 2nd byte is invalid. if ((slice[1] & 0xC0) != 0x80) { - return slice.prefix(0); + return 0; } - return slice.prefix(char_size); + return char_size; } default: { - return slice.prefix(0); + // Return 0 if the character size is invalid. + return 0; } } } - return slice.prefix(1); + // Return 1 for an ASCII character. + return 1; } } // namespace charset Modified: lib/grnxx/charset/utf-8.hpp (+1 -0) =================================================================== --- lib/grnxx/charset/utf-8.hpp 2013-03-25 09:34:51 +0900 (cb6829f) +++ lib/grnxx/charset/utf-8.hpp 2013-03-25 10:02:16 +0900 (dced404) @@ -31,6 +31,7 @@ class UTF_8 : public Charset { CharsetCode code() const; Slice get_char(const Slice &slice) const; + size_t get_char_size(const Slice &slice) const; }; } // namespace charset -------------- next part -------------- HTML����������������������������...Download