Revision: 10766 https://osdn.net/projects/ttssh2/scm/svn/commits/10766 Author: zmatsuo Date: 2023-06-16 22:44:22 +0900 (Fri, 16 Jun 2023) Log Message: ----------- UTF-8 のデコードを厳密にした - サロゲートペアなども考慮 - Unicode仕様書を参考にした - https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf - Table 3-7. Well-Formed UTF-8 Byte Sequences - 実装を見直した ticket #48226 Ticket Links: ------------ https://osdn.net/projects/ttssh2/tracker/detail/48226 Modified Paths: -------------- trunk/teraterm/teraterm/charset.c -------------- next part -------------- Modified: trunk/teraterm/teraterm/charset.c =================================================================== --- trunk/teraterm/teraterm/charset.c 2023-06-15 13:01:12 UTC (rev 10765) +++ trunk/teraterm/teraterm/charset.c 2023-06-16 13:44:22 UTC (rev 10766) @@ -568,10 +568,8 @@ VttermKanjiWork *w = &KanjiWork; static BYTE buf[4]; static int count = 0; + char32_t code; - unsigned int code; - int i; - if (Fallbacked) { BOOL r = ParseFirstJP(b); Fallbacked = FALSE; @@ -578,92 +576,76 @@ return r; } - if (b < 0x20) { - PutReplacementChr(w, buf, count, ts.FallbackToCP932); - count = 0; - ParseASCII(b); - return TRUE; - } - // UTF-8\x83G\x83\x93\x83R\x81[\x83h - // Unicode 1byte, 2byte, 3byte, 4byte - // U+0000 ... U+007f 0x00 .. 0x7f - // U+0080 ... U+07ff 0xc2 .. 0xdf, 0x80 .. 0xbf - // U+0800 ... U+ffff 0xe0 .. 0xef, 0x80 .. 0xbf, 0x80 .. 0xbf - // U+10000 ... U+10ffff 0xf0 .. 0xf4, 0x80 .. 0xbf, 0x80 .. 0xbf, 0x80 .. 0xbf + // The Unicode Standard Chapter 3 + // Table 3-7. Well-Formed UTF-8 Byte Sequences + // | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | + // | U+0000..U+007F | 00..7F | | | | + // | U+0080..U+07FF | C2..DF | 80..BF | | | + // | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | + // | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | + // | U+D000..U+D7FF | ED | 80..9F | 80..BF | | + // | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | + // | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | + // | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | + // | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | // UTF-8\x82Ńf\x83R\x81[\x83h\x82ł\xAB\x82Ȃ\xA2\x8Fꍇ // - 1byte\x96\xDA - // - C1(0x80 - 0x9f) - // - 0xa0 - 0xc1 - // - 0xf5 - 0xff + // - 0x00 - 0x7f ok + // - 0x80 - 0xc1 ng + // - 0xc2 - 0xf4 ok + // - 0xf5 - 0xff ng // - 2byte\x96ڈȍ~ - // - 0x00 - 0x7f - // - 0xc0 - 0xff + // - 0x00 - 0x7f ng + // - 0x80 - 0xbf ok + // - 0xc0 - 0xff ng + // - 2byte\x96ڗ\xE1\x8AO + // - 1byte == 0xe0 \x82̂Ƃ\xAB 0xa0 - 0xbf\x82̂\xDDok + // - 1byte == 0xed \x82̂Ƃ\xAB 0x80 - 0x9f\x82̂\xDDok + // - 1byte == 0xf0 \x82̂Ƃ\xAB 0x90 - 0xbf\x82̂\xDDok + // - 1byte == 0xf4 \x82̂Ƃ\xAB 0x90 - 0x8f\x82̂\xDDok recheck: // 1byte(7bit) if (count == 0) { - if ((b & 0x80) == 0x00) { + if (b <= 0x7f) { // 1byte(7bit) // 0x7f\x88ȉ\xBA, \x82̂Ƃ\xAB\x81A\x82\xBB\x82̂܂o\x97\xCD ParseASCII(b); return TRUE; } - if ((b & 0x40) == 0x00 || b >= 0xf6) { - // UTF-8\x82\xC51byte\x82ɏo\x8C\xBB\x82\xB5\x82Ȃ\xA2\x83R\x81[\x83h\x82̂Ƃ\xAB - // 0x40 = 0b1011_1111, 0b10xx_xxxx\x82Ƃ\xA2\x82\xA4bit\x83p\x83^\x81[\x83\x93\x82ɂ͂Ȃ\xE7\x82Ȃ\xA2 - // 0xf6 \x88ȏ\xE3\x82̂Ƃ\xAB U+10FFFF\x82\xE6\x82\xE8\x91傫\x82\xAD\x82Ȃ\xE9 - if (ts.FallbackToCP932) { - // fallback\x82\xB7\x82\xE9\x8Fꍇ - if ((ts.Language == IdJapanese) && ismbbleadSJIS(b)) { - // \x93\xFA\x96{\x8C\xEA\x82̏ꍇ && Shift_JIS 1byte\x96\xDA - // Shift_JIS \x82\xC9 fallback - Fallbacked = TRUE; - ConvJIS = FALSE; - Kanji = b << 8; - KanjiIn = TRUE; - return TRUE; - } - // fallback ISO8859-1 - PutU32(b); - return TRUE; - } - else { - // fallback\x82\xB5\x82Ȃ\xA2, \x95s\x90\xB3\x82ȕ\xB6\x8E\x9A\x93\xFC\x97\xCD - buf[0] = b; - PutReplacementChr(w, buf, 1, FALSE); - } + if (0xc2 <= b && b <= 0xf4) { + // 1byte\x96ڕۑ\xB6 + buf[count++] = b; return TRUE; } - // 1byte\x96ڕۑ\xB6 - buf[count++] = b; - return TRUE; - } - // 2byte(11bit) - if ((buf[0] & 0xe0) == 0xc0) { - code = 0; - if((b & 0xc0) == 0x80) { - // 5bit + 6bit - code = ((buf[0] & 0x1f) << 6) | (b & 0x3f); - if (code < 0x80) { - // 11bit\x8Eg\x82\xC1\x82\xC47bit\x88ȉ\xBA\x82̎\x9E\x81AUTF-8\x82̏璷\x82ȕ\\x8C\xBB - code = 0; + // UTF-8\x82\xC51byte\x82ɏo\x8C\xBB\x82\xB5\x82Ȃ\xA2\x83R\x81[\x83h\x82̂Ƃ\xAB + if (ts.FallbackToCP932) { + // fallback\x82\xB7\x82\xE9\x8Fꍇ + if ((ts.Language == IdJapanese) && ismbbleadSJIS(b)) { + // \x93\xFA\x96{\x8C\xEA\x82̏ꍇ && Shift_JIS 1byte\x96\xDA + // Shift_JIS \x82\xC9 fallback + Fallbacked = TRUE; + ConvJIS = FALSE; + Kanji = b << 8; + KanjiIn = TRUE; + return TRUE; } + // fallback ISO8859-1 + PutU32(b); + return TRUE; } - if (code == 0){ - PutReplacementChr(w, buf, 1, ts.FallbackToCP932); - count = 0; - goto recheck; - } else { - PutU32(code); - count = 0; - return TRUE; + // fallback\x82\xB5\x82Ȃ\xA2, \x95s\x90\xB3\x82ȕ\xB6\x8E\x9A\x93\xFC\x97\xCD + buf[0] = b; + PutReplacementChr(w, buf, 1, FALSE); } + return TRUE; } // 2byte\x88ȍ~\x90\xB3\x8F\xED? - if ((b & 0xc0) != 0x80) { // \x8F\xE3\x88\xCA2bit\x82\xAA 10 \x82\xA9? + if((b & 0xc0) != 0x80) { // == (b <= 0x7f || 0xc0 <= b) + // \x95s\x90\xB3\x82ȕ\xB6\x8E\x9A, (\x8F\xE3\x88\xCA2bit\x82\xAA 0b10xx_xxxx \x82ł͂Ȃ\xA2) PutReplacementChr(w, buf, count, ts.FallbackToCP932); count = 0; goto recheck; @@ -672,67 +654,55 @@ // 2byte\x96ڈȍ~\x95ۑ\xB6 buf[count++] = b; - // 3byte(16bit) - if ((buf[0] & 0xf0) == 0xe0) { - if(count < 3) { + // 2byte(11bit) + if (count == 2) { + if ((buf[0] & 0xe0) == 0xc0) { // == (0xc2 <= buf[0] && buf[0] <= 0xdf) + // 5bit + 6bit + code = ((buf[0] & 0x1f) << 6) | (b & 0x3f); + PutU32(code); + count = 0; return TRUE; } - code = 0; - if ((buf[1] & 0xc0) == 0x80 && (buf[2] & 0xc0) == 0x80) { + return TRUE; + } + + // 3byte(16bit) + if (count == 3) { + if ((buf[0] & 0xf0) == 0xe0) { + if ((buf[0] == 0xe0 && (buf[1] < 0xa0 || 0xbf < buf[1])) || + (buf[0] == 0xed && ( 0x9f < buf[1]))) { + // \x95s\x90\xB3\x82\xC8 UTF-8 + PutReplacementChr(w, buf, 2, ts.FallbackToCP932); + count = 0; + goto recheck; + } // 4bit + 6bit + 6bit code = ((buf[0] & 0xf) << 12); code |= ((buf[1] & 0x3f) << 6); code |= ((buf[2] & 0x3f)); - if (code < 0x800) { - // 16bit\x8Eg\x82\xC1\x82\xC411bit\x88ȉ\xBA\x82̂Ƃ\xAB\x81AUTF-8\x82̏璷\x82ȕ\\x8C\xBB - code = 0; - } - } - if (code == 0) { - PutReplacementChr(w, buf, count - 1, ts.FallbackToCP932); - count = 0; - goto recheck; - } else { PutU32(code); count = 0; return TRUE; } + return TRUE; } // 4byte(21bit) - if ((buf[0] & 0xf8) == 0xf0) { - if(count < 4) { - return TRUE; - } - code = 0; - if ((buf[1] & 0xc0) == 0x80 && (buf[2] & 0xc0) == 0x80 && (buf[3] & 0xc0) == 0x80) { - // 3bit + 6bit + 6bit + 6bit - code = ((buf[0] & 0x07) << 18); - code |= ((buf[1] & 0x3f) << 12); - code |= ((buf[2] & 0x3f) << 6); - code |= (buf[3] & 0x3f); - if (code < 0x10000) { - // 21bit\x8Eg\x82\xC1\x82\xC416bit\x88ȉ\xBA\x82̂Ƃ\xAB\x81AUTF-8\x82̏璷\x82ȕ\\x8C\xBB - code = 0; - } - } - if (code == 0) { - PutReplacementChr(w, buf, count - 1, ts.FallbackToCP932); - count = 0; - goto recheck; - } else { - PutU32(code); - count = 0; - return TRUE; - } + assert(count == 4); + assert((buf[0] & 0xf8) == 0xf0); + if ((buf[0] == 0xf0 && (buf[1] < 0x90 || 0x9f < buf[1])) || + (buf[0] == 0xf4 && (buf[1] < 0x80 || 0x8f < buf[1]))) { + // \x95s\x90\xB3\x82\xC8 UTF-8 + PutReplacementChr(w, buf, 3, ts.FallbackToCP932); + count = 0; + goto recheck; } - - // \x82\xB1\x82\xB1\x82ɂ͗\x88\x82Ȃ\xA2 - assert(FALSE); - - for (i = 0; i < count; i++) { - ParseASCII(buf[i]); - } + // 3bit + 6bit + 6bit + 6bit + code = ((buf[0] & 0x07) << 18); + code |= ((buf[1] & 0x3f) << 12); + code |= ((buf[2] & 0x3f) << 6); + code |= (buf[3] & 0x3f); + PutU32(code); count = 0; return TRUE; }