Develop and Download Open Source Software

Browse Subversion Repository

Annotation of /trunk/teraterm/teraterm/charset.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 10800 - (hide annotations) (download) (as text)
Fri Jul 7 16:09:54 2023 UTC (9 months ago) by zmatsuo
File MIME type: text/x-c++src
File size: 23843 byte(s)
不正なUTF-8を受信したとき出力する文字コードを U+FFFD に変更した

- U+FFFD REPLACEMENT CHARACTER
  - 変更前は "?" だった

ticket #48226
1 zmatsuo 10755 /*
2     * (C) 2023- TeraTerm Project
3     * All rights reserved.
4     *
5     * Redistribution and use in source and binary forms, with or without
6     * modification, are permitted provided that the following conditions
7     * are met:
8     *
9     * 1. Redistributions of source code must retain the above copyright
10     * notice, this list of conditions and the following disclaimer.
11     * 2. Redistributions in binary form must reproduce the above copyright
12     * notice, this list of conditions and the following disclaimer in the
13     * documentation and/or other materials provided with the distribution.
14     * 3. The name of the author may not be used to endorse or promote products
15     * derived from this software without specific prior written permission.
16     *
17     * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
18     * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19     * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20     * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22     * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23     * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24     * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25     * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26     * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27     */
28    
29     #include "teraterm.h"
30     #include "tttypes.h"
31     #include <stdio.h>
32     #include <string.h>
33     #if !defined(_CRTDBG_MAP_ALLOC)
34     #define _CRTDBG_MAP_ALLOC
35     #endif
36     #include <stdlib.h>
37     #include <crtdbg.h>
38     #include <assert.h>
39    
40     #include "ttwinman.h"
41     #include "codeconv.h"
42     #include "unicode.h"
43     #include "language.h" // for JIS2SJIS()
44 zmatsuo 10763 #include "ttcstd.h"
45 zmatsuo 10773 #include "vtterm.h"
46 zmatsuo 10755
47     #include "charset.h"
48    
49 zmatsuo 10763 // UTF-8���s�����l�����������\����������
50 zmatsuo 10800 #define REPLACEMENT_CHARACTER 0xfffd // REPLACEMENT CHARACTER
51     //#define REPLACEMENT_CHARACTER 0x2e2e // Reversed Question Mark (VT382)
52 zmatsuo 10763
53 zmatsuo 10782 typedef struct CharSetDataTag {
54 zmatsuo 10755 /* GL, GR code group */
55     int Glr[2];
56     /* G0, G1, G2, G3 code group */
57     int Gn[4];
58 zmatsuo 10782 /* GL for single shift 2/3 */
59     int GLtmp;
60     /* single shift 2/3 flag */
61     BOOL SSflag;
62 zmatsuo 10763 //
63     char32_t replacement_char;
64 zmatsuo 10767 // UTF-8 work
65     BYTE buf[4];
66     int count;
67 zmatsuo 10782 BOOL Fallbacked;
68 zmatsuo 10755
69 zmatsuo 10782 // MBCS
70     BOOL KanjiIn; // TRUE = MBCS��1byte�������M��������
71     WORD Kanji;
72 zmatsuo 10755
73 zmatsuo 10782 // EUC
74     BOOL EUCkanaIn;
75     BOOL EUCsupIn;
76     int EUCcount;
77    
78     /* JIS -> SJIS conversion flag */
79     BOOL ConvJIS;
80     BYTE DebugFlag;
81    
82     // Operations
83     CharSetOp Op;
84     void *ClientData;
85     } CharSetData;
86    
87 zmatsuo 10770 static BOOL IsC0(char32_t b)
88     {
89     return (b <= US);
90     }
91    
92     static BOOL IsC1(char32_t b)
93     {
94     return ((b>=0x80) && (b<=0x9F));
95     }
96    
97     /**
98 zmatsuo 10755 * ISO2022�p���[�N������������
99     */
100 zmatsuo 10782 static void CharSetInit2(CharSetData *w)
101 zmatsuo 10755 {
102     if (ts.Language==IdJapanese) {
103     w->Gn[0] = IdASCII;
104     w->Gn[1] = IdKatakana;
105     w->Gn[2] = IdKatakana;
106     w->Gn[3] = IdKanji;
107     w->Glr[0] = 0;
108     if ((ts.KanjiCode==IdJIS) && (ts.JIS7Katakana==0))
109     w->Glr[1] = 2; // 8-bit katakana
110     else
111     w->Glr[1] = 3;
112     }
113     else {
114     w->Gn[0] = IdASCII;
115     w->Gn[1] = IdSpecial;
116     w->Gn[2] = IdASCII;
117     w->Gn[3] = IdASCII;
118     w->Glr[0] = 0;
119     w->Glr[1] = 0;
120     }
121     }
122    
123     /**
124     * �������A���[�N������������
125     */
126 zmatsuo 10782 CharSetData *CharSetInit(const CharSetOp *op, void *client_data)
127 zmatsuo 10755 {
128 zmatsuo 10782 CharSetData *w = (CharSetData *)calloc(sizeof(*w), 1);
129     if (w == NULL) {
130     return NULL;
131     }
132 zmatsuo 10763
133 zmatsuo 10782 w->Op = *op;
134     w->ClientData = client_data;
135    
136 zmatsuo 10763 CharSetInit2(w);
137 zmatsuo 10782 w->GLtmp = 0;
138     w->SSflag = FALSE;
139 zmatsuo 10763
140 zmatsuo 10782 w->DebugFlag = DEBUG_FLAG_NONE;
141    
142 zmatsuo 10763 w->replacement_char = REPLACEMENT_CHARACTER;
143 zmatsuo 10782 w->SSflag = FALSE;
144 zmatsuo 10755
145 zmatsuo 10782 w->KanjiIn = FALSE;
146     w->EUCkanaIn = FALSE;
147     w->EUCsupIn = FALSE;
148     w->ConvJIS = FALSE;
149     w->Fallbacked = FALSE;
150    
151     return w;
152 zmatsuo 10755 }
153    
154 zmatsuo 10782 void CharSetFinish(CharSetData *w)
155     {
156     assert(w != NULL);
157     free(w);
158     }
159    
160 zmatsuo 10755 /**
161     * 1byte���`�F�b�N
162     */
163     static BOOL CheckFirstByte(BYTE b, int lang, int kanji_code)
164     {
165     switch (lang) {
166     case IdKorean:
167 zmatsuo 10779 return __ismbblead(b, 949);
168 zmatsuo 10755 case IdChinese:
169     if (kanji_code == IdCnGB2312) {
170     return __ismbblead(b, 936);
171     }
172     else if (ts.KanjiCode == IdCnBig5) {
173     return __ismbblead(b, 950);
174     }
175     break;
176     default:
177     assert(FALSE);
178     break;
179     }
180     assert(FALSE);
181     return FALSE;
182     }
183 zmatsuo 10763
184 zmatsuo 10755 /**
185 zmatsuo 10763 * Double-byte Character Sets
186     * SJIS��1byte��?
187     *
188     * ��1�o�C�g0x81...0x9F or 0xE0...0xEF
189     * ��1�o�C�g0x81...0x9F or 0xE0...0xFC
190     */
191     static BOOL ismbbleadSJIS(BYTE b)
192     {
193     if (((0x80<b) && (b<0xa0)) || ((0xdf<b) && (b<0xfd))) {
194     return TRUE;
195     }
196     return FALSE;
197     }
198    
199     /**
200 zmatsuo 10755 * ts.Language == IdJapanese ��
201     * 1byte���`�F�b�N
202     */
203 zmatsuo 10782 static BOOL CheckKanji(CharSetData *w, BYTE b)
204 zmatsuo 10755 {
205     BOOL Check;
206    
207     if (ts.Language!=IdJapanese)
208     return FALSE;
209    
210 zmatsuo 10782 w->ConvJIS = FALSE;
211 zmatsuo 10755
212     if (ts.KanjiCode==IdSJIS ||
213 zmatsuo 10794 (ts.UTF8Fallback && ts.KanjiCode==IdUTF8)) {
214 zmatsuo 10759 if (((0x80<b) && (b<0xa0)) || ((0xdf<b) && (b<0xfd))) {
215 zmatsuo 10782 w->Fallbacked = TRUE;
216 zmatsuo 10755 return TRUE; // SJIS kanji
217     }
218     if ((0xa1<=b) && (b<=0xdf)) {
219     return FALSE; // SJIS katakana
220     }
221     }
222    
223     if ((b>=0x21) && (b<=0x7e)) {
224     Check = (w->Gn[w->Glr[0]] == IdKanji);
225 zmatsuo 10782 w->ConvJIS = Check;
226 zmatsuo 10755 }
227     else if ((b>=0xA1) && (b<=0xFE)) {
228     Check = (w->Gn[w->Glr[1]] == IdKanji);
229     if (ts.KanjiCode==IdEUC) {
230     Check = TRUE;
231     }
232     else if (ts.KanjiCode==IdJIS && ((ts.TermFlag & TF_FIXEDJIS)!=0) && (ts.JIS7Katakana==0)) {
233     Check = FALSE; // 8-bit katakana
234     }
235 zmatsuo 10782 w->ConvJIS = Check;
236 zmatsuo 10755 }
237     else {
238     Check = FALSE;
239     }
240    
241     return Check;
242     }
243    
244 zmatsuo 10782 static BOOL ParseFirstJP(CharSetData *w, BYTE b)
245 zmatsuo 10755 // returns TRUE if b is processed
246     // (actually allways returns TRUE)
247     {
248 zmatsuo 10782 if (w->KanjiIn) {
249     if (((! w->ConvJIS) && (0x3F<b) && (b<0xFD)) ||
250     (w->ConvJIS && ( ((0x20<b) && (b<0x7f)) ||
251 zmatsuo 10759 ((0xa0<b) && (b<0xff)) )) )
252 zmatsuo 10755 {
253 zmatsuo 10758 unsigned long u32;
254 zmatsuo 10782 w->Kanji = w->Kanji + b;
255     if (w->ConvJIS) {
256 zmatsuo 10758 // JIS -> Shift_JIS(CP932)
257 zmatsuo 10782 w->Kanji = JIS2SJIS((WORD)(w->Kanji & 0x7f7f));
258 zmatsuo 10758 }
259 zmatsuo 10782 u32 = CP932ToUTF32(w->Kanji);
260     w->Op.PutU32(u32, w->ClientData);
261     w->KanjiIn = FALSE;
262 zmatsuo 10755 return TRUE;
263     }
264     else if ((ts.TermFlag & TF_CTRLINKANJI)==0) {
265 zmatsuo 10782 w->KanjiIn = FALSE;
266 zmatsuo 10755 }
267     }
268    
269 zmatsuo 10782 if (w->SSflag) {
270     if (w->Gn[w->GLtmp] == IdKanji) {
271     w->Kanji = b << 8;
272     w->KanjiIn = TRUE;
273     w->SSflag = FALSE;
274 zmatsuo 10755 return TRUE;
275     }
276 zmatsuo 10782 else if (w->Gn[w->GLtmp] == IdKatakana) {
277 zmatsuo 10755 b = b | 0x80;
278     }
279    
280 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
281     w->SSflag = FALSE;
282 zmatsuo 10755 return TRUE;
283     }
284    
285 zmatsuo 10782 if ((!w->EUCsupIn) && (!w->EUCkanaIn) && (!w->KanjiIn) && CheckKanji(w, b)) {
286     w->Kanji = b << 8;
287     w->KanjiIn = TRUE;
288 zmatsuo 10755 return TRUE;
289     }
290    
291     if (b<=US) {
292 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
293 zmatsuo 10755 }
294     else if (b==0x20) {
295 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
296 zmatsuo 10755 }
297     else if ((b>=0x21) && (b<=0x7E)) {
298 zmatsuo 10782 if (w->EUCsupIn) {
299     w->EUCcount--;
300     w->EUCsupIn = (w->EUCcount==0);
301 zmatsuo 10755 return TRUE;
302     }
303    
304 zmatsuo 10782 if ((w->Gn[w->Glr[0]] == IdKatakana) || w->EUCkanaIn) {
305 zmatsuo 10755 b = b | 0x80;
306 zmatsuo 10782 w->EUCkanaIn = FALSE;
307 zmatsuo 10755 {
308     // b��sjis�����p�J�^�J�i
309     unsigned long u32 = CP932ToUTF32(b);
310 zmatsuo 10782 w->Op.PutU32(u32, w->ClientData);
311 zmatsuo 10755 }
312     return TRUE;
313     }
314 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
315 zmatsuo 10755 }
316     else if (b==0x7f) {
317     return TRUE;
318     }
319     else if ((b>=0x80) && (b<=0x8D)) {
320 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
321 zmatsuo 10755 }
322     else if (b==0x8E) { // SS2
323     switch (ts.KanjiCode) {
324     case IdEUC:
325     if (ts.ISO2022Flag & ISO2022_SS2) {
326 zmatsuo 10782 w->EUCkanaIn = TRUE;
327 zmatsuo 10755 }
328     break;
329     case IdUTF8:
330 zmatsuo 10800 w->Op.PutU32(w->replacement_char, w->ClientData);
331 zmatsuo 10755 break;
332     default:
333 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
334 zmatsuo 10755 }
335     }
336     else if (b==0x8F) { // SS3
337     switch (ts.KanjiCode) {
338     case IdEUC:
339     if (ts.ISO2022Flag & ISO2022_SS3) {
340 zmatsuo 10782 w->EUCcount = 2;
341     w->EUCsupIn = TRUE;
342 zmatsuo 10755 }
343     break;
344     case IdUTF8:
345 zmatsuo 10800 w->Op.PutU32(w->replacement_char, w->ClientData);
346 zmatsuo 10755 break;
347     default:
348 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
349 zmatsuo 10755 }
350     }
351     else if ((b>=0x90) && (b<=0x9F)) {
352 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
353 zmatsuo 10755 }
354     else if (b==0xA0) {
355 zmatsuo 10782 w->Op.PutU32(0x20, w->ClientData);
356 zmatsuo 10755 }
357     else if ((b>=0xA1) && (b<=0xFE)) {
358 zmatsuo 10782 if (w->EUCsupIn) {
359     w->EUCcount--;
360     w->EUCsupIn = (w->EUCcount==0);
361 zmatsuo 10755 return TRUE;
362     }
363    
364     if ((w->Gn[w->Glr[1]] != IdASCII) ||
365 zmatsuo 10782 ((ts.KanjiCode==IdEUC) && w->EUCkanaIn) ||
366 zmatsuo 10755 (ts.KanjiCode==IdSJIS) ||
367 zmatsuo 10759 ((ts.KanjiCode==IdJIS) &&
368     (ts.JIS7Katakana==0) &&
369     ((ts.TermFlag & TF_FIXEDJIS)!=0))) {
370 zmatsuo 10755 // b��sjis�����p�J�^�J�i
371     unsigned long u32 = CP932ToUTF32(b);
372 zmatsuo 10782 w->Op.PutU32(u32, w->ClientData);
373 zmatsuo 10755 } else {
374     if (w->Gn[w->Glr[1]] == IdASCII) {
375     b = b & 0x7f;
376     }
377 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
378 zmatsuo 10755 }
379 zmatsuo 10782 w->EUCkanaIn = FALSE;
380 zmatsuo 10755 }
381     else {
382 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
383 zmatsuo 10755 }
384    
385     return TRUE;
386     }
387    
388 zmatsuo 10782 static BOOL ParseFirstKR(CharSetData *w, BYTE b)
389 zmatsuo 10755 // returns TRUE if b is processed
390     // (actually allways returns TRUE)
391     {
392 zmatsuo 10782 if (w->KanjiIn) {
393 zmatsuo 10759 if (((0x41<=b) && (b<=0x5A)) ||
394     ((0x61<=b) && (b<=0x7A)) ||
395     ((0x81<=b) && (b<=0xFE)))
396 zmatsuo 10755 {
397 zmatsuo 10758 unsigned long u32 = 0;
398 zmatsuo 10768 if (ts.KanjiCode == IdKoreanCP949) {
399 zmatsuo 10779 // CP949
400 zmatsuo 10782 w->Kanji = w->Kanji + b;
401     u32 = MBCP_UTF32(w->Kanji, 949);
402 zmatsuo 10758 }
403     else {
404     assert(FALSE);
405     }
406 zmatsuo 10782 w->Op.PutU32(u32, w->ClientData);
407     w->KanjiIn = FALSE;
408 zmatsuo 10755 return TRUE;
409     }
410     else if ((ts.TermFlag & TF_CTRLINKANJI)==0) {
411 zmatsuo 10782 w->KanjiIn = FALSE;
412 zmatsuo 10755 }
413     }
414    
415 zmatsuo 10782 if ((!w->KanjiIn) && CheckFirstByte(b, ts.Language, ts.KanjiCode)) {
416     w->Kanji = b << 8;
417     w->KanjiIn = TRUE;
418 zmatsuo 10755 return TRUE;
419     }
420    
421     if (b<=US) {
422 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
423 zmatsuo 10755 }
424     else if (b==0x20) {
425 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
426 zmatsuo 10755 }
427     else if ((b>=0x21) && (b<=0x7E)) {
428     // if (Gn[Glr[0]] == IdKatakana) {
429     // b = b | 0x80;
430     // }
431 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
432 zmatsuo 10755 }
433     else if (b==0x7f) {
434     return TRUE;
435     }
436     else if ((0x80<=b) && (b<=0x9F)) {
437 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
438 zmatsuo 10755 }
439     else if (b==0xA0) {
440 zmatsuo 10782 w->Op.PutU32(0x20, w->ClientData);
441 zmatsuo 10755 }
442     else if ((b>=0xA1) && (b<=0xFE)) {
443     if (w->Gn[w->Glr[1]] == IdASCII) {
444     b = b & 0x7f;
445     }
446 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
447 zmatsuo 10755 }
448     else {
449 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
450 zmatsuo 10755 }
451    
452     return TRUE;
453     }
454    
455 zmatsuo 10782 static BOOL ParseFirstCn(CharSetData *w, BYTE b)
456 zmatsuo 10755 // returns TRUE if b is processed
457     // (actually allways returns TRUE)
458     {
459 zmatsuo 10782 if (w->KanjiIn) {
460 zmatsuo 10755 // TODO
461 zmatsuo 10759 if (((0x40<=b) && (b<=0x7e)) ||
462     ((0xa1<=b) && (b<=0xFE)))
463 zmatsuo 10755 {
464 zmatsuo 10758 unsigned long u32 = 0;
465 zmatsuo 10782 w->Kanji = w->Kanji + b;
466 zmatsuo 10758 if (ts.KanjiCode == IdCnGB2312) {
467     // CP936 GB2312
468 zmatsuo 10782 u32 = MBCP_UTF32(w->Kanji, 936);
469 zmatsuo 10758 }
470     else if (ts.KanjiCode == IdCnBig5) {
471     // CP950 Big5
472 zmatsuo 10782 u32 = MBCP_UTF32(w->Kanji, 950);
473 zmatsuo 10758 }
474     else {
475     assert(FALSE);
476     }
477 zmatsuo 10782 w->Op.PutU32(u32, w->ClientData);
478     w->KanjiIn = FALSE;
479 zmatsuo 10755 return TRUE;
480     }
481     else if ((ts.TermFlag & TF_CTRLINKANJI)==0) {
482 zmatsuo 10782 w->KanjiIn = FALSE;
483 zmatsuo 10755 }
484     }
485    
486 zmatsuo 10782 if ((!w->KanjiIn) && CheckFirstByte(b, ts.Language, ts.KanjiCode)) {
487     w->Kanji = b << 8;
488     w->KanjiIn = TRUE;
489 zmatsuo 10755 return TRUE;
490     }
491    
492     if (b<=US) {
493 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
494 zmatsuo 10755 }
495     else if (b==0x20) {
496 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
497 zmatsuo 10755 }
498     else if ((b>=0x21) && (b<=0x7E)) {
499     // if (Gn[Glr[0]] == IdKatakana) {
500     // b = b | 0x80;
501     // }
502 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
503 zmatsuo 10755 }
504     else if (b==0x7f) {
505     return TRUE;
506     }
507     else if ((0x80<=b) && (b<=0x9F)) {
508 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
509 zmatsuo 10755 }
510     else if (b==0xA0) {
511 zmatsuo 10782 w->Op.PutU32(0x20, w->ClientData);
512 zmatsuo 10755 }
513     else if ((b>=0xA1) && (b<=0xFE)) {
514     if (w->Gn[w->Glr[1]] == IdASCII) {
515     b = b & 0x7f;
516     }
517 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
518 zmatsuo 10755 }
519     else {
520 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
521 zmatsuo 10755 }
522    
523     return TRUE;
524     }
525    
526 zmatsuo 10782 static void ParseASCII(CharSetData *w, BYTE b)
527 zmatsuo 10755 {
528 zmatsuo 10782 if (w->SSflag) {
529     w->Op.PutU32(b, w->ClientData);
530     w->SSflag = FALSE;
531 zmatsuo 10755 return;
532     }
533    
534     if (b<=US) {
535 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
536 zmatsuo 10755 } else if ((b>=0x20) && (b<=0x7E)) {
537 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
538 zmatsuo 10755 } else if ((b==0x8E) || (b==0x8F)) {
539 zmatsuo 10800 w->Op.PutU32(w->replacement_char, w->ClientData);
540 zmatsuo 10755 } else if ((b>=0x80) && (b<=0x9F)) {
541 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
542 zmatsuo 10755 } else if (b>=0xA0) {
543 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
544 zmatsuo 10755 }
545     }
546    
547 zmatsuo 10770 /**
548     * REPLACEMENT_CHARACTER ���\��
549     * UTF-8 �f�R�[�h�����g�p
550     */
551 zmatsuo 10782 static void PutReplacementChr(CharSetData *w, const BYTE *ptr, size_t len, BOOL fallback)
552 zmatsuo 10763 {
553     const char32_t replacement_char = w->replacement_char;
554     int i;
555     for (i = 0; i < len; i++) {
556     BYTE c = *ptr++;
557 zmatsuo 10789 assert(!IsC0(c));
558 zmatsuo 10764 if (fallback) {
559     // fallback ISO8859-1
560 zmatsuo 10782 w->Op.PutU32(c, w->ClientData);
561 zmatsuo 10763 }
562     else {
563 zmatsuo 10764 // fallback������
564     if (c < 0x80) {
565     // �s����UTF-8��������������0x80�������������A
566     // 1������UTF-8�������������������\������
567 zmatsuo 10782 w->Op.PutU32(c, w->ClientData);
568 zmatsuo 10764 }
569     else {
570 zmatsuo 10782 w->Op.PutU32(replacement_char, w->ClientData);
571 zmatsuo 10764 }
572 zmatsuo 10763 }
573     }
574     }
575    
576 zmatsuo 10770 /**
577     * UTF-8�����M�f�[�^����������
578     *
579     * returns TRUE if b is processed
580     */
581 zmatsuo 10782 static BOOL ParseFirstUTF8(CharSetData *w, BYTE b)
582 zmatsuo 10755 {
583 zmatsuo 10766 char32_t code;
584 zmatsuo 10755
585 zmatsuo 10782 if (w->Fallbacked) {
586     BOOL r = ParseFirstJP(w, b);
587     w->Fallbacked = FALSE;
588 zmatsuo 10763 return r;
589 zmatsuo 10755 }
590    
591     // UTF-8�G���R�[�h
592 zmatsuo 10766 // The Unicode Standard Chapter 3
593     // Table 3-7. Well-Formed UTF-8 Byte Sequences
594 zmatsuo 10777 // | Code Points | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
595     // | U+0000..U+007F | 00..7F | | | |
596     // | U+0080..U+07FF | C2..DF | 80..BF | | |
597     // | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
598     // | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
599     // | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
600     // | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
601     // | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
602     // | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
603     // | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
604 zmatsuo 10755 // - 1byte��
605 zmatsuo 10766 // - 0x00 - 0x7f ok
606     // - 0x80 - 0xc1 ng
607     // - 0xc2 - 0xf4 ok
608     // - 0xf5 - 0xff ng
609 zmatsuo 10755 // - 2byte�����~
610 zmatsuo 10766 // - 0x00 - 0x7f ng
611     // - 0x80 - 0xbf ok
612     // - 0xc0 - 0xff ng
613     // - 2byte�����O
614     // - 1byte == 0xe0 ������ 0xa0 - 0xbf����ok
615     // - 1byte == 0xed ������ 0x80 - 0x9f����ok
616     // - 1byte == 0xf0 ������ 0x90 - 0xbf����ok
617     // - 1byte == 0xf4 ������ 0x90 - 0x8f����ok
618 zmatsuo 10763 recheck:
619 zmatsuo 10755 // 1byte(7bit)
620 zmatsuo 10767 if (w->count == 0) {
621 zmatsuo 10770 if (IsC0(b)) {
622     // U+0000 .. U+001f
623     // C0��������, C0 Coontrols
624 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
625 zmatsuo 10755 return TRUE;
626     }
627 zmatsuo 10770 else if (b <= 0x7f) {
628     // 0x7f����, �������A���������o��
629 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
630 zmatsuo 10770 return TRUE;
631     }
632     else if (0xc2 <= b && b <= 0xf4) {
633 zmatsuo 10766 // 1byte������
634 zmatsuo 10767 w->buf[w->count++] = b;
635 zmatsuo 10755 return TRUE;
636     }
637    
638 zmatsuo 10770 // 0x80 - 0xc1, 0xf5 - 0xff
639 zmatsuo 10766 // UTF-8��1byte���o���������R�[�h������
640 zmatsuo 10794 if (ts.UTF8Fallback) {
641 zmatsuo 10766 // fallback��������
642     if ((ts.Language == IdJapanese) && ismbbleadSJIS(b)) {
643     // ���{�������� && Shift_JIS 1byte��
644     // Shift_JIS �� fallback
645 zmatsuo 10782 w->Fallbacked = TRUE;
646     w->ConvJIS = FALSE;
647     w->Kanji = b << 8;
648     w->KanjiIn = TRUE;
649 zmatsuo 10766 return TRUE;
650 zmatsuo 10755 }
651 zmatsuo 10766 // fallback ISO8859-1
652 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
653 zmatsuo 10766 return TRUE;
654 zmatsuo 10755 }
655     else {
656 zmatsuo 10766 // fallback������, �s������������
657 zmatsuo 10767 w->buf[0] = b;
658     PutReplacementChr(w, w->buf, 1, FALSE);
659 zmatsuo 10755 }
660 zmatsuo 10766 return TRUE;
661 zmatsuo 10755 }
662    
663 zmatsuo 10764 // 2byte���~����?
664 zmatsuo 10766 if((b & 0xc0) != 0x80) { // == (b <= 0x7f || 0xc0 <= b)
665     // �s��������, (����2bit�� 0b10xx_xxxx ��������)
666 zmatsuo 10794 PutReplacementChr(w, w->buf, w->count, ts.UTF8Fallback);
667 zmatsuo 10767 w->count = 0;
668 zmatsuo 10764 goto recheck;
669     }
670    
671 zmatsuo 10755 // 2byte�����~����
672 zmatsuo 10767 w->buf[w->count++] = b;
673 zmatsuo 10755
674 zmatsuo 10766 // 2byte(11bit)
675 zmatsuo 10767 if (w->count == 2) {
676     if ((w->buf[0] & 0xe0) == 0xc0) { // == (0xc2 <= w->buf[0] && w->buf[0] <= 0xdf)
677 zmatsuo 10766 // 5bit + 6bit
678 zmatsuo 10767 code = ((w->buf[0] & 0x1f) << 6) | (b & 0x3f);
679 zmatsuo 10770 if (IsC1(code)) {
680     // U+0080 .. u+009f
681     // C1��������, C1 Controls
682 zmatsuo 10782 w->Op.ParseControl((BYTE)code, w->ClientData);
683 zmatsuo 10770 }
684     else {
685 zmatsuo 10782 w->Op.PutU32(code, w->ClientData);
686 zmatsuo 10770 }
687 zmatsuo 10767 w->count = 0;
688 zmatsuo 10755 return TRUE;
689     }
690 zmatsuo 10766 return TRUE;
691     }
692    
693     // 3byte(16bit)
694 zmatsuo 10767 if (w->count == 3) {
695     if ((w->buf[0] & 0xf0) == 0xe0) {
696     if ((w->buf[0] == 0xe0 && (w->buf[1] < 0xa0 || 0xbf < w->buf[1])) ||
697     (w->buf[0] == 0xed && ( 0x9f < w->buf[1]))) {
698 zmatsuo 10766 // �s���� UTF-8
699 zmatsuo 10794 PutReplacementChr(w, w->buf, 2, ts.UTF8Fallback);
700 zmatsuo 10767 w->count = 0;
701 zmatsuo 10766 goto recheck;
702     }
703 zmatsuo 10755 // 4bit + 6bit + 6bit
704 zmatsuo 10767 code = ((w->buf[0] & 0xf) << 12);
705     code |= ((w->buf[1] & 0x3f) << 6);
706     code |= ((w->buf[2] & 0x3f));
707 zmatsuo 10782 w->Op.PutU32(code, w->ClientData);
708 zmatsuo 10767 w->count = 0;
709 zmatsuo 10755 return TRUE;
710     }
711 zmatsuo 10766 return TRUE;
712 zmatsuo 10755 }
713    
714     // 4byte(21bit)
715 zmatsuo 10767 assert(w->count == 4);
716     assert((w->buf[0] & 0xf8) == 0xf0);
717     if ((w->buf[0] == 0xf0 && (w->buf[1] < 0x90 || 0x9f < w->buf[1])) ||
718     (w->buf[0] == 0xf4 && (w->buf[1] < 0x80 || 0x8f < w->buf[1]))) {
719 zmatsuo 10766 // �s���� UTF-8
720 zmatsuo 10794 PutReplacementChr(w, w->buf, 3, ts.UTF8Fallback);
721 zmatsuo 10767 w->count = 0;
722 zmatsuo 10766 goto recheck;
723 zmatsuo 10755 }
724 zmatsuo 10766 // 3bit + 6bit + 6bit + 6bit
725 zmatsuo 10767 code = ((w->buf[0] & 0x07) << 18);
726     code |= ((w->buf[1] & 0x3f) << 12);
727     code |= ((w->buf[2] & 0x3f) << 6);
728     code |= (w->buf[3] & 0x3f);
729 zmatsuo 10782 w->Op.PutU32(code, w->ClientData);
730 zmatsuo 10767 w->count = 0;
731 zmatsuo 10755 return TRUE;
732     }
733    
734 zmatsuo 10782 static BOOL ParseFirstRus(CharSetData *w, BYTE b)
735 zmatsuo 10755 // returns if b is processed
736     {
737 zmatsuo 10770 if (IsC0(b)) {
738 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
739 zmatsuo 10770 return TRUE;
740     }
741 zmatsuo 10756 // CP1251������
742     BYTE c = RussConv(ts.KanjiCode, IdWindows, b);
743     // CP1251->Unicode
744     unsigned long u32 = MBCP_UTF32(c, 1251);
745 zmatsuo 10782 w->Op.PutU32(u32, w->ClientData);
746 zmatsuo 10756 return TRUE;
747 zmatsuo 10755 }
748    
749 zmatsuo 10782 static BOOL ParseEnglish(CharSetData *w, BYTE b)
750 zmatsuo 10755 {
751     unsigned short u16 = 0;
752     int part = KanjiCodeToISO8859Part(ts.KanjiCode);
753     int r = UnicodeFromISO8859(part, b, &u16);
754     if (r == 0) {
755     return FALSE;
756     }
757     if (u16 < 0x100) {
758 zmatsuo 10782 ParseASCII(w, (BYTE)u16);
759 zmatsuo 10755 }
760     else {
761 zmatsuo 10782 w->Op.PutU32(u16, w->ClientData);
762 zmatsuo 10755 }
763     return TRUE;
764     }
765    
766 zmatsuo 10782 static void PutDebugChar(CharSetData *w, BYTE b)
767 zmatsuo 10771 {
768     int i;
769     BOOL svInsertMode, svAutoWrapMode;
770     TCharAttr svCharAttr;
771     TCharAttr char_attr;
772    
773     svInsertMode = TermGetInsertMode();
774     TermSetInsertMode(FALSE);
775     svAutoWrapMode = TermGetAutoWrapMode();
776     TermSetAutoWrapMode(TRUE);
777    
778     TermGetAttr(&svCharAttr);
779     char_attr = svCharAttr;
780     char_attr.Attr = AttrDefault;
781     TermSetAttr(&char_attr);
782    
783 zmatsuo 10782 if (w->DebugFlag==DEBUG_FLAG_HEXD) {
784 zmatsuo 10771 char buff[3];
785     _snprintf(buff, 3, "%02X", (unsigned int) b);
786    
787     for (i=0; i<2; i++)
788 zmatsuo 10782 w->Op.PutU32(buff[i], w->ClientData);
789     w->Op.PutU32(' ', w->ClientData);
790 zmatsuo 10771 }
791 zmatsuo 10782 else if (w->DebugFlag==DEBUG_FLAG_NORM) {
792 zmatsuo 10771
793     if ((b & 0x80) == 0x80) {
794     //UpdateStr();
795     char_attr.Attr = AttrReverse;
796     TermSetAttr(&char_attr);
797     b = b & 0x7f;
798     }
799    
800     if (b<=US) {
801 zmatsuo 10782 w->Op.PutU32('^', w->ClientData);
802     w->Op.PutU32((char)(b + 0x40), w->ClientData);
803 zmatsuo 10771 }
804     else if (b==DEL) {
805 zmatsuo 10782 w->Op.PutU32('<', w->ClientData);
806     w->Op.PutU32('D', w->ClientData);
807     w->Op.PutU32('E', w->ClientData);
808     w->Op.PutU32('L', w->ClientData);
809     w->Op.PutU32('>', w->ClientData);
810 zmatsuo 10771 }
811     else
812 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
813 zmatsuo 10771 }
814    
815     TermSetAttr(&char_attr);
816     TermSetInsertMode(svInsertMode);
817     TermSetAutoWrapMode(svAutoWrapMode);
818     }
819    
820 zmatsuo 10782 void ParseFirst(CharSetData *w, BYTE b)
821 zmatsuo 10771 {
822     WORD language = ts.Language;
823 zmatsuo 10782 if (w->DebugFlag != DEBUG_FLAG_NONE) {
824 zmatsuo 10771 language = IdDebug;
825     }
826    
827     switch (language) {
828 zmatsuo 10782 default:
829     assert(FALSE);
830     language = IdUtf8;
831     // FALLTHROUGH
832 zmatsuo 10771 case IdUtf8:
833 zmatsuo 10782 ParseFirstUTF8(w, b);
834 zmatsuo 10755 return;
835    
836 zmatsuo 10771 case IdJapanese:
837 zmatsuo 10755 switch (ts.KanjiCode) {
838 zmatsuo 10771 case IdUTF8:
839 zmatsuo 10782 if (ParseFirstUTF8(w, b)) {
840 zmatsuo 10755 return;
841     }
842     break;
843 zmatsuo 10771 default:
844 zmatsuo 10782 if (ParseFirstJP(w, b)) {
845 zmatsuo 10755 return;
846     }
847     }
848     break;
849    
850 zmatsuo 10771 case IdKorean:
851 zmatsuo 10755 switch (ts.KanjiCode) {
852 zmatsuo 10771 case IdUTF8:
853 zmatsuo 10782 if (ParseFirstUTF8(w, b)) {
854 zmatsuo 10755 return;
855     }
856     break;
857 zmatsuo 10771 default:
858 zmatsuo 10782 if (ParseFirstKR(w, b)) {
859 zmatsuo 10755 return;
860     }
861     }
862     break;
863    
864 zmatsuo 10771 case IdRussian:
865 zmatsuo 10782 if (ParseFirstRus(w, b)) {
866 zmatsuo 10755 return;
867     }
868     break;
869    
870     case IdChinese:
871     switch (ts.KanjiCode) {
872     case IdUTF8:
873 zmatsuo 10782 if (ParseFirstUTF8(w, b)) {
874 zmatsuo 10755 return;
875     }
876     break;
877     default:
878 zmatsuo 10782 if (ParseFirstCn(w, b)) {
879 zmatsuo 10755 return;
880     }
881     }
882     break;
883     case IdEnglish: {
884 zmatsuo 10782 if (ParseEnglish(w, b)) {
885 zmatsuo 10755 return;
886     }
887     break;
888     }
889 zmatsuo 10771 case IdDebug: {
890 zmatsuo 10782 PutDebugChar(w, b);
891 zmatsuo 10771 return;
892 zmatsuo 10755 }
893 zmatsuo 10771 }
894 zmatsuo 10755
895 zmatsuo 10782 if (w->SSflag) {
896     w->Op.PutU32(b, w->ClientData);
897     w->SSflag = FALSE;
898 zmatsuo 10755 return;
899     }
900    
901     if (b<=US)
902 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
903 zmatsuo 10755 else if ((b>=0x20) && (b<=0x7E))
904 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
905 zmatsuo 10755 else if ((b>=0x80) && (b<=0x9F))
906 zmatsuo 10782 w->Op.ParseControl(b, w->ClientData);
907 zmatsuo 10755 else if (b>=0xA0)
908 zmatsuo 10782 w->Op.PutU32(b, w->ClientData);
909 zmatsuo 10755 }
910    
911     /**
912     * �w��(Designate)
913     *
914     * @param Gn 0/1/2/3 = G0/G1/G2/G3
915     * @param codeset IdASCII 0
916     * IdKatakana 1
917     * IdKanji 2
918     * IdSpecial 3
919     */
920 zmatsuo 10782 void CharSet2022Designate(CharSetData *w, int gn, int cs)
921 zmatsuo 10755 {
922     w->Gn[gn] = cs;
923     }
924    
925     /**
926     * �����o��(Invoke)
927 zmatsuo 10776 * @param shift
928 zmatsuo 10755 */
929 zmatsuo 10782 void CharSet2022Invoke(CharSetData *w, CharSet2022Shift shift)
930 zmatsuo 10755 {
931 zmatsuo 10776 switch (shift) {
932     case CHARSET_LS0:
933     // Locking Shift 0 (G0->GL)
934     w->Glr[0] = 0;
935     break;
936     case CHARSET_LS1:
937     // Locking Shift 1 (G1->GL)
938     w->Glr[0] = 1;
939     break;
940     case CHARSET_LS2:
941     // Locking Shift 2 (G2->GL)
942     w->Glr[0] = 2;
943     break;
944     case CHARSET_LS3:
945     // Locking Shift 3 (G3->GL)
946     w->Glr[0] = 3;
947     break;
948     case CHARSET_LS1R:
949     // Locking Shift 1 (G1->GR)
950     w->Glr[1] = 1;
951     break;
952     case CHARSET_LS2R:
953     // Locking Shift 2 (G2->GR)
954     w->Glr[1] = 2;
955     break;
956     case CHARSET_LS3R:
957     // Locking Shift 3 (G3->GR)
958     w->Glr[1] = 3;
959     break;
960     case CHARSET_SS2:
961     // Single Shift 2
962 zmatsuo 10782 w->GLtmp = 2;
963     w->SSflag = TRUE;
964 zmatsuo 10776 break;
965     case CHARSET_SS3:
966     // Single Shift 3
967 zmatsuo 10782 w->GLtmp = 3;
968     w->SSflag = TRUE;
969 zmatsuo 10776 break;
970     default:
971     assert(FALSE);
972     break;
973 zmatsuo 10755 }
974     }
975    
976     /**
977     * DEC�����t�H���g(Tera Special font)
978     * 0140(0x60) ... 0176(0x7f) ���r�����A�T�C������������
979 zmatsuo 10760 * (0xe0) ... (0xff) ��?
980 zmatsuo 10755 * <ESC>(0 �������������G�X�P�[�v�V�[�P���X�����`
981     * about/emulations.html
982     *
983     * @param b �R�[�h
984 zmatsuo 10760 * @retval TRUE IdSpecial
985     * @retval FALSE IdSpecial��������
986 zmatsuo 10755 */
987 zmatsuo 10782 BOOL CharSetIsSpecial(CharSetData *w, BYTE b)
988 zmatsuo 10755 {
989     BOOL SpecialNew = FALSE;
990    
991     if ((b>0x5F) && (b<0x80)) {
992 zmatsuo 10782 if (w->SSflag)
993     SpecialNew = (w->Gn[w->GLtmp]==IdSpecial);
994 zmatsuo 10755 else
995     SpecialNew = (w->Gn[w->Glr[0]]==IdSpecial);
996     }
997     else if (b>0xDF) {
998 zmatsuo 10782 if (w->SSflag)
999     SpecialNew = (w->Gn[w->GLtmp]==IdSpecial);
1000 zmatsuo 10755 else
1001     SpecialNew = (w->Gn[w->Glr[1]]==IdSpecial);
1002     }
1003    
1004     return SpecialNew;
1005     }
1006    
1007 zmatsuo 10782 static void CharSetSaveStateLow(CharSetState *state, const CharSetData *w)
1008 zmatsuo 10755 {
1009     int i;
1010     state->infos[0] = w->Glr[0];
1011     state->infos[1] = w->Glr[1];
1012     for (i=0 ; i<=3; i++) {
1013     state->infos[2 + i] = w->Gn[i];
1014     }
1015     }
1016    
1017     /**
1018     * ��������������
1019     */
1020 zmatsuo 10782 void CharSetSaveState(CharSetData *w, CharSetState *state)
1021 zmatsuo 10755 {
1022     CharSetSaveStateLow(state, w);
1023     }
1024    
1025     /**
1026     * ���������A����
1027     */
1028 zmatsuo 10782 void CharSetLoadState(CharSetData *w, const CharSetState *state)
1029 zmatsuo 10755 {
1030     int i;
1031     w->Glr[0] = state->infos[0];
1032     w->Glr[1] = state->infos[1];
1033     for (i=0 ; i<=3; i++) {
1034     w->Gn[i] = state->infos[2 + i];
1035     }
1036     }
1037 zmatsuo 10763
1038     /**
1039     * �t�H�[���o�b�N���I��
1040     * ���M�f�[�^UTF-8�����AShift_JIS�o����(fallback����)�����f����
1041     *
1042     */
1043 zmatsuo 10782 void CharSetFallbackFinish(CharSetData *w)
1044 zmatsuo 10763 {
1045 zmatsuo 10782 w->Fallbacked = FALSE;
1046 zmatsuo 10763 }
1047 zmatsuo 10773
1048     /**
1049     * �f�o�O�o�����������[�h�����X����
1050     */
1051 zmatsuo 10782 void CharSetSetNextDebugMode(CharSetData *w)
1052 zmatsuo 10773 {
1053     // ts.DebugModes ���� tttypes.h �� DBGF_* �� OR ����������
1054     do {
1055 zmatsuo 10782 w->DebugFlag = (w->DebugFlag + 1) % DEBUG_FLAG_MAXD;
1056     } while (w->DebugFlag != DEBUG_FLAG_NONE && !((ts.DebugModes >> (w->DebugFlag - 1)) & 1));
1057 zmatsuo 10773 }
1058    
1059 zmatsuo 10782 BYTE CharSetGetDebugMode(CharSetData *w)
1060 zmatsuo 10773 {
1061 zmatsuo 10782 return w->DebugFlag;
1062 zmatsuo 10773 }
1063    
1064 zmatsuo 10782 void CharSetSetDebugMode(CharSetData *w, BYTE mode)
1065 zmatsuo 10773 {
1066 zmatsuo 10782 w->DebugFlag = mode % DEBUG_FLAG_MAXD;
1067 zmatsuo 10773 }

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26