| 1 |
/* |
| 2 |
* Copyright (C) 2019- TeraTerm Project |
| 3 |
* All rights reserved. |
| 4 |
* |
| 5 |
* Redistribution and use in source and binary forms, with or without |
| 6 |
* modification, are permitted provided that the following conditions |
| 7 |
* are met: |
| 8 |
* |
| 9 |
* 1. Redistributions of source code must retain the above copyright |
| 10 |
* notice, this list of conditions and the following disclaimer. |
| 11 |
* 2. Redistributions in binary form must reproduce the above copyright |
| 12 |
* notice, this list of conditions and the following disclaimer in the |
| 13 |
* documentation and/or other materials provided with the distribution. |
| 14 |
* 3. The name of the author may not be used to endorse or promote products |
| 15 |
* derived from this software without specific prior written permission. |
| 16 |
* |
| 17 |
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR |
| 18 |
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| 19 |
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| 20 |
* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 21 |
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| 22 |
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 23 |
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 24 |
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 25 |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| 26 |
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 27 |
*/ |
| 28 |
|
| 29 |
#include <stdlib.h> |
| 30 |
#include <stdio.h> |
| 31 |
#include <assert.h> |
| 32 |
|
| 33 |
#include "unicode.h" |
| 34 |
|
| 35 |
/** |
| 36 |
* East_Asian_Width �Q�l���� ���� |
| 37 |
* |
| 38 |
* @retval 'F' Fullwidth �S�p |
| 39 |
* @retval 'H' Halfwidth ���p |
| 40 |
* @retval 'W' Wide �L |
| 41 |
* @retval 'n' Na,Narrow �� |
| 42 |
* @retval 'A' Ambiguous �B�� |
| 43 |
* �������������������������������B |
| 44 |
* ���A�W�A���g�����������O���g�����������o�����A |
| 45 |
* ���A�W�A���]�������R�[�h�������������S�p�������������������������B |
| 46 |
* �M���V�A�������L�������������B |
| 47 |
* @retval 'N' Neutral ���� |
| 48 |
* ���A�W�A���g�����������o�������A�S�p�������p���������B�A���r�A���������B |
| 49 |
*/ |
| 50 |
char UnicodeGetWidthProperty(unsigned long u32) |
| 51 |
{ |
| 52 |
typedef struct { |
| 53 |
unsigned long code_from; |
| 54 |
unsigned long code_to; |
| 55 |
char property; |
| 56 |
} east_asian_width_map_t; |
| 57 |
// �e�[�u���������������������� H |
| 58 |
const static east_asian_width_map_t east_asian_width_map[] = { |
| 59 |
#include "unicode_asian_width.tbl" |
| 60 |
}; |
| 61 |
const east_asian_width_map_t *table = east_asian_width_map; |
| 62 |
const size_t table_size = _countof(east_asian_width_map); |
| 63 |
char result; |
| 64 |
|
| 65 |
// �e�[�u���O�`�F�b�N |
| 66 |
if (u32 < east_asian_width_map[0].code_from) { |
| 67 |
return 'H'; |
| 68 |
} |
| 69 |
if (east_asian_width_map[table_size-1].code_to < u32) { |
| 70 |
return 'H'; |
| 71 |
} |
| 72 |
|
| 73 |
// �e�[�u������ |
| 74 |
result = 'H'; |
| 75 |
size_t low = 0; |
| 76 |
size_t high = table_size - 1; |
| 77 |
while (low < high) { |
| 78 |
size_t mid = (low + high) / 2; |
| 79 |
if (table[mid].code_from <= u32 && u32 <= table[mid].code_to) { |
| 80 |
result = table[mid].property; |
| 81 |
break; |
| 82 |
} else if (table[mid].code_to < u32) { |
| 83 |
low = mid + 1; |
| 84 |
} else { |
| 85 |
high = mid; |
| 86 |
} |
| 87 |
} |
| 88 |
|
| 89 |
return result; |
| 90 |
} |
| 91 |
|
| 92 |
typedef struct { |
| 93 |
unsigned long code_from; |
| 94 |
unsigned long code_to; |
| 95 |
} UnicodeTable_t; |
| 96 |
|
| 97 |
typedef struct { |
| 98 |
unsigned long code_from; |
| 99 |
unsigned long code_to; |
| 100 |
unsigned char category; |
| 101 |
} UnicodeTableCombine_t; |
| 102 |
|
| 103 |
typedef struct { |
| 104 |
unsigned long code_from; |
| 105 |
unsigned long code_to; |
| 106 |
const char *block_name; |
| 107 |
} UnicodeTableBlock_t; |
| 108 |
|
| 109 |
static const UnicodeTableBlock_t UnicodeBlockList[] = { |
| 110 |
#include "unicode_block.tbl" |
| 111 |
}; |
| 112 |
|
| 113 |
/** |
| 114 |
* u32���e�[�u�����f�[�^���������������������� |
| 115 |
* |
| 116 |
* @retval �e�[�u����index |
| 117 |
* @retval -1 �e�[�u�������������� |
| 118 |
*/ |
| 119 |
static int SearchTableSimple( |
| 120 |
const UnicodeTable_t *table, size_t table_size, |
| 121 |
unsigned long u32) |
| 122 |
{ |
| 123 |
if (u32 < table[0].code_from) { |
| 124 |
return -1; |
| 125 |
} |
| 126 |
if (u32 > table[table_size-1].code_to) { |
| 127 |
return -1; |
| 128 |
} |
| 129 |
size_t low = 0; |
| 130 |
size_t high = table_size - 1; |
| 131 |
while (low <= high) { |
| 132 |
size_t mid = (low + high) / 2; |
| 133 |
if (table[mid].code_from <= u32 && u32 <= table[mid].code_to) { |
| 134 |
return (int)mid; |
| 135 |
} else if (table[mid].code_to < u32) { |
| 136 |
low = mid + 1; |
| 137 |
} else { |
| 138 |
high = mid - 1; |
| 139 |
} |
| 140 |
} |
| 141 |
// �e�[�u���������O |
| 142 |
return -1; |
| 143 |
} |
| 144 |
|
| 145 |
/** |
| 146 |
* SearchTableSimple() ������ |
| 147 |
* �e�[�u�����^�������� |
| 148 |
* |
| 149 |
* @retval �e�[�u����index |
| 150 |
* @retval -1 �e�[�u�������������� |
| 151 |
*/ |
| 152 |
static int SearchTableCombine( |
| 153 |
const UnicodeTableCombine_t *table, size_t table_size, |
| 154 |
unsigned long u32) |
| 155 |
{ |
| 156 |
if (u32 < table[0].code_from) { |
| 157 |
return -1; |
| 158 |
} |
| 159 |
if (u32 > table[table_size-1].code_to) { |
| 160 |
return -1; |
| 161 |
} |
| 162 |
size_t low = 0; |
| 163 |
size_t high = table_size - 1; |
| 164 |
while (low <= high) { |
| 165 |
size_t mid = (low + high) / 2; |
| 166 |
if (table[mid].code_from <= u32 && u32 <= table[mid].code_to) { |
| 167 |
return (int)mid; |
| 168 |
} else if (table[mid].code_to < u32) { |
| 169 |
low = mid + 1; |
| 170 |
} else { |
| 171 |
high = mid - 1; |
| 172 |
} |
| 173 |
} |
| 174 |
// �e�[�u���������O |
| 175 |
return -1; |
| 176 |
} |
| 177 |
|
| 178 |
/** |
| 179 |
* SearchTableSimple() ������ |
| 180 |
* �e�[�u�����^�������� |
| 181 |
* |
| 182 |
* @retval �e�[�u����index |
| 183 |
* @retval -1 �e�[�u�������������� |
| 184 |
*/ |
| 185 |
static int SearchTableBlock( |
| 186 |
const UnicodeTableBlock_t *table, size_t table_size, |
| 187 |
unsigned long u32) |
| 188 |
{ |
| 189 |
if (u32 < table[0].code_from) { |
| 190 |
return -1; |
| 191 |
} |
| 192 |
if (u32 > table[table_size-1].code_to) { |
| 193 |
return -1; |
| 194 |
} |
| 195 |
size_t low = 0; |
| 196 |
size_t high = table_size - 1; |
| 197 |
while (low <= high) { |
| 198 |
size_t mid = (low + high) / 2; |
| 199 |
if (table[mid].code_from <= u32 && u32 <= table[mid].code_to) { |
| 200 |
return (int)mid; |
| 201 |
} else if (table[mid].code_to < u32) { |
| 202 |
low = mid + 1; |
| 203 |
} else { |
| 204 |
high = mid - 1; |
| 205 |
} |
| 206 |
} |
| 207 |
// �e�[�u���������O |
| 208 |
return -1; |
| 209 |
} |
| 210 |
|
| 211 |
/* |
| 212 |
* ������������������ |
| 213 |
* ���������������������������� |
| 214 |
* EMOJI MODIFIER |
| 215 |
* = Nonspacing Mark |
| 216 |
* VARIATION SELECTOR (�������Z���N�^) |
| 217 |
* = Nonspacing Mark |
| 218 |
* |
| 219 |
* @retval 0 ���������������� |
| 220 |
* @retval 1 ��������,Nonspacing Mark, �J�[�\�������������� |
| 221 |
* @retval 2 ��������,Spacing Mark, �J�[�\���� +1 �������� |
| 222 |
*/ |
| 223 |
int UnicodeIsCombiningCharacter(unsigned long u32) |
| 224 |
{ |
| 225 |
#define Mn 1 // Nonspacing_Mark a nonspacing combining mark (zero advance width) |
| 226 |
#define Mc 2 // Spacing_Mark a spacing combining mark (positive advance width) |
| 227 |
#define Me 1 // Enclosing_Mark an enclosing combining mark |
| 228 |
#define Sk 1 // Modifier_Symbol a non-letterlike modifier symbol |
| 229 |
const static UnicodeTableCombine_t CombiningCharacterList[] = { |
| 230 |
#include "unicode_combine.tbl" |
| 231 |
}; |
| 232 |
const int index = SearchTableCombine(CombiningCharacterList, _countof(CombiningCharacterList), u32); |
| 233 |
if (index == -1) { |
| 234 |
return 0; |
| 235 |
} |
| 236 |
return (int)CombiningCharacterList[index].category; |
| 237 |
} |
| 238 |
|
| 239 |
/** |
| 240 |
* �G����? |
| 241 |
* |
| 242 |
* @retval 0 �G������������ |
| 243 |
* @retval 1 �G���������� |
| 244 |
*/ |
| 245 |
int UnicodeIsEmoji(unsigned long u32) |
| 246 |
{ |
| 247 |
const static UnicodeTable_t EmojiList[] = { |
| 248 |
#include "unicode_emoji.tbl" |
| 249 |
}; |
| 250 |
const int index = SearchTableSimple(EmojiList, _countof(EmojiList), u32); |
| 251 |
return index != -1 ? 1 : 0; |
| 252 |
} |
| 253 |
|
| 254 |
/** |
| 255 |
* �������Z���N�^���`�F�b�N���� |
| 256 |
* |
| 257 |
* UnicodeIsCombiningCharacter() ���������`�F�b�N�����������g�p������������ |
| 258 |
* |
| 259 |
* @retval 0 �������Z���N�^�������� |
| 260 |
* @retval 1 �������Z���N�^������ |
| 261 |
*/ |
| 262 |
#if 0 |
| 263 |
int UnicodeIsVariationSelector(unsigned long u32) |
| 264 |
{ |
| 265 |
if ((0x00180b <= u32 && u32 <= 0x00180d) || // FVS (Mongolian Free Variation Selector) |
| 266 |
(0x00fe00 <= u32 && u32 <= 0x00fe0f) || // SVS VS1�`VS16 |
| 267 |
(0x0e0100 <= u32 && u32 <= 0x0e01ef)) // IVS VS17�`VS256 |
| 268 |
{ |
| 269 |
return 1; |
| 270 |
} |
| 271 |
return 0; |
| 272 |
} |
| 273 |
#endif |
| 274 |
|
| 275 |
/** |
| 276 |
* ���B���[�}? |
| 277 |
* |
| 278 |
* @retval 0 ���B���[�}�������� |
| 279 |
* @retval 1 ���B���[�}������ |
| 280 |
*/ |
| 281 |
int UnicodeIsVirama(unsigned long u32) |
| 282 |
{ |
| 283 |
const static UnicodeTable_t ViramaList[] = { |
| 284 |
#include "unicode_virama.tbl" |
| 285 |
}; |
| 286 |
const int index = SearchTableSimple(ViramaList, _countof(ViramaList), u32); |
| 287 |
return index != -1 ? 1 : 0; |
| 288 |
} |
| 289 |
|
| 290 |
/** |
| 291 |
* Unicode block �� index ������ |
| 292 |
* |
| 293 |
* @retval -1 block �������������� |
| 294 |
* @retval block �� index |
| 295 |
*/ |
| 296 |
int UnicodeBlockIndex(unsigned long u32) |
| 297 |
{ |
| 298 |
return SearchTableBlock(UnicodeBlockList, _countof(UnicodeBlockList), u32); |
| 299 |
} |
| 300 |
|
| 301 |
const char *UnicodeBlockName(int index) |
| 302 |
{ |
| 303 |
if (index == -1) { |
| 304 |
return ""; |
| 305 |
} |
| 306 |
return UnicodeBlockList[index].block_name; |
| 307 |
} |
| 308 |
|
| 309 |
#if 0 |
| 310 |
int main(int, char *[]) |
| 311 |
{ |
| 312 |
static const unsigned long codes[] = { |
| 313 |
#if 0 |
| 314 |
0, 1, 0x7f, |
| 315 |
0x80, |
| 316 |
0x0e00ff, |
| 317 |
0x0e0100, |
| 318 |
0x10fffd, |
| 319 |
#endif |
| 320 |
0x10fffe, |
| 321 |
}; |
| 322 |
|
| 323 |
for (size_t i = 0; i < _countof(codes); i++) { |
| 324 |
unsigned long code = codes[i]; |
| 325 |
printf("U+%06lx %c\n", code, UnicodeGetWidthProperty(code)); |
| 326 |
} |
| 327 |
return 0; |
| 328 |
} |
| 329 |
#endif |
| 330 |
|
| 331 |
// |
| 332 |
// Unicode Combining Character Support |
| 333 |
// |
| 334 |
#include "uni_combining.map" |
| 335 |
|
| 336 |
static unsigned short UnicodeGetPrecomposedChar(int start_index, unsigned short first_code, unsigned short code) |
| 337 |
{ |
| 338 |
const combining_map_t *table = mapCombiningToPrecomposed; |
| 339 |
int tmax = _countof(mapCombiningToPrecomposed); |
| 340 |
unsigned short result = 0; |
| 341 |
int i; |
| 342 |
|
| 343 |
for (i = start_index ; i < tmax ; i++) { |
| 344 |
if (table[i].first_code != first_code) { // 1�������������������A���~���������������������B |
| 345 |
break; |
| 346 |
} |
| 347 |
|
| 348 |
if (table[i].second_code == code) { |
| 349 |
result = table[i].precomposed; |
| 350 |
break; |
| 351 |
} |
| 352 |
} |
| 353 |
|
| 354 |
return (result); |
| 355 |
} |
| 356 |
|
| 357 |
static int UnicodeGetIndexOfCombiningFirstCode(unsigned short code) |
| 358 |
{ |
| 359 |
const combining_map_t *table = mapCombiningToPrecomposed; |
| 360 |
int tmax = _countof(mapCombiningToPrecomposed); |
| 361 |
int low, mid, high; |
| 362 |
int index = -1; |
| 363 |
|
| 364 |
low = 0; |
| 365 |
high = tmax - 1; |
| 366 |
|
| 367 |
// binary search |
| 368 |
while (low < high) { |
| 369 |
mid = (low + high) / 2; |
| 370 |
if (table[mid].first_code < code) { |
| 371 |
low = mid + 1; |
| 372 |
} else { |
| 373 |
high = mid; |
| 374 |
} |
| 375 |
} |
| 376 |
|
| 377 |
if (table[low].first_code == code) { |
| 378 |
while (low >= 0 && table[low].first_code == code) { |
| 379 |
index = low; |
| 380 |
low--; |
| 381 |
} |
| 382 |
} |
| 383 |
|
| 384 |
return (index); |
| 385 |
} |
| 386 |
|
| 387 |
/** |
| 388 |
* Unicode�������������s�� |
| 389 |
* @param[in] first_code |
| 390 |
* @param[in] code |
| 391 |
* @retval 0 ������������ |
| 392 |
* @retval ���O ��������Unicode |
| 393 |
* |
| 394 |
* �� |
| 395 |
* first_code |
| 396 |
* U+307B(��) |
| 397 |
* code |
| 398 |
* U+309A(�K) |
| 399 |
* retval |
| 400 |
* U+307D(��) |
| 401 |
*/ |
| 402 |
unsigned short UnicodeCombining(unsigned short first_code, unsigned short code) |
| 403 |
{ |
| 404 |
int first_code_index = UnicodeGetIndexOfCombiningFirstCode(first_code); |
| 405 |
if (first_code_index == -1) { |
| 406 |
return 0; |
| 407 |
} |
| 408 |
unsigned short cset = UnicodeGetPrecomposedChar(first_code_index, first_code, code); |
| 409 |
return cset; |
| 410 |
} |
| 411 |
|
| 412 |
typedef struct { |
| 413 |
unsigned char code; |
| 414 |
unsigned short unicode; |
| 415 |
} ISO8859Table_t; |
| 416 |
|
| 417 |
/** |
| 418 |
* ISO8859�e�[�u�� |
| 419 |
*/ |
| 420 |
const ISO8859Table_t *GetISO8859Table(int iso8859_part) |
| 421 |
{ |
| 422 |
static const ISO8859Table_t iso8859_2[] = { |
| 423 |
#include "iso8859-2.tbl" |
| 424 |
}; |
| 425 |
static const ISO8859Table_t iso8859_3[] = { |
| 426 |
#include "iso8859-3.tbl" |
| 427 |
}; |
| 428 |
static const ISO8859Table_t iso8859_4[] = { |
| 429 |
#include "iso8859-4.tbl" |
| 430 |
}; |
| 431 |
static const ISO8859Table_t iso8859_5[] = { |
| 432 |
#include "iso8859-5.tbl" |
| 433 |
}; |
| 434 |
static const ISO8859Table_t iso8859_6[] = { |
| 435 |
#include "iso8859-6.tbl" |
| 436 |
}; |
| 437 |
static const ISO8859Table_t iso8859_7[] = { |
| 438 |
#include "iso8859-7.tbl" |
| 439 |
}; |
| 440 |
static const ISO8859Table_t iso8859_8[] = { |
| 441 |
#include "iso8859-8.tbl" |
| 442 |
}; |
| 443 |
static const ISO8859Table_t iso8859_9[] = { |
| 444 |
#include "iso8859-9.tbl" |
| 445 |
}; |
| 446 |
static const ISO8859Table_t iso8859_10[] = { |
| 447 |
#include "iso8859-10.tbl" |
| 448 |
}; |
| 449 |
static const ISO8859Table_t iso8859_11[] = { |
| 450 |
#include "iso8859-11.tbl" |
| 451 |
}; |
| 452 |
static const ISO8859Table_t iso8859_13[] = { |
| 453 |
#include "iso8859-13.tbl" |
| 454 |
}; |
| 455 |
static const ISO8859Table_t iso8859_14[] = { |
| 456 |
#include "iso8859-14.tbl" |
| 457 |
}; |
| 458 |
static const ISO8859Table_t iso8859_15[] = { |
| 459 |
#include "iso8859-15.tbl" |
| 460 |
}; |
| 461 |
static const ISO8859Table_t iso8859_16[] = { |
| 462 |
#include "iso8859-16.tbl" |
| 463 |
}; |
| 464 |
|
| 465 |
static const ISO8859Table_t *tables[] = { |
| 466 |
NULL, // 0 |
| 467 |
NULL, // ISO8859-1 |
| 468 |
iso8859_2, |
| 469 |
iso8859_3, |
| 470 |
iso8859_4, |
| 471 |
iso8859_5, |
| 472 |
iso8859_6, |
| 473 |
iso8859_7, |
| 474 |
iso8859_8, |
| 475 |
iso8859_9, |
| 476 |
iso8859_10, |
| 477 |
iso8859_11, |
| 478 |
NULL, |
| 479 |
iso8859_13, |
| 480 |
iso8859_14, |
| 481 |
iso8859_15, |
| 482 |
iso8859_16, |
| 483 |
}; |
| 484 |
if (iso8859_part >= _countof(tables)) { |
| 485 |
assert(0); |
| 486 |
return NULL; |
| 487 |
} |
| 488 |
assert(tables[iso8859_part] != NULL); |
| 489 |
return tables[iso8859_part]; |
| 490 |
} |
| 491 |
|
| 492 |
/** |
| 493 |
* ISO8859����Unicode������ |
| 494 |
*/ |
| 495 |
int UnicodeFromISO8859(int part, unsigned char b, unsigned short *u16) |
| 496 |
{ |
| 497 |
if (part == 1) { |
| 498 |
// ISO8859-1 �� unicode ������ |
| 499 |
*u16 = b; |
| 500 |
return 1; |
| 501 |
} |
| 502 |
const ISO8859Table_t *table = GetISO8859Table(part); |
| 503 |
if (table == NULL) { |
| 504 |
// ���������������� |
| 505 |
*u16 = 0; |
| 506 |
return 0; |
| 507 |
} |
| 508 |
for (int i = 0; i < 0xff; i++ ){ |
| 509 |
if (table[i].code == b) { |
| 510 |
*u16 = table[i].unicode; |
| 511 |
return 1; |
| 512 |
} |
| 513 |
} |
| 514 |
*u16 = 0; |
| 515 |
return 0; |
| 516 |
} |
| 517 |
|
| 518 |
/** |
| 519 |
* Unicode����ISO8859������ |
| 520 |
* |
| 521 |
* @param[in] part IS8859���� 1...11,13...16 |
| 522 |
* @param[in] u32 Unicode |
| 523 |
* @param[out] *b ISO8859 char |
| 524 |
* @retval 0 ������������ |
| 525 |
* @retval 1 ���������� |
| 526 |
*/ |
| 527 |
int UnicodeToISO8859(int part, unsigned long u32, unsigned char *b) |
| 528 |
{ |
| 529 |
if (part == 1) { |
| 530 |
// ISO8859-1 �� unicode ������ |
| 531 |
*b = (unsigned char)u32; |
| 532 |
return 1; |
| 533 |
} |
| 534 |
if (u32 >= 0x10000) { |
| 535 |
// �������������������R�[�h |
| 536 |
*b = 0; |
| 537 |
return 0; |
| 538 |
} |
| 539 |
const unsigned short u16 = (unsigned short)u32; |
| 540 |
const ISO8859Table_t *table = GetISO8859Table(part); |
| 541 |
if (table == NULL) { |
| 542 |
// ���������������� |
| 543 |
*b = 0; |
| 544 |
return 0; |
| 545 |
} |
| 546 |
for (int i = 0; i < 0xff; i++ ){ |
| 547 |
if (table[i].unicode == u16) { |
| 548 |
*b = table[i].code; |
| 549 |
return 1; |
| 550 |
} |
| 551 |
} |
| 552 |
*b = 0; |
| 553 |
return 0; |
| 554 |
} |