| 1 |
/*********************************************************************** |
| 2 |
* |
| 3 |
* file: wdict.c |
| 4 |
* |
| 5 |
* 辞書登録・判定など |
| 6 |
* |
| 7 |
* $Id$ |
| 8 |
* |
| 9 |
*/ |
| 10 |
|
| 11 |
#include <assert.h> |
| 12 |
#include <stdio.h> |
| 13 |
#include <stdlib.h> |
| 14 |
#include <string.h> |
| 15 |
#include <sys/types.h> |
| 16 |
|
| 17 |
#include "debug.h" |
| 18 |
#include "synth.h" |
| 19 |
#include "util.h" |
| 20 |
#include "wdict.h" |
| 21 |
|
| 22 |
#define HASH_NCHARS 4 |
| 23 |
#define HASH_SEED 16381 |
| 24 |
#define HASH_LEAP 37 |
| 25 |
#define BMHASH_SIZ ((HASH_SEED + ALU_BITS - 1) / ALU_BITS) |
| 26 |
|
| 27 |
/*************************************************************** |
| 28 |
* |
| 29 |
* 辞書構造 |
| 30 |
* |
| 31 |
*/ |
| 32 |
|
| 33 |
struct WDK |
| 34 |
{ |
| 35 |
uint32_t k; /* キー */ |
| 36 |
uint32_t i; /* 範囲終点 */ |
| 37 |
}; |
| 38 |
|
| 39 |
struct WDW |
| 40 |
{ |
| 41 |
uint64_t w; /* 語 */ |
| 42 |
uint16_t xp; /* 区別ナシ */ |
| 43 |
}; |
| 44 |
|
| 45 |
struct WDICT |
| 46 |
{ |
| 47 |
/* ビットマップハッシュ(切り捨て用) |
| 48 |
経験的に、ケツで判定するのがよろし */ |
| 49 |
ALU_T bmhash[BMHASH_SIZ]; |
| 50 |
|
| 51 |
char pos; |
| 52 |
char len; |
| 53 |
|
| 54 |
/* 一次探索キー */ |
| 55 |
int nwdk; |
| 56 |
struct WDK *wdk; |
| 57 |
|
| 58 |
/* 単語 */ |
| 59 |
struct WDW *words; |
| 60 |
|
| 61 |
/* 以下のフィールドは、探索時に用いないハズ */ |
| 62 |
|
| 63 |
/* 対応ノードクロスリファレンス */ |
| 64 |
struct ITREE *node; |
| 65 |
|
| 66 |
/* 総単語数 */ |
| 67 |
int nwords; |
| 68 |
|
| 69 |
/* 被覆記録 */ |
| 70 |
uint64_t *cov; |
| 71 |
}; |
| 72 |
|
| 73 |
#define N_CLS 3 |
| 74 |
|
| 75 |
static |
| 76 |
struct WDICT *g_dict[N_CLS][11][11]; |
| 77 |
|
| 78 |
/* tn の分だけ並んでいる */ |
| 79 |
static int min_dictpool, n_dictpool; |
| 80 |
static |
| 81 |
struct WDICT g_dictpool[0x1000]; |
| 82 |
|
| 83 |
/*************************************************************** |
| 84 |
* |
| 85 |
* xpize() ... ビットマップ xp に基づいて正規化 |
| 86 |
* normalize() ... すべて正規化 |
| 87 |
* hash() ... 微妙なハッシュ関数(要正規化) |
| 88 |
* |
| 89 |
*/ |
| 90 |
|
| 91 |
static |
| 92 |
uint64_t |
| 93 |
xpize(uint64_t m, unsigned xp, unsigned len) |
| 94 |
{ |
| 95 |
int i; |
| 96 |
|
| 97 |
for (i = 0; i < len; i++) |
| 98 |
if (xp & (1 << i)) |
| 99 |
{ |
| 100 |
unsigned c = (m >> 6 * i) & 077; |
| 101 |
unsigned s; |
| 102 |
if (c >= 046) |
| 103 |
s = 26; |
| 104 |
else if (c == 001) |
| 105 |
s = 1; |
| 106 |
else |
| 107 |
continue; |
| 108 |
m -= (uint64_t)s << 6 * i; |
| 109 |
} |
| 110 |
|
| 111 |
return m; |
| 112 |
} |
| 113 |
|
| 114 |
static |
| 115 |
uint64_t |
| 116 |
normalize(uint64_t m) |
| 117 |
{ |
| 118 |
return xpize(m, -1, 10); |
| 119 |
} |
| 120 |
|
| 121 |
static |
| 122 |
unsigned |
| 123 |
hash(unsigned m) |
| 124 |
{ |
| 125 |
int i; |
| 126 |
unsigned a = 0; |
| 127 |
|
| 128 |
for (i = 0; i < HASH_NCHARS; i++, m >>= 6) |
| 129 |
{ |
| 130 |
unsigned c = m & 077; |
| 131 |
#if DEBUG>=1 |
| 132 |
assert(c != 001); |
| 133 |
assert(c < 046); |
| 134 |
#endif |
| 135 |
if (c >= 002) |
| 136 |
c--; |
| 137 |
a = 37 * a + c; |
| 138 |
} |
| 139 |
|
| 140 |
return a % HASH_SEED; |
| 141 |
} |
| 142 |
|
| 143 |
/*************************************************************** |
| 144 |
* |
| 145 |
* 被覆をつくりだす |
| 146 |
* |
| 147 |
*/ |
| 148 |
|
| 149 |
static |
| 150 |
void |
| 151 |
wdict_mkcov(uint64_t *cov, int len, uint64_t w, unsigned xp) |
| 152 |
{ |
| 153 |
int i; |
| 154 |
for (i = 0; i < len; i++, w >>= 6, xp >>= 1) |
| 155 |
cov[i] |= (((xp & 1) /* 区別ナシ */ |
| 156 |
? ((w & 077) < 2 |
| 157 |
? 3ULL /* [./] */ |
| 158 |
: (1ULL | (1ULL << 26))) |
| 159 |
: 1ULL) |
| 160 |
<< (w & 077)); |
| 161 |
} |
| 162 |
|
| 163 |
/*************************************************************** |
| 164 |
* |
| 165 |
* 辞書に単語を突っ込む |
| 166 |
* |
| 167 |
*/ |
| 168 |
|
| 169 |
struct WDICT * |
| 170 |
wdict_add(int tn, |
| 171 |
int pos, |
| 172 |
struct DICT_CHAR const *word) |
| 173 |
{ |
| 174 |
int i; |
| 175 |
struct WDICT *pd; |
| 176 |
struct WDICT *rpd = NULL; |
| 177 |
int len = word->len; |
| 178 |
unsigned attr; |
| 179 |
unsigned xp = word->xp; |
| 180 |
uint64_t text = word->bmp; |
| 181 |
|
| 182 |
/* クラスを決める */ |
| 183 |
switch (word->attr) |
| 184 |
{ |
| 185 |
case CA_U: attr = 1; break; |
| 186 |
case CA_U | CA_C: |
| 187 |
case CA_C: attr = 2; break; |
| 188 |
default: attr = 0; break; |
| 189 |
} |
| 190 |
|
| 191 |
assert(attr < N_CLS); |
| 192 |
pd = g_dict[attr][pos][len]; |
| 193 |
if (pd == NULL) |
| 194 |
{ |
| 195 |
assert(tn >= 0); |
| 196 |
assert(tn < sizeof(g_dictpool) / sizeof(*g_dictpool)); |
| 197 |
g_dict[attr][pos][len] = pd = &g_dictpool[tn]; |
| 198 |
if (n_dictpool == 0) |
| 199 |
min_dictpool = tn; |
| 200 |
else if (min_dictpool >= tn) |
| 201 |
min_dictpool = tn; |
| 202 |
if (n_dictpool <= tn) |
| 203 |
n_dictpool = tn + 1; |
| 204 |
pd->pos = pos; |
| 205 |
pd->len = len; |
| 206 |
pd->words = malloc(1 * sizeof(*pd->words)); |
| 207 |
assert(pd->words != NULL); |
| 208 |
pd->cov = calloc(10, sizeof(*pd->cov)); |
| 209 |
rpd = pd; |
| 210 |
} |
| 211 |
|
| 212 |
/* 辞書バッファを 2**n の単位で拡張 */ |
| 213 |
if (pd->nwords >= 1 |
| 214 |
&& IS_POWER2(pd->nwords)) |
| 215 |
{ |
| 216 |
assert(pd->words != NULL); |
| 217 |
pd->words = realloc(pd->words, |
| 218 |
2 * pd->nwords * sizeof(*pd->words)); |
| 219 |
assert(pd->words != NULL); |
| 220 |
} |
| 221 |
|
| 222 |
/* 辞書に単語を登録 */ |
| 223 |
assert(text == xpize(text, xp, len)); |
| 224 |
pd->words[pd->nwords].xp = xp; |
| 225 |
pd->words[pd->nwords++].w = text; |
| 226 |
|
| 227 |
/* 被覆を記録 */ |
| 228 |
wdict_mkcov(pd->cov, len, text, xp); |
| 229 |
|
| 230 |
return rpd; /* 初回登録のみレコードを返す */ |
| 231 |
} |
| 232 |
|
| 233 |
/*************************************************************** |
| 234 |
* |
| 235 |
* 辞書ノードに木を関連づける |
| 236 |
* |
| 237 |
*/ |
| 238 |
|
| 239 |
void |
| 240 |
wdict_setnode(struct WDICT *pd, |
| 241 |
struct ITREE *node) |
| 242 |
{ |
| 243 |
assert(pd != NULL); |
| 244 |
assert(node != NULL); |
| 245 |
pd->node = node; |
| 246 |
} |
| 247 |
|
| 248 |
/*************************************************************** |
| 249 |
* |
| 250 |
* 辞書の被覆を固定する |
| 251 |
* ついでにソートもしておく(まだてぬき) |
| 252 |
* |
| 253 |
*/ |
| 254 |
|
| 255 |
static |
| 256 |
int |
| 257 |
cmp_dict(void const *pa, void const *pb) |
| 258 |
{ |
| 259 |
struct WDW const *a = pa; |
| 260 |
struct WDW const *b = pb; |
| 261 |
uint64_t i = normalize(a->w); |
| 262 |
uint64_t j = normalize(b->w); |
| 263 |
if (i < j) return -1; |
| 264 |
if (i > j) return 1; |
| 265 |
if (a->w < b->w) return -1; |
| 266 |
if (a->w > b->w) return 1; |
| 267 |
return (int)(a->xp - (int)b->xp); |
| 268 |
} |
| 269 |
|
| 270 |
static |
| 271 |
void |
| 272 |
wdict_setup_words(struct WDICT *pd) |
| 273 |
{ |
| 274 |
int i, j, k; |
| 275 |
|
| 276 |
/* まずはソート */ |
| 277 |
qsort(pd->words, |
| 278 |
pd->nwords, sizeof(*pd->words), |
| 279 |
cmp_dict); |
| 280 |
|
| 281 |
/* 一次検索を組み立てる */ |
| 282 |
k = 0; |
| 283 |
for (i = j = 0; i < pd->nwords; i++) |
| 284 |
{ |
| 285 |
uint32_t w = normalize(pd->words[i].w) >> 6 * (pd->len - HASH_NCHARS); |
| 286 |
if (i == 0) |
| 287 |
{ |
| 288 |
/* 1番目のエントリ */ |
| 289 |
pd->wdk = malloc(sizeof(*pd->wdk)); |
| 290 |
assert(pd->wdk != NULL); |
| 291 |
pd->wdk[k].k = w; |
| 292 |
} |
| 293 |
else if (pd->wdk[k].k != w) |
| 294 |
{ |
| 295 |
pd->wdk[k++].i = i; |
| 296 |
if (IS_POWER2(k)) |
| 297 |
pd->wdk = realloc(pd->wdk, |
| 298 |
2 * k * sizeof(*pd->wdk)); |
| 299 |
assert(pd->wdk != NULL); |
| 300 |
pd->wdk[k].k = w; |
| 301 |
} |
| 302 |
|
| 303 |
/* 零次ハッシュを記録 */ |
| 304 |
w = hash(w); |
| 305 |
pd->bmhash[w / ALU_BITS] |= (ALU_T)1 << (w & (ALU_BITS - 1)); |
| 306 |
} |
| 307 |
pd->wdk[k].i = i; |
| 308 |
pd->nwdk = k + 1; |
| 309 |
} |
| 310 |
|
| 311 |
void |
| 312 |
wdict_setup_lr(void) |
| 313 |
{ |
| 314 |
int i; |
| 315 |
int cls, pos, len; |
| 316 |
|
| 317 |
for (cls = 0; cls < N_CLS; cls++) |
| 318 |
for (pos = 1; pos < 11; pos++) |
| 319 |
for (len = 1; len <= 10; len++) |
| 320 |
{ |
| 321 |
struct WDICT *pd = g_dict[cls][pos][len]; |
| 322 |
|
| 323 |
if (pd == NULL) |
| 324 |
continue; |
| 325 |
|
| 326 |
assert(pd->nwords > 0); |
| 327 |
assert(pd->node != NULL); |
| 328 |
assert(pd->node->dict == pd); |
| 329 |
assert(pd->node->ref > 0); |
| 330 |
assert(pd->pos == pos); |
| 331 |
assert(pd->len == len); |
| 332 |
|
| 333 |
for (i = 0; i < len; i++) |
| 334 |
synth_add(pd->node, |
| 335 |
synth_make_lr(pd->cov[i], pos + i), |
| 336 |
NULL); |
| 337 |
wdict_setup_words(pd); |
| 338 |
|
| 339 |
#if DEBUG>=1 |
| 340 |
fprintf(stderr, |
| 341 |
"dic(%d/%2d/%2d):%08X%08X/%08X%08X(%8d)\n", |
| 342 |
cls, pos, len, |
| 343 |
(unsigned)(pd->words[0].w >> 32), (unsigned)pd->words[0].w, |
| 344 |
(unsigned)(pd->words[pd->nwords - 1].w >> 32), (unsigned)pd->words[pd->nwords - 1].w, |
| 345 |
pd->nwords); |
| 346 |
#endif |
| 347 |
} |
| 348 |
#if DEBUG>=1 |
| 349 |
fprintf(stderr, |
| 350 |
"wdict=%d(%d..%d)\n", |
| 351 |
n_dictpool - min_dictpool, min_dictpool, n_dictpool - 1); |
| 352 |
#endif |
| 353 |
} |
| 354 |
|
| 355 |
/*************************************************************** |
| 356 |
* |
| 357 |
* 辞書のヒットを調べる |
| 358 |
* てぬきバイナリサーチでどうにかなってしまうあたりがすごい。 |
| 359 |
* |
| 360 |
*/ |
| 361 |
|
| 362 |
int |
| 363 |
wdict_ishit(SLICE const *hit, /* 生成比較器がセットしたヒット配列[tn] */ |
| 364 |
int an, int ab, /* SLICE::a[an].bit(ab) */ |
| 365 |
uint64_t bmp) /* デコード済み文字列 */ |
| 366 |
{ |
| 367 |
int i; |
| 368 |
uint64_t bi; |
| 369 |
|
| 370 |
bmp &= (1ULL << 6 * 10) - 1; |
| 371 |
bi = normalize(bmp); |
| 372 |
|
| 373 |
for (i = min_dictpool; i < n_dictpool; i++) |
| 374 |
{ |
| 375 |
struct WDICT const *pd = &g_dictpool[i]; |
| 376 |
int len = pd->len; |
| 377 |
if (len > 0 |
| 378 |
&& (hit[i].a[an] & ((ALU_T)1 << ab))) |
| 379 |
{ |
| 380 |
uint64_t m; |
| 381 |
unsigned k = ((bi >> 6 * ((pd->pos - 1) + len - HASH_NCHARS)) |
| 382 |
& ((1 << 6 * HASH_NCHARS) - 1)); |
| 383 |
unsigned ia, ib; |
| 384 |
unsigned i, j; |
| 385 |
|
| 386 |
/* 零次ヒット判定 */ |
| 387 |
i = hash(k); |
| 388 |
if (!(pd->bmhash[i / ALU_BITS] & ((ALU_T)1 << (i & (ALU_BITS - 1))))) |
| 389 |
continue; |
| 390 |
|
| 391 |
/* 一次キーを引く */ |
| 392 |
for (ia = 0, ib = pd->nwdk; |
| 393 |
ib > 0; |
| 394 |
ib >>= 1) |
| 395 |
{ |
| 396 |
uint32_t x, y; |
| 397 |
i = ia + (ib >> 1); |
| 398 |
x = k; |
| 399 |
y = pd->wdk[i].k; |
| 400 |
#if DEBUG>=2 |
| 401 |
printf("%08o/%08o %5d(%5d)%5d %5d\n", |
| 402 |
x, y, ia, i, ib, pd->wdk[i].i); |
| 403 |
#endif |
| 404 |
if (x == y) |
| 405 |
goto found_1st; |
| 406 |
else if (x > y) |
| 407 |
{ |
| 408 |
ia = i + 1; |
| 409 |
ib--; |
| 410 |
} |
| 411 |
} |
| 412 |
continue; /* not found */ |
| 413 |
|
| 414 |
/* 一次にヒットしたのでのんびりと二次範囲を決める */ |
| 415 |
found_1st: |
| 416 |
m = (bi >> 6 * (pd->pos - 1)) & ((1ULL << 6 * pd->len) - 1); |
| 417 |
#if DEBUG>=2 |
| 418 |
printf("hit:k=%08o i=%d\n", k, i); |
| 419 |
printf("%010o%010o (%d,%d)/%d\n", |
| 420 |
(unsigned)(m >> 30), |
| 421 |
(unsigned)(m & 07777777777), |
| 422 |
(i == 0 ? 0 : pd->wdk[i - 1].i), |
| 423 |
pd->wdk[i].i, |
| 424 |
pd->nwords); |
| 425 |
#endif |
| 426 |
for (ia = (i == 0 ? 0 : pd->wdk[i - 1].i), |
| 427 |
ib = pd->wdk[i].i - ia; |
| 428 |
ib > 0; |
| 429 |
ib >>= 1) |
| 430 |
{ |
| 431 |
uint64_t x, y; |
| 432 |
i = ia + (ib >> 1); |
| 433 |
x = m; |
| 434 |
y = normalize(pd->words[i].w); |
| 435 |
#if DEBUG>=2 |
| 436 |
printf("%010o%010o %5d(%5d)%5d %5d\n", |
| 437 |
(unsigned)(y >> 30), |
| 438 |
(unsigned)(y & 07777777777), |
| 439 |
ia, i, ib, pd->nwords); |
| 440 |
#endif |
| 441 |
if (x == y) |
| 442 |
{ |
| 443 |
/* 近辺をリニアサーチ(乱暴だね!) */ |
| 444 |
x = ((bmp >> 6 * (pd->pos - 1)) & ((1ULL << 6 * pd->len) - 1)); |
| 445 |
#if DEBUG>=2 |
| 446 |
printf("%010o%010o:X\n", |
| 447 |
(unsigned)(x >> 30), |
| 448 |
(unsigned)(x & 07777777777)); |
| 449 |
#endif |
| 450 |
for (j = i; |
| 451 |
j < pd->nwords && normalize(pd->words[j].w) == m; |
| 452 |
j++) |
| 453 |
{ |
| 454 |
uint64_t xi = xpize(x, pd->words[j].xp, len); |
| 455 |
#if DEBUG>=2 |
| 456 |
printf("%010o%010o:%03X\n%010o%010o:%d\n", |
| 457 |
(unsigned)(xi >> 30), |
| 458 |
(unsigned)(xi & 07777777777), |
| 459 |
pd->words[j].xp, |
| 460 |
(unsigned)(y >> 30), |
| 461 |
(unsigned)(y & 07777777777), |
| 462 |
j); |
| 463 |
#endif |
| 464 |
if (pd->words[j].w == xi) |
| 465 |
return 1; |
| 466 |
} |
| 467 |
|
| 468 |
for (j = i - 1; |
| 469 |
0 <= j && j < pd->nwords |
| 470 |
&& normalize(pd->words[j].w) == m; |
| 471 |
j--) |
| 472 |
{ |
| 473 |
uint64_t xi = xpize(x, pd->words[j].xp, len); |
| 474 |
#if DEBUG>=2 |
| 475 |
printf("%010o%010o:%03X\n%010o%010o:%d\n", |
| 476 |
(unsigned)(xi >> 30), |
| 477 |
(unsigned)(xi & 07777777777), |
| 478 |
pd->words[j].xp, |
| 479 |
(unsigned)(y >> 30), |
| 480 |
(unsigned)(y & 07777777777), |
| 481 |
j); |
| 482 |
#endif |
| 483 |
if (pd->words[j].w == xi) |
| 484 |
return 1; |
| 485 |
} |
| 486 |
|
| 487 |
/* 次の DICT の判定へ。 */ |
| 488 |
break; |
| 489 |
} |
| 490 |
|
| 491 |
if (x > y) |
| 492 |
{ |
| 493 |
ia = i + 1; |
| 494 |
ib--; |
| 495 |
} |
| 496 |
} |
| 497 |
} |
| 498 |
} |
| 499 |
|
| 500 |
return 0; |
| 501 |
} |
| 502 |
|
| 503 |
/*************************************************************** |
| 504 |
* |
| 505 |
* 辞書内容をダンプ; for debug |
| 506 |
* |
| 507 |
*/ |
| 508 |
|
| 509 |
void |
| 510 |
wdict_dump(int lv, struct WDICT const *dict) |
| 511 |
{ |
| 512 |
int i; |
| 513 |
int cls, pos, len; |
| 514 |
char const *t = &"\t\t\t\t\t\t\t\t\t\t"[10 - lv]; |
| 515 |
|
| 516 |
assert(dict != NULL); |
| 517 |
|
| 518 |
for (cls = 0; cls < N_CLS; cls++) |
| 519 |
for (pos = 0; pos <= 10; pos++) |
| 520 |
for (len = 1; len <= 10; len++) |
| 521 |
if (dict == g_dict[cls][pos][len]) |
| 522 |
printf("%sdic=%d/%d/%d words=%d\n", |
| 523 |
t, |
| 524 |
cls, pos, len, |
| 525 |
dict->nwords); |
| 526 |
#if DEBUG>=2 |
| 527 |
for (i = 0; i < dict->nwords; i++) |
| 528 |
printf("%s\t%010o%010o:%4d xp=%03X\n", |
| 529 |
t, |
| 530 |
(unsigned)(dict->words[i].w >> 30), |
| 531 |
(unsigned)(dict->words[i].w & 07777777777), |
| 532 |
i, |
| 533 |
dict->words[i].xp); |
| 534 |
#endif |
| 535 |
} |
| 536 |
|
| 537 |
/* |
| 538 |
* Local Variables: |
| 539 |
* tab-width: 4 |
| 540 |
* End: |
| 541 |
* |
| 542 |
* EOF */ |