Develop and Download Open Source Software

Browse Subversion Repository

Contents of /branches/mty-makai/wdict.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 252 - (show annotations) (download) (as text)
Mon Dec 27 03:23:48 2010 UTC (13 years, 3 months ago) by notanpe
File MIME type: text/x-csrc
File size: 10577 byte(s)
本家 R245 をベースに全てやりなおし。
1 /***********************************************************************
2 *
3 * file: wdict.c
4 *
5 * 辞書登録・判定など
6 *
7 * $Id$
8 *
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <sys/types.h>
16
17 #include "debug.h"
18 #include "synth.h"
19 #include "util.h"
20 #include "wdict.h"
21
22 #define HASH_NCHARS 4
23 #define HASH_SEED 16381
24 #define HASH_LEAP 37
25 #define BMHASH_SIZ ((HASH_SEED + ALU_BITS - 1) / ALU_BITS)
26
27 /***************************************************************
28 *
29 * 辞書構造
30 *
31 */
32
33 struct WDK
34 {
35 uint32_t k; /* キー */
36 uint32_t i; /* 範囲終点 */
37 };
38
39 struct WDW
40 {
41 uint64_t w; /* 語 */
42 uint16_t xp; /* 区別ナシ */
43 };
44
45 struct WDICT
46 {
47 /* ビットマップハッシュ(切り捨て用)
48 経験的に、ケツで判定するのがよろし */
49 ALU_T bmhash[BMHASH_SIZ];
50
51 char pos;
52 char len;
53
54 /* 一次探索キー */
55 int nwdk;
56 struct WDK *wdk;
57
58 /* 単語 */
59 struct WDW *words;
60
61 /* 以下のフィールドは、探索時に用いないハズ */
62
63 /* 対応ノードクロスリファレンス */
64 struct ITREE *node;
65
66 /* 総単語数 */
67 int nwords;
68
69 /* 被覆記録 */
70 uint64_t *cov;
71 };
72
73 #define N_CLS 3
74
75 static
76 struct WDICT *g_dict[N_CLS][11][11];
77
78 /* tn の分だけ並んでいる */
79 static int min_dictpool, n_dictpool;
80 static
81 struct WDICT g_dictpool[0x1000];
82
83 /***************************************************************
84 *
85 * xpize() ... ビットマップ xp に基づいて正規化
86 * normalize() ... すべて正規化
87 * hash() ... 微妙なハッシュ関数(要正規化)
88 *
89 */
90
91 static
92 uint64_t
93 xpize(uint64_t m, unsigned xp, unsigned len)
94 {
95 int i;
96
97 for (i = 0; i < len; i++)
98 if (xp & (1 << i))
99 {
100 unsigned c = (m >> 6 * i) & 077;
101 unsigned s;
102 if (c >= 046)
103 s = 26;
104 else if (c == 001)
105 s = 1;
106 else
107 continue;
108 m -= (uint64_t)s << 6 * i;
109 }
110
111 return m;
112 }
113
114 static
115 uint64_t
116 normalize(uint64_t m)
117 {
118 return xpize(m, -1, 10);
119 }
120
121 static
122 unsigned
123 hash(unsigned m)
124 {
125 int i;
126 unsigned a = 0;
127
128 for (i = 0; i < HASH_NCHARS; i++, m >>= 6)
129 {
130 unsigned c = m & 077;
131 #if DEBUG>=1
132 assert(c != 001);
133 assert(c < 046);
134 #endif
135 if (c >= 002)
136 c--;
137 a = 37 * a + c;
138 }
139
140 return a % HASH_SEED;
141 }
142
143 /***************************************************************
144 *
145 * 被覆をつくりだす
146 *
147 */
148
149 static
150 void
151 wdict_mkcov(uint64_t *cov, int len, uint64_t w, unsigned xp)
152 {
153 int i;
154 for (i = 0; i < len; i++, w >>= 6, xp >>= 1)
155 cov[i] |= (((xp & 1) /* 区別ナシ */
156 ? ((w & 077) < 2
157 ? 3ULL /* [./] */
158 : (1ULL | (1ULL << 26)))
159 : 1ULL)
160 << (w & 077));
161 }
162
163 /***************************************************************
164 *
165 * 辞書に単語を突っ込む
166 *
167 */
168
169 struct WDICT *
170 wdict_add(int tn,
171 int pos,
172 struct DICT_CHAR const *word)
173 {
174 int i;
175 struct WDICT *pd;
176 struct WDICT *rpd = NULL;
177 int len = word->len;
178 unsigned attr;
179 unsigned xp = word->xp;
180 uint64_t text = word->bmp;
181
182 /* クラスを決める */
183 switch (word->attr)
184 {
185 case CA_U: attr = 1; break;
186 case CA_U | CA_C:
187 case CA_C: attr = 2; break;
188 default: attr = 0; break;
189 }
190
191 assert(attr < N_CLS);
192 pd = g_dict[attr][pos][len];
193 if (pd == NULL)
194 {
195 assert(tn >= 0);
196 assert(tn < sizeof(g_dictpool) / sizeof(*g_dictpool));
197 g_dict[attr][pos][len] = pd = &g_dictpool[tn];
198 if (n_dictpool == 0)
199 min_dictpool = tn;
200 else if (min_dictpool >= tn)
201 min_dictpool = tn;
202 if (n_dictpool <= tn)
203 n_dictpool = tn + 1;
204 pd->pos = pos;
205 pd->len = len;
206 pd->words = malloc(1 * sizeof(*pd->words));
207 assert(pd->words != NULL);
208 pd->cov = calloc(10, sizeof(*pd->cov));
209 rpd = pd;
210 }
211
212 /* 辞書バッファを 2**n の単位で拡張 */
213 if (pd->nwords >= 1
214 && IS_POWER2(pd->nwords))
215 {
216 assert(pd->words != NULL);
217 pd->words = realloc(pd->words,
218 2 * pd->nwords * sizeof(*pd->words));
219 assert(pd->words != NULL);
220 }
221
222 /* 辞書に単語を登録 */
223 assert(text == xpize(text, xp, len));
224 pd->words[pd->nwords].xp = xp;
225 pd->words[pd->nwords++].w = text;
226
227 /* 被覆を記録 */
228 wdict_mkcov(pd->cov, len, text, xp);
229
230 return rpd; /* 初回登録のみレコードを返す */
231 }
232
233 /***************************************************************
234 *
235 * 辞書ノードに木を関連づける
236 *
237 */
238
239 void
240 wdict_setnode(struct WDICT *pd,
241 struct ITREE *node)
242 {
243 assert(pd != NULL);
244 assert(node != NULL);
245 pd->node = node;
246 }
247
248 /***************************************************************
249 *
250 * 辞書の被覆を固定する
251 * ついでにソートもしておく(まだてぬき)
252 *
253 */
254
255 static
256 int
257 cmp_dict(void const *pa, void const *pb)
258 {
259 struct WDW const *a = pa;
260 struct WDW const *b = pb;
261 uint64_t i = normalize(a->w);
262 uint64_t j = normalize(b->w);
263 if (i < j) return -1;
264 if (i > j) return 1;
265 if (a->w < b->w) return -1;
266 if (a->w > b->w) return 1;
267 return (int)(a->xp - (int)b->xp);
268 }
269
270 static
271 void
272 wdict_setup_words(struct WDICT *pd)
273 {
274 int i, j, k;
275
276 /* まずはソート */
277 qsort(pd->words,
278 pd->nwords, sizeof(*pd->words),
279 cmp_dict);
280
281 /* 一次検索を組み立てる */
282 k = 0;
283 for (i = j = 0; i < pd->nwords; i++)
284 {
285 uint32_t w = normalize(pd->words[i].w) >> 6 * (pd->len - HASH_NCHARS);
286 if (i == 0)
287 {
288 /* 1番目のエントリ */
289 pd->wdk = malloc(sizeof(*pd->wdk));
290 assert(pd->wdk != NULL);
291 pd->wdk[k].k = w;
292 }
293 else if (pd->wdk[k].k != w)
294 {
295 pd->wdk[k++].i = i;
296 if (IS_POWER2(k))
297 pd->wdk = realloc(pd->wdk,
298 2 * k * sizeof(*pd->wdk));
299 assert(pd->wdk != NULL);
300 pd->wdk[k].k = w;
301 }
302
303 /* 零次ハッシュを記録 */
304 w = hash(w);
305 pd->bmhash[w / ALU_BITS] |= (ALU_T)1 << (w & (ALU_BITS - 1));
306 }
307 pd->wdk[k].i = i;
308 pd->nwdk = k + 1;
309 }
310
311 void
312 wdict_setup_lr(void)
313 {
314 int i;
315 int cls, pos, len;
316
317 for (cls = 0; cls < N_CLS; cls++)
318 for (pos = 1; pos < 11; pos++)
319 for (len = 1; len <= 10; len++)
320 {
321 struct WDICT *pd = g_dict[cls][pos][len];
322
323 if (pd == NULL)
324 continue;
325
326 assert(pd->nwords > 0);
327 assert(pd->node != NULL);
328 assert(pd->node->dict == pd);
329 assert(pd->node->ref > 0);
330 assert(pd->pos == pos);
331 assert(pd->len == len);
332
333 for (i = 0; i < len; i++)
334 synth_add(pd->node,
335 synth_make_lr(pd->cov[i], pos + i),
336 NULL);
337 wdict_setup_words(pd);
338
339 #if DEBUG>=1
340 fprintf(stderr,
341 "dic(%d/%2d/%2d):%08X%08X/%08X%08X(%8d)\n",
342 cls, pos, len,
343 (unsigned)(pd->words[0].w >> 32), (unsigned)pd->words[0].w,
344 (unsigned)(pd->words[pd->nwords - 1].w >> 32), (unsigned)pd->words[pd->nwords - 1].w,
345 pd->nwords);
346 #endif
347 }
348 #if DEBUG>=1
349 fprintf(stderr,
350 "wdict=%d(%d..%d)\n",
351 n_dictpool - min_dictpool, min_dictpool, n_dictpool - 1);
352 #endif
353 }
354
355 /***************************************************************
356 *
357 * 辞書のヒットを調べる
358 * てぬきバイナリサーチでどうにかなってしまうあたりがすごい。
359 *
360 */
361
362 int
363 wdict_ishit(SLICE const *hit, /* 生成比較器がセットしたヒット配列[tn] */
364 int an, int ab, /* SLICE::a[an].bit(ab) */
365 uint64_t bmp) /* デコード済み文字列 */
366 {
367 int i;
368 uint64_t bi;
369
370 bmp &= (1ULL << 6 * 10) - 1;
371 bi = normalize(bmp);
372
373 for (i = min_dictpool; i < n_dictpool; i++)
374 {
375 struct WDICT const *pd = &g_dictpool[i];
376 int len = pd->len;
377 if (len > 0
378 && (hit[i].a[an] & ((ALU_T)1 << ab)))
379 {
380 uint64_t m;
381 unsigned k = ((bi >> 6 * ((pd->pos - 1) + len - HASH_NCHARS))
382 & ((1 << 6 * HASH_NCHARS) - 1));
383 unsigned ia, ib;
384 unsigned i, j;
385
386 /* 零次ヒット判定 */
387 i = hash(k);
388 if (!(pd->bmhash[i / ALU_BITS] & ((ALU_T)1 << (i & (ALU_BITS - 1)))))
389 continue;
390
391 /* 一次キーを引く */
392 for (ia = 0, ib = pd->nwdk;
393 ib > 0;
394 ib >>= 1)
395 {
396 uint32_t x, y;
397 i = ia + (ib >> 1);
398 x = k;
399 y = pd->wdk[i].k;
400 #if DEBUG>=2
401 printf("%08o/%08o %5d(%5d)%5d %5d\n",
402 x, y, ia, i, ib, pd->wdk[i].i);
403 #endif
404 if (x == y)
405 goto found_1st;
406 else if (x > y)
407 {
408 ia = i + 1;
409 ib--;
410 }
411 }
412 continue; /* not found */
413
414 /* 一次にヒットしたのでのんびりと二次範囲を決める */
415 found_1st:
416 m = (bi >> 6 * (pd->pos - 1)) & ((1ULL << 6 * pd->len) - 1);
417 #if DEBUG>=2
418 printf("hit:k=%08o i=%d\n", k, i);
419 printf("%010o%010o (%d,%d)/%d\n",
420 (unsigned)(m >> 30),
421 (unsigned)(m & 07777777777),
422 (i == 0 ? 0 : pd->wdk[i - 1].i),
423 pd->wdk[i].i,
424 pd->nwords);
425 #endif
426 for (ia = (i == 0 ? 0 : pd->wdk[i - 1].i),
427 ib = pd->wdk[i].i - ia;
428 ib > 0;
429 ib >>= 1)
430 {
431 uint64_t x, y;
432 i = ia + (ib >> 1);
433 x = m;
434 y = normalize(pd->words[i].w);
435 #if DEBUG>=2
436 printf("%010o%010o %5d(%5d)%5d %5d\n",
437 (unsigned)(y >> 30),
438 (unsigned)(y & 07777777777),
439 ia, i, ib, pd->nwords);
440 #endif
441 if (x == y)
442 {
443 /* 近辺をリニアサーチ(乱暴だね!) */
444 x = ((bmp >> 6 * (pd->pos - 1)) & ((1ULL << 6 * pd->len) - 1));
445 #if DEBUG>=2
446 printf("%010o%010o:X\n",
447 (unsigned)(x >> 30),
448 (unsigned)(x & 07777777777));
449 #endif
450 for (j = i;
451 j < pd->nwords && normalize(pd->words[j].w) == m;
452 j++)
453 {
454 uint64_t xi = xpize(x, pd->words[j].xp, len);
455 #if DEBUG>=2
456 printf("%010o%010o:%03X\n%010o%010o:%d\n",
457 (unsigned)(xi >> 30),
458 (unsigned)(xi & 07777777777),
459 pd->words[j].xp,
460 (unsigned)(y >> 30),
461 (unsigned)(y & 07777777777),
462 j);
463 #endif
464 if (pd->words[j].w == xi)
465 return 1;
466 }
467
468 for (j = i - 1;
469 0 <= j && j < pd->nwords
470 && normalize(pd->words[j].w) == m;
471 j--)
472 {
473 uint64_t xi = xpize(x, pd->words[j].xp, len);
474 #if DEBUG>=2
475 printf("%010o%010o:%03X\n%010o%010o:%d\n",
476 (unsigned)(xi >> 30),
477 (unsigned)(xi & 07777777777),
478 pd->words[j].xp,
479 (unsigned)(y >> 30),
480 (unsigned)(y & 07777777777),
481 j);
482 #endif
483 if (pd->words[j].w == xi)
484 return 1;
485 }
486
487 /* 次の DICT の判定へ。 */
488 break;
489 }
490
491 if (x > y)
492 {
493 ia = i + 1;
494 ib--;
495 }
496 }
497 }
498 }
499
500 return 0;
501 }
502
503 /***************************************************************
504 *
505 * 辞書内容をダンプ; for debug
506 *
507 */
508
509 void
510 wdict_dump(int lv, struct WDICT const *dict)
511 {
512 int i;
513 int cls, pos, len;
514 char const *t = &"\t\t\t\t\t\t\t\t\t\t"[10 - lv];
515
516 assert(dict != NULL);
517
518 for (cls = 0; cls < N_CLS; cls++)
519 for (pos = 0; pos <= 10; pos++)
520 for (len = 1; len <= 10; len++)
521 if (dict == g_dict[cls][pos][len])
522 printf("%sdic=%d/%d/%d words=%d\n",
523 t,
524 cls, pos, len,
525 dict->nwords);
526 #if DEBUG>=2
527 for (i = 0; i < dict->nwords; i++)
528 printf("%s\t%010o%010o:%4d xp=%03X\n",
529 t,
530 (unsigned)(dict->words[i].w >> 30),
531 (unsigned)(dict->words[i].w & 07777777777),
532 i,
533 dict->words[i].xp);
534 #endif
535 }
536
537 /*
538 * Local Variables:
539 * tab-width: 4
540 * End:
541 *
542 * EOF */

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Rev URL

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26