ADD: Proxy SKKServ Converter
MOD: SkkJisyoConverter while DBM building, temporary file use
MOD: SkkJisyoConverter server completion search use binary search
@@ -142,7 +142,7 @@ | ||
142 | 142 | ; 注釈 |
143 | 143 | は、Social IMEで存在しないため、(concat "8進数値")の形で返す |
144 | 144 | また |
145 | - 「/」「 」 は候補の区切り文字なので同様にconcatで返す | |
145 | + 「/」「 」 はSKKプロトコルの候補の区切り文字なので同様にconcatで返す | |
146 | 146 | ''' |
147 | 147 | |
148 | 148 | _SERVER = u'http://www.social-ime.com:80/' |
@@ -158,7 +158,8 @@ | ||
158 | 158 | try: |
159 | 159 | f = urllib.urlopen(req) |
160 | 160 | uresult = f.read().strip().decode(self.SERVER_CHARSET) |
161 | - l = uresult.split(u'\t') | |
161 | + # 何故か複数の文節として扱われることがある | |
162 | + l = uresult.split(u'\n')[0].split(u'\t') | |
162 | 163 | array = [] |
163 | 164 | # Social IMEは変換結果に注釈を含まない |
164 | 165 | for x in l: |
@@ -168,10 +169,12 @@ | ||
168 | 169 | else: |
169 | 170 | r = x |
170 | 171 | pass |
171 | - array.append(r) | |
172 | + if r: | |
173 | + array.append(r) | |
174 | + pass | |
172 | 175 | pass |
173 | 176 | logging.debug(uresult) |
174 | - logging.debug(array) | |
177 | + logging.debug("[%s]" % u'/'.join(array)) | |
175 | 178 | if array: |
176 | 179 | status = self.SERVER_FOUND |
177 | 180 | pass |
@@ -252,7 +255,7 @@ | ||
252 | 255 | if okuri: |
253 | 256 | # 送り仮名に一致する候補のみ抽出したリストへ |
254 | 257 | logging.debug(u'okuris:%s -> %s' % (okuri, self.okurimap[okuri])) |
255 | - l = [r[:-1] for r in result if r[-1] in self.okurimap[okuri]] | |
258 | + l = [r[:-1] for r in result if r[-1] in self.okurimap[okuri] and len(r[:-1])] | |
256 | 259 | logging.debug(l) |
257 | 260 | # 順番を保ったまま重複を無くす |
258 | 261 | newresult = sorted(set(l), key = l.index) |
@@ -315,57 +318,130 @@ | ||
315 | 318 | _JISYO_CHARSET = u'EUC_JIS_2004' |
316 | 319 | _entries = {} |
317 | 320 | _db = None |
321 | + _dbnasi = None | |
318 | 322 | _dbmidashi = None |
319 | 323 | _DBM_CHARSET = u'UTF-8' |
324 | + _useDBMidashi = False | |
320 | 325 | |
326 | + def __loadDBM(self, name): | |
327 | + import anydbm | |
328 | + # 変換候補辞書の読み込み | |
329 | + self._db = anydbm.open(u'%s.db' % name, u'r') | |
330 | + # 補完用見出し辞書の読み込み | |
331 | + self._dbnasi = anydbm.open(u'%s.okuri-nasi.db' % name, u'r') | |
332 | + if self._useDBMidashi: | |
333 | + logging.debug(_('use DBM midashi')) | |
334 | + self._dbmidashi = anydbm.open(u'%s.midashi.db' % name, u'r') | |
335 | + pass | |
336 | + else: | |
337 | + # 多少メモリは食うが速いぞ | |
338 | + logging.debug(_('use on memory midashi')) | |
339 | + entries = [k.decode(self._DBM_CHARSET) for k in self._dbnasi.keys()] | |
340 | + entries.sort() | |
341 | + self._entries = tuple(entries) | |
342 | + pass | |
343 | + self._dbnasi.close() | |
344 | + del self._dbnasi | |
345 | + # おしまい | |
346 | + return | |
347 | + | |
348 | + | |
349 | + def _createDBM(self, howmany): | |
350 | + import tempfile, dbhash | |
351 | + adb = [] | |
352 | + for x in range(howmany): | |
353 | + t = tempfile.mkstemp() | |
354 | + db = dbhash.open(t[1], u'n') | |
355 | + adb.append((t, db)) | |
356 | + pass | |
357 | + return adb | |
358 | + | |
359 | + def _addEntryForCompletion(self, db, midasi): | |
360 | + # 非常に無駄の多い処理方法だが、初めての辞書DBM作成時のみなので | |
361 | + # とりあえず... | |
362 | + for i in xrange(1, len(midasi)): | |
363 | + v = db.get(midashi[:i].encode(self._DBM_CHARSET), u'').decode(self._DBM_CHARSET) | |
364 | + if v: | |
365 | + s = u'\t'.join((v, midasi)) | |
366 | + else: | |
367 | + s = midashi | |
368 | + pass | |
369 | + db[midasi[:i].encode(self._DBM_CHARSET)] = s.encode(self._DBM_CHARSET) | |
370 | + pass | |
371 | + return | |
372 | + | |
321 | 373 | def _readJisyoDBM(self, name, fobj): |
322 | - import anydbm, whichdb | |
374 | + ''' | |
375 | + SKK辞書を読み取り、DBM化する | |
376 | + DBMに保存される見出しと候補はUTF-8で保存される | |
377 | + ${SKK-JISYO}.db は見出しと候補の両方を保持 | |
378 | + ${SKK-JISYO}.okuri-nasi.db は送り無し見出しのみ保持 | |
379 | + ${SKK-JISYO}.midashi.db 送り無し見出しを補完に適した形で保持(あまりメリットがなさそうなのでやめようと思う) | |
380 | + ''' | |
323 | 381 | logging.debug(u'jisyo read to DBM started') |
324 | - db = anydbm.open(u'%s.db' % name, u'c') | |
325 | - db2 = anydbm.open(u'%s.midashi.db' % name, u'c') | |
326 | - line = fobj.readline() | |
382 | + tdb = self._createDBM(2) | |
327 | 383 | okuri = False |
328 | - while line: | |
384 | + for count, line in enumerate(fobj): | |
385 | + # 内部エンコードに変換 | |
329 | 386 | try: |
330 | 387 | uline = line.decode(self._JISYO_CHARSET) |
331 | 388 | except UnicodeDecodeError: |
332 | 389 | logging.debug(line) |
390 | + # 失敗したら無視して次行へ | |
333 | 391 | continue |
392 | + # 特別なコメントによる「送り有り」「送り無し」のチェック | |
334 | 393 | if uline.startswith(u';; okuri-ari entries.'): |
335 | 394 | okuri = True |
395 | + pass | |
336 | 396 | if uline.startswith(u';; okuri-nasi entries.'): |
337 | 397 | okuri = False |
338 | - if not uline.startswith(u';;'): | |
398 | + pass | |
399 | + # コメント行はスキップ | |
400 | + if uline.startswith(u';;'): | |
401 | + continue | |
402 | + # 見出し /候補1/候補2/.../候補n/\nの行の処理 | |
403 | + try: | |
404 | + # なんと空白を含むエントリがあった… | |
405 | + # (midashi, entry) = uline.strip().split(u' /') | |
406 | + # 正規表現で処理したらスマートかな? | |
339 | 407 | uline = uline.strip() |
340 | 408 | idx = uline.find(u' /') |
341 | - (midashi, entry) = (uline[:idx], uline[idx+2:-1]) | |
342 | - # add to main dict | |
343 | - ent = u'\t'.join(entry.split(u'/')) | |
344 | - db[midashi.encode(self._DBM_CHARSET)] = ent.encode(self._DBM_CHARSET) | |
345 | - # add to midashi dict | |
409 | + (midashi, entry) = uline[:idx], u'\t'.join(uline[idx+2:].strip(u'/').split(u'/')).encode(self._DBM_CHARSET) | |
410 | + # add both entry and candidates to main dict | |
411 | + tdb[0][1][midashi.encode(self._DBM_CHARSET)] = entry | |
412 | + # add okuri-nasi entry to midashi dict | |
346 | 413 | if not okuri: |
347 | - # 非常に無駄の多い処理方法だが、初めての辞書DBM作成時のみなので | |
348 | - # とりあえず... | |
349 | - for i in xrange(1, len(midashi)): | |
350 | - v = db2.get(midashi[:i].encode(self._DBM_CHARSET), u'').decode(self._DBM_CHARSET) | |
351 | - if v: | |
352 | - s = u'\t'.join((v, midashi)) | |
353 | - else: | |
354 | - s = midashi | |
355 | - db2[midashi[:i].encode(self._DBM_CHARSET)] = s.encode(self._DBM_CHARSET) | |
356 | - #logging.debug('%s:%s' % (midashi[:i], s)) | |
414 | + if self._useDBMidashi: | |
415 | + self._addEntryForCompletion(tdb[1][1], midashi) | |
357 | 416 | pass |
417 | + else: | |
418 | + tdb[1][1][midashi.encode(self._DBM_CHARSET)] = u''.encode(self._DBM_CHARSET) | |
419 | + pass | |
358 | 420 | pass |
359 | 421 | pass |
360 | - line = fobj.readline() | |
422 | + except: | |
423 | + logging.exception('%s' % uline) | |
424 | + pass | |
361 | 425 | pass |
362 | - db.close() | |
363 | - db2.close() | |
364 | - self._db = anydbm.open(u'%s.db' % name, u'r') | |
365 | - self._dbmidashi = anydbm.open(u'%s.midashi.db' % name, u'r') | |
366 | - #entries = [k.decode(self._DBM_CHARSET) for k in self._db.keys()] | |
367 | - #self._entries = tuple(entries) | |
368 | - logging.info(u'%s used %s' % (name, whichdb.whichdb(u'%s.db' % name))) | |
426 | + # 全行読み取り完了 | |
427 | + for db in tdb: | |
428 | + db[1].close() | |
429 | + pass | |
430 | + # 一時ファイルとして作成したDBMを目的の名前に変更 | |
431 | + dbs = ((tdb[0], u'%s.db' % name), (tdb[1], u'%s.okuri-nasi.db' % name)) | |
432 | + import os, shutil | |
433 | + for t in dbs: | |
434 | + try: | |
435 | + try: | |
436 | + os.remove(u'%s.db' % name) | |
437 | + except: | |
438 | + pass | |
439 | + shutil.move(t[0][1], t[1]) | |
440 | + except: | |
441 | + logging.exception(_('DBM %(name)s rename failed') % {name:t[1]}) | |
442 | + pass | |
443 | + # 作成したDBMを読み込み | |
444 | + self.__loadDBM(name) | |
369 | 445 | logging.debug(u'jisyo read to DBM complete') |
370 | 446 | pass |
371 | 447 |
@@ -390,10 +466,7 @@ | ||
390 | 466 | import urllib, gzip, StringIO, shutil |
391 | 467 | try: |
392 | 468 | import anydbm, whichdb |
393 | - self._db = anydbm.open(u'%s.db' % name, u'r') | |
394 | - self._dbmidashi = anydbm.open(u'%s.midashi.db' % name, u'r') | |
395 | - #entries = [k.decode(self._DBM_CHARSET) for k in self._db.keys()] | |
396 | - #self._entries = tuple(entries) | |
469 | + self.__loadDBM(name) | |
397 | 470 | #logging.info(u'%s has %d entries.' % (name, len(self._entries))) |
398 | 471 | logging.info(u'%s used %s' % (name, whichdb.whichdb(u'%s.db' % name))) |
399 | 472 | return |
@@ -400,10 +473,8 @@ | ||
400 | 473 | except: |
401 | 474 | logging.exception(u'SKK JISYO(%s) dbm open failed' % name) |
402 | 475 | pass |
403 | - try: | |
404 | - fdict = file(name, u'r') | |
405 | - fdict.close() | |
406 | - except IOError, e: | |
476 | + if not os.path.exists(name): | |
477 | + # SKK辞書が存在していなかったらダウンロードを試みる | |
407 | 478 | try: |
408 | 479 | logging.info(u'SKK JISYO(%s) not found. Trying download' % name) |
409 | 480 | data = urllib.urlopen(self._JISYOHOST % name) |
@@ -422,8 +493,10 @@ | ||
422 | 493 | except: |
423 | 494 | pass |
424 | 495 | import os |
425 | - os.unlink(name) | |
496 | + os.remove(name) | |
426 | 497 | pass |
498 | + pass | |
499 | + # SKK辞書を変換して読み込み | |
427 | 500 | try: |
428 | 501 | fdict = file(name, u'r') |
429 | 502 | # in memory |
@@ -435,7 +508,7 @@ | ||
435 | 508 | logging.exception(u'SKK JISYO(%s) open failed.' % name) |
436 | 509 | logging.exception(u'SKK JISYO(%s) conversion not served.' % name) |
437 | 510 | pass |
438 | - pass | |
511 | + return | |
439 | 512 | |
440 | 513 | def request(self, arg): |
441 | 514 | status = self.SERVER_NOT_FOUND |
@@ -445,7 +518,7 @@ | ||
445 | 518 | # dbm |
446 | 519 | candidates = self._db[arg.encode(self._DBM_CHARSET)].decode(self._DBM_CHARSET).split(u'\t') |
447 | 520 | status = self.SERVER_FOUND |
448 | - logging.debug(u'%s -> %s' % (arg, candidates)) | |
521 | + logging.debug(u'%s -> [%s]' % (arg, '/'.join(candidates))) | |
449 | 522 | except: |
450 | 523 | candidates = [arg] |
451 | 524 | pass |
@@ -457,28 +530,21 @@ | ||
457 | 530 | if arg[-1] == u'~': |
458 | 531 | arg = arg[:-1] |
459 | 532 | pass |
460 | - # in memory | |
461 | - # d = [k for k in self._entries.keys() if k.startswith(arg) and self.isKana(k)] | |
462 | - # dbm | |
463 | - # これだとえらく遅い | |
464 | - # SKK-JISYO.Lで約1.3秒(PentiumD 3.4GHz) | |
465 | - # d = [k.decode(self._DBM_CHARSET) for k in self._db.keys() if k.decode(self._DBM_CHARSET).startswith(arg) and (self.isKana(k.decode(self._DBM_CHARSET)) or self.isalpha(k.decode(self._DBM_CHARSET)))] | |
466 | - # 文字コード変換量を抑えてみる。しかし遅い | |
467 | - # SKK-JISYO.Lで約0.7秒(PentiumD 3.4GHz) | |
468 | - # midashi = arg.encode(self._DBM_CHARSET) | |
469 | - # d = [k.decode(self._DBM_CHARSET) for k in self._db.keys() if k.startswith(midashi) and (self.isKana(k.decode(self._DBM_CHARSET)) or self.isalpha(k.decode(self._DBM_CHARSET)))] | |
470 | - # 文字コード変換量を抑え、イテレータで回してみる。恐ろしく遅い | |
471 | - # SKK-JISYO.Lで約2.0秒(PentiumD 3.4GHz) | |
472 | - # midashi = arg.encode(self._DBM_CHARSET) | |
473 | - # d = [k.decode(self._DBM_CHARSET) for k in self._db.iterkeys() if k.startswith(midashi) and (self.isKana(k.decode(self._DBM_CHARSET)) or self.isalpha(k.decode(self._DBM_CHARSET)))] | |
474 | - # このようにすると8倍程早くなるが見出し一覧を常にメモリに持つのでちょっとメモリ消費が大きい | |
475 | - # SKK-JISYO.Lで約0.1秒(PentiumD 3.4GHz) | |
533 | + # 二分探索 見出し辞書を作成した場合よりは遅いようだが非常に速い | |
534 | + import bisect | |
476 | 535 | if self._entries: |
477 | - d = [k for k in self._entries if k.startswith(arg) and (self.isKana(k) or self.isalpha(k))] | |
478 | - # 交ぜ書き辞書無しと見做してみる。ほとんど変わらない | |
479 | - # SKK-JISYO.Lで約0.1秒(PentiumD 3.4GHz) | |
480 | - # d = [k for k in self._entries if k.startswith(arg)] | |
481 | - # みだし辞書を作ってみたぞ | |
536 | + idx = bisect.bisect_left(self._entries, arg) | |
537 | + d = [] | |
538 | + while True: | |
539 | + if self._entries[idx].startswith(arg): | |
540 | + d.append(self._entries[idx]) | |
541 | + idx +=1 | |
542 | + pass | |
543 | + else: | |
544 | + break | |
545 | + pass | |
546 | + pass | |
547 | + # 見出し辞書を作ってみたぞ | |
482 | 548 | # これだととっても速い |
483 | 549 | if self._dbmidashi: |
484 | 550 | v = self._dbmidashi.get(arg.encode(self._DBM_CHARSET), '').decode(self._DBM_CHARSET) |
@@ -489,12 +555,113 @@ | ||
489 | 555 | d = [] |
490 | 556 | pass |
491 | 557 | pass |
492 | - logging.debug(u'%s -> %s' % (arg, d)) | |
558 | + logging.debug(u'%s -> [%s]' % (arg, '/'.join(d))) | |
493 | 559 | if d: |
494 | 560 | status = self.SERVER_FOUND |
495 | 561 | pass |
496 | 562 | return (status, d) |
497 | 563 | |
564 | +class ProxyConverter(Converter): | |
565 | + ''' | |
566 | + 他のSKKServを使用して変換する | |
567 | + ''' | |
568 | + __server = () | |
569 | + __svrsok = None | |
570 | + __completion = True | |
571 | + __discon = True | |
572 | + | |
573 | + def __reconnect(self): | |
574 | + import socket | |
575 | + if self.__discon: | |
576 | + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | |
577 | + #logging.debug("%s [%s]" % (type(self.__server), self.__server)) | |
578 | + s.connect(self.__server) | |
579 | + s.settimeout(3.0) | |
580 | + self.__svrsok = s | |
581 | + self.__discon = False | |
582 | + pass | |
583 | + return | |
584 | + | |
585 | + def __init__(self, server=('127.0.0.1', 1178)): | |
586 | + #logging.debug("[%s]" % server) | |
587 | + self.__server = server | |
588 | + self.__reconnect() | |
589 | + pass | |
590 | + | |
591 | + def request(self, arg): | |
592 | + self.__reconnect() | |
593 | + smsg = (u'1' + arg + u' ').encode(self.SERVER_CHARSET) | |
594 | + rmsg = '' | |
595 | + try: | |
596 | + self.__svrsok.sendall(smsg) | |
597 | + r = self.__svrsok.recv(1024) | |
598 | + if r: | |
599 | + rmsg += r | |
600 | + pass | |
601 | + pass | |
602 | + except: | |
603 | + self.__discon = True | |
604 | + pass | |
605 | + rmsg = rmsg.decode(self.SERVER_CHARSET) | |
606 | + logging.debug(u'Proxy [%s]' % rmsg) | |
607 | + if rmsg: | |
608 | + status = rmsg[0] | |
609 | + sep = rmsg[1] | |
610 | + if status == '1' or (status == 4 and len(rmsg) > 2): | |
611 | + candidates = rmsg.strip()[1:].strip(sep).split(sep) | |
612 | + logging.debug(u'Proxy %s -> [%s]' % (arg, '/'.join(candidates))) | |
613 | + return (status, candidates) | |
614 | + else: | |
615 | + return (status, []) | |
616 | + pass | |
617 | + else: | |
618 | + return (self.SERVER_ERROR, []) | |
619 | + | |
620 | + def server_completion(self, arg): | |
621 | + if not self.__completion: | |
622 | + return (self.SERVER_ERROR, []) | |
623 | + self.__reconnect() | |
624 | + smsg = (u'4' + arg + u' ').encode(self.SERVER_CHARSET) | |
625 | + rmsg = '' | |
626 | + try: | |
627 | + self.__svrsok.sendall(smsg) | |
628 | + r = self.__svrsok.recv(1024) | |
629 | + if r: | |
630 | + rmsg += r | |
631 | + pass | |
632 | + pass | |
633 | + except: | |
634 | + logging.exception('Proxy discon') | |
635 | + try: | |
636 | + self.__svrsok.close() | |
637 | + except: | |
638 | + logging.exception('Proxy close') | |
639 | + pass | |
640 | + logging.exception('Proxy server completion not supported maybe.') | |
641 | + self.__completion = False | |
642 | + self.__discon = True | |
643 | + pass | |
644 | + rmsg = rmsg.decode(self.SERVER_CHARSET) | |
645 | + logging.debug(u'Proxy [%s]' % rmsg) | |
646 | + if rmsg: | |
647 | + status = rmsg[0] | |
648 | + sep = rmsg[1] | |
649 | + if status == '1' or (status == 4 and len(rmsg) > 2): | |
650 | + candidates = rmsg.strip()[1:].strip(sep).split(sep) | |
651 | + return (status, candidates) | |
652 | + else: | |
653 | + if status == '0': | |
654 | + logging.exception('Proxy server completion not supported maybe.') | |
655 | + self.__completion = False | |
656 | + return (status, []) | |
657 | + pass | |
658 | + else: | |
659 | + logging.exception('Proxy server completion not supported maybe.') | |
660 | + self.__completion = False | |
661 | + return (self.SERVER_ERROR, []) | |
662 | + | |
663 | + pass | |
664 | + | |
498 | 665 | # END 変換クラス定義 |
499 | 666 | ###################################################################### |
500 | 667 |