| 160 |
//================================================== |
//================================================== |
| 161 |
|
|
| 162 |
uses |
uses |
| 163 |
SysUtils, Math, Windows; |
SysUtils, Math, Windows, |
| 164 |
|
MojuUtils; |
| 165 |
|
|
| 166 |
const |
const |
| 167 |
GIKO_BAYESIAN_FILE_VERSION = '1.0'; |
GIKO_BAYESIAN_FILE_VERSION = '1.0'; |
| 400 |
const text : string; |
const text : string; |
| 401 |
wordCount : TWordCount ); |
wordCount : TWordCount ); |
| 402 |
type |
type |
| 403 |
Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum, |
Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana, |
| 404 |
ModeWGraph, ModeWAlpha, ModeWNum, |
ModeWGraph, ModeWAlpha, ModeWNum, |
| 405 |
ModeWHira, ModeWKata, ModeWKanji); |
ModeWHira, ModeWKata, ModeWKanji); |
| 406 |
var |
var |
| 407 |
p, tail, last : PChar; |
p, tail, last : PChar; |
| 408 |
mode, newMode : Modes; |
mode, newMode : Modes; |
| 409 |
aWord : string; |
ch : Longword; |
| 410 |
ch : Longword; |
chSize : Integer; |
| 411 |
chSize : Integer; |
wHiraDelimiter : TStringList; |
| 412 |
delimiter : TStringList; |
wHiraFinalDelimiter : TStringList; |
| 413 |
delimited : Boolean; |
wKanjiDelimiter : TStringList; |
| 414 |
i, idx : Integer; |
words : TStringList; |
| 415 |
countInfo : TWordCountInfo; |
aWord : string; |
| 416 |
|
countInfo : TWordCountInfo; |
| 417 |
|
|
| 418 |
|
function cutBoth( _aWord : string; _delim : TStringList ) : string; |
| 419 |
|
var |
| 420 |
|
_i : Integer; |
| 421 |
|
begin |
| 422 |
|
for _i := 0 to _delim.Count - 1 do begin |
| 423 |
|
_aWord := CustomStringReplace( |
| 424 |
|
_aWord, |
| 425 |
|
_delim[ _i ], |
| 426 |
|
#10 + _delim[ _i ] + #10, False ); |
| 427 |
|
end; |
| 428 |
|
Result := _aWord; |
| 429 |
|
end; |
| 430 |
|
|
| 431 |
|
function cutFirst( _aWord : string; _delim : TStringList ) : string; |
| 432 |
|
var |
| 433 |
|
_i : Integer; |
| 434 |
|
begin |
| 435 |
|
for _i := 0 to _delim.Count - 1 do begin |
| 436 |
|
_aWord := CustomStringReplace( |
| 437 |
|
_aWord, |
| 438 |
|
_delim[ _i ], |
| 439 |
|
#10 + _delim[ _i ], False ); |
| 440 |
|
end; |
| 441 |
|
Result := _aWord; |
| 442 |
|
end; |
| 443 |
|
|
| 444 |
|
function cutFinal( _aWord : string; _delim : TStringList ) : string; |
| 445 |
|
var |
| 446 |
|
_i : Integer; |
| 447 |
|
begin |
| 448 |
|
for _i := 0 to _delim.Count - 1 do begin |
| 449 |
|
_aWord := CustomStringReplace( |
| 450 |
|
_aWord, |
| 451 |
|
_delim[ _i ], |
| 452 |
|
_delim[ _i ] + #10, False ); |
| 453 |
|
end; |
| 454 |
|
Result := _aWord; |
| 455 |
|
end; |
| 456 |
|
|
| 457 |
|
procedure addWord( _dst : TWordCount; _words : TStringList ); |
| 458 |
|
var |
| 459 |
|
_aWord : string; |
| 460 |
|
_i, _idx : Integer; |
| 461 |
|
_countInfo : TWordCountInfo; |
| 462 |
|
begin |
| 463 |
|
for _i := 0 to _words.Count - 1 do begin |
| 464 |
|
_aWord := _words[ _i ]; |
| 465 |
|
if Length( _aWord ) > 0 then begin |
| 466 |
|
if _dst.Find( _aWord, _idx ) then begin |
| 467 |
|
_countInfo := TWordCountInfo( _dst.Objects[ _idx ] ); |
| 468 |
|
end else begin |
| 469 |
|
_countInfo := TWordCountInfo.Create; |
| 470 |
|
_dst.AddObject( _aWord, _countInfo ); |
| 471 |
|
end; |
| 472 |
|
_countInfo.WordCount := _countInfo.WordCount + 1; |
| 473 |
|
end; |
| 474 |
|
end; |
| 475 |
|
end; |
| 476 |
|
|
| 477 |
|
function changeMode( _aWord : string; _mode : Modes ) : string; |
| 478 |
|
var |
| 479 |
|
_i : Integer; |
| 480 |
|
_aWord2 : string; |
| 481 |
|
_pWord, _pWord2 : PChar; |
| 482 |
|
_pWordTail, _pFound : PChar; |
| 483 |
|
const |
| 484 |
|
_delim : string = #10; |
| 485 |
|
begin |
| 486 |
|
if Ord( _mode ) >= Ord( ModeWGraph ) then begin |
| 487 |
|
// 日本語 |
| 488 |
|
// スペースを詰める |
| 489 |
|
_aWord := CustomStringReplace( _aWord, ' ', '', False ); |
| 490 |
|
_aWord := CustomStringReplace( _aWord, ' ', '', False ); |
| 491 |
|
|
| 492 |
|
// デリミタで単語分け |
| 493 |
|
case mode of |
| 494 |
|
ModeWHira: |
| 495 |
|
begin |
| 496 |
|
_aWord := cutFinal( _aWord, wHiraFinalDelimiter ); |
| 497 |
|
Result := cutBoth( _aWord, wHiraDelimiter ); |
| 498 |
|
end; |
| 499 |
|
|
| 500 |
|
ModeWKanji: |
| 501 |
|
begin |
| 502 |
|
// デリミタで単語分け |
| 503 |
|
_aWord := cutBoth( _aWord, wKanjiDelimiter ); |
| 504 |
|
// 4 byte (2 字) ずつで単語分け |
| 505 |
|
_pWord := PChar( _aWord ); |
| 506 |
|
_i := Length( _aWord ); |
| 507 |
|
_pWordTail := _pWord + _i; |
| 508 |
|
SetLength( _aWord2, _i + (_i shr 2) ); |
| 509 |
|
_pWord2 := PChar( _aWord2 ); |
| 510 |
|
|
| 511 |
|
while _pWord < _pWordTail do begin |
| 512 |
|
_pFound := AnsiStrPos( _pWord, PChar( _delim ) ); |
| 513 |
|
if _pFound = nil then |
| 514 |
|
_pFound := _pWordTail; |
| 515 |
|
_pFound := _pFound - 3; |
| 516 |
|
|
| 517 |
|
while _pWord <= _pFound do begin |
| 518 |
|
CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10; |
| 519 |
|
_pWord2 := _pWord2 + 5; _pWord := _pWord + 4; |
| 520 |
|
end; |
| 521 |
|
_i := _pFound + 4 - _pWord; // 4 = 3 + #10 |
| 522 |
|
CopyMemory( _pWord2, _pWord, _i ); |
| 523 |
|
_pWord2 := _pWord2 + _i; _pWord := _pWord + _i; |
| 524 |
|
end; |
| 525 |
|
if _pWord < _pWordTail then begin |
| 526 |
|
_i := _pWordTail - _pWord; |
| 527 |
|
CopyMemory( _pWord2, _pWord, _i ); |
| 528 |
|
_pWord2 := _pWord2 + _i; |
| 529 |
|
end; |
| 530 |
|
SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) ); |
| 531 |
|
|
| 532 |
|
Result := _aWord2; |
| 533 |
|
end; |
| 534 |
|
|
| 535 |
|
else |
| 536 |
|
Result := _aWord; |
| 537 |
|
end; |
| 538 |
|
end else begin |
| 539 |
|
Result := _aWord; |
| 540 |
|
end; |
| 541 |
|
end; |
| 542 |
const |
const |
| 543 |
KAKUJOSI = 'を' + #10 + 'に' + #10 + 'が' + #10 + 'と' + #10 + 'から' |
WHIRA_DELIMITER = 'を' + #10 + 'に' + #10 + 'が' + #10 + 'と' + #10 + 'から' |
| 544 |
+ #10 + 'で' + #10 + 'へ' + #10 + 'より' + #10 + 'まで' |
+ #10 + 'へ' + #10 + 'より' + #10 + 'まで'+ #10 + 'で' |
| 545 |
;{ |
+ #10 + 'ここ' + #10 + 'そこ' + #10 + 'どこ' |
| 546 |
+ #10 + 'これ' + #10 + 'それ' + #10 + 'あれ' + #10 + 'どれ' |
+ #10 + 'これ' + #10 + 'それ' + #10 + 'あれ' + #10 + 'どれ' |
| 547 |
+ #10 + 'この' + #10 + 'その' + #10 + 'あの' + #10 + 'どの' |
+ #10 + 'この' + #10 + 'その' + #10 + 'あの' + #10 + 'どの' |
| 548 |
+ #10 + 'こう' + #10 + 'そう' + #10 + 'ああ' + #10 + 'どう' |
+ #10 + 'こう' + #10 + 'そう' + #10 + 'ああ' + #10 + 'どう' |
| 549 |
+ #10 + 'こんな' + #10 + 'そんな' + #10 + 'あんな' + #10 + 'どんな' |
+ #10 + 'こんな' + #10 + 'そんな' + #10 + 'あんな' + #10 + 'どんな' |
| 550 |
+ #10 + '的' + #10 + '性' + #10 + '式' + #10 + '化' + #10 + '法' |
+ #10 + 'れた' + #10 + 'れて' + #10 + 'れれ' + #10 + 'れろ' |
| 551 |
+ #10 + '不' + #10 + '無' + #10 + '非' |
+ #10 + 'れる' + #10 + 'られる' |
| 552 |
|
+ #10 + 'です' + #10 + 'ます' + #10 + 'ません' |
| 553 |
|
+ #10 + 'でした' + #10 + 'ました' |
| 554 |
|
+ #10 + 'する' + #10 + 'しない' + #10 + 'される' + #10 + 'されない' |
| 555 |
|
; |
| 556 |
|
WKANJI_DELIMITER = '的' + #10 + '性' + #10 + '式' + #10 + '化' + #10 + '法' |
| 557 |
|
+ #10 + '不' + #10 + '無' + #10 + '非' + #10 + '反' |
| 558 |
|
; |
| 559 |
|
WHIRA_FINAL_DELIMITER = 'った' + #10 + 'って' |
| 560 |
|
;{ |
| 561 |
|
+ #10 + 'よって' + #10 + 'したがって' + #10 + 'なので' |
| 562 |
|
+ #10 + 'だから' + #10 + 'ですから' |
| 563 |
+ #10 + 'また' |
+ #10 + 'また' |
| 564 |
+ #10 + 'しかし' + #10 + 'だが' + #10 + 'けど' + #10 + 'けれど' |
+ #10 + 'しかし' + #10 + 'だが' + #10 + 'けど' + #10 + 'けれど' |
| 565 |
+ #10 + 'やはり' + #10 + 'やっぱり' |
+ #10 + 'やはり' + #10 + 'やっぱり' |
| 566 |
+ #10 + 'です' + #10 + 'ます' + #10 + 'でし' + #10 + 'だろ' |
+ #10 + 'でし' + #10 + 'だろ' |
| 567 |
+ #10 + 'する' + #10 + 'しない' + #10 + 'した' + #10 + 'しない' |
+ #10 + 'する' + #10 + 'しない' + #10 + 'した' + #10 + 'しない' |
| 568 |
;} |
;} |
| 569 |
|
// 'ー' を 'ぁぃぅぇぉ' に。 |
| 570 |
|
HA_LINE = 'あかさたなはまやらわがざだばぱぁゎ'; |
| 571 |
|
HI_LINE = 'いきしちにひみりゐぎじびぴぃ'; |
| 572 |
|
HU_LINE = 'うくすつぬふむゆるぐぶぷぅ'; |
| 573 |
|
HE_LINE = 'えけせてねへめれゑげべぺぇ'; |
| 574 |
|
HO_LINE = 'おこそとのほもよろをごぼぽぉ'; |
| 575 |
|
KA_LINE = 'アカサタナハマヤラワガザダバパァヵヮ'; |
| 576 |
|
KI_LINE = 'イキシチニヒミリヰギジビピィ'; |
| 577 |
|
KU_LINE = 'ウクスツヌフムユルグブプゥヴ'; |
| 578 |
|
KE_LINE = 'エケセテネヘメレヱゲベペェヶ'; |
| 579 |
|
KO_LINE = 'オコソトノホモヨロヲゴボポォ'; |
| 580 |
kKanji = [$80..$A0, $E0..$ff]; |
kKanji = [$80..$A0, $E0..$ff]; |
| 581 |
begin |
begin |
| 582 |
|
|
| 583 |
delimiter := TStringList.Create; |
wHiraDelimiter := TStringList.Create; |
| 584 |
|
wHiraFinalDelimiter := TStringList.Create; |
| 585 |
|
wKanjiDelimiter := TStringList.Create; |
| 586 |
|
words := TStringList.Create; |
| 587 |
try |
try |
| 588 |
mode := ModeWhite; |
mode := ModeWhite; |
| 589 |
delimiter.Text := KAKUJOSI; |
wHiraDelimiter.Text := WHIRA_DELIMITER; |
| 590 |
|
wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER; |
| 591 |
|
wKanjiDelimiter.Text := WKANJI_DELIMITER; |
| 592 |
p := PChar( text ); |
p := PChar( text ); |
| 593 |
tail := p + Length( text ); |
tail := p + Length( text ); |
| 594 |
last := p; |
last := p; |
| 595 |
|
|
| 596 |
while p < tail do begin |
while p < tail do begin |
|
delimited := False; |
|
| 597 |
// 文字のタイプを判別 |
// 文字のタイプを判別 |
| 598 |
// ※句読点は ModeGraph になるので個別に対応しなくてもいい |
// ※句読点は ModeGraph になるので個別に対応しなくてもいい |
| 599 |
// if Byte(Byte( p^ ) - $a1) < $5e then begin |
// if Byte(Byte( p^ ) - $a1) < $5e then begin |
| 601 |
if p + 1 < tail then begin |
if p + 1 < tail then begin |
| 602 |
ch := (PByte( p )^ shl 8) or PByte( p + 1 )^; |
ch := (PByte( p )^ shl 8) or PByte( p + 1 )^; |
| 603 |
case ch of |
case ch of |
| 604 |
$8140: newMode := ModeWhite; |
// スペースで単語分けせずに詰める |
| 605 |
|
//$8140: newMode := ModeWhite; |
| 606 |
$8141..$824e: newMode := ModeWGraph; |
$8141..$824e: newMode := ModeWGraph; |
| 607 |
$824f..$8258: newMode := ModeWNum; |
$824f..$8258: newMode := ModeWNum; |
| 608 |
$8260..$829a: newMode := ModeWAlpha; |
$8260..$829a: newMode := ModeWAlpha; |
| 619 |
end; |
end; |
| 620 |
|
|
| 621 |
chSize := 2; |
chSize := 2; |
|
|
|
|
// 区切りになる文字があるか検査する |
|
|
if p + 3 < tail then begin // 3 = delimiter の最大字数 - 1 |
|
|
for i := 0 to delimiter.Count - 1 do begin |
|
|
if CompareMem( |
|
|
p, PChar( delimiter[ i ] ), Length( delimiter[ i ] ) ) then begin |
|
|
delimited := True; |
|
|
chSize := Length( delimiter[ i ] ); |
|
|
Break; |
|
|
end; |
|
|
end; |
|
|
end; |
|
| 622 |
end else begin |
end else begin |
| 623 |
newMode := Modes( CharMode1[ Byte( p^ ) ] ); |
newMode := Modes( CharMode1[ Byte( p^ ) ] ); |
| 624 |
|
if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin |
| 625 |
|
// 今まで日本語で今スペース |
| 626 |
|
// 単語を繋げて後でスペースを詰める |
| 627 |
|
// ※半角カナは通常スペースで区切るだろうから詰めない |
| 628 |
|
newMode := mode; |
| 629 |
|
end; |
| 630 |
|
|
| 631 |
chSize := 1; |
chSize := 1; |
| 632 |
end; |
end; |
| 633 |
|
|
| 634 |
if (mode <> newMode) or delimited then begin |
if mode <> newMode then begin |
| 635 |
|
|
| 636 |
// 文字のタイプが変更された |
// 文字のタイプが変更された |
| 637 |
if mode <> ModeWhite then begin |
SetLength( aWord, p - last ); |
| 638 |
SetLength( aWord, p - last ); |
CopyMemory( PChar( aWord ), last, p - last ); |
|
CopyMemory( PChar( aWord ), last, p - last ); |
|
|
//aWord := Copy( last, 0, p - last ); |
|
|
if wordCount.Find( aWord, idx ) then begin |
|
|
countInfo := TWordCountInfo( wordCount.Objects[ idx ] ); |
|
|
end else begin |
|
|
countInfo := TWordCountInfo.Create; |
|
|
wordCount.AddObject( aWord, countInfo ); |
|
|
end; |
|
|
countInfo.WordCount := countInfo.WordCount + 1; |
|
|
end; |
|
| 639 |
|
|
| 640 |
last := p; |
words.Text := changeMode( aWord, mode ); |
| 641 |
|
|
| 642 |
// 区切りになる文字に遭遇した |
// 単語登録 |
| 643 |
if delimited then begin |
addWord( wordCount, words ); |
|
SetLength( aWord, chSize ); |
|
|
CopyMemory( PChar( aWord ), last, chSize ); |
|
|
//aWord := Copy( last, 0, p - last ); |
|
|
if wordCount.Find( aWord, idx ) then begin |
|
|
countInfo := TWordCountInfo( wordCount.Objects[ idx ] ); |
|
|
end else begin |
|
|
countInfo := TWordCountInfo.Create; |
|
|
wordCount.AddObject( aWord, countInfo ); |
|
|
end; |
|
|
countInfo.WordCount := countInfo.WordCount + 1; |
|
|
last := last + chSize; |
|
|
end; |
|
| 644 |
|
|
| 645 |
|
last := p; |
| 646 |
mode := newMode; |
mode := newMode; |
| 647 |
|
|
| 648 |
end; |
end; |
| 652 |
|
|
| 653 |
if mode <> ModeWhite then begin |
if mode <> ModeWhite then begin |
| 654 |
aWord := Copy( last, 0, p - last ); |
aWord := Copy( last, 0, p - last ); |
| 655 |
if wordCount.Find( aWord, idx ) then begin |
words.Text := changeMode( aWord, mode ); |
| 656 |
countInfo := TWordCountInfo( wordCount.Objects[ idx ] ); |
|
| 657 |
end else begin |
// 単語登録 |
| 658 |
countInfo := TWordCountInfo.Create; |
addWord( wordCount, words ); |
|
wordCount.AddObject( aWord, countInfo ); |
|
|
end; |
|
|
countInfo.WordCount := countInfo.WordCount + 1; |
|
| 659 |
end; |
end; |
| 660 |
finally |
finally |
| 661 |
delimiter.Free; |
words.Free; |
| 662 |
|
wKanjiDelimiter.Free; |
| 663 |
|
wHiraFinalDelimiter.Free; |
| 664 |
|
wHiraDelimiter.Free; |
| 665 |
end; |
end; |
| 666 |
|
|
| 667 |
end; |
end; |