| 50 |
\brief 解析済み単語リスト |
\brief 解析済み単語リスト |
| 51 |
************************************************************} |
************************************************************} |
| 52 |
// TWordCount = class( THashedStringList ) // 激遅 |
// TWordCount = class( THashedStringList ) // 激遅 |
| 53 |
TWordCount = class( TStringList ) // 遅 |
TWordCount = class( TStringList ) |
| 54 |
public |
public |
| 55 |
|
constructor Create; |
| 56 |
destructor Destroy; override; |
destructor Destroy; override; |
| 57 |
end; |
end; |
| 58 |
|
|
| 65 |
{!*********************************************************** |
{!*********************************************************** |
| 66 |
\brief ベイジアンフィルタ |
\brief ベイジアンフィルタ |
| 67 |
************************************************************} |
************************************************************} |
| 68 |
TGikoBayesian = class( THashedStringList ) |
// TGikoBayesian = class( THashedStringList ) // 激遅 |
| 69 |
|
TGikoBayesian = class( TStringList ) |
| 70 |
private |
private |
| 71 |
FFilePath : string; //!< 読み込んだファイルパス |
FFilePath : string; //!< 読み込んだファイルパス |
| 72 |
function GetObject( const name : string ) : TWordInfo; |
function GetObject( const name : string ) : TWordInfo; |
| 156 |
//================================================== |
//================================================== |
| 157 |
|
|
| 158 |
uses |
uses |
| 159 |
SysUtils, Math |
SysUtils, Math; |
|
{$IFDEF BENCHMARK} |
|
|
, Windows, Dialogs |
|
|
{$ENDIF} |
|
|
; |
|
|
|
|
|
{$IFDEF BENCHMARK} |
|
|
var |
|
|
b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 : Int64; // benchmark |
|
|
{$ENDIF} |
|
| 160 |
|
|
| 161 |
const |
const |
| 162 |
GIKO_BAYESIAN_FILE_VERSION = '1.0'; |
GIKO_BAYESIAN_FILE_VERSION = '1.0'; |
| 163 |
kYofKanji : TSysCharSet = [#$80..#$A0, #$E0..#$ff]; |
{ |
| 164 |
|
Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum, |
| 165 |
|
ModeWGraph, ModeWAlpha, ModeWNum, |
| 166 |
|
ModeWHira, ModeWKata, ModeWKanji); |
| 167 |
|
} |
| 168 |
|
CharMode1 : array [ 0..255 ] of Byte = |
| 169 |
|
( |
| 170 |
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 171 |
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 172 |
|
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 173 |
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, |
| 174 |
|
1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 175 |
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, |
| 176 |
|
1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 177 |
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0, |
| 178 |
|
|
| 179 |
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 180 |
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 181 |
|
0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
| 182 |
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
| 183 |
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
| 184 |
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, |
| 185 |
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 186 |
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| 187 |
|
); |
| 188 |
|
|
| 189 |
//************************************************************ |
//************************************************************ |
| 190 |
// misc |
// misc |
| 227 |
//************************************************************ |
//************************************************************ |
| 228 |
// TWordCount class |
// TWordCount class |
| 229 |
//************************************************************ |
//************************************************************ |
| 230 |
|
constructor TWordCount.Create; |
| 231 |
|
begin |
| 232 |
|
|
| 233 |
|
Duplicates := dupIgnore; |
| 234 |
|
CaseSensitive := True; |
| 235 |
|
Sorted := True; |
| 236 |
|
|
| 237 |
|
end; |
| 238 |
|
|
| 239 |
destructor TWordCount.Destroy; |
destructor TWordCount.Destroy; |
| 240 |
var |
var |
| 241 |
i : Integer; |
i : Integer; |
| 259 |
constructor TGikoBayesian.Create; |
constructor TGikoBayesian.Create; |
| 260 |
begin |
begin |
| 261 |
|
|
| 262 |
{$IFDEF BENCHMARK} |
Duplicates := dupIgnore; |
| 263 |
b1:=0; b2:=0; b3:=0; b4:=0; b5:=0; b6:=0; b7:=0; b8:=0; b9:=0; b10:=0; |
CaseSensitive := True; |
| 264 |
{$ENDIF} |
Sorted := True; |
|
|
|
|
Duplicates := dupIgnore; |
|
|
Sorted := True; |
|
| 265 |
|
|
| 266 |
end; |
end; |
| 267 |
|
|
| 324 |
info : TWordInfo; |
info : TWordInfo; |
| 325 |
begin |
begin |
| 326 |
|
|
|
{$IFDEF BENCHMARK} |
|
|
ShowMessage(IntToStr(b1)+'/'+IntToStr(b2)+'/'+IntToStr(b3)+'/'+IntToStr(b4)+ |
|
|
'/'+IntToStr(b5)+'/'+IntToStr(b6)); |
|
|
{$ENDIF} |
|
|
|
|
| 327 |
FFilePath := filePath; |
FFilePath := filePath; |
| 328 |
|
|
| 329 |
sl := TStringList.Create; |
sl := TStringList.Create; |
| 365 |
idx : Integer; |
idx : Integer; |
| 366 |
begin |
begin |
| 367 |
|
|
| 368 |
idx := IndexOf( name ); |
idx := IndexOf( name ); // 激遅 |
| 369 |
if idx < 0 then |
if idx < 0 then |
| 370 |
Result := nil |
Result := nil |
| 371 |
else |
else |
| 397 |
const text : string; |
const text : string; |
| 398 |
wordCount : TWordCount ); |
wordCount : TWordCount ); |
| 399 |
type |
type |
| 400 |
Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana, |
Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum, |
| 401 |
ModeWGraph, ModeWAlpha, ModeWNum, |
ModeWGraph, ModeWAlpha, ModeWNum, |
| 402 |
ModeWHira, ModeWKata, ModeWKanji); |
ModeWHira, ModeWKata, ModeWKanji); |
| 403 |
var |
var |
| 410 |
delimited : Boolean; |
delimited : Boolean; |
| 411 |
i, idx : Integer; |
i, idx : Integer; |
| 412 |
countInfo : TWordCountInfo; |
countInfo : TWordCountInfo; |
|
{$IFDEF BENCHMARK} |
|
|
t1, t2 : Int64; |
|
|
{$ENDIF} |
|
| 413 |
const |
const |
| 414 |
KAKUJOSI = 'を' + #10 + 'に' + #10 + 'が' + #10 + 'と' + #10 + 'から' + |
KAKUJOSI = 'を' + #10 + 'に' + #10 + 'が' + #10 + 'と' + #10 + 'から' + |
| 415 |
#10 + 'で' + #10 + 'へ' + #10 + 'より' + #10 + 'まで'; |
#10 + 'で' + #10 + 'へ' + #10 + 'より' + #10 + 'まで'; |
| 417 |
|
|
| 418 |
delimiter := TStringList.Create; |
delimiter := TStringList.Create; |
| 419 |
try |
try |
|
//*** 速度テスト中 |
|
|
wordCount.Duplicates := dupIgnore; |
|
|
wordCount.CaseSensitive := True; |
|
|
wordCount.Capacity := 1000; |
|
|
wordCount.Sorted := True; |
|
|
//*** |
|
|
|
|
| 420 |
mode := ModeWhite; |
mode := ModeWhite; |
| 421 |
delimiter.Text := KAKUJOSI; |
delimiter.Text := KAKUJOSI; |
|
SetLength( aWord, 256 ); |
|
| 422 |
p := PChar( text ); |
p := PChar( text ); |
| 423 |
tail := p + Length( text ); |
tail := p + Length( text ); |
| 424 |
last := p; |
last := p; |
| 425 |
|
|
| 426 |
while p < tail do begin |
while p < tail do begin |
|
{$IFDEF BENCHMARK} |
|
|
QueryPerformanceCounter( t1 ); |
|
|
{$ENDIF} |
|
| 427 |
delimited := False; |
delimited := False; |
| 428 |
// 文字のタイプを判別 |
// 文字のタイプを判別 |
| 429 |
// ※句読点は ModeGraph になるので個別に対応しなくてもいい |
// ※句読点は ModeGraph になるので個別に対応しなくてもいい |
| 430 |
if p^ in kYofKanji then begin |
if Byte(Byte( p^ ) - $a1) < $5e then begin |
| 431 |
if p + 1 < tail then begin |
if p + 1 < tail then begin |
| 432 |
ch := (PByte( p )^ shl 8) or PByte( p + 1 )^; |
ch := (PByte( p )^ shl 8) or PByte( p + 1 )^; |
| 433 |
case ch of |
case ch of |
| 457 |
end; |
end; |
| 458 |
end; |
end; |
| 459 |
end else begin |
end else begin |
| 460 |
case p^ of |
// ↑↓変わらず |
| 461 |
#$0..#$20, #$7f: newMode := ModeWhite; |
newMode := Modes( CharMode1[ Byte( p^ ) ] ); |
|
'0'..'9': newMode := ModeNum; |
|
|
'a'..'z', 'A'..'Z': newMode := ModeAlpha; |
|
|
#$A6..#$DD: newMode := ModeHanKana; |
|
|
else newMode := ModeGraph; |
|
|
end; |
|
| 462 |
|
|
| 463 |
chSize := 1; |
chSize := 1; |
| 464 |
end; |
end; |
|
{$IFDEF BENCHMARK} |
|
|
QueryPerformanceCounter( t2 ); b1 := b1 + (t2 - t1); |
|
|
{$ENDIF} |
|
| 465 |
|
|
| 466 |
if (mode <> newMode) or delimited then begin |
if (mode <> newMode) or delimited then begin |
| 467 |
|
|
| 468 |
// 文字のタイプが変更された |
// 文字のタイプが変更された |
| 469 |
// もしくは区切りになる文字に遭遇した |
// もしくは区切りになる文字に遭遇した |
| 470 |
if mode <> ModeWhite then begin |
if mode <> ModeWhite then begin |
| 471 |
{$IFDEF BENCHMARK} |
SetLength( aWord, p - last ); |
| 472 |
QueryPerformanceCounter( t1 ); |
CopyMemory( PChar( aWord ), last, p - last ); |
| 473 |
{$ENDIF} |
idx := wordCount.IndexOf( aWord ); // 遅 |
|
aWord := Copy( last, 0, p - last ); // 激遅 |
|
|
// SetLength( aWord, p - last ); |
|
|
// CopyMemory( PChar( aWord ), last, p - last ); |
|
|
{$IFDEF BENCHMARK} |
|
|
QueryPerformanceCounter( t2 ); b2 := b2 + (t2 - t1); |
|
|
{$ENDIF} |
|
|
idx := wordCount.IndexOf( aWord ); // 激遅 |
|
|
{$IFDEF BENCHMARK} |
|
|
QueryPerformanceCounter( t1 ); b3 := b3 + (t1 - t2); |
|
|
{$ENDIF} |
|
| 474 |
if idx < 0 then begin |
if idx < 0 then begin |
| 475 |
countInfo := TWordCountInfo.Create; |
countInfo := TWordCountInfo.Create; |
| 476 |
wordCount.AddObject( aWord, countInfo ); |
wordCount.AddObject( aWord, countInfo ); |
| 478 |
countInfo := TWordCountInfo( wordCount.Objects[ idx ] ); |
countInfo := TWordCountInfo( wordCount.Objects[ idx ] ); |
| 479 |
end; |
end; |
| 480 |
countInfo.WordCount := countInfo.WordCount + 1; |
countInfo.WordCount := countInfo.WordCount + 1; |
|
{$IFDEF BENCHMARK} |
|
|
QueryPerformanceCounter( t2 ); b4 := b4 + (t2 - t1); |
|
|
{$ENDIF} |
|
| 481 |
end; |
end; |
| 482 |
|
|
| 483 |
last := p; |
last := p; |
| 646 |
wordCount : TWordCount; |
wordCount : TWordCount; |
| 647 |
algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson |
algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson |
| 648 |
) : Extended; |
) : Extended; |
|
{$IFDEF BENCHMARK} |
|
|
var |
|
|
t1, t2 : Int64; |
|
|
{$ENDIF} |
|
| 649 |
begin |
begin |
| 650 |
|
|
|
{$IFDEF BENCHMARK} |
|
|
QueryPerformanceCounter( t1 ); |
|
|
{$ENDIF} |
|
| 651 |
CountWord( text, wordCount ); |
CountWord( text, wordCount ); |
|
{$IFDEF BENCHMARK} |
|
|
QueryPerformanceCounter( t2 ); b5 := b5 + (t2 - t1); |
|
|
{$ENDIF} |
|
| 652 |
case algorithm of |
case algorithm of |
| 653 |
gbaPaulGraham: Result := CalcPaulGraham( wordCount ); |
gbaPaulGraham: Result := CalcPaulGraham( wordCount ); |
| 654 |
gbaGaryRonbinson: Result := CalcGaryRobinson( wordCount ); |
gbaGaryRonbinson: Result := CalcGaryRobinson( wordCount ); |
| 655 |
else Result := 0; |
else Result := 0; |
| 656 |
end; |
end; |
|
{$IFDEF BENCHMARK} |
|
|
QueryPerformanceCounter( t1 ); b6 := b6 + (t1 - t2); |
|
|
{$ENDIF} |
|
| 657 |
|
|
| 658 |
end; |
end; |
| 659 |
|
|
| 667 |
aWord : string; |
aWord : string; |
| 668 |
wordinfo : TWordInfo; |
wordinfo : TWordInfo; |
| 669 |
countinfo : TWordCountInfo; |
countinfo : TWordCountInfo; |
| 670 |
i : Integer; |
i : Integer; |
| 671 |
begin |
begin |
| 672 |
|
|
| 673 |
for i := 0 to wordCount.Count - 1 do begin |
for i := 0 to wordCount.Count - 1 do begin |
| 674 |
aWord := wordCount[ i ]; |
aWord := wordCount[ i ]; |
| 675 |
wordinfo := Objects[ aWord ]; |
wordinfo := Objects[ aWord ]; |
| 676 |
|
countinfo := TWordCountInfo( wordCount.Objects[ i ] ); |
| 677 |
if wordinfo = nil then begin |
if wordinfo = nil then begin |
| 678 |
wordinfo := TWordInfo.Create; |
wordinfo := TWordInfo.Create; |
| 679 |
Objects[ aWord ] := wordinfo; |
Objects[ aWord ] := wordinfo; |
| 680 |
end; |
end; |
| 681 |
|
|
|
countinfo := TWordCountInfo( wordCount.Objects[ i ] ); |
|
| 682 |
if isImportant then begin |
if isImportant then begin |
| 683 |
wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount; |
wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount; |
| 684 |
wordinfo.ImportantText := wordinfo.ImportantText + 1; |
wordinfo.ImportantText := wordinfo.ImportantText + 1; |