| 154 |
//================================================== |
//================================================== |
| 155 |
|
|
| 156 |
uses |
uses |
| 157 |
SysUtils, Math; |
SysUtils, Math |
| 158 |
|
{$IFDEF BENCHMARK} |
| 159 |
|
, Windows, Dialogs |
| 160 |
|
{$ENDIF} |
| 161 |
|
; |
| 162 |
|
|
| 163 |
|
{$IFDEF BENCHMARK} |
| 164 |
|
var |
| 165 |
|
b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 : Int64; // benchmark |
| 166 |
|
{$ENDIF} |
| 167 |
|
|
| 168 |
const |
const |
| 169 |
GIKO_BAYESIAN_FILE_VERSION = '1.0'; |
GIKO_BAYESIAN_FILE_VERSION = '1.0'; |
| 233 |
constructor TGikoBayesian.Create; |
constructor TGikoBayesian.Create; |
| 234 |
begin |
begin |
| 235 |
|
|
| 236 |
|
{$IFDEF BENCHMARK} |
| 237 |
|
b1:=0; b2:=0; b3:=0; b4:=0; b5:=0; b6:=0; b7:=0; b8:=0; b9:=0; b10:=0; |
| 238 |
|
{$ENDIF} |
| 239 |
|
|
| 240 |
Duplicates := dupIgnore; |
Duplicates := dupIgnore; |
| 241 |
Sorted := True; |
Sorted := True; |
| 242 |
|
|
| 267 |
info : TWordInfo; |
info : TWordInfo; |
| 268 |
begin |
begin |
| 269 |
|
|
| 270 |
|
FFilePath := filePath; |
| 271 |
|
|
| 272 |
if not FileExists( filePath ) then |
if not FileExists( filePath ) then |
| 273 |
Exit; |
Exit; |
| 274 |
|
|
| 275 |
sl := TStringList.Create; |
sl := TStringList.Create; |
| 276 |
try |
try |
| 277 |
sl.LoadFromFile( filePath ); |
sl.LoadFromFile( filePath ); |
| 301 |
info : TWordInfo; |
info : TWordInfo; |
| 302 |
begin |
begin |
| 303 |
|
|
| 304 |
|
{$IFDEF BENCHMARK} |
| 305 |
|
ShowMessage(IntToStr(b1)+'/'+IntToStr(b2)+'/'+IntToStr(b3)+'/'+IntToStr(b4)+ |
| 306 |
|
'/'+IntToStr(b5)+'/'+IntToStr(b6)); |
| 307 |
|
{$ENDIF} |
| 308 |
|
|
| 309 |
|
FFilePath := filePath; |
| 310 |
|
|
| 311 |
sl := TStringList.Create; |
sl := TStringList.Create; |
| 312 |
try |
try |
| 313 |
sl.BeginUpdate; |
sl.BeginUpdate; |
| 392 |
delimited : Boolean; |
delimited : Boolean; |
| 393 |
i, idx : Integer; |
i, idx : Integer; |
| 394 |
countInfo : TWordCountInfo; |
countInfo : TWordCountInfo; |
| 395 |
|
{$IFDEF BENCHMARK} |
| 396 |
|
t1, t2 : Int64; |
| 397 |
|
{$ENDIF} |
| 398 |
const |
const |
| 399 |
KAKUJOSI = 'を' + #10 + 'に' + #10 + 'が' + #10 + 'と' + #10 + 'から' + |
KAKUJOSI = 'を' + #10 + 'に' + #10 + 'が' + #10 + 'と' + #10 + 'から' + |
| 400 |
#10 + 'で' + #10 + 'へ' + #10 + 'より' + #10 + 'まで'; |
#10 + 'で' + #10 + 'へ' + #10 + 'より' + #10 + 'まで'; |
| 417 |
last := p; |
last := p; |
| 418 |
|
|
| 419 |
while p < tail do begin |
while p < tail do begin |
| 420 |
|
{$IFDEF BENCHMARK} |
| 421 |
|
QueryPerformanceCounter( t1 ); |
| 422 |
|
{$ENDIF} |
| 423 |
delimited := False; |
delimited := False; |
| 424 |
// 文字のタイプを判別 |
// 文字のタイプを判別 |
| 425 |
// ※句読点は ModeGraph になるので個別に対応しなくてもいい |
// ※句読点は ModeGraph になるので個別に対応しなくてもいい |
| 463 |
|
|
| 464 |
chSize := 1; |
chSize := 1; |
| 465 |
end; |
end; |
| 466 |
|
{$IFDEF BENCHMARK} |
| 467 |
|
QueryPerformanceCounter( t2 ); b1 := b1 + (t2 - t1); |
| 468 |
|
{$ENDIF} |
| 469 |
|
|
| 470 |
if (mode <> newMode) or delimited then begin |
if (mode <> newMode) or delimited then begin |
| 471 |
|
|
| 472 |
// 文字のタイプが変更された |
// 文字のタイプが変更された |
| 473 |
// もしくは区切りになる文字に遭遇した |
// もしくは区切りになる文字に遭遇した |
| 474 |
if mode <> ModeWhite then begin |
if mode <> ModeWhite then begin |
| 475 |
|
{$IFDEF BENCHMARK} |
| 476 |
|
QueryPerformanceCounter( t1 ); |
| 477 |
|
{$ENDIF} |
| 478 |
aWord := Copy( last, 0, p - last ); // 激遅 |
aWord := Copy( last, 0, p - last ); // 激遅 |
| 479 |
// SetLength( aWord, p - last ); |
// SetLength( aWord, p - last ); |
| 480 |
// CopyMemory( PChar( aWord ), last, p - last ); |
// CopyMemory( PChar( aWord ), last, p - last ); |
| 481 |
|
{$IFDEF BENCHMARK} |
| 482 |
|
QueryPerformanceCounter( t2 ); b2 := b2 + (t2 - t1); |
| 483 |
|
{$ENDIF} |
| 484 |
idx := wordCount.IndexOf( aWord ); // 激遅 |
idx := wordCount.IndexOf( aWord ); // 激遅 |
| 485 |
|
{$IFDEF BENCHMARK} |
| 486 |
|
QueryPerformanceCounter( t1 ); b3 := b3 + (t1 - t2); |
| 487 |
|
{$ENDIF} |
| 488 |
if idx < 0 then begin |
if idx < 0 then begin |
| 489 |
countInfo := TWordCountInfo.Create; |
countInfo := TWordCountInfo.Create; |
| 490 |
wordCount.AddObject( aWord, countInfo ); |
wordCount.AddObject( aWord, countInfo ); |
| 492 |
countInfo := TWordCountInfo( wordCount.Objects[ idx ] ); |
countInfo := TWordCountInfo( wordCount.Objects[ idx ] ); |
| 493 |
end; |
end; |
| 494 |
countInfo.WordCount := countInfo.WordCount + 1; |
countInfo.WordCount := countInfo.WordCount + 1; |
| 495 |
|
{$IFDEF BENCHMARK} |
| 496 |
|
QueryPerformanceCounter( t2 ); b4 := b4 + (t2 - t1); |
| 497 |
|
{$ENDIF} |
| 498 |
end; |
end; |
| 499 |
|
|
| 500 |
last := p; |
last := p; |
| 663 |
wordCount : TWordCount; |
wordCount : TWordCount; |
| 664 |
algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson |
algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson |
| 665 |
) : Extended; |
) : Extended; |
| 666 |
|
{$IFDEF BENCHMARK} |
| 667 |
|
var |
| 668 |
|
t1, t2 : Int64; |
| 669 |
|
{$ENDIF} |
| 670 |
begin |
begin |
| 671 |
|
|
| 672 |
|
{$IFDEF BENCHMARK} |
| 673 |
|
QueryPerformanceCounter( t1 ); |
| 674 |
|
{$ENDIF} |
| 675 |
CountWord( text, wordCount ); |
CountWord( text, wordCount ); |
| 676 |
|
{$IFDEF BENCHMARK} |
| 677 |
|
QueryPerformanceCounter( t2 ); b5 := b5 + (t2 - t1); |
| 678 |
|
{$ENDIF} |
| 679 |
case algorithm of |
case algorithm of |
| 680 |
gbaPaulGraham: Result := CalcPaulGraham( wordCount ); |
gbaPaulGraham: Result := CalcPaulGraham( wordCount ); |
| 681 |
gbaGaryRonbinson: Result := CalcGaryRobinson( wordCount ); |
gbaGaryRonbinson: Result := CalcGaryRobinson( wordCount ); |
| 682 |
else Result := 0; |
else Result := 0; |
| 683 |
end; |
end; |
| 684 |
|
{$IFDEF BENCHMARK} |
| 685 |
|
QueryPerformanceCounter( t1 ); b6 := b6 + (t1 - t2); |
| 686 |
|
{$ENDIF} |
| 687 |
|
|
| 688 |
end; |
end; |
| 689 |
|
|