Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.2 - (show annotations) (download) (as text)
Thu Oct 21 01:20:34 2004 UTC (19 years, 6 months ago) by yoffy
Branch: MAIN
Changes since 1.1: +59 -3 lines
File MIME type: text/x-pascal
- 正しくファイル名がセットされないバグを修正。
- 初回学習かどうかの判定を誤っていたバグを修正。

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7 $Id: GikoBayesian.pas,v 1.1 2004/10/20 18:25:00 yoffy Exp $
8 }
9
10 interface
11
12 //==================================================
13 uses
14 //==================================================
15 Classes, IniFiles;
16
17 //==================================================
18 type
19 //==================================================
20
21 {!***********************************************************
22 \brief ??茯???????????/span>
23 ************************************************************}
24 TWordInfo = class( TObject )
25 private
26 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
27 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
28 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
29 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
30
31 public
32 property NormalWord : Integer read FNormalWord write FNormalWord;
33 property ImportantWord : Integer read FImportantWord write FImportantWord;
34 property NormalText : Integer read FNormalText write FNormalText;
35 property ImportantText : Integer read FImportantText write FImportantText;
36 end;
37
38 {!***********************************************************
39 \brief 茹f??羝??水??茯???????????/span>
40 ************************************************************}
41 TWordCountInfo = class( TObject )
42 private
43 FWordCount : Integer; //!< ??茯???/span>
44
45 public
46 property WordCount : Integer read FWordCount write FWordCount;
47 end;
48
49 {!***********************************************************
50 \brief 茹f??羝??水??茯????鴻??
51 ************************************************************}
52 // TWordCount = class( THashedStringList ) // 羶???
53 TWordCount = class( TStringList ) // ??
54 public
55 destructor Destroy; override;
56 end;
57
58 {!***********************************************************
59 \brief ???c???帥?≪???眼???冴??
60 ************************************************************}
61 TGikoBayesianAlgorithm =
62 (gbaPaulGraham, gbaGaryRonbinson{, gbaGaryRonbinsonFisher});
63
64 {!***********************************************************
65 \brief ???ゃ?吾?≪?潟???c????/span>
66 ************************************************************}
67 TGikoBayesian = class( THashedStringList )
68 private
69 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
70 function GetObject( const name : string ) : TWordInfo;
71 procedure SetObject( const name : string; value : TWordInfo );
72
73 public
74 constructor Create;
75 destructor Destroy; override;
76
77 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
78 procedure LoadFromFile( const filePath : string );
79
80 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
81 procedure SaveToFile( const filePath : string );
82
83 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
84 procedure Save;
85
86 //! ??茯???????????宴????緇????障??
87 property Objects[ const name : string ] : TWordInfo
88 read GetObject write SetObject; default;
89
90 //! ??腴??????障??????茯????????潟?????障??
91 procedure CountWord(
92 const text : string;
93 wordCount : TWordCount );
94
95 {!
96 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
97 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
98 }
99 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
100
101 {!
102 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
103 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
104 }
105 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
106
107 // function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
108
109 {!
110 \brief ??腴???茹f??
111 \param text 茹f????????腴?
112 \param wordCount 茹f??????????茯????鴻????菴???
113 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
114 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
115
116 CountWord ? Calcxxxxx ???障???????茵??????????с????
117 }
118 function Parse(
119 const text : string;
120 wordCount : TWordCount;
121 algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
122 ) : Extended;
123
124 {!
125 \brief 絖??????
126 \param wordCount Parse ?цВ??????????茯????鴻??
127 \param isImportant 羈??????鴻????腴???????????????? True
128 }
129 procedure Learn(
130 wordCount : TWordCount;
131 isImportant : Boolean );
132
133 {!
134 \brief 絖??腟?????綽?????
135 \param wordCount Parse ?цВ??????????茯????鴻??
136 \param isImportant 羈??????鴻????腴???????????????????????? True
137 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
138 Learn ????????????腴??? isImportant ???????c????????腴???
139 Forget ?????????若?帥???若?鴻???贋?????障????<br>
140 絖??羝??帥????????????????????????????????
141
142 ???????膺?腟??????????≪????????с???????障??????<br>
143 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
144
145 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
146 }
147 procedure Forget(
148 wordCount : TWordCount;
149 isImportant : Boolean );
150 end;
151
152 //==================================================
153 implementation
154 //==================================================
155
156 uses
157 SysUtils, Math
158 {$IFDEF BENCHMARK}
159 , Windows, Dialogs
160 {$ENDIF}
161 ;
162
163 {$IFDEF BENCHMARK}
164 var
165 b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 : Int64; // benchmark
166 {$ENDIF}
167
168 const
169 GIKO_BAYESIAN_FILE_VERSION = '1.0';
170 kYofKanji : TSysCharSet = [#$80..#$A0, #$E0..#$ff];
171
172 //************************************************************
173 // misc
174 //************************************************************
175
176 //==============================
177 // RemoveToken
178 //==============================
179 function RemoveToken(var s: string;const delimiter: string): string;
180 var
181 p: Integer;
182 begin
183 p := AnsiPos(delimiter, s);
184 if p = 0 then
185 Result := s
186 else
187 Result := Copy(s, 1, p - 1);
188 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
189 end;
190
191 //==============================
192 // AbsSort
193 //==============================
194 function AbsSort( p1, p2 : Pointer ) : Integer;
195 var
196 v1, v2 : Single;
197 begin
198
199 v1 := Abs( Single( p1 ) - 0.5 );
200 v2 := Abs( Single( p2 ) - 0.5 );
201 if v1 > v2 then
202 Result := -1
203 else if v1 = v2 then
204 Result := 0
205 else
206 Result := 1;
207
208 end;
209
210 //************************************************************
211 // TWordCount class
212 //************************************************************
213 destructor TWordCount.Destroy;
214 var
215 i : Integer;
216 begin
217
218 for i := Count - 1 downto 0 do
219 if Objects[ i ] <> nil then
220 Objects[ i ].Free;
221
222 inherited;
223
224 end;
225
226 //************************************************************
227 // TGikoBayesian class
228 //************************************************************
229
230 //==============================
231 // Create
232 //==============================
233 constructor TGikoBayesian.Create;
234 begin
235
236 {$IFDEF BENCHMARK}
237 b1:=0; b2:=0; b3:=0; b4:=0; b5:=0; b6:=0; b7:=0; b8:=0; b9:=0; b10:=0;
238 {$ENDIF}
239
240 Duplicates := dupIgnore;
241 Sorted := True;
242
243 end;
244
245 //==============================
246 // Destroy
247 //==============================
248 destructor TGikoBayesian.Destroy;
249 var
250 i : Integer;
251 begin
252
253 for i := Count - 1 downto 0 do
254 if inherited Objects[ i ] <> nil then
255 inherited Objects[ i ].Free;
256
257 inherited;
258
259 end;
260
261 procedure TGikoBayesian.LoadFromFile( const filePath : string );
262 var
263 i : Integer;
264 sl : TStringList;
265 s : string;
266 name : string;
267 info : TWordInfo;
268 begin
269
270 FFilePath := filePath;
271
272 if not FileExists( filePath ) then
273 Exit;
274
275 sl := TStringList.Create;
276 try
277 sl.LoadFromFile( filePath );
278
279 for i := 1 to sl.Count - 1 do begin
280 s := sl[ i ];
281 name := RemoveToken( s, #1 );
282 info := TWordInfo.Create;
283 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
284 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
285 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
286 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
287
288 AddObject( name, info );
289 end;
290 finally
291 sl.Free;
292 end;
293
294 end;
295
296 procedure TGikoBayesian.SaveToFile( const filePath : string );
297 var
298 i : Integer;
299 sl : TStringList;
300 s : string;
301 info : TWordInfo;
302 begin
303
304 {$IFDEF BENCHMARK}
305 ShowMessage(IntToStr(b1)+'/'+IntToStr(b2)+'/'+IntToStr(b3)+'/'+IntToStr(b4)+
306 '/'+IntToStr(b5)+'/'+IntToStr(b6));
307 {$ENDIF}
308
309 FFilePath := filePath;
310
311 sl := TStringList.Create;
312 try
313 sl.BeginUpdate;
314 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
315
316 for i := 0 to Count - 1 do begin
317 info := TWordInfo( inherited Objects[ i ] );
318 s := Strings[ i ] + #1
319 + Format('%x', [info.NormalWord]) + #1
320 + Format('%x', [info.ImportantWord]) + #1
321 + Format('%x', [info.NormalText]) + #1
322 + Format('%x', [info.ImportantText]);
323
324 sl.Add(s);
325 end;
326 sl.EndUpdate;
327 sl.SaveToFile( filePath );
328 finally
329 sl.Free;
330 end;
331
332 end;
333
334 procedure TGikoBayesian.Save;
335 begin
336
337 if FFilePath <> '' then
338 SaveToFile( FFilePath );
339
340 end;
341
342 //==============================
343 // GetObject
344 //==============================
345 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
346 var
347 idx : Integer;
348 begin
349
350 idx := IndexOf( name );
351 if idx < 0 then
352 Result := nil
353 else
354 Result := TWordInfo( inherited Objects[ idx ] );
355
356 end;
357
358 //==============================
359 // SetObject
360 //==============================
361 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
362 var
363 idx : Integer;
364 begin
365
366 idx := IndexOf( name );
367 if idx < 0 then
368 AddObject( name, value )
369 else
370 inherited Objects[ idx ] := value;
371
372 end;
373
374
375 //==============================
376 // CountWord
377 //==============================
378 procedure TGikoBayesian.CountWord(
379 const text : string;
380 wordCount : TWordCount );
381 type
382 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
383 ModeWGraph, ModeWAlpha, ModeWNum,
384 ModeWHira, ModeWKata, ModeWKanji);
385 var
386 p, tail, last : PChar;
387 mode, newMode : Modes;
388 aWord : string;
389 ch : Longword;
390 chSize : Integer;
391 delimiter : TStringList;
392 delimited : Boolean;
393 i, idx : Integer;
394 countInfo : TWordCountInfo;
395 {$IFDEF BENCHMARK}
396 t1, t2 : Int64;
397 {$ENDIF}
398 const
399 KAKUJOSI = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????' +
400 #10 + '??#39; + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;;
401 begin
402
403 delimiter := TStringList.Create;
404 try
405 //*** ??綺????鴻??筝?/span>
406 wordCount.Duplicates := dupIgnore;
407 wordCount.CaseSensitive := True;
408 wordCount.Capacity := 1000;
409 wordCount.Sorted := True;
410 //***
411
412 mode := ModeWhite;
413 delimiter.Text := KAKUJOSI;
414 SetLength( aWord, 256 );
415 p := PChar( text );
416 tail := p + Length( text );
417 last := p;
418
419 while p < tail do begin
420 {$IFDEF BENCHMARK}
421 QueryPerformanceCounter( t1 );
422 {$ENDIF}
423 delimited := False;
424 // ??絖????帥?ゃ?????ゅ??/span>
425 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
426 if p^ in kYofKanji then begin
427 if p + 1 < tail then begin
428 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
429 case ch of
430 $8140: newMode := ModeWhite;
431 $8141..$824e: newMode := ModeWGraph;
432 $824f..$8258: newMode := ModeWNum;
433 $8260..$829a: newMode := ModeWAlpha;
434 $829f..$82f1: newMode := ModeWHira;
435 $8340..$8396: newMode := ModeWKata;
436 else newMode := ModeWKanji;
437 end;
438 end else begin
439 newMode := ModeWhite;
440 end;
441
442 chSize := 2;
443
444 // ?阪????????????絖?????????罎??祉????
445 if p + 3 < tail then begin // 3 = delimiter ????紊у?? - 1
446 for i := 0 to delimiter.Count - 1 do begin
447 if CompareMem(
448 p, PChar( delimiter[ i ] ), Length( delimiter[ i ] ) ) then begin
449 delimited := True;
450 chSize := Length( delimiter[ i ] );
451 Break;
452 end;
453 end;
454 end;
455 end else begin
456 case p^ of
457 #$0..#$20, #$7f: newMode := ModeWhite;
458 '0'..'9': newMode := ModeNum;
459 'a'..'z', 'A'..'Z': newMode := ModeAlpha;
460 #$A6..#$DD: newMode := ModeHanKana;
461 else newMode := ModeGraph;
462 end;
463
464 chSize := 1;
465 end;
466 {$IFDEF BENCHMARK}
467 QueryPerformanceCounter( t2 ); b1 := b1 + (t2 - t1);
468 {$ENDIF}
469
470 if (mode <> newMode) or delimited then begin
471
472 // ??絖????帥?ゃ????紊??眼??????
473 // ????????阪????????????絖???????????
474 if mode <> ModeWhite then begin
475 {$IFDEF BENCHMARK}
476 QueryPerformanceCounter( t1 );
477 {$ENDIF}
478 aWord := Copy( last, 0, p - last ); // 羶???
479 // SetLength( aWord, p - last );
480 // CopyMemory( PChar( aWord ), last, p - last );
481 {$IFDEF BENCHMARK}
482 QueryPerformanceCounter( t2 ); b2 := b2 + (t2 - t1);
483 {$ENDIF}
484 idx := wordCount.IndexOf( aWord ); // 羶???
485 {$IFDEF BENCHMARK}
486 QueryPerformanceCounter( t1 ); b3 := b3 + (t1 - t2);
487 {$ENDIF}
488 if idx < 0 then begin
489 countInfo := TWordCountInfo.Create;
490 wordCount.AddObject( aWord, countInfo );
491 end else begin
492 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
493 end;
494 countInfo.WordCount := countInfo.WordCount + 1;
495 {$IFDEF BENCHMARK}
496 QueryPerformanceCounter( t2 ); b4 := b4 + (t2 - t1);
497 {$ENDIF}
498 end;
499
500 last := p;
501 mode := newMode;
502
503 end;
504
505 p := p + chSize;
506 end; // while
507
508 if mode <> ModeWhite then begin
509 aWord := Copy( last, 0, p - last );
510 idx := wordCount.IndexOf( aWord );
511 if idx < 0 then begin
512 countInfo := TWordCountInfo.Create;
513 wordCount.AddObject( aWord, countInfo );
514 end else begin
515 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
516 end;
517 countInfo.WordCount := countInfo.WordCount + 1;
518 end;
519 finally
520 delimiter.Free;
521 end;
522
523 end;
524
525 //==============================
526 // CalcPaulGraham
527 //==============================
528 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
529
530 function p( const aWord : string ) : Single;
531 var
532 info : TWordInfo;
533 begin
534 info := Objects[ aWord ];
535 if info = nil then
536 Result := 0.4
537 else if info.NormalWord = 0 then
538 Result := 0.99
539 else if info.ImportantWord = 0 then
540 Result := 0.01
541 else
542 Result := ( info.ImportantWord / info.ImportantText ) /
543 ((info.NormalWord * 2 / info.NormalText ) +
544 (info.ImportantWord / info.ImportantText));
545 end;
546
547 var
548 s, q : Extended;
549 i : Integer;
550 narray : TList;
551 const
552 SAMPLE_COUNT = 15;
553 begin
554
555 Result := 1;
556 if wordCount.Count = 0 then
557 Exit;
558
559 narray := TList.Create;
560 try
561 for i := 0 to wordCount.Count - 1 do begin
562 narray.Add( Pointer( p( wordCount[ i ] ) ) );
563 end;
564
565 narray.Sort( AbsSort );
566
567 s := 1;
568 q := 1;
569 i := min( SAMPLE_COUNT, narray.Count );
570 while i > 0 do begin
571 Dec( i );
572 s := s * Single( narray[ i ] );
573 q := q * (1 - Single( narray[ i ] ));
574 end;
575
576 Result := s / (s + q);
577 finally
578 narray.Free;
579 end;
580
581 end;
582
583 //==============================
584 // CalcGaryRobinson
585 //==============================
586 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
587
588 function p( const aWord : string ) : Single;
589 var
590 info : TWordInfo;
591 begin
592 info := Objects[ aWord ];
593 if info = nil then
594 Result := 0.415
595 else if info.ImportantWord = 0 then
596 Result := 0.0001
597 else if info.NormalWord = 0 then
598 Result := 0.9999
599 else
600 Result := ( info.ImportantWord / info.ImportantText ) /
601 ((info.NormalWord / info.NormalText ) +
602 (info.ImportantWord / info.ImportantText));
603 end;
604
605 function f( cnt : Integer; n, mean : Single ) : Extended;
606 const
607 k = 0.00001;
608 begin
609 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
610 end;
611
612 var
613 n : Extended;
614 narray : array of Single;
615 mean : Extended;
616 countInfo : TWordCountInfo;
617 i : Integer;
618 normal : Extended;
619 important : Extended;
620 cnt : Extended;
621 begin
622
623 if wordCount.Count = 0 then begin
624 Result := 1;
625 Exit;
626 end;
627
628 SetLength( narray, wordCount.Count );
629 mean := 0;
630 for i := 0 to wordCount.Count - 1 do begin
631 n := p( wordCount[ i ] );
632 narray[ i ] := n;
633 mean := mean + n;
634 end;
635 mean := mean / wordCount.Count;
636
637 cnt := 0;
638 normal := 1;
639 important := 1;
640 for i := 0 to wordCount.Count - 1 do begin
641 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
642 n := f( countInfo.WordCount, narray[ i ], mean );
643 normal := normal * n;
644 important := important * (1 - n);
645 if countInfo <> nil then
646 cnt := cnt + countInfo.WordCount;
647 end;
648 if cnt = 0 then
649 cnt := 1;
650 normal := 1 - Exp( Ln( normal ) * (1 / cnt) );
651 important := 1 - Exp( Ln( important ) * (1 / cnt) );
652
653 n := (important - normal+ 0.00001) / (important + normal + 0.00001);
654 Result := (1 + n) / 2;
655
656 end;
657
658 //==============================
659 // Parse
660 //==============================
661 function TGikoBayesian.Parse(
662 const text : string;
663 wordCount : TWordCount;
664 algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
665 ) : Extended;
666 {$IFDEF BENCHMARK}
667 var
668 t1, t2 : Int64;
669 {$ENDIF}
670 begin
671
672 {$IFDEF BENCHMARK}
673 QueryPerformanceCounter( t1 );
674 {$ENDIF}
675 CountWord( text, wordCount );
676 {$IFDEF BENCHMARK}
677 QueryPerformanceCounter( t2 ); b5 := b5 + (t2 - t1);
678 {$ENDIF}
679 case algorithm of
680 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
681 gbaGaryRonbinson: Result := CalcGaryRobinson( wordCount );
682 else Result := 0;
683 end;
684 {$IFDEF BENCHMARK}
685 QueryPerformanceCounter( t1 ); b6 := b6 + (t1 - t2);
686 {$ENDIF}
687
688 end;
689
690 //==============================
691 // Learn
692 //==============================
693 procedure TGikoBayesian.Learn(
694 wordCount : TWordCount;
695 isImportant : Boolean );
696 var
697 aWord : string;
698 wordinfo : TWordInfo;
699 countinfo : TWordCountInfo;
700 i : Integer;
701 begin
702
703 for i := 0 to wordCount.Count - 1 do begin
704 aWord := wordCount[ i ];
705 wordinfo := Objects[ aWord ];
706 if wordinfo = nil then begin
707 wordinfo := TWordInfo.Create;
708 Objects[ aWord ] := wordinfo;
709 end;
710
711 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
712 if isImportant then begin
713 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
714 wordinfo.ImportantText := wordinfo.ImportantText + 1;
715 end else begin
716 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
717 wordinfo.NormalText := wordinfo.NormalText + 1;
718 end;
719 end;
720
721 end;
722
723 //==============================
724 // Forget
725 //==============================
726 procedure TGikoBayesian.Forget(
727 wordCount : TWordCount;
728 isImportant : Boolean );
729 var
730 aWord : string;
731 wordinfo : TWordInfo;
732 countinfo : TWordCountInfo;
733 i : Integer;
734 begin
735
736 for i := 0 to wordCount.Count - 1 do begin
737 aWord := wordCount[ i ];
738 wordinfo := Objects[ aWord ];
739 if wordinfo = nil then
740 Continue;
741
742 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
743 if isImportant then begin
744 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
745 wordinfo.ImportantText := wordinfo.ImportantText - 1;
746 end else begin
747 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
748 wordinfo.NormalText := wordinfo.NormalText - 1;
749 end;
750 end;
751
752 end;
753
754 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26