Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.10 - (show annotations) (download) (as text)
Sun Oct 31 12:24:33 2004 UTC (19 years, 5 months ago) by yoffy
Branch: MAIN
Changes since 1.9: +10 -11 lines
File MIME type: text/x-pascal
- DEBUG を定義してビルドした場合に、
 スパムフィルタのアルゴリズムを選択できるようになった。
- スパムフィルタの Gary Robinson 法を教科書通りに実装してみた。
 ※Gary Robinson-Fisher 法は未だ機能せず。

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7 $Id: GikoBayesian.pas,v 1.9 2004/10/27 00:10:12 yoffy Exp $
8 }
9
10 interface
11
12 //==================================================
13 uses
14 //==================================================
15 Classes;
16
17 //==================================================
18 type
19 //==================================================
20
21 {!***********************************************************
22 \brief ??茯???????????/span>
23 ************************************************************}
24 TWordInfo = class( TObject )
25 private
26 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
27 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
28 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
29 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
30
31 public
32 property NormalWord : Integer read FNormalWord write FNormalWord;
33 property ImportantWord : Integer read FImportantWord write FImportantWord;
34 property NormalText : Integer read FNormalText write FNormalText;
35 property ImportantText : Integer read FImportantText write FImportantText;
36 end;
37
38 {!***********************************************************
39 \brief 茹f??羝??水??茯???????????/span>
40 ************************************************************}
41 TWordCountInfo = class( TObject )
42 private
43 FWordCount : Integer; //!< ??茯???/span>
44
45 public
46 property WordCount : Integer read FWordCount write FWordCount;
47 end;
48
49 {!***********************************************************
50 \brief 茹f??羝??水??茯????鴻??
51 ************************************************************}
52 // TWordCount = class( THashedStringList ) // 羶???
53 TWordCount = class( TStringList )
54 public
55 constructor Create;
56 destructor Destroy; override;
57 end;
58
59 {!***********************************************************
60 \brief ???c???帥?≪???眼???冴??
61 ************************************************************}
62 TGikoBayesianAlgorithm =
63 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
64
65 {!***********************************************************
66 \brief ???ゃ?吾?≪?潟???c????/span>
67 ************************************************************}
68 // TGikoBayesian = class( THashedStringList ) // 羶???
69 TGikoBayesian = class( TStringList )
70 private
71 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
72 function GetObject( const name : string ) : TWordInfo;
73 procedure SetObject( const name : string; value : TWordInfo );
74
75 public
76 constructor Create;
77 destructor Destroy; override;
78
79 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
80 procedure LoadFromFile( const filePath : string );
81
82 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
83 procedure SaveToFile( const filePath : string );
84
85 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
86 procedure Save;
87
88 //! ??茯???????????宴????緇????障??
89 property Objects[ const name : string ] : TWordInfo
90 read GetObject write SetObject; default;
91
92 //! ??腴??????障??????茯????????潟?????障??
93 procedure CountWord(
94 const text : string;
95 wordCount : TWordCount );
96
97 {!
98 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
99 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
100 }
101 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
102
103 {!
104 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
105 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
106 }
107 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
108
109 {!
110 \brief GaryRobinson-Fisher 羈????冴?ャ??????腴???絵??墾??羆阪????障??
111 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
112 }
113 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
114
115 {!
116 \brief ??腴???茹f??
117 \param text 茹f????????腴?
118 \param wordCount 茹f??????????茯????鴻????菴???
119 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
120 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
121
122 CountWord ? Calcxxxxx ???障???????茵??????????с????
123 }
124 function Parse(
125 const text : string;
126 wordCount : TWordCount;
127 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
128 ) : Extended;
129
130 {!
131 \brief 絖??????
132 \param wordCount Parse ?цВ??????????茯????鴻??
133 \param isImportant 羈??????鴻????腴???????????????? True
134 }
135 procedure Learn(
136 wordCount : TWordCount;
137 isImportant : Boolean );
138
139 {!
140 \brief 絖??腟?????綽?????
141 \param wordCount Parse ?цВ??????????茯????鴻??
142 \param isImportant 羈??????鴻????腴???????????????????????? True
143 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
144 Learn ????????????腴??? isImportant ???????c????????腴???
145 Forget ?????????若?帥???若?鴻???贋?????障????<br>
146 絖??羝??帥????????????????????????????????
147
148 ???????膺?腟??????????≪????????с???????障??????<br>
149 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
150
151 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
152 }
153 procedure Forget(
154 wordCount : TWordCount;
155 isImportant : Boolean );
156 end;
157
158 //==================================================
159 implementation
160 //==================================================
161
162 uses
163 SysUtils, Math, Windows;
164
165 const
166 GIKO_BAYESIAN_FILE_VERSION = '1.0';
167 {
168 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
169 ModeWGraph, ModeWAlpha, ModeWNum,
170 ModeWHira, ModeWKata, ModeWKanji);
171 }
172 CharMode1 : array [ 0..255 ] of Byte =
173 (
174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
177 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
178 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
179 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
180 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
181 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
182
183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
186 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
187 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
188 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
191 );
192
193 //************************************************************
194 // misc
195 //************************************************************
196
197 //==============================
198 // RemoveToken
199 //==============================
200 function RemoveToken(var s: string;const delimiter: string): string;
201 var
202 p: Integer;
203 begin
204 p := AnsiPos(delimiter, s);
205 if p = 0 then
206 Result := s
207 else
208 Result := Copy(s, 1, p - 1);
209 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
210 end;
211
212 //==============================
213 // AbsSort
214 //==============================
215 function AbsSort( p1, p2 : Pointer ) : Integer;
216 var
217 v1, v2 : Single;
218 begin
219
220 v1 := Abs( Single( p1 ) - 0.5 );
221 v2 := Abs( Single( p2 ) - 0.5 );
222 if v1 > v2 then
223 Result := -1
224 else if v1 = v2 then
225 Result := 0
226 else
227 Result := 1;
228
229 end;
230
231 //************************************************************
232 // TWordCount class
233 //************************************************************
234 constructor TWordCount.Create;
235 begin
236
237 Duplicates := dupIgnore;
238 CaseSensitive := True;
239 Sorted := True;
240
241 end;
242
243 destructor TWordCount.Destroy;
244 var
245 i : Integer;
246 begin
247
248 for i := Count - 1 downto 0 do
249 if Objects[ i ] <> nil then
250 Objects[ i ].Free;
251
252 inherited;
253
254 end;
255
256 //************************************************************
257 // TGikoBayesian class
258 //************************************************************
259
260 //==============================
261 // Create
262 //==============================
263 constructor TGikoBayesian.Create;
264 begin
265
266 Duplicates := dupIgnore;
267 CaseSensitive := True;
268 Sorted := True;
269
270 end;
271
272 //==============================
273 // Destroy
274 //==============================
275 destructor TGikoBayesian.Destroy;
276 var
277 i : Integer;
278 begin
279
280 for i := Count - 1 downto 0 do
281 if inherited Objects[ i ] <> nil then
282 inherited Objects[ i ].Free;
283
284 inherited;
285
286 end;
287
288 procedure TGikoBayesian.LoadFromFile( const filePath : string );
289 var
290 i : Integer;
291 sl : TStringList;
292 s : string;
293 name : string;
294 info : TWordInfo;
295 begin
296
297 FFilePath := filePath;
298
299 if not FileExists( filePath ) then
300 Exit;
301
302 sl := TStringList.Create;
303 try
304 sl.LoadFromFile( filePath );
305
306 for i := 1 to sl.Count - 1 do begin
307 s := sl[ i ];
308 name := RemoveToken( s, #1 );
309 info := TWordInfo.Create;
310 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
311 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
312 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
313 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
314
315 AddObject( name, info );
316 end;
317 finally
318 sl.Free;
319 end;
320
321 end;
322
323 procedure TGikoBayesian.SaveToFile( const filePath : string );
324 var
325 i : Integer;
326 sl : TStringList;
327 s : string;
328 info : TWordInfo;
329 begin
330
331 FFilePath := filePath;
332
333 sl := TStringList.Create;
334 try
335 sl.BeginUpdate;
336 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
337
338 for i := 0 to Count - 1 do begin
339 info := TWordInfo( inherited Objects[ i ] );
340 s := Strings[ i ] + #1
341 + Format('%x', [info.NormalWord]) + #1
342 + Format('%x', [info.ImportantWord]) + #1
343 + Format('%x', [info.NormalText]) + #1
344 + Format('%x', [info.ImportantText]);
345
346 sl.Add(s);
347 end;
348 sl.EndUpdate;
349 sl.SaveToFile( filePath );
350 finally
351 sl.Free;
352 end;
353
354 end;
355
356 procedure TGikoBayesian.Save;
357 begin
358
359 if FFilePath <> '' then
360 SaveToFile( FFilePath );
361
362 end;
363
364 //==============================
365 // GetObject
366 //==============================
367 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
368 var
369 idx : Integer;
370 begin
371
372 idx := IndexOf( name ); // 羶???
373 if idx < 0 then
374 Result := nil
375 else
376 Result := TWordInfo( inherited Objects[ idx ] );
377
378 end;
379
380 //==============================
381 // SetObject
382 //==============================
383 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
384 var
385 idx : Integer;
386 begin
387
388 idx := IndexOf( name );
389 if idx < 0 then
390 AddObject( name, value )
391 else
392 inherited Objects[ idx ] := value;
393
394 end;
395
396
397 //==============================
398 // CountWord
399 //==============================
400 procedure TGikoBayesian.CountWord(
401 const text : string;
402 wordCount : TWordCount );
403 type
404 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
405 ModeWGraph, ModeWAlpha, ModeWNum,
406 ModeWHira, ModeWKata, ModeWKanji);
407 var
408 p, tail, last : PChar;
409 mode, newMode : Modes;
410 aWord : string;
411 ch : Longword;
412 chSize : Integer;
413 delimiter : TStringList;
414 delimited : Boolean;
415 i, idx : Integer;
416 countInfo : TWordCountInfo;
417 const
418 KAKUJOSI = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????' +
419 #10 + '??#39; + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;;
420 kKanji = [$80..$A0, $E0..$ff];
421 begin
422
423 delimiter := TStringList.Create;
424 try
425 mode := ModeWhite;
426 delimiter.Text := KAKUJOSI;
427 p := PChar( text );
428 tail := p + Length( text );
429 last := p;
430
431 while p < tail do begin
432 delimited := False;
433 // ??絖????帥?ゃ?????ゅ??/span>
434 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
435 // if Byte(Byte( p^ ) - $a1) < $5e then begin
436 if Byte( p^ ) in kKanji then begin
437 if p + 1 < tail then begin
438 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
439 case ch of
440 $8140: newMode := ModeWhite;
441 $8141..$824e: newMode := ModeWGraph;
442 $824f..$8258: newMode := ModeWNum;
443 $8260..$829a: newMode := ModeWAlpha;
444 $829f..$82f1: newMode := ModeWHira;
445 $8340..$8396: newMode := ModeWKata;
446 else newMode := ModeWKanji;
447 end;
448 // '??????#39; ??抗篁??????障???????帥?????????障????
449 if (mode = ModeWHira) or (mode = ModeWKata) then
450 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
451 newMode := mode;
452 end else begin
453 newMode := ModeWhite;
454 end;
455
456 chSize := 2;
457
458 // ?阪????????????絖?????????罎??祉????
459 if p + 3 < tail then begin // 3 = delimiter ????紊у?? - 1
460 for i := 0 to delimiter.Count - 1 do begin
461 if CompareMem(
462 p, PChar( delimiter[ i ] ), Length( delimiter[ i ] ) ) then begin
463 delimited := True;
464 chSize := Length( delimiter[ i ] );
465 Break;
466 end;
467 end;
468 end;
469 end else begin
470 newMode := Modes( CharMode1[ Byte( p^ ) ] );
471
472 chSize := 1;
473 end;
474
475 if (mode <> newMode) or delimited then begin
476
477 // ??絖????帥?ゃ????紊??眼??????
478 // ????????阪????????????絖???????????
479 if mode <> ModeWhite then begin
480 SetLength( aWord, p - last );
481 CopyMemory( PChar( aWord ), last, p - last );
482 //aWord := Copy( last, 0, p - last );
483 idx := wordCount.IndexOf( aWord ); // ??
484 if idx < 0 then begin
485 countInfo := TWordCountInfo.Create;
486 wordCount.AddObject( aWord, countInfo );
487 end else begin
488 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
489 end;
490 countInfo.WordCount := countInfo.WordCount + 1;
491 end;
492
493 last := p;
494 mode := newMode;
495
496 end;
497
498 p := p + chSize;
499 end; // while
500
501 if mode <> ModeWhite then begin
502 aWord := Copy( last, 0, p - last );
503 idx := wordCount.IndexOf( aWord );
504 if idx < 0 then begin
505 countInfo := TWordCountInfo.Create;
506 wordCount.AddObject( aWord, countInfo );
507 end else begin
508 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
509 end;
510 countInfo.WordCount := countInfo.WordCount + 1;
511 end;
512 finally
513 delimiter.Free;
514 end;
515
516 end;
517
518 //==============================
519 // CalcPaulGraham
520 //==============================
521 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
522
523 function p( const aWord : string ) : Single;
524 var
525 info : TWordInfo;
526 begin
527 info := Objects[ aWord ];
528 if info = nil then
529 Result := 0.415
530 else if info.NormalWord = 0 then
531 Result := 0.99
532 else if info.ImportantWord = 0 then
533 Result := 0.01
534 else if info.ImportantWord + info.NormalWord * 2 < 5 then
535 Result := 0.5
536 else
537 Result := ( info.ImportantWord / info.ImportantText ) /
538 ((info.NormalWord * 2 / info.NormalText ) +
539 (info.ImportantWord / info.ImportantText));
540 end;
541
542 var
543 s, q : Extended;
544 i : Integer;
545 narray : TList;
546 const
547 SAMPLE_COUNT = 15;
548 begin
549
550 Result := 1;
551 if wordCount.Count = 0 then
552 Exit;
553
554 narray := TList.Create;
555 try
556 for i := 0 to wordCount.Count - 1 do begin
557 narray.Add( Pointer( p( wordCount[ i ] ) ) );
558 end;
559
560 narray.Sort( AbsSort );
561
562 s := 1;
563 q := 1;
564 i := min( SAMPLE_COUNT, narray.Count );
565 while i > 0 do begin
566 Dec( i );
567
568 s := s * Single( narray[ i ] );
569 q := q * (1 - Single( narray[ i ] ));
570 end;
571
572 Result := s / (s + q);
573 finally
574 narray.Free;
575 end;
576
577 end;
578
579 //==============================
580 // CalcGaryRobinson
581 //==============================
582 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
583
584 function p( const aWord : string ) : Single;
585 var
586 info : TWordInfo;
587 begin
588 info := Objects[ aWord ];
589 if info = nil then
590 Result := 0.415
591 else if info.ImportantWord = 0 then
592 Result := 0.01
593 else if info.NormalWord = 0 then
594 Result := 0.99
595 else
596 Result := ( info.ImportantWord / info.ImportantText ) /
597 ((info.NormalWord / info.NormalText ) +
598 (info.ImportantWord / info.ImportantText));
599 end;
600
601 function f( cnt : Integer; n, mean : Single ) : Extended;
602 const
603 k = 0.00001;
604 begin
605 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
606 end;
607
608 var
609 n : Extended;
610 narray : array of Single;
611 mean : Extended;
612 countInfo : TWordCountInfo;
613 i : Integer;
614 P1, Q1, R1 : Extended;
615 cnt : Extended;
616 begin
617
618 if wordCount.Count = 0 then begin
619 Result := 1;
620 Exit;
621 end;
622
623 SetLength( narray, wordCount.Count );
624 mean := 0;
625 for i := 0 to wordCount.Count - 1 do begin
626 n := p( wordCount[ i ] );
627 narray[ i ] := n;
628 mean := mean + n;
629 end;
630 mean := mean / wordCount.Count;
631
632 P1 := 1;
633 Q1 := 1;
634 for i := 0 to wordCount.Count - 1 do begin
635 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
636 n := f( countInfo.WordCount, narray[ i ], mean );
637 P1 := P1 * ( 1 - n );
638 Q1 := Q1 * n;
639 end;
640 cnt := wordCount.Count;
641 if cnt = 0 then
642 cnt := 1
643 else
644 P1 := 1 - Power( P1, 1 / cnt );
645 Q1 := 1 - Power( Q1, 1 / cnt );
646
647 if P1 + Q1 = 0 then begin
648 Result := 0.5
649 end else begin
650 n := (P1 - Q1) / (P1 + Q1);
651 Result := (1 + n) / 2;
652 end;
653
654 end;
655
656 //==============================
657 // CalcGaryRobinsonFisher
658 //==============================
659 function TGikoBayesian.CalcGaryRobinsonFisher(
660 wordCount : TWordCount
661 ) : Extended;
662
663 function p( const aWord : string ) : Single;
664 var
665 info : TWordInfo;
666 begin
667 info := Objects[ aWord ];
668 if info = nil then
669 Result := 0.415
670 else if info.ImportantWord = 0 then
671 Result := 0.01
672 else if info.NormalWord = 0 then
673 Result := 0.99
674 else
675 Result := info.ImportantWord /
676 (info.ImportantWord + info.NormalWord *
677 info.ImportantText / info.NormalText);
678 end;
679
680 function f( cnt : Integer; n, mean : Single ) : Extended;
681 const
682 k = 0.00001;
683 begin
684 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
685 end;
686
687 function prbx( x2, degree : Extended ) : Extended;
688 var
689 m : Extended;
690 sum : Extended;
691 term : Extended;
692 i : extended;
693 begin
694
695 m := x2 / 2;
696 sum := exp( -m );
697 term := -m;
698
699 i := 1;
700 while i < (degree / 2 - 1) do begin
701 term := term + ln( m / i );
702 sum := sum + exp( term );
703 i := i + 1;
704 end;
705
706 if sum < 1 then
707 Result := sum
708 else
709 Result := 1.0;
710
711 end;
712
713 var
714 n : Extended;
715 narray : array of Single;
716 mean : Extended;
717 countInfo : TWordCountInfo;
718 i : Integer;
719 normal : Extended;
720 important : Extended;
721 P1, Q1 : Extended;
722 cnt : Extended;
723 begin
724
725 if wordCount.Count = 0 then begin
726 Result := 1;
727 Exit;
728 end;
729
730 SetLength( narray, wordCount.Count );
731 mean := 0;
732 for i := 0 to wordCount.Count - 1 do begin
733 n := p( wordCount[ i ] );
734 narray[ i ] := n;
735 mean := mean + n;
736 end;
737 mean := mean / wordCount.Count;
738
739 cnt := 0;
740 (*
741 P1 := 1;
742 Q1 := 1;
743 (*)
744 P1 := 0;
745 Q1 := 0;
746 //*
747 for i := 0 to wordCount.Count - 1 do begin
748 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
749 n := f( countInfo.WordCount, narray[ i ], mean );
750 if countInfo <> nil then
751 cnt := cnt + countInfo.WordCount;
752 (*
753 P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
754 Q1 := Q1 + Ln( n ) * countInfo.WordCount;
755 (*)
756 P1 := P1 + Ln( 1 - n );
757 Q1 := Q1 + Ln( n );
758 //*)
759 end;
760 if cnt = 0 then
761 cnt := 1;
762 //(*
763 P1 := prbx( -2 * P1, 2 * cnt );
764 Q1 := prbx( -2 * Q1, 2 * cnt );
765 (*)
766 P1 := prbx( -2 * Ln( P1 ), 2 * cnt );
767 Q1 := prbx( -2 * Ln( Q1 ), 2 * cnt );
768 //*)
769 if P1 + Q1 = 0 then begin
770 Result := 0.5
771 end else begin
772 Result := (1 + Q1 + P1) / 2;
773 end;
774
775 end;
776
777 //==============================
778 // Parse
779 //==============================
780 function TGikoBayesian.Parse(
781 const text : string;
782 wordCount : TWordCount;
783 algorithm : TGikoBayesianAlgorithm
784 ) : Extended;
785 begin
786
787 CountWord( text, wordCount );
788 case algorithm of
789 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
790 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
791 gbaGaryRobinsonFisher:
792 Result := CalcGaryRobinsonFisher( wordCount );
793 else Result := 0;
794 end;
795
796 end;
797
798 //==============================
799 // Learn
800 //==============================
801 procedure TGikoBayesian.Learn(
802 wordCount : TWordCount;
803 isImportant : Boolean );
804 var
805 aWord : string;
806 wordinfo : TWordInfo;
807 countinfo : TWordCountInfo;
808 i : Integer;
809 begin
810
811 for i := 0 to wordCount.Count - 1 do begin
812 aWord := wordCount[ i ];
813 wordinfo := Objects[ aWord ];
814 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
815 if wordinfo = nil then begin
816 wordinfo := TWordInfo.Create;
817 Objects[ aWord ] := wordinfo;
818 end;
819
820 if isImportant then begin
821 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
822 wordinfo.ImportantText := wordinfo.ImportantText + 1;
823 end else begin
824 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
825 wordinfo.NormalText := wordinfo.NormalText + 1;
826 end;
827 end;
828
829 end;
830
831 //==============================
832 // Forget
833 //==============================
834 procedure TGikoBayesian.Forget(
835 wordCount : TWordCount;
836 isImportant : Boolean );
837 var
838 aWord : string;
839 wordinfo : TWordInfo;
840 countinfo : TWordCountInfo;
841 i : Integer;
842 begin
843
844 for i := 0 to wordCount.Count - 1 do begin
845 aWord := wordCount[ i ];
846 wordinfo := Objects[ aWord ];
847 if wordinfo = nil then
848 Continue;
849
850 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
851 if isImportant then begin
852 if wordInfo.ImportantText > 0 then begin
853 wordinfo.ImportantText := wordinfo.ImportantText - 1;
854 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
855 end;
856 end else begin
857 if wordinfo.NormalText > 0 then begin
858 wordinfo.NormalText := wordinfo.NormalText - 1;
859 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
860 end;
861 end;
862 end;
863
864 end;
865
866 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26