Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (show annotations) (download) (as text)
Wed Oct 20 18:25:00 2004 UTC (19 years, 6 months ago) by yoffy
Branch: MAIN
File MIME type: text/x-pascal
スパムフィルタの導入。

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7 $Id$
8 }
9
10 interface
11
12 //==================================================
13 uses
14 //==================================================
15 Classes, IniFiles;
16
17 //==================================================
18 type
19 //==================================================
20
21 {!***********************************************************
22 \brief ??茯???????????/span>
23 ************************************************************}
24 TWordInfo = class( TObject )
25 private
26 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
27 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
28 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
29 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
30
31 public
32 property NormalWord : Integer read FNormalWord write FNormalWord;
33 property ImportantWord : Integer read FImportantWord write FImportantWord;
34 property NormalText : Integer read FNormalText write FNormalText;
35 property ImportantText : Integer read FImportantText write FImportantText;
36 end;
37
38 {!***********************************************************
39 \brief 茹f??羝??水??茯???????????/span>
40 ************************************************************}
41 TWordCountInfo = class( TObject )
42 private
43 FWordCount : Integer; //!< ??茯???/span>
44
45 public
46 property WordCount : Integer read FWordCount write FWordCount;
47 end;
48
49 {!***********************************************************
50 \brief 茹f??羝??水??茯????鴻??
51 ************************************************************}
52 // TWordCount = class( THashedStringList ) // 羶???
53 TWordCount = class( TStringList ) // ??
54 public
55 destructor Destroy; override;
56 end;
57
58 {!***********************************************************
59 \brief ???c???帥?≪???眼???冴??
60 ************************************************************}
61 TGikoBayesianAlgorithm =
62 (gbaPaulGraham, gbaGaryRonbinson{, gbaGaryRonbinsonFisher});
63
64 {!***********************************************************
65 \brief ???ゃ?吾?≪?潟???c????/span>
66 ************************************************************}
67 TGikoBayesian = class( THashedStringList )
68 private
69 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
70 function GetObject( const name : string ) : TWordInfo;
71 procedure SetObject( const name : string; value : TWordInfo );
72
73 public
74 constructor Create;
75 destructor Destroy; override;
76
77 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
78 procedure LoadFromFile( const filePath : string );
79
80 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
81 procedure SaveToFile( const filePath : string );
82
83 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
84 procedure Save;
85
86 //! ??茯???????????宴????緇????障??
87 property Objects[ const name : string ] : TWordInfo
88 read GetObject write SetObject; default;
89
90 //! ??腴??????障??????茯????????潟?????障??
91 procedure CountWord(
92 const text : string;
93 wordCount : TWordCount );
94
95 {!
96 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
97 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
98 }
99 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
100
101 {!
102 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
103 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
104 }
105 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
106
107 // function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
108
109 {!
110 \brief ??腴???茹f??
111 \param text 茹f????????腴?
112 \param wordCount 茹f??????????茯????鴻????菴???
113 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
114 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
115
116 CountWord ? Calcxxxxx ???障???????茵??????????с????
117 }
118 function Parse(
119 const text : string;
120 wordCount : TWordCount;
121 algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
122 ) : Extended;
123
124 {!
125 \brief 絖??????
126 \param wordCount Parse ?цВ??????????茯????鴻??
127 \param isImportant 羈??????鴻????腴???????????????? True
128 }
129 procedure Learn(
130 wordCount : TWordCount;
131 isImportant : Boolean );
132
133 {!
134 \brief 絖??腟?????綽?????
135 \param wordCount Parse ?цВ??????????茯????鴻??
136 \param isImportant 羈??????鴻????腴???????????????????????? True
137 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
138 Learn ????????????腴??? isImportant ???????c????????腴???
139 Forget ?????????若?帥???若?鴻???贋?????障????<br>
140 絖??羝??帥????????????????????????????????
141
142 ???????膺?腟??????????≪????????с???????障??????<br>
143 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
144
145 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
146 }
147 procedure Forget(
148 wordCount : TWordCount;
149 isImportant : Boolean );
150 end;
151
152 //==================================================
153 implementation
154 //==================================================
155
156 uses
157 SysUtils, Math;
158
159 const
160 GIKO_BAYESIAN_FILE_VERSION = '1.0';
161 kYofKanji : TSysCharSet = [#$80..#$A0, #$E0..#$ff];
162
163 //************************************************************
164 // misc
165 //************************************************************
166
167 //==============================
168 // RemoveToken
169 //==============================
170 function RemoveToken(var s: string;const delimiter: string): string;
171 var
172 p: Integer;
173 begin
174 p := AnsiPos(delimiter, s);
175 if p = 0 then
176 Result := s
177 else
178 Result := Copy(s, 1, p - 1);
179 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
180 end;
181
182 //==============================
183 // AbsSort
184 //==============================
185 function AbsSort( p1, p2 : Pointer ) : Integer;
186 var
187 v1, v2 : Single;
188 begin
189
190 v1 := Abs( Single( p1 ) - 0.5 );
191 v2 := Abs( Single( p2 ) - 0.5 );
192 if v1 > v2 then
193 Result := -1
194 else if v1 = v2 then
195 Result := 0
196 else
197 Result := 1;
198
199 end;
200
201 //************************************************************
202 // TWordCount class
203 //************************************************************
204 destructor TWordCount.Destroy;
205 var
206 i : Integer;
207 begin
208
209 for i := Count - 1 downto 0 do
210 if Objects[ i ] <> nil then
211 Objects[ i ].Free;
212
213 inherited;
214
215 end;
216
217 //************************************************************
218 // TGikoBayesian class
219 //************************************************************
220
221 //==============================
222 // Create
223 //==============================
224 constructor TGikoBayesian.Create;
225 begin
226
227 Duplicates := dupIgnore;
228 Sorted := True;
229
230 end;
231
232 //==============================
233 // Destroy
234 //==============================
235 destructor TGikoBayesian.Destroy;
236 var
237 i : Integer;
238 begin
239
240 for i := Count - 1 downto 0 do
241 if inherited Objects[ i ] <> nil then
242 inherited Objects[ i ].Free;
243
244 inherited;
245
246 end;
247
248 procedure TGikoBayesian.LoadFromFile( const filePath : string );
249 var
250 i : Integer;
251 sl : TStringList;
252 s : string;
253 name : string;
254 info : TWordInfo;
255 begin
256
257 if not FileExists( filePath ) then
258 Exit;
259
260 sl := TStringList.Create;
261 try
262 sl.LoadFromFile( filePath );
263
264 for i := 1 to sl.Count - 1 do begin
265 s := sl[ i ];
266 name := RemoveToken( s, #1 );
267 info := TWordInfo.Create;
268 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
269 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
270 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
271 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
272
273 AddObject( name, info );
274 end;
275 finally
276 sl.Free;
277 end;
278
279 end;
280
281 procedure TGikoBayesian.SaveToFile( const filePath : string );
282 var
283 i : Integer;
284 sl : TStringList;
285 s : string;
286 info : TWordInfo;
287 begin
288
289 sl := TStringList.Create;
290 try
291 sl.BeginUpdate;
292 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
293
294 for i := 0 to Count - 1 do begin
295 info := TWordInfo( inherited Objects[ i ] );
296 s := Strings[ i ] + #1
297 + Format('%x', [info.NormalWord]) + #1
298 + Format('%x', [info.ImportantWord]) + #1
299 + Format('%x', [info.NormalText]) + #1
300 + Format('%x', [info.ImportantText]);
301
302 sl.Add(s);
303 end;
304 sl.EndUpdate;
305 sl.SaveToFile( filePath );
306 finally
307 sl.Free;
308 end;
309
310 end;
311
312 procedure TGikoBayesian.Save;
313 begin
314
315 if FFilePath <> '' then
316 SaveToFile( FFilePath );
317
318 end;
319
320 //==============================
321 // GetObject
322 //==============================
323 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
324 var
325 idx : Integer;
326 begin
327
328 idx := IndexOf( name );
329 if idx < 0 then
330 Result := nil
331 else
332 Result := TWordInfo( inherited Objects[ idx ] );
333
334 end;
335
336 //==============================
337 // SetObject
338 //==============================
339 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
340 var
341 idx : Integer;
342 begin
343
344 idx := IndexOf( name );
345 if idx < 0 then
346 AddObject( name, value )
347 else
348 inherited Objects[ idx ] := value;
349
350 end;
351
352
353 //==============================
354 // CountWord
355 //==============================
356 procedure TGikoBayesian.CountWord(
357 const text : string;
358 wordCount : TWordCount );
359 type
360 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
361 ModeWGraph, ModeWAlpha, ModeWNum,
362 ModeWHira, ModeWKata, ModeWKanji);
363 var
364 p, tail, last : PChar;
365 mode, newMode : Modes;
366 aWord : string;
367 ch : Longword;
368 chSize : Integer;
369 delimiter : TStringList;
370 delimited : Boolean;
371 i, idx : Integer;
372 countInfo : TWordCountInfo;
373 const
374 KAKUJOSI = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????' +
375 #10 + '??#39; + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;;
376 begin
377
378 delimiter := TStringList.Create;
379 try
380 //*** ??綺????鴻??筝?/span>
381 wordCount.Duplicates := dupIgnore;
382 wordCount.CaseSensitive := True;
383 wordCount.Capacity := 1000;
384 wordCount.Sorted := True;
385 //***
386
387 mode := ModeWhite;
388 delimiter.Text := KAKUJOSI;
389 SetLength( aWord, 256 );
390 p := PChar( text );
391 tail := p + Length( text );
392 last := p;
393
394 while p < tail do begin
395 delimited := False;
396 // ??絖????帥?ゃ?????ゅ??/span>
397 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
398 if p^ in kYofKanji then begin
399 if p + 1 < tail then begin
400 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
401 case ch of
402 $8140: newMode := ModeWhite;
403 $8141..$824e: newMode := ModeWGraph;
404 $824f..$8258: newMode := ModeWNum;
405 $8260..$829a: newMode := ModeWAlpha;
406 $829f..$82f1: newMode := ModeWHira;
407 $8340..$8396: newMode := ModeWKata;
408 else newMode := ModeWKanji;
409 end;
410 end else begin
411 newMode := ModeWhite;
412 end;
413
414 chSize := 2;
415
416 // ?阪????????????絖?????????罎??祉????
417 if p + 3 < tail then begin // 3 = delimiter ????紊у?? - 1
418 for i := 0 to delimiter.Count - 1 do begin
419 if CompareMem(
420 p, PChar( delimiter[ i ] ), Length( delimiter[ i ] ) ) then begin
421 delimited := True;
422 chSize := Length( delimiter[ i ] );
423 Break;
424 end;
425 end;
426 end;
427 end else begin
428 case p^ of
429 #$0..#$20, #$7f: newMode := ModeWhite;
430 '0'..'9': newMode := ModeNum;
431 'a'..'z', 'A'..'Z': newMode := ModeAlpha;
432 #$A6..#$DD: newMode := ModeHanKana;
433 else newMode := ModeGraph;
434 end;
435
436 chSize := 1;
437 end;
438
439 if (mode <> newMode) or delimited then begin
440
441 // ??絖????帥?ゃ????紊??眼??????
442 // ????????阪????????????絖???????????
443 if mode <> ModeWhite then begin
444 aWord := Copy( last, 0, p - last ); // 羶???
445 // SetLength( aWord, p - last );
446 // CopyMemory( PChar( aWord ), last, p - last );
447 idx := wordCount.IndexOf( aWord ); // 羶???
448 if idx < 0 then begin
449 countInfo := TWordCountInfo.Create;
450 wordCount.AddObject( aWord, countInfo );
451 end else begin
452 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
453 end;
454 countInfo.WordCount := countInfo.WordCount + 1;
455 end;
456
457 last := p;
458 mode := newMode;
459
460 end;
461
462 p := p + chSize;
463 end; // while
464
465 if mode <> ModeWhite then begin
466 aWord := Copy( last, 0, p - last );
467 idx := wordCount.IndexOf( aWord );
468 if idx < 0 then begin
469 countInfo := TWordCountInfo.Create;
470 wordCount.AddObject( aWord, countInfo );
471 end else begin
472 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
473 end;
474 countInfo.WordCount := countInfo.WordCount + 1;
475 end;
476 finally
477 delimiter.Free;
478 end;
479
480 end;
481
482 //==============================
483 // CalcPaulGraham
484 //==============================
485 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
486
487 function p( const aWord : string ) : Single;
488 var
489 info : TWordInfo;
490 begin
491 info := Objects[ aWord ];
492 if info = nil then
493 Result := 0.4
494 else if info.NormalWord = 0 then
495 Result := 0.99
496 else if info.ImportantWord = 0 then
497 Result := 0.01
498 else
499 Result := ( info.ImportantWord / info.ImportantText ) /
500 ((info.NormalWord * 2 / info.NormalText ) +
501 (info.ImportantWord / info.ImportantText));
502 end;
503
504 var
505 s, q : Extended;
506 i : Integer;
507 narray : TList;
508 const
509 SAMPLE_COUNT = 15;
510 begin
511
512 Result := 1;
513 if wordCount.Count = 0 then
514 Exit;
515
516 narray := TList.Create;
517 try
518 for i := 0 to wordCount.Count - 1 do begin
519 narray.Add( Pointer( p( wordCount[ i ] ) ) );
520 end;
521
522 narray.Sort( AbsSort );
523
524 s := 1;
525 q := 1;
526 i := min( SAMPLE_COUNT, narray.Count );
527 while i > 0 do begin
528 Dec( i );
529 s := s * Single( narray[ i ] );
530 q := q * (1 - Single( narray[ i ] ));
531 end;
532
533 Result := s / (s + q);
534 finally
535 narray.Free;
536 end;
537
538 end;
539
540 //==============================
541 // CalcGaryRobinson
542 //==============================
543 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
544
545 function p( const aWord : string ) : Single;
546 var
547 info : TWordInfo;
548 begin
549 info := Objects[ aWord ];
550 if info = nil then
551 Result := 0.415
552 else if info.ImportantWord = 0 then
553 Result := 0.0001
554 else if info.NormalWord = 0 then
555 Result := 0.9999
556 else
557 Result := ( info.ImportantWord / info.ImportantText ) /
558 ((info.NormalWord / info.NormalText ) +
559 (info.ImportantWord / info.ImportantText));
560 end;
561
562 function f( cnt : Integer; n, mean : Single ) : Extended;
563 const
564 k = 0.00001;
565 begin
566 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
567 end;
568
569 var
570 n : Extended;
571 narray : array of Single;
572 mean : Extended;
573 countInfo : TWordCountInfo;
574 i : Integer;
575 normal : Extended;
576 important : Extended;
577 cnt : Extended;
578 begin
579
580 if wordCount.Count = 0 then begin
581 Result := 1;
582 Exit;
583 end;
584
585 SetLength( narray, wordCount.Count );
586 mean := 0;
587 for i := 0 to wordCount.Count - 1 do begin
588 n := p( wordCount[ i ] );
589 narray[ i ] := n;
590 mean := mean + n;
591 end;
592 mean := mean / wordCount.Count;
593
594 cnt := 0;
595 normal := 1;
596 important := 1;
597 for i := 0 to wordCount.Count - 1 do begin
598 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
599 n := f( countInfo.WordCount, narray[ i ], mean );
600 normal := normal * n;
601 important := important * (1 - n);
602 if countInfo <> nil then
603 cnt := cnt + countInfo.WordCount;
604 end;
605 if cnt = 0 then
606 cnt := 1;
607 normal := 1 - Exp( Ln( normal ) * (1 / cnt) );
608 important := 1 - Exp( Ln( important ) * (1 / cnt) );
609
610 n := (important - normal+ 0.00001) / (important + normal + 0.00001);
611 Result := (1 + n) / 2;
612
613 end;
614
615 //==============================
616 // Parse
617 //==============================
618 function TGikoBayesian.Parse(
619 const text : string;
620 wordCount : TWordCount;
621 algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
622 ) : Extended;
623 begin
624
625 CountWord( text, wordCount );
626 case algorithm of
627 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
628 gbaGaryRonbinson: Result := CalcGaryRobinson( wordCount );
629 else Result := 0;
630 end;
631
632 end;
633
634 //==============================
635 // Learn
636 //==============================
637 procedure TGikoBayesian.Learn(
638 wordCount : TWordCount;
639 isImportant : Boolean );
640 var
641 aWord : string;
642 wordinfo : TWordInfo;
643 countinfo : TWordCountInfo;
644 i : Integer;
645 begin
646
647 for i := 0 to wordCount.Count - 1 do begin
648 aWord := wordCount[ i ];
649 wordinfo := Objects[ aWord ];
650 if wordinfo = nil then begin
651 wordinfo := TWordInfo.Create;
652 Objects[ aWord ] := wordinfo;
653 end;
654
655 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
656 if isImportant then begin
657 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
658 wordinfo.ImportantText := wordinfo.ImportantText + 1;
659 end else begin
660 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
661 wordinfo.NormalText := wordinfo.NormalText + 1;
662 end;
663 end;
664
665 end;
666
667 //==============================
668 // Forget
669 //==============================
670 procedure TGikoBayesian.Forget(
671 wordCount : TWordCount;
672 isImportant : Boolean );
673 var
674 aWord : string;
675 wordinfo : TWordInfo;
676 countinfo : TWordCountInfo;
677 i : Integer;
678 begin
679
680 for i := 0 to wordCount.Count - 1 do begin
681 aWord := wordCount[ i ];
682 wordinfo := Objects[ aWord ];
683 if wordinfo = nil then
684 Continue;
685
686 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
687 if isImportant then begin
688 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
689 wordinfo.ImportantText := wordinfo.ImportantText - 1;
690 end else begin
691 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
692 wordinfo.NormalText := wordinfo.NormalText - 1;
693 end;
694 end;
695
696 end;
697
698 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26