Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.8 - (show annotations) (download) (as text)
Thu Oct 21 05:59:39 2004 UTC (19 years, 6 months ago) by yoffy
Branch: MAIN
Changes since 1.7: +9 -5 lines
File MIME type: text/x-pascal
辞書が壊れていてもいいように、値がマイナス値を取れないよう修正。

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7 $Id: GikoBayesian.pas,v 1.7 2004/10/21 04:28:15 yoffy Exp $
8 }
9
10 interface
11
12 //==================================================
13 uses
14 //==================================================
15 Classes;
16
17 //==================================================
18 type
19 //==================================================
20
21 {!***********************************************************
22 \brief ??茯???????????/span>
23 ************************************************************}
24 TWordInfo = class( TObject )
25 private
26 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
27 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
28 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
29 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
30
31 public
32 property NormalWord : Integer read FNormalWord write FNormalWord;
33 property ImportantWord : Integer read FImportantWord write FImportantWord;
34 property NormalText : Integer read FNormalText write FNormalText;
35 property ImportantText : Integer read FImportantText write FImportantText;
36 end;
37
38 {!***********************************************************
39 \brief 茹f??羝??水??茯???????????/span>
40 ************************************************************}
41 TWordCountInfo = class( TObject )
42 private
43 FWordCount : Integer; //!< ??茯???/span>
44
45 public
46 property WordCount : Integer read FWordCount write FWordCount;
47 end;
48
49 {!***********************************************************
50 \brief 茹f??羝??水??茯????鴻??
51 ************************************************************}
52 // TWordCount = class( THashedStringList ) // 羶???
53 TWordCount = class( TStringList )
54 public
55 constructor Create;
56 destructor Destroy; override;
57 end;
58
59 {!***********************************************************
60 \brief ???c???帥?≪???眼???冴??
61 ************************************************************}
62 TGikoBayesianAlgorithm =
63 (gbaPaulGraham, gbaGaryRonbinson{, gbaGaryRonbinsonFisher});
64
65 {!***********************************************************
66 \brief ???ゃ?吾?≪?潟???c????/span>
67 ************************************************************}
68 // TGikoBayesian = class( THashedStringList ) // 羶???
69 TGikoBayesian = class( TStringList )
70 private
71 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
72 function GetObject( const name : string ) : TWordInfo;
73 procedure SetObject( const name : string; value : TWordInfo );
74
75 public
76 constructor Create;
77 destructor Destroy; override;
78
79 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
80 procedure LoadFromFile( const filePath : string );
81
82 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
83 procedure SaveToFile( const filePath : string );
84
85 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
86 procedure Save;
87
88 //! ??茯???????????宴????緇????障??
89 property Objects[ const name : string ] : TWordInfo
90 read GetObject write SetObject; default;
91
92 //! ??腴??????障??????茯????????潟?????障??
93 procedure CountWord(
94 const text : string;
95 wordCount : TWordCount );
96
97 {!
98 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
99 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
100 }
101 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
102
103 {!
104 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
105 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
106 }
107 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
108
109 // function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
110
111 {!
112 \brief ??腴???茹f??
113 \param text 茹f????????腴?
114 \param wordCount 茹f??????????茯????鴻????菴???
115 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
116 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
117
118 CountWord ? Calcxxxxx ???障???????茵??????????с????
119 }
120 function Parse(
121 const text : string;
122 wordCount : TWordCount;
123 algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
124 ) : Extended;
125
126 {!
127 \brief 絖??????
128 \param wordCount Parse ?цВ??????????茯????鴻??
129 \param isImportant 羈??????鴻????腴???????????????? True
130 }
131 procedure Learn(
132 wordCount : TWordCount;
133 isImportant : Boolean );
134
135 {!
136 \brief 絖??腟?????綽?????
137 \param wordCount Parse ?цВ??????????茯????鴻??
138 \param isImportant 羈??????鴻????腴???????????????????????? True
139 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
140 Learn ????????????腴??? isImportant ???????c????????腴???
141 Forget ?????????若?帥???若?鴻???贋?????障????<br>
142 絖??羝??帥????????????????????????????????
143
144 ???????膺?腟??????????≪????????с???????障??????<br>
145 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
146
147 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
148 }
149 procedure Forget(
150 wordCount : TWordCount;
151 isImportant : Boolean );
152 end;
153
154 //==================================================
155 implementation
156 //==================================================
157
158 uses
159 SysUtils, Math, Windows;
160
161 const
162 GIKO_BAYESIAN_FILE_VERSION = '1.0';
163 {
164 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
165 ModeWGraph, ModeWAlpha, ModeWNum,
166 ModeWHira, ModeWKata, ModeWKanji);
167 }
168 CharMode1 : array [ 0..255 ] of Byte =
169 (
170 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
173 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
174 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
175 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
176 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
177 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
178
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
182 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
183 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
184 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
187 );
188
189 //************************************************************
190 // misc
191 //************************************************************
192
193 //==============================
194 // RemoveToken
195 //==============================
196 function RemoveToken(var s: string;const delimiter: string): string;
197 var
198 p: Integer;
199 begin
200 p := AnsiPos(delimiter, s);
201 if p = 0 then
202 Result := s
203 else
204 Result := Copy(s, 1, p - 1);
205 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
206 end;
207
208 //==============================
209 // AbsSort
210 //==============================
211 function AbsSort( p1, p2 : Pointer ) : Integer;
212 var
213 v1, v2 : Single;
214 begin
215
216 v1 := Abs( Single( p1 ) - 0.5 );
217 v2 := Abs( Single( p2 ) - 0.5 );
218 if v1 > v2 then
219 Result := -1
220 else if v1 = v2 then
221 Result := 0
222 else
223 Result := 1;
224
225 end;
226
227 //************************************************************
228 // TWordCount class
229 //************************************************************
230 constructor TWordCount.Create;
231 begin
232
233 Duplicates := dupIgnore;
234 CaseSensitive := True;
235 Sorted := True;
236
237 end;
238
239 destructor TWordCount.Destroy;
240 var
241 i : Integer;
242 begin
243
244 for i := Count - 1 downto 0 do
245 if Objects[ i ] <> nil then
246 Objects[ i ].Free;
247
248 inherited;
249
250 end;
251
252 //************************************************************
253 // TGikoBayesian class
254 //************************************************************
255
256 //==============================
257 // Create
258 //==============================
259 constructor TGikoBayesian.Create;
260 begin
261
262 Duplicates := dupIgnore;
263 CaseSensitive := True;
264 Sorted := True;
265
266 end;
267
268 //==============================
269 // Destroy
270 //==============================
271 destructor TGikoBayesian.Destroy;
272 var
273 i : Integer;
274 begin
275
276 for i := Count - 1 downto 0 do
277 if inherited Objects[ i ] <> nil then
278 inherited Objects[ i ].Free;
279
280 inherited;
281
282 end;
283
284 procedure TGikoBayesian.LoadFromFile( const filePath : string );
285 var
286 i : Integer;
287 sl : TStringList;
288 s : string;
289 name : string;
290 info : TWordInfo;
291 begin
292
293 FFilePath := filePath;
294
295 if not FileExists( filePath ) then
296 Exit;
297
298 sl := TStringList.Create;
299 try
300 sl.LoadFromFile( filePath );
301
302 for i := 1 to sl.Count - 1 do begin
303 s := sl[ i ];
304 name := RemoveToken( s, #1 );
305 info := TWordInfo.Create;
306 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
307 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
308 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
309 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
310
311 AddObject( name, info );
312 end;
313 finally
314 sl.Free;
315 end;
316
317 end;
318
319 procedure TGikoBayesian.SaveToFile( const filePath : string );
320 var
321 i : Integer;
322 sl : TStringList;
323 s : string;
324 info : TWordInfo;
325 begin
326
327 FFilePath := filePath;
328
329 sl := TStringList.Create;
330 try
331 sl.BeginUpdate;
332 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
333
334 for i := 0 to Count - 1 do begin
335 info := TWordInfo( inherited Objects[ i ] );
336 s := Strings[ i ] + #1
337 + Format('%x', [info.NormalWord]) + #1
338 + Format('%x', [info.ImportantWord]) + #1
339 + Format('%x', [info.NormalText]) + #1
340 + Format('%x', [info.ImportantText]);
341
342 sl.Add(s);
343 end;
344 sl.EndUpdate;
345 sl.SaveToFile( filePath );
346 finally
347 sl.Free;
348 end;
349
350 end;
351
352 procedure TGikoBayesian.Save;
353 begin
354
355 if FFilePath <> '' then
356 SaveToFile( FFilePath );
357
358 end;
359
360 //==============================
361 // GetObject
362 //==============================
363 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
364 var
365 idx : Integer;
366 begin
367
368 idx := IndexOf( name ); // 羶???
369 if idx < 0 then
370 Result := nil
371 else
372 Result := TWordInfo( inherited Objects[ idx ] );
373
374 end;
375
376 //==============================
377 // SetObject
378 //==============================
379 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
380 var
381 idx : Integer;
382 begin
383
384 idx := IndexOf( name );
385 if idx < 0 then
386 AddObject( name, value )
387 else
388 inherited Objects[ idx ] := value;
389
390 end;
391
392
393 //==============================
394 // CountWord
395 //==============================
396 procedure TGikoBayesian.CountWord(
397 const text : string;
398 wordCount : TWordCount );
399 type
400 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
401 ModeWGraph, ModeWAlpha, ModeWNum,
402 ModeWHira, ModeWKata, ModeWKanji);
403 var
404 p, tail, last : PChar;
405 mode, newMode : Modes;
406 aWord : string;
407 ch : Longword;
408 chSize : Integer;
409 delimiter : TStringList;
410 delimited : Boolean;
411 i, idx : Integer;
412 countInfo : TWordCountInfo;
413 const
414 KAKUJOSI = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????' +
415 #10 + '??#39; + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;;
416 kKanji = [$80..$A0, $E0..$ff];
417 begin
418
419 delimiter := TStringList.Create;
420 try
421 mode := ModeWhite;
422 delimiter.Text := KAKUJOSI;
423 p := PChar( text );
424 tail := p + Length( text );
425 last := p;
426
427 while p < tail do begin
428 delimited := False;
429 // ??絖????帥?ゃ?????ゅ??/span>
430 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
431 // if Byte(Byte( p^ ) - $a1) < $5e then begin
432 if Byte( p^ ) in kKanji then begin
433 if p + 1 < tail then begin
434 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
435 case ch of
436 $8140: newMode := ModeWhite;
437 $8141..$824e: newMode := ModeWGraph;
438 $824f..$8258: newMode := ModeWNum;
439 $8260..$829a: newMode := ModeWAlpha;
440 $829f..$82f1: newMode := ModeWHira;
441 $8340..$8396: newMode := ModeWKata;
442 else newMode := ModeWKanji;
443 end;
444 // '??????#39; ??抗篁??????障???????帥?????????障????
445 if (mode = ModeWHira) or (mode = ModeWKata) then
446 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
447 newMode := mode;
448 end else begin
449 newMode := ModeWhite;
450 end;
451
452 chSize := 2;
453
454 // ?阪????????????絖?????????罎??祉????
455 if p + 3 < tail then begin // 3 = delimiter ????紊у?? - 1
456 for i := 0 to delimiter.Count - 1 do begin
457 if CompareMem(
458 p, PChar( delimiter[ i ] ), Length( delimiter[ i ] ) ) then begin
459 delimited := True;
460 chSize := Length( delimiter[ i ] );
461 Break;
462 end;
463 end;
464 end;
465 end else begin
466 newMode := Modes( CharMode1[ Byte( p^ ) ] );
467
468 chSize := 1;
469 end;
470
471 if (mode <> newMode) or delimited then begin
472
473 // ??絖????帥?ゃ????紊??眼??????
474 // ????????阪????????????絖???????????
475 if mode <> ModeWhite then begin
476 SetLength( aWord, p - last );
477 CopyMemory( PChar( aWord ), last, p - last );
478 //aWord := Copy( last, 0, p - last );
479 idx := wordCount.IndexOf( aWord ); // ??
480 if idx < 0 then begin
481 countInfo := TWordCountInfo.Create;
482 wordCount.AddObject( aWord, countInfo );
483 end else begin
484 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
485 end;
486 countInfo.WordCount := countInfo.WordCount + 1;
487 end;
488
489 last := p;
490 mode := newMode;
491
492 end;
493
494 p := p + chSize;
495 end; // while
496
497 if mode <> ModeWhite then begin
498 aWord := Copy( last, 0, p - last );
499 idx := wordCount.IndexOf( aWord );
500 if idx < 0 then begin
501 countInfo := TWordCountInfo.Create;
502 wordCount.AddObject( aWord, countInfo );
503 end else begin
504 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
505 end;
506 countInfo.WordCount := countInfo.WordCount + 1;
507 end;
508 finally
509 delimiter.Free;
510 end;
511
512 end;
513
514 //==============================
515 // CalcPaulGraham
516 //==============================
517 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
518
519 function p( const aWord : string ) : Single;
520 var
521 info : TWordInfo;
522 begin
523 info := Objects[ aWord ];
524 if info = nil then
525 Result := 0.4
526 else if info.NormalWord = 0 then
527 Result := 0.99
528 else if info.ImportantWord = 0 then
529 Result := 0.01
530 else
531 Result := ( info.ImportantWord / info.ImportantText ) /
532 ((info.NormalWord * 2 / info.NormalText ) +
533 (info.ImportantWord / info.ImportantText));
534 end;
535
536 var
537 s, q : Extended;
538 i : Integer;
539 narray : TList;
540 const
541 SAMPLE_COUNT = 15;
542 begin
543
544 Result := 1;
545 if wordCount.Count = 0 then
546 Exit;
547
548 narray := TList.Create;
549 try
550 for i := 0 to wordCount.Count - 1 do begin
551 narray.Add( Pointer( p( wordCount[ i ] ) ) );
552 end;
553
554 narray.Sort( AbsSort );
555
556 s := 1;
557 q := 1;
558 i := min( SAMPLE_COUNT, narray.Count );
559 while i > 0 do begin
560 Dec( i );
561 s := s * Single( narray[ i ] );
562 q := q * (1 - Single( narray[ i ] ));
563 end;
564
565 Result := s / (s + q);
566 finally
567 narray.Free;
568 end;
569
570 end;
571
572 //==============================
573 // CalcGaryRobinson
574 //==============================
575 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
576
577 function p( const aWord : string ) : Single;
578 var
579 info : TWordInfo;
580 begin
581 info := Objects[ aWord ];
582 if info = nil then
583 Result := 0.415
584 else if info.ImportantWord = 0 then
585 Result := 0.0001
586 else if info.NormalWord = 0 then
587 Result := 0.9999
588 else
589 Result := ( info.ImportantWord / info.ImportantText ) /
590 ((info.NormalWord / info.NormalText ) +
591 (info.ImportantWord / info.ImportantText));
592 end;
593
594 function f( cnt : Integer; n, mean : Single ) : Extended;
595 const
596 k = 0.00001;
597 begin
598 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
599 end;
600
601 var
602 n : Extended;
603 narray : array of Single;
604 mean : Extended;
605 countInfo : TWordCountInfo;
606 i : Integer;
607 normal : Extended;
608 important : Extended;
609 cnt : Extended;
610 begin
611
612 if wordCount.Count = 0 then begin
613 Result := 1;
614 Exit;
615 end;
616
617 SetLength( narray, wordCount.Count );
618 mean := 0;
619 for i := 0 to wordCount.Count - 1 do begin
620 n := p( wordCount[ i ] );
621 narray[ i ] := n;
622 mean := mean + n;
623 end;
624 mean := mean / wordCount.Count;
625
626 cnt := 0;
627 normal := 1;
628 important := 1;
629 for i := 0 to wordCount.Count - 1 do begin
630 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
631 n := f( countInfo.WordCount, narray[ i ], mean );
632 normal := normal * n;
633 important := important * (1 - n);
634 if countInfo <> nil then
635 cnt := cnt + countInfo.WordCount;
636 end;
637 if cnt = 0 then
638 cnt := 1;
639 normal := 1 - Exp( Ln( normal ) * (1 / cnt) );
640 important := 1 - Exp( Ln( important ) * (1 / cnt) );
641
642 n := (important - normal+ 0.00001) / (important + normal + 0.00001);
643 Result := (1 + n) / 2;
644
645 end;
646
647 //==============================
648 // Parse
649 //==============================
650 function TGikoBayesian.Parse(
651 const text : string;
652 wordCount : TWordCount;
653 algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
654 ) : Extended;
655 begin
656
657 CountWord( text, wordCount );
658 case algorithm of
659 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
660 gbaGaryRonbinson: Result := CalcGaryRobinson( wordCount );
661 else Result := 0;
662 end;
663
664 end;
665
666 //==============================
667 // Learn
668 //==============================
669 procedure TGikoBayesian.Learn(
670 wordCount : TWordCount;
671 isImportant : Boolean );
672 var
673 aWord : string;
674 wordinfo : TWordInfo;
675 countinfo : TWordCountInfo;
676 i : Integer;
677 begin
678
679 for i := 0 to wordCount.Count - 1 do begin
680 aWord := wordCount[ i ];
681 wordinfo := Objects[ aWord ];
682 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
683 if wordinfo = nil then begin
684 wordinfo := TWordInfo.Create;
685 Objects[ aWord ] := wordinfo;
686 end;
687
688 if isImportant then begin
689 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
690 wordinfo.ImportantText := wordinfo.ImportantText + 1;
691 end else begin
692 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
693 wordinfo.NormalText := wordinfo.NormalText + 1;
694 end;
695 end;
696
697 end;
698
699 //==============================
700 // Forget
701 //==============================
702 procedure TGikoBayesian.Forget(
703 wordCount : TWordCount;
704 isImportant : Boolean );
705 var
706 aWord : string;
707 wordinfo : TWordInfo;
708 countinfo : TWordCountInfo;
709 i : Integer;
710 begin
711
712 for i := 0 to wordCount.Count - 1 do begin
713 aWord := wordCount[ i ];
714 wordinfo := Objects[ aWord ];
715 if wordinfo = nil then
716 Continue;
717
718 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
719 if isImportant then begin
720 if wordInfo.ImportantText > 0 then begin
721 wordinfo.ImportantText := wordinfo.ImportantText - 1;
722 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
723 end;
724 end else begin
725 if wordinfo.NormalText > 0 then begin
726 wordinfo.NormalText := wordinfo.NormalText - 1;
727 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
728 end;
729 end;
730 end;
731
732 end;
733
734 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26