Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.5 - (show annotations) (download) (as text)
Thu Oct 21 03:46:57 2004 UTC (19 years, 6 months ago) by yoffy
Branch: MAIN
CVS Tags: v1_50_0_561
Changes since 1.4: +5 -3 lines
File MIME type: text/x-pascal
漢字の取り扱いがおかしいので戻した。

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7 $Id: GikoBayesian.pas,v 1.4 2004/10/21 03:18:44 yoffy Exp $
8 }
9
10 interface
11
12 //==================================================
13 uses
14 //==================================================
15 Classes;
16
17 //==================================================
18 type
19 //==================================================
20
21 {!***********************************************************
22 \brief ??茯???????????/span>
23 ************************************************************}
24 TWordInfo = class( TObject )
25 private
26 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
27 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
28 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
29 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
30
31 public
32 property NormalWord : Integer read FNormalWord write FNormalWord;
33 property ImportantWord : Integer read FImportantWord write FImportantWord;
34 property NormalText : Integer read FNormalText write FNormalText;
35 property ImportantText : Integer read FImportantText write FImportantText;
36 end;
37
38 {!***********************************************************
39 \brief 茹f??羝??水??茯???????????/span>
40 ************************************************************}
41 TWordCountInfo = class( TObject )
42 private
43 FWordCount : Integer; //!< ??茯???/span>
44
45 public
46 property WordCount : Integer read FWordCount write FWordCount;
47 end;
48
49 {!***********************************************************
50 \brief 茹f??羝??水??茯????鴻??
51 ************************************************************}
52 // TWordCount = class( THashedStringList ) // 羶???
53 TWordCount = class( TStringList )
54 public
55 constructor Create;
56 destructor Destroy; override;
57 end;
58
59 {!***********************************************************
60 \brief ???c???帥?≪???眼???冴??
61 ************************************************************}
62 TGikoBayesianAlgorithm =
63 (gbaPaulGraham, gbaGaryRonbinson{, gbaGaryRonbinsonFisher});
64
65 {!***********************************************************
66 \brief ???ゃ?吾?≪?潟???c????/span>
67 ************************************************************}
68 // TGikoBayesian = class( THashedStringList ) // 羶???
69 TGikoBayesian = class( TStringList )
70 private
71 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
72 function GetObject( const name : string ) : TWordInfo;
73 procedure SetObject( const name : string; value : TWordInfo );
74
75 public
76 constructor Create;
77 destructor Destroy; override;
78
79 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
80 procedure LoadFromFile( const filePath : string );
81
82 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
83 procedure SaveToFile( const filePath : string );
84
85 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
86 procedure Save;
87
88 //! ??茯???????????宴????緇????障??
89 property Objects[ const name : string ] : TWordInfo
90 read GetObject write SetObject; default;
91
92 //! ??腴??????障??????茯????????潟?????障??
93 procedure CountWord(
94 const text : string;
95 wordCount : TWordCount );
96
97 {!
98 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
99 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
100 }
101 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
102
103 {!
104 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
105 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
106 }
107 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
108
109 // function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
110
111 {!
112 \brief ??腴???茹f??
113 \param text 茹f????????腴?
114 \param wordCount 茹f??????????茯????鴻????菴???
115 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
116 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
117
118 CountWord ? Calcxxxxx ???障???????茵??????????с????
119 }
120 function Parse(
121 const text : string;
122 wordCount : TWordCount;
123 algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
124 ) : Extended;
125
126 {!
127 \brief 絖??????
128 \param wordCount Parse ?цВ??????????茯????鴻??
129 \param isImportant 羈??????鴻????腴???????????????? True
130 }
131 procedure Learn(
132 wordCount : TWordCount;
133 isImportant : Boolean );
134
135 {!
136 \brief 絖??腟?????綽?????
137 \param wordCount Parse ?цВ??????????茯????鴻??
138 \param isImportant 羈??????鴻????腴???????????????????????? True
139 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
140 Learn ????????????腴??? isImportant ???????c????????腴???
141 Forget ?????????若?帥???若?鴻???贋?????障????<br>
142 絖??羝??帥????????????????????????????????
143
144 ???????膺?腟??????????≪????????с???????障??????<br>
145 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
146
147 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
148 }
149 procedure Forget(
150 wordCount : TWordCount;
151 isImportant : Boolean );
152 end;
153
154 //==================================================
155 implementation
156 //==================================================
157
158 uses
159 SysUtils, Math, Windows;
160
161 const
162 GIKO_BAYESIAN_FILE_VERSION = '1.0';
163 {
164 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
165 ModeWGraph, ModeWAlpha, ModeWNum,
166 ModeWHira, ModeWKata, ModeWKanji);
167 }
168 CharMode1 : array [ 0..255 ] of Byte =
169 (
170 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
173 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
174 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
175 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
176 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
177 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
178
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
182 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
183 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
184 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1,
185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
187 );
188
189 //************************************************************
190 // misc
191 //************************************************************
192
193 //==============================
194 // RemoveToken
195 //==============================
196 function RemoveToken(var s: string;const delimiter: string): string;
197 var
198 p: Integer;
199 begin
200 p := AnsiPos(delimiter, s);
201 if p = 0 then
202 Result := s
203 else
204 Result := Copy(s, 1, p - 1);
205 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
206 end;
207
208 //==============================
209 // AbsSort
210 //==============================
211 function AbsSort( p1, p2 : Pointer ) : Integer;
212 var
213 v1, v2 : Single;
214 begin
215
216 v1 := Abs( Single( p1 ) - 0.5 );
217 v2 := Abs( Single( p2 ) - 0.5 );
218 if v1 > v2 then
219 Result := -1
220 else if v1 = v2 then
221 Result := 0
222 else
223 Result := 1;
224
225 end;
226
227 //************************************************************
228 // TWordCount class
229 //************************************************************
230 constructor TWordCount.Create;
231 begin
232
233 Duplicates := dupIgnore;
234 CaseSensitive := True;
235 Sorted := True;
236
237 end;
238
239 destructor TWordCount.Destroy;
240 var
241 i : Integer;
242 begin
243
244 for i := Count - 1 downto 0 do
245 if Objects[ i ] <> nil then
246 Objects[ i ].Free;
247
248 inherited;
249
250 end;
251
252 //************************************************************
253 // TGikoBayesian class
254 //************************************************************
255
256 //==============================
257 // Create
258 //==============================
259 constructor TGikoBayesian.Create;
260 begin
261
262 Duplicates := dupIgnore;
263 CaseSensitive := True;
264 Sorted := True;
265
266 end;
267
268 //==============================
269 // Destroy
270 //==============================
271 destructor TGikoBayesian.Destroy;
272 var
273 i : Integer;
274 begin
275
276 for i := Count - 1 downto 0 do
277 if inherited Objects[ i ] <> nil then
278 inherited Objects[ i ].Free;
279
280 inherited;
281
282 end;
283
284 procedure TGikoBayesian.LoadFromFile( const filePath : string );
285 var
286 i : Integer;
287 sl : TStringList;
288 s : string;
289 name : string;
290 info : TWordInfo;
291 begin
292
293 FFilePath := filePath;
294
295 if not FileExists( filePath ) then
296 Exit;
297
298 sl := TStringList.Create;
299 try
300 sl.LoadFromFile( filePath );
301
302 for i := 1 to sl.Count - 1 do begin
303 s := sl[ i ];
304 name := RemoveToken( s, #1 );
305 info := TWordInfo.Create;
306 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
307 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
308 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
309 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
310
311 AddObject( name, info );
312 end;
313 finally
314 sl.Free;
315 end;
316
317 end;
318
319 procedure TGikoBayesian.SaveToFile( const filePath : string );
320 var
321 i : Integer;
322 sl : TStringList;
323 s : string;
324 info : TWordInfo;
325 begin
326
327 FFilePath := filePath;
328
329 sl := TStringList.Create;
330 try
331 sl.BeginUpdate;
332 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
333
334 for i := 0 to Count - 1 do begin
335 info := TWordInfo( inherited Objects[ i ] );
336 s := Strings[ i ] + #1
337 + Format('%x', [info.NormalWord]) + #1
338 + Format('%x', [info.ImportantWord]) + #1
339 + Format('%x', [info.NormalText]) + #1
340 + Format('%x', [info.ImportantText]);
341
342 sl.Add(s);
343 end;
344 sl.EndUpdate;
345 sl.SaveToFile( filePath );
346 finally
347 sl.Free;
348 end;
349
350 end;
351
352 procedure TGikoBayesian.Save;
353 begin
354
355 if FFilePath <> '' then
356 SaveToFile( FFilePath );
357
358 end;
359
360 //==============================
361 // GetObject
362 //==============================
363 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
364 var
365 idx : Integer;
366 begin
367
368 idx := IndexOf( name ); // 羶???
369 if idx < 0 then
370 Result := nil
371 else
372 Result := TWordInfo( inherited Objects[ idx ] );
373
374 end;
375
376 //==============================
377 // SetObject
378 //==============================
379 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
380 var
381 idx : Integer;
382 begin
383
384 idx := IndexOf( name );
385 if idx < 0 then
386 AddObject( name, value )
387 else
388 inherited Objects[ idx ] := value;
389
390 end;
391
392
393 //==============================
394 // CountWord
395 //==============================
396 procedure TGikoBayesian.CountWord(
397 const text : string;
398 wordCount : TWordCount );
399 type
400 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
401 ModeWGraph, ModeWAlpha, ModeWNum,
402 ModeWHira, ModeWKata, ModeWKanji);
403 var
404 p, tail, last : PChar;
405 mode, newMode : Modes;
406 aWord : string;
407 ch : Longword;
408 chSize : Integer;
409 delimiter : TStringList;
410 delimited : Boolean;
411 i, idx : Integer;
412 countInfo : TWordCountInfo;
413 const
414 KAKUJOSI = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????' +
415 #10 + '??#39; + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;;
416 kKanji = [$80..$A0, $E0..$ff];
417 begin
418
419 delimiter := TStringList.Create;
420 try
421 mode := ModeWhite;
422 delimiter.Text := KAKUJOSI;
423 p := PChar( text );
424 tail := p + Length( text );
425 last := p;
426
427 while p < tail do begin
428 delimited := False;
429 // ??絖????帥?ゃ?????ゅ??/span>
430 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
431 // if Byte(Byte( p^ ) - $a1) < $5e then begin
432 if Byte( p^ ) in kKanji then begin
433 if p + 1 < tail then begin
434 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
435 case ch of
436 $8140: newMode := ModeWhite;
437 $8141..$824e: newMode := ModeWGraph;
438 $824f..$8258: newMode := ModeWNum;
439 $8260..$829a: newMode := ModeWAlpha;
440 $829f..$82f1: newMode := ModeWHira;
441 $8340..$8396: newMode := ModeWKata;
442 else newMode := ModeWKanji;
443 end;
444 end else begin
445 newMode := ModeWhite;
446 end;
447
448 chSize := 2;
449
450 // ?阪????????????絖?????????罎??祉????
451 if p + 3 < tail then begin // 3 = delimiter ????紊у?? - 1
452 for i := 0 to delimiter.Count - 1 do begin
453 if CompareMem(
454 p, PChar( delimiter[ i ] ), Length( delimiter[ i ] ) ) then begin
455 delimited := True;
456 chSize := Length( delimiter[ i ] );
457 Break;
458 end;
459 end;
460 end;
461 end else begin
462 newMode := Modes( CharMode1[ Byte( p^ ) ] );
463
464 chSize := 1;
465 end;
466
467 if (mode <> newMode) or delimited then begin
468
469 // ??絖????帥?ゃ????紊??眼??????
470 // ????????阪????????????絖???????????
471 if mode <> ModeWhite then begin
472 SetLength( aWord, p - last );
473 CopyMemory( PChar( aWord ), last, p - last );
474 //aWord := Copy( last, 0, p - last );
475 idx := wordCount.IndexOf( aWord ); // ??
476 if idx < 0 then begin
477 countInfo := TWordCountInfo.Create;
478 wordCount.AddObject( aWord, countInfo );
479 end else begin
480 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
481 end;
482 countInfo.WordCount := countInfo.WordCount + 1;
483 end;
484
485 last := p;
486 mode := newMode;
487
488 end;
489
490 p := p + chSize;
491 end; // while
492
493 if mode <> ModeWhite then begin
494 aWord := Copy( last, 0, p - last );
495 idx := wordCount.IndexOf( aWord );
496 if idx < 0 then begin
497 countInfo := TWordCountInfo.Create;
498 wordCount.AddObject( aWord, countInfo );
499 end else begin
500 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
501 end;
502 countInfo.WordCount := countInfo.WordCount + 1;
503 end;
504 finally
505 delimiter.Free;
506 end;
507
508 end;
509
510 //==============================
511 // CalcPaulGraham
512 //==============================
513 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
514
515 function p( const aWord : string ) : Single;
516 var
517 info : TWordInfo;
518 begin
519 info := Objects[ aWord ];
520 if info = nil then
521 Result := 0.4
522 else if info.NormalWord = 0 then
523 Result := 0.99
524 else if info.ImportantWord = 0 then
525 Result := 0.01
526 else
527 Result := ( info.ImportantWord / info.ImportantText ) /
528 ((info.NormalWord * 2 / info.NormalText ) +
529 (info.ImportantWord / info.ImportantText));
530 end;
531
532 var
533 s, q : Extended;
534 i : Integer;
535 narray : TList;
536 const
537 SAMPLE_COUNT = 15;
538 begin
539
540 Result := 1;
541 if wordCount.Count = 0 then
542 Exit;
543
544 narray := TList.Create;
545 try
546 for i := 0 to wordCount.Count - 1 do begin
547 narray.Add( Pointer( p( wordCount[ i ] ) ) );
548 end;
549
550 narray.Sort( AbsSort );
551
552 s := 1;
553 q := 1;
554 i := min( SAMPLE_COUNT, narray.Count );
555 while i > 0 do begin
556 Dec( i );
557 s := s * Single( narray[ i ] );
558 q := q * (1 - Single( narray[ i ] ));
559 end;
560
561 Result := s / (s + q);
562 finally
563 narray.Free;
564 end;
565
566 end;
567
568 //==============================
569 // CalcGaryRobinson
570 //==============================
571 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
572
573 function p( const aWord : string ) : Single;
574 var
575 info : TWordInfo;
576 begin
577 info := Objects[ aWord ];
578 if info = nil then
579 Result := 0.415
580 else if info.ImportantWord = 0 then
581 Result := 0.0001
582 else if info.NormalWord = 0 then
583 Result := 0.9999
584 else
585 Result := ( info.ImportantWord / info.ImportantText ) /
586 ((info.NormalWord / info.NormalText ) +
587 (info.ImportantWord / info.ImportantText));
588 end;
589
590 function f( cnt : Integer; n, mean : Single ) : Extended;
591 const
592 k = 0.00001;
593 begin
594 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
595 end;
596
597 var
598 n : Extended;
599 narray : array of Single;
600 mean : Extended;
601 countInfo : TWordCountInfo;
602 i : Integer;
603 normal : Extended;
604 important : Extended;
605 cnt : Extended;
606 begin
607
608 if wordCount.Count = 0 then begin
609 Result := 1;
610 Exit;
611 end;
612
613 SetLength( narray, wordCount.Count );
614 mean := 0;
615 for i := 0 to wordCount.Count - 1 do begin
616 n := p( wordCount[ i ] );
617 narray[ i ] := n;
618 mean := mean + n;
619 end;
620 mean := mean / wordCount.Count;
621
622 cnt := 0;
623 normal := 1;
624 important := 1;
625 for i := 0 to wordCount.Count - 1 do begin
626 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
627 n := f( countInfo.WordCount, narray[ i ], mean );
628 normal := normal * n;
629 important := important * (1 - n);
630 if countInfo <> nil then
631 cnt := cnt + countInfo.WordCount;
632 end;
633 if cnt = 0 then
634 cnt := 1;
635 normal := 1 - Exp( Ln( normal ) * (1 / cnt) );
636 important := 1 - Exp( Ln( important ) * (1 / cnt) );
637
638 n := (important - normal+ 0.00001) / (important + normal + 0.00001);
639 Result := (1 + n) / 2;
640
641 end;
642
643 //==============================
644 // Parse
645 //==============================
646 function TGikoBayesian.Parse(
647 const text : string;
648 wordCount : TWordCount;
649 algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
650 ) : Extended;
651 begin
652
653 CountWord( text, wordCount );
654 case algorithm of
655 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
656 gbaGaryRonbinson: Result := CalcGaryRobinson( wordCount );
657 else Result := 0;
658 end;
659
660 end;
661
662 //==============================
663 // Learn
664 //==============================
665 procedure TGikoBayesian.Learn(
666 wordCount : TWordCount;
667 isImportant : Boolean );
668 var
669 aWord : string;
670 wordinfo : TWordInfo;
671 countinfo : TWordCountInfo;
672 i : Integer;
673 begin
674
675 for i := 0 to wordCount.Count - 1 do begin
676 aWord := wordCount[ i ];
677 wordinfo := Objects[ aWord ];
678 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
679 if wordinfo = nil then begin
680 wordinfo := TWordInfo.Create;
681 Objects[ aWord ] := wordinfo;
682 end;
683
684 if isImportant then begin
685 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
686 wordinfo.ImportantText := wordinfo.ImportantText + 1;
687 end else begin
688 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
689 wordinfo.NormalText := wordinfo.NormalText + 1;
690 end;
691 end;
692
693 end;
694
695 //==============================
696 // Forget
697 //==============================
698 procedure TGikoBayesian.Forget(
699 wordCount : TWordCount;
700 isImportant : Boolean );
701 var
702 aWord : string;
703 wordinfo : TWordInfo;
704 countinfo : TWordCountInfo;
705 i : Integer;
706 begin
707
708 for i := 0 to wordCount.Count - 1 do begin
709 aWord := wordCount[ i ];
710 wordinfo := Objects[ aWord ];
711 if wordinfo = nil then
712 Continue;
713
714 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
715 if isImportant then begin
716 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
717 wordinfo.ImportantText := wordinfo.ImportantText - 1;
718 end else begin
719 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
720 wordinfo.NormalText := wordinfo.NormalText - 1;
721 end;
722 end;
723
724 end;
725
726 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26