Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.13 - (show annotations) (download) (as text)
Mon Nov 1 05:18:21 2004 UTC (19 years, 5 months ago) by yoffy
Branch: MAIN
Changes since 1.12: +8 -6 lines
File MIME type: text/x-pascal
前回のコミットで ModeWhite のチェックが抜けてしまっていたので修正。

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7 $Id: GikoBayesian.pas,v 1.12 2004/11/01 04:45:25 yoffy Exp $
8 }
9
10 interface
11
12 //==================================================
13 uses
14 //==================================================
15 Classes;
16
17 //==================================================
18 type
19 //==================================================
20
21 {!***********************************************************
22 \brief ??茯???????????/span>
23 ************************************************************}
24 TWordInfo = class( TObject )
25 private
26 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
27 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
28 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
29 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
30
31 public
32 property NormalWord : Integer read FNormalWord write FNormalWord;
33 property ImportantWord : Integer read FImportantWord write FImportantWord;
34 property NormalText : Integer read FNormalText write FNormalText;
35 property ImportantText : Integer read FImportantText write FImportantText;
36 end;
37
38 {!***********************************************************
39 \brief 茹f??羝??水??茯???????????/span>
40 ************************************************************}
41 TWordCountInfo = class( TObject )
42 private
43 FWordCount : Integer; //!< ??茯???/span>
44
45 public
46 property WordCount : Integer read FWordCount write FWordCount;
47 end;
48
49 {!***********************************************************
50 \brief 茹f??羝??水??茯????鴻??
51 ************************************************************}
52 // TWordCount = class( THashedStringList ) // 羶???
53 TWordCount = class( TStringList )
54 public
55 constructor Create;
56 destructor Destroy; override;
57 end;
58
59 {!***********************************************************
60 \brief ???c???帥?≪???眼???冴??
61 ************************************************************}
62 TGikoBayesianAlgorithm =
63 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
64
65 {!***********************************************************
66 \brief ???ゃ?吾?≪?潟???c????/span>
67 ************************************************************}
68 // TGikoBayesian = class( THashedStringList ) // 羶???
69 TGikoBayesian = class( TStringList )
70 private
71 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
72 function GetObject( const name : string ) : TWordInfo;
73 procedure SetObject( const name : string; value : TWordInfo );
74
75 public
76 constructor Create;
77 destructor Destroy; override;
78
79 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
80 procedure LoadFromFile( const filePath : string );
81
82 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
83 procedure SaveToFile( const filePath : string );
84
85 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
86 procedure Save;
87
88 //! ??茯???????????宴????緇????障??
89 property Objects[ const name : string ] : TWordInfo
90 read GetObject write SetObject; default;
91
92 //! ??腴??????障??????茯????????潟?????障??
93 procedure CountWord(
94 const text : string;
95 wordCount : TWordCount );
96
97 {!
98 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
99 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
100 }
101 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
102
103 {!
104 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
105 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
106 }
107 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
108
109 {!
110 \brief GaryRobinson-Fisher 羈????冴?ャ??????腴???絵??墾??羆阪????障??
111 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
112 }
113 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
114
115 {!
116 \brief ??腴???茹f??
117 \param text 茹f????????腴?
118 \param wordCount 茹f??????????茯????鴻????菴???
119 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
120 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
121
122 CountWord ? Calcxxxxx ???障???????茵??????????с????
123 }
124 function Parse(
125 const text : string;
126 wordCount : TWordCount;
127 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
128 ) : Extended;
129
130 {!
131 \brief 絖??????
132 \param wordCount Parse ?цВ??????????茯????鴻??
133 \param isImportant 羈??????鴻????腴???????????????? True
134 }
135 procedure Learn(
136 wordCount : TWordCount;
137 isImportant : Boolean );
138
139 {!
140 \brief 絖??腟?????綽?????
141 \param wordCount Parse ?цВ??????????茯????鴻??
142 \param isImportant 羈??????鴻????腴???????????????????????? True
143 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
144 Learn ????????????腴??? isImportant ???????c????????腴???
145 Forget ?????????若?帥???若?鴻???贋?????障????<br>
146 絖??羝??帥????????????????????????????????
147
148 ???????膺?腟??????????≪????????с???????障??????<br>
149 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
150
151 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
152 }
153 procedure Forget(
154 wordCount : TWordCount;
155 isImportant : Boolean );
156 end;
157
158 //==================================================
159 implementation
160 //==================================================
161
162 uses
163 SysUtils, Math, Windows,
164 MojuUtils;
165
166 const
167 GIKO_BAYESIAN_FILE_VERSION = '1.0';
168 {
169 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
170 ModeWGraph, ModeWAlpha, ModeWNum,
171 ModeWHira, ModeWKata, ModeWKanji);
172 }
173 CharMode1 : array [ 0..255 ] of Byte =
174 (
175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
178 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
179 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
180 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
181 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
182 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
183
184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
187 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
188 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
189 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
192 );
193
194 //************************************************************
195 // misc
196 //************************************************************
197
198 //==============================
199 // RemoveToken
200 //==============================
201 function RemoveToken(var s: string;const delimiter: string): string;
202 var
203 p: Integer;
204 begin
205 p := AnsiPos(delimiter, s);
206 if p = 0 then
207 Result := s
208 else
209 Result := Copy(s, 1, p - 1);
210 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
211 end;
212
213 //==============================
214 // AbsSort
215 //==============================
216 function AbsSort( p1, p2 : Pointer ) : Integer;
217 var
218 v1, v2 : Single;
219 begin
220
221 v1 := Abs( Single( p1 ) - 0.5 );
222 v2 := Abs( Single( p2 ) - 0.5 );
223 if v1 > v2 then
224 Result := -1
225 else if v1 = v2 then
226 Result := 0
227 else
228 Result := 1;
229
230 end;
231
232 //************************************************************
233 // TWordCount class
234 //************************************************************
235 constructor TWordCount.Create;
236 begin
237
238 Duplicates := dupIgnore;
239 CaseSensitive := True;
240 Sorted := True;
241
242 end;
243
244 destructor TWordCount.Destroy;
245 var
246 i : Integer;
247 begin
248
249 for i := Count - 1 downto 0 do
250 if Objects[ i ] <> nil then
251 Objects[ i ].Free;
252
253 inherited;
254
255 end;
256
257 //************************************************************
258 // TGikoBayesian class
259 //************************************************************
260
261 //==============================
262 // Create
263 //==============================
264 constructor TGikoBayesian.Create;
265 begin
266
267 Duplicates := dupIgnore;
268 CaseSensitive := True;
269 Sorted := True;
270
271 end;
272
273 //==============================
274 // Destroy
275 //==============================
276 destructor TGikoBayesian.Destroy;
277 var
278 i : Integer;
279 begin
280
281 for i := Count - 1 downto 0 do
282 if inherited Objects[ i ] <> nil then
283 inherited Objects[ i ].Free;
284
285 inherited;
286
287 end;
288
289 procedure TGikoBayesian.LoadFromFile( const filePath : string );
290 var
291 i : Integer;
292 sl : TStringList;
293 s : string;
294 name : string;
295 info : TWordInfo;
296 begin
297
298 FFilePath := filePath;
299
300 if not FileExists( filePath ) then
301 Exit;
302
303 sl := TStringList.Create;
304 try
305 sl.LoadFromFile( filePath );
306
307 for i := 1 to sl.Count - 1 do begin
308 s := sl[ i ];
309 name := RemoveToken( s, #1 );
310 info := TWordInfo.Create;
311 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
312 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
313 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
314 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315
316 AddObject( name, info );
317 end;
318 finally
319 sl.Free;
320 end;
321
322 end;
323
324 procedure TGikoBayesian.SaveToFile( const filePath : string );
325 var
326 i : Integer;
327 sl : TStringList;
328 s : string;
329 info : TWordInfo;
330 begin
331
332 FFilePath := filePath;
333
334 sl := TStringList.Create;
335 try
336 sl.BeginUpdate;
337 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
338
339 for i := 0 to Count - 1 do begin
340 info := TWordInfo( inherited Objects[ i ] );
341 s := Strings[ i ] + #1
342 + Format('%x', [info.NormalWord]) + #1
343 + Format('%x', [info.ImportantWord]) + #1
344 + Format('%x', [info.NormalText]) + #1
345 + Format('%x', [info.ImportantText]);
346
347 sl.Add(s);
348 end;
349 sl.EndUpdate;
350 sl.SaveToFile( filePath );
351 finally
352 sl.Free;
353 end;
354
355 end;
356
357 procedure TGikoBayesian.Save;
358 begin
359
360 if FFilePath <> '' then
361 SaveToFile( FFilePath );
362
363 end;
364
365 //==============================
366 // GetObject
367 //==============================
368 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
369 var
370 idx : Integer;
371 begin
372
373 if Find( name, idx ) then
374 Result := TWordInfo( inherited Objects[ idx ] )
375 else
376 Result := nil;
377
378 end;
379
380 //==============================
381 // SetObject
382 //==============================
383 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
384 var
385 idx : Integer;
386 begin
387
388 if Find( name, idx ) then
389 inherited Objects[ idx ] := value
390 else
391 AddObject( name, value );
392
393 end;
394
395
396 //==============================
397 // CountWord
398 //==============================
399 procedure TGikoBayesian.CountWord(
400 const text : string;
401 wordCount : TWordCount );
402 type
403 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
404 ModeWGraph, ModeWAlpha, ModeWNum,
405 ModeWHira, ModeWKata, ModeWKanji);
406 var
407 p, tail, last : PChar;
408 mode, newMode : Modes;
409 ch : Longword;
410 chSize : Integer;
411 wHiraDelimiter : TStringList;
412 wHiraFinalDelimiter : TStringList;
413 wKanjiDelimiter : TStringList;
414 words : TStringList;
415 aWord : string;
416 countInfo : TWordCountInfo;
417
418 function cutBoth( _aWord : string; _delim : TStringList ) : string;
419 var
420 _i : Integer;
421 begin
422 for _i := 0 to _delim.Count - 1 do begin
423 _aWord := CustomStringReplace(
424 _aWord,
425 _delim[ _i ],
426 #10 + _delim[ _i ] + #10, False );
427 end;
428 Result := _aWord;
429 end;
430
431 function cutFirst( _aWord : string; _delim : TStringList ) : string;
432 var
433 _i : Integer;
434 begin
435 for _i := 0 to _delim.Count - 1 do begin
436 _aWord := CustomStringReplace(
437 _aWord,
438 _delim[ _i ],
439 #10 + _delim[ _i ], False );
440 end;
441 Result := _aWord;
442 end;
443
444 function cutFinal( _aWord : string; _delim : TStringList ) : string;
445 var
446 _i : Integer;
447 begin
448 for _i := 0 to _delim.Count - 1 do begin
449 _aWord := CustomStringReplace(
450 _aWord,
451 _delim[ _i ],
452 _delim[ _i ] + #10, False );
453 end;
454 Result := _aWord;
455 end;
456
457 procedure addWord( _dst : TWordCount; _words : TStringList );
458 var
459 _aWord : string;
460 _i, _idx : Integer;
461 _countInfo : TWordCountInfo;
462 begin
463 for _i := 0 to _words.Count - 1 do begin
464 _aWord := _words[ _i ];
465 if Length( _aWord ) > 0 then begin
466 if _dst.Find( _aWord, _idx ) then begin
467 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
468 end else begin
469 _countInfo := TWordCountInfo.Create;
470 _dst.AddObject( _aWord, _countInfo );
471 end;
472 _countInfo.WordCount := _countInfo.WordCount + 1;
473 end;
474 end;
475 end;
476
477 function changeMode( _aWord : string; _mode : Modes ) : string;
478 var
479 _i : Integer;
480 _aWord2 : string;
481 _pWord, _pWord2 : PChar;
482 _pWordTail, _pFound : PChar;
483 const
484 _delim : string = #10;
485 begin
486 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
487 // ?ユ???
488 // ?鴻???若?鴻??荅違????
489 _aWord := CustomStringReplace( _aWord, ' ', '', False );
490 _aWord := CustomStringReplace( _aWord, '??', '', False );
491
492 // ???????帥?у??茯?????
493 case mode of
494 ModeWHira:
495 begin
496 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
497 Result := cutBoth( _aWord, wHiraDelimiter );
498 end;
499
500 ModeWKanji:
501 begin
502 // ???????帥?у??茯?????
503 _aWord := cutBoth( _aWord, wKanjiDelimiter );
504 // 4 byte (2 絖?) ???ゃ?у??茯?????
505 _pWord := PChar( _aWord );
506 _i := Length( _aWord );
507 _pWordTail := _pWord + _i;
508 SetLength( _aWord2, _i + (_i shr 2) );
509 _pWord2 := PChar( _aWord2 );
510
511 while _pWord < _pWordTail do begin
512 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
513 if _pFound = nil then
514 _pFound := _pWordTail;
515 _pFound := _pFound - 3;
516
517 while _pWord <= _pFound do begin
518 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
519 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
520 end;
521 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
522 CopyMemory( _pWord2, _pWord, _i );
523 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
524 end;
525 if _pWord < _pWordTail then begin
526 _i := _pWordTail - _pWord;
527 CopyMemory( _pWord2, _pWord, _i );
528 _pWord2 := _pWord2 + _i;
529 end;
530 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
531
532 Result := _aWord2;
533 end;
534
535 else
536 Result := _aWord;
537 end;
538 end else begin
539 Result := _aWord;
540 end;
541 end;
542 const
543 WHIRA_DELIMITER = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????'
544 + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;+ #10 + '??#39;
545 + #10 + '????' + #10 + '????' + #10 + '????'
546 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
547 + #10 + '????#39; + #10 + '????#39; + #10 + '????#39; + #10 + '????#39;
548 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
549 + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39;
550 + #10 + '????' + #10 + '????#39; + #10 + '????' + #10 + '????'
551 + #10 + '????' + #10 + '??????'
552 + #10 + '?с??' + #10 + '?障??' + #10 + '?障????'
553 + #10 + '?с????' + #10 + '?障????'
554 + #10 + '????' + #10 + '??????' + #10 + '??????' + #10 + '????????'
555 ;
556 WKANJI_DELIMITER = '??' + #10 + '??#39; + #10 + '綣?39; + #10 + '??' + #10 + '羈?'
557 + #10 + '筝?' + #10 + '??#39; + #10 + '??' + #10 + '??'
558 ;
559 WHIRA_FINAL_DELIMITER = '?c??' + #10 + '?c??#39;
560 ;{
561 + #10 + '???c??#39; + #10 + '???????c??#39; + #10 + '??????#39;
562 + #10 + '??????' + #10 + '?с??????'
563 + #10 + '?障??'
564 + #10 + '??????' + #10 + '????' + #10 + '????#39; + #10 + '??????#39;
565 + #10 + '??????' + #10 + '???c?宴??'
566 + #10 + '?с??' + #10 + '????'
567 + #10 + '????' + #10 + '??????' + #10 + '????' + #10 + '??????'
568 ;}
569 // '??#39; ?? '??????????' ????
570 HA_LINE = '?????????????障????????????違?宴????#39;;
571 HI_LINE = '???????<???蚊?帥?????????潟?眼??';
572 HU_LINE = '??????ゃ???泣?????????吟?激??';
573 HE_LINE = '???????????吾?????????鴻?冴??';
574 HO_LINE = '???????????祉???????????若?純??';
575 KA_LINE = '?≪???泣?帥??????ゃ???????吟???????<?泣??#39;;
576 KI_LINE = '?ゃ???激???????????違???吾??????#39;;
577 KU_LINE = '?????鴻?????????????違?????ャ??#39;;
578 KE_LINE = '???宴?祉???????<???宴?蚊?????с??#39;;
579 KO_LINE = '???潟?純???????≪?????蚊?眼??????#39;;
580 kKanji = [$80..$A0, $E0..$ff];
581 begin
582
583 wHiraDelimiter := TStringList.Create;
584 wHiraFinalDelimiter := TStringList.Create;
585 wKanjiDelimiter := TStringList.Create;
586 words := TStringList.Create;
587 try
588 mode := ModeWhite;
589 wHiraDelimiter.Text := WHIRA_DELIMITER;
590 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
591 wKanjiDelimiter.Text := WKANJI_DELIMITER;
592 p := PChar( text );
593 tail := p + Length( text );
594 last := p;
595
596 while p < tail do begin
597 // ??絖????帥?ゃ?????ゅ??/span>
598 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
599 // if Byte(Byte( p^ ) - $a1) < $5e then begin
600 if Byte( p^ ) in kKanji then begin
601 if p + 1 < tail then begin
602 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
603 case ch of
604 // ?鴻???若?鴻?у??茯????????????????
605 //$8140: newMode := ModeWhite;
606 $8141..$824e: newMode := ModeWGraph;
607 $824f..$8258: newMode := ModeWNum;
608 $8260..$829a: newMode := ModeWAlpha;
609 $829f..$82f1: newMode := ModeWHira;
610 $8340..$8396: newMode := ModeWKata;
611 else newMode := ModeWKanji;
612 end;
613 // '??????#39; ??抗篁??????障???????帥?????????障????
614 if (mode = ModeWHira) or (mode = ModeWKata) then
615 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
616 newMode := mode;
617 end else begin
618 newMode := ModeWhite;
619 end;
620
621 chSize := 2;
622 end else begin
623 newMode := Modes( CharMode1[ Byte( p^ ) ] );
624 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
625 // 篁??障?ф?ユ????т??鴻???若??/span>
626 // ??茯???膵???????с?鴻???若?鴻??荅違????
627 // ?糸??茹?????????絽吾?鴻???若?鴻?у?阪??????????????荅違??????
628 newMode := mode;
629 end;
630
631 chSize := 1;
632 end;
633
634 if mode <> newMode then begin
635
636 // ??絖????帥?ゃ????紊??眼??????
637 if mode <> ModeWhite then begin
638 SetLength( aWord, p - last );
639 CopyMemory( PChar( aWord ), last, p - last );
640
641 words.Text := changeMode( aWord, mode );
642
643 // ??茯??脂??/span>
644 addWord( wordCount, words );
645 end;
646
647 last := p;
648 mode := newMode;
649
650 end;
651
652 p := p + chSize;
653 end; // while
654
655 if mode <> ModeWhite then begin
656 aWord := Copy( last, 0, p - last );
657 words.Text := changeMode( aWord, mode );
658
659 // ??茯??脂??/span>
660 addWord( wordCount, words );
661 end;
662 finally
663 words.Free;
664 wKanjiDelimiter.Free;
665 wHiraFinalDelimiter.Free;
666 wHiraDelimiter.Free;
667 end;
668
669 end;
670
671 //==============================
672 // CalcPaulGraham
673 //==============================
674 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
675
676 function p( const aWord : string ) : Single;
677 var
678 info : TWordInfo;
679 begin
680 info := Objects[ aWord ];
681 if info = nil then
682 Result := 0.415
683 else if info.NormalWord = 0 then
684 Result := 0.99
685 else if info.ImportantWord = 0 then
686 Result := 0.01
687 else if info.ImportantWord + info.NormalWord * 2 < 5 then
688 Result := 0.5
689 else
690 Result := ( info.ImportantWord / info.ImportantText ) /
691 ((info.NormalWord * 2 / info.NormalText ) +
692 (info.ImportantWord / info.ImportantText));
693 end;
694
695 var
696 s, q : Extended;
697 i : Integer;
698 narray : TList;
699 const
700 SAMPLE_COUNT = 15;
701 begin
702
703 Result := 1;
704 if wordCount.Count = 0 then
705 Exit;
706
707 narray := TList.Create;
708 try
709 for i := 0 to wordCount.Count - 1 do begin
710 narray.Add( Pointer( p( wordCount[ i ] ) ) );
711 end;
712
713 narray.Sort( AbsSort );
714
715 s := 1;
716 q := 1;
717 i := min( SAMPLE_COUNT, narray.Count );
718 while i > 0 do begin
719 Dec( i );
720
721 s := s * Single( narray[ i ] );
722 q := q * (1 - Single( narray[ i ] ));
723 end;
724
725 Result := s / (s + q);
726 finally
727 narray.Free;
728 end;
729
730 end;
731
732 //==============================
733 // CalcGaryRobinson
734 //==============================
735 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
736
737 function p( const aWord : string ) : Single;
738 var
739 info : TWordInfo;
740 begin
741 info := Objects[ aWord ];
742 if info = nil then
743 Result := 0.415
744 else if info.ImportantWord = 0 then
745 Result := 0.01
746 else if info.NormalWord = 0 then
747 Result := 0.99
748 else
749 Result := ( info.ImportantWord / info.ImportantText ) /
750 ((info.NormalWord / info.NormalText ) +
751 (info.ImportantWord / info.ImportantText));
752 end;
753
754 function f( cnt : Integer; n, mean : Single ) : Extended;
755 const
756 k = 0.00001;
757 begin
758 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
759 end;
760
761 var
762 n : Extended;
763 narray : array of Single;
764 mean : Extended;
765 countInfo : TWordCountInfo;
766 i : Integer;
767 P1, Q1, R1 : Extended;
768 cnt : Extended;
769 begin
770
771 if wordCount.Count = 0 then begin
772 Result := 1;
773 Exit;
774 end;
775
776 SetLength( narray, wordCount.Count );
777 mean := 0;
778 for i := 0 to wordCount.Count - 1 do begin
779 n := p( wordCount[ i ] );
780 narray[ i ] := n;
781 mean := mean + n;
782 end;
783 mean := mean / wordCount.Count;
784
785 P1 := 1;
786 Q1 := 1;
787 for i := 0 to wordCount.Count - 1 do begin
788 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
789 n := f( countInfo.WordCount, narray[ i ], mean );
790 P1 := P1 * ( 1 - n );
791 Q1 := Q1 * n;
792 end;
793 cnt := wordCount.Count;
794 if cnt = 0 then
795 cnt := 1
796 else
797 P1 := 1 - Power( P1, 1 / cnt );
798 Q1 := 1 - Power( Q1, 1 / cnt );
799
800 if P1 + Q1 = 0 then begin
801 Result := 0.5
802 end else begin
803 n := (P1 - Q1) / (P1 + Q1);
804 Result := (1 + n) / 2;
805 end;
806
807 end;
808
809 //==============================
810 // CalcGaryRobinsonFisher
811 //==============================
812 function TGikoBayesian.CalcGaryRobinsonFisher(
813 wordCount : TWordCount
814 ) : Extended;
815
816 function p( const aWord : string ) : Single;
817 var
818 info : TWordInfo;
819 begin
820 info := Objects[ aWord ];
821 if info = nil then
822 Result := 0.415
823 else if info.ImportantWord = 0 then
824 Result := 0.01
825 else if info.NormalWord = 0 then
826 Result := 0.99
827 else
828 Result := info.ImportantWord /
829 (info.ImportantWord + info.NormalWord *
830 info.ImportantText / info.NormalText);
831 end;
832
833 function f( cnt : Integer; n, mean : Single ) : Extended;
834 const
835 k = 0.00001;
836 begin
837 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
838 end;
839
840 function prbx( x2, degree : Extended ) : Extended;
841 var
842 m : Extended;
843 sum : Extended;
844 term : Extended;
845 i : extended;
846 begin
847
848 m := x2 / 2;
849 sum := exp( -m );
850 term := -m;
851
852 i := 1;
853 while i < (degree / 2 - 1) do begin
854 term := term + ln( m / i );
855 sum := sum + exp( term );
856 i := i + 1;
857 end;
858
859 if sum < 1 then
860 Result := sum
861 else
862 Result := 1.0;
863
864 end;
865
866 var
867 n : Extended;
868 narray : array of Single;
869 mean : Extended;
870 countInfo : TWordCountInfo;
871 i : Integer;
872 normal : Extended;
873 important : Extended;
874 P1, Q1 : Extended;
875 cnt : Extended;
876 begin
877
878 if wordCount.Count = 0 then begin
879 Result := 1;
880 Exit;
881 end;
882
883 SetLength( narray, wordCount.Count );
884 mean := 0;
885 for i := 0 to wordCount.Count - 1 do begin
886 n := p( wordCount[ i ] );
887 narray[ i ] := n;
888 mean := mean + n;
889 end;
890 mean := mean / wordCount.Count;
891
892 cnt := 0;
893 (*
894 P1 := 1;
895 Q1 := 1;
896 (*)
897 P1 := 0;
898 Q1 := 0;
899 //*
900 for i := 0 to wordCount.Count - 1 do begin
901 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
902 n := f( countInfo.WordCount, narray[ i ], mean );
903 if countInfo <> nil then
904 cnt := cnt + countInfo.WordCount;
905 (*
906 P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
907 Q1 := Q1 + Ln( n ) * countInfo.WordCount;
908 (*)
909 P1 := P1 + Ln( 1 - n );
910 Q1 := Q1 + Ln( n );
911 //*)
912 end;
913 if cnt = 0 then
914 cnt := 1;
915 //(*
916 P1 := prbx( -2 * P1, 2 * cnt );
917 Q1 := prbx( -2 * Q1, 2 * cnt );
918 (*)
919 P1 := prbx( -2 * Ln( P1 ), 2 * cnt );
920 Q1 := prbx( -2 * Ln( Q1 ), 2 * cnt );
921 //*)
922 if P1 + Q1 = 0 then begin
923 Result := 0.5
924 end else begin
925 Result := (1 + Q1 + P1) / 2;
926 end;
927
928 end;
929
930 //==============================
931 // Parse
932 //==============================
933 function TGikoBayesian.Parse(
934 const text : string;
935 wordCount : TWordCount;
936 algorithm : TGikoBayesianAlgorithm
937 ) : Extended;
938 begin
939
940 CountWord( text, wordCount );
941 case algorithm of
942 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
943 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
944 gbaGaryRobinsonFisher:
945 Result := CalcGaryRobinsonFisher( wordCount );
946 else Result := 0;
947 end;
948
949 end;
950
951 //==============================
952 // Learn
953 //==============================
954 procedure TGikoBayesian.Learn(
955 wordCount : TWordCount;
956 isImportant : Boolean );
957 var
958 aWord : string;
959 wordinfo : TWordInfo;
960 countinfo : TWordCountInfo;
961 i : Integer;
962 begin
963
964 for i := 0 to wordCount.Count - 1 do begin
965 aWord := wordCount[ i ];
966 wordinfo := Objects[ aWord ];
967 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
968 if wordinfo = nil then begin
969 wordinfo := TWordInfo.Create;
970 Objects[ aWord ] := wordinfo;
971 end;
972
973 if isImportant then begin
974 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
975 wordinfo.ImportantText := wordinfo.ImportantText + 1;
976 end else begin
977 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
978 wordinfo.NormalText := wordinfo.NormalText + 1;
979 end;
980 end;
981
982 end;
983
984 //==============================
985 // Forget
986 //==============================
987 procedure TGikoBayesian.Forget(
988 wordCount : TWordCount;
989 isImportant : Boolean );
990 var
991 aWord : string;
992 wordinfo : TWordInfo;
993 countinfo : TWordCountInfo;
994 i : Integer;
995 begin
996
997 for i := 0 to wordCount.Count - 1 do begin
998 aWord := wordCount[ i ];
999 wordinfo := Objects[ aWord ];
1000 if wordinfo = nil then
1001 Continue;
1002
1003 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
1004 if isImportant then begin
1005 if wordInfo.ImportantText > 0 then begin
1006 wordinfo.ImportantText := wordinfo.ImportantText - 1;
1007 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1008 end;
1009 end else begin
1010 if wordinfo.NormalText > 0 then begin
1011 wordinfo.NormalText := wordinfo.NormalText - 1;
1012 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
1013 end;
1014 end;
1015 end;
1016
1017 end;
1018
1019 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26