Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.12 - (show annotations) (download) (as text)
Mon Nov 1 04:45:25 2004 UTC (19 years, 5 months ago) by yoffy
Branch: MAIN
Changes since 1.11: +197 -70 lines
File MIME type: text/x-pascal
- 平仮名の単語分けワードを増やしてより細分化。
- 漢字にも単語分けワードを用意。
- 漢字は最長 2 字で切るようになった。

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7 $Id: GikoBayesian.pas,v 1.11 2004/10/31 16:48:44 yoffy Exp $
8 }
9
10 interface
11
12 //==================================================
13 uses
14 //==================================================
15 Classes;
16
17 //==================================================
18 type
19 //==================================================
20
21 {!***********************************************************
22 \brief ??茯???????????/span>
23 ************************************************************}
24 TWordInfo = class( TObject )
25 private
26 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
27 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
28 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
29 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
30
31 public
32 property NormalWord : Integer read FNormalWord write FNormalWord;
33 property ImportantWord : Integer read FImportantWord write FImportantWord;
34 property NormalText : Integer read FNormalText write FNormalText;
35 property ImportantText : Integer read FImportantText write FImportantText;
36 end;
37
38 {!***********************************************************
39 \brief 茹f??羝??水??茯???????????/span>
40 ************************************************************}
41 TWordCountInfo = class( TObject )
42 private
43 FWordCount : Integer; //!< ??茯???/span>
44
45 public
46 property WordCount : Integer read FWordCount write FWordCount;
47 end;
48
49 {!***********************************************************
50 \brief 茹f??羝??水??茯????鴻??
51 ************************************************************}
52 // TWordCount = class( THashedStringList ) // 羶???
53 TWordCount = class( TStringList )
54 public
55 constructor Create;
56 destructor Destroy; override;
57 end;
58
59 {!***********************************************************
60 \brief ???c???帥?≪???眼???冴??
61 ************************************************************}
62 TGikoBayesianAlgorithm =
63 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
64
65 {!***********************************************************
66 \brief ???ゃ?吾?≪?潟???c????/span>
67 ************************************************************}
68 // TGikoBayesian = class( THashedStringList ) // 羶???
69 TGikoBayesian = class( TStringList )
70 private
71 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
72 function GetObject( const name : string ) : TWordInfo;
73 procedure SetObject( const name : string; value : TWordInfo );
74
75 public
76 constructor Create;
77 destructor Destroy; override;
78
79 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
80 procedure LoadFromFile( const filePath : string );
81
82 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
83 procedure SaveToFile( const filePath : string );
84
85 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
86 procedure Save;
87
88 //! ??茯???????????宴????緇????障??
89 property Objects[ const name : string ] : TWordInfo
90 read GetObject write SetObject; default;
91
92 //! ??腴??????障??????茯????????潟?????障??
93 procedure CountWord(
94 const text : string;
95 wordCount : TWordCount );
96
97 {!
98 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
99 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
100 }
101 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
102
103 {!
104 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
105 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
106 }
107 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
108
109 {!
110 \brief GaryRobinson-Fisher 羈????冴?ャ??????腴???絵??墾??羆阪????障??
111 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
112 }
113 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
114
115 {!
116 \brief ??腴???茹f??
117 \param text 茹f????????腴?
118 \param wordCount 茹f??????????茯????鴻????菴???
119 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
120 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
121
122 CountWord ? Calcxxxxx ???障???????茵??????????с????
123 }
124 function Parse(
125 const text : string;
126 wordCount : TWordCount;
127 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
128 ) : Extended;
129
130 {!
131 \brief 絖??????
132 \param wordCount Parse ?цВ??????????茯????鴻??
133 \param isImportant 羈??????鴻????腴???????????????? True
134 }
135 procedure Learn(
136 wordCount : TWordCount;
137 isImportant : Boolean );
138
139 {!
140 \brief 絖??腟?????綽?????
141 \param wordCount Parse ?цВ??????????茯????鴻??
142 \param isImportant 羈??????鴻????腴???????????????????????? True
143 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
144 Learn ????????????腴??? isImportant ???????c????????腴???
145 Forget ?????????若?帥???若?鴻???贋?????障????<br>
146 絖??羝??帥????????????????????????????????
147
148 ???????膺?腟??????????≪????????с???????障??????<br>
149 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
150
151 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
152 }
153 procedure Forget(
154 wordCount : TWordCount;
155 isImportant : Boolean );
156 end;
157
158 //==================================================
159 implementation
160 //==================================================
161
162 uses
163 SysUtils, Math, Windows,
164 MojuUtils;
165
166 const
167 GIKO_BAYESIAN_FILE_VERSION = '1.0';
168 {
169 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
170 ModeWGraph, ModeWAlpha, ModeWNum,
171 ModeWHira, ModeWKata, ModeWKanji);
172 }
173 CharMode1 : array [ 0..255 ] of Byte =
174 (
175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
178 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
179 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
180 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
181 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
182 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
183
184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
187 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
188 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
189 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
192 );
193
194 //************************************************************
195 // misc
196 //************************************************************
197
198 //==============================
199 // RemoveToken
200 //==============================
201 function RemoveToken(var s: string;const delimiter: string): string;
202 var
203 p: Integer;
204 begin
205 p := AnsiPos(delimiter, s);
206 if p = 0 then
207 Result := s
208 else
209 Result := Copy(s, 1, p - 1);
210 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
211 end;
212
213 //==============================
214 // AbsSort
215 //==============================
216 function AbsSort( p1, p2 : Pointer ) : Integer;
217 var
218 v1, v2 : Single;
219 begin
220
221 v1 := Abs( Single( p1 ) - 0.5 );
222 v2 := Abs( Single( p2 ) - 0.5 );
223 if v1 > v2 then
224 Result := -1
225 else if v1 = v2 then
226 Result := 0
227 else
228 Result := 1;
229
230 end;
231
232 //************************************************************
233 // TWordCount class
234 //************************************************************
235 constructor TWordCount.Create;
236 begin
237
238 Duplicates := dupIgnore;
239 CaseSensitive := True;
240 Sorted := True;
241
242 end;
243
244 destructor TWordCount.Destroy;
245 var
246 i : Integer;
247 begin
248
249 for i := Count - 1 downto 0 do
250 if Objects[ i ] <> nil then
251 Objects[ i ].Free;
252
253 inherited;
254
255 end;
256
257 //************************************************************
258 // TGikoBayesian class
259 //************************************************************
260
261 //==============================
262 // Create
263 //==============================
264 constructor TGikoBayesian.Create;
265 begin
266
267 Duplicates := dupIgnore;
268 CaseSensitive := True;
269 Sorted := True;
270
271 end;
272
273 //==============================
274 // Destroy
275 //==============================
276 destructor TGikoBayesian.Destroy;
277 var
278 i : Integer;
279 begin
280
281 for i := Count - 1 downto 0 do
282 if inherited Objects[ i ] <> nil then
283 inherited Objects[ i ].Free;
284
285 inherited;
286
287 end;
288
289 procedure TGikoBayesian.LoadFromFile( const filePath : string );
290 var
291 i : Integer;
292 sl : TStringList;
293 s : string;
294 name : string;
295 info : TWordInfo;
296 begin
297
298 FFilePath := filePath;
299
300 if not FileExists( filePath ) then
301 Exit;
302
303 sl := TStringList.Create;
304 try
305 sl.LoadFromFile( filePath );
306
307 for i := 1 to sl.Count - 1 do begin
308 s := sl[ i ];
309 name := RemoveToken( s, #1 );
310 info := TWordInfo.Create;
311 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
312 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
313 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
314 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315
316 AddObject( name, info );
317 end;
318 finally
319 sl.Free;
320 end;
321
322 end;
323
324 procedure TGikoBayesian.SaveToFile( const filePath : string );
325 var
326 i : Integer;
327 sl : TStringList;
328 s : string;
329 info : TWordInfo;
330 begin
331
332 FFilePath := filePath;
333
334 sl := TStringList.Create;
335 try
336 sl.BeginUpdate;
337 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
338
339 for i := 0 to Count - 1 do begin
340 info := TWordInfo( inherited Objects[ i ] );
341 s := Strings[ i ] + #1
342 + Format('%x', [info.NormalWord]) + #1
343 + Format('%x', [info.ImportantWord]) + #1
344 + Format('%x', [info.NormalText]) + #1
345 + Format('%x', [info.ImportantText]);
346
347 sl.Add(s);
348 end;
349 sl.EndUpdate;
350 sl.SaveToFile( filePath );
351 finally
352 sl.Free;
353 end;
354
355 end;
356
357 procedure TGikoBayesian.Save;
358 begin
359
360 if FFilePath <> '' then
361 SaveToFile( FFilePath );
362
363 end;
364
365 //==============================
366 // GetObject
367 //==============================
368 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
369 var
370 idx : Integer;
371 begin
372
373 if Find( name, idx ) then
374 Result := TWordInfo( inherited Objects[ idx ] )
375 else
376 Result := nil;
377
378 end;
379
380 //==============================
381 // SetObject
382 //==============================
383 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
384 var
385 idx : Integer;
386 begin
387
388 if Find( name, idx ) then
389 inherited Objects[ idx ] := value
390 else
391 AddObject( name, value );
392
393 end;
394
395
396 //==============================
397 // CountWord
398 //==============================
399 procedure TGikoBayesian.CountWord(
400 const text : string;
401 wordCount : TWordCount );
402 type
403 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
404 ModeWGraph, ModeWAlpha, ModeWNum,
405 ModeWHira, ModeWKata, ModeWKanji);
406 var
407 p, tail, last : PChar;
408 mode, newMode : Modes;
409 ch : Longword;
410 chSize : Integer;
411 wHiraDelimiter : TStringList;
412 wHiraFinalDelimiter : TStringList;
413 wKanjiDelimiter : TStringList;
414 words : TStringList;
415 aWord : string;
416 countInfo : TWordCountInfo;
417
418 function cutBoth( _aWord : string; _delim : TStringList ) : string;
419 var
420 _i : Integer;
421 begin
422 for _i := 0 to _delim.Count - 1 do begin
423 _aWord := CustomStringReplace(
424 _aWord,
425 _delim[ _i ],
426 #10 + _delim[ _i ] + #10, False );
427 end;
428 Result := _aWord;
429 end;
430
431 function cutFirst( _aWord : string; _delim : TStringList ) : string;
432 var
433 _i : Integer;
434 begin
435 for _i := 0 to _delim.Count - 1 do begin
436 _aWord := CustomStringReplace(
437 _aWord,
438 _delim[ _i ],
439 #10 + _delim[ _i ], False );
440 end;
441 Result := _aWord;
442 end;
443
444 function cutFinal( _aWord : string; _delim : TStringList ) : string;
445 var
446 _i : Integer;
447 begin
448 for _i := 0 to _delim.Count - 1 do begin
449 _aWord := CustomStringReplace(
450 _aWord,
451 _delim[ _i ],
452 _delim[ _i ] + #10, False );
453 end;
454 Result := _aWord;
455 end;
456
457 procedure addWord( _dst : TWordCount; _words : TStringList );
458 var
459 _aWord : string;
460 _i, _idx : Integer;
461 _countInfo : TWordCountInfo;
462 begin
463 for _i := 0 to _words.Count - 1 do begin
464 _aWord := _words[ _i ];
465 if Length( _aWord ) > 0 then begin
466 if _dst.Find( _aWord, _idx ) then begin
467 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
468 end else begin
469 _countInfo := TWordCountInfo.Create;
470 _dst.AddObject( _aWord, _countInfo );
471 end;
472 _countInfo.WordCount := _countInfo.WordCount + 1;
473 end;
474 end;
475 end;
476
477 function changeMode( _aWord : string; _mode : Modes ) : string;
478 var
479 _i : Integer;
480 _aWord2 : string;
481 _pWord, _pWord2 : PChar;
482 _pWordTail, _pFound : PChar;
483 const
484 _delim : string = #10;
485 begin
486 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
487 // ?ユ???
488 // ?鴻???若?鴻??荅違????
489 _aWord := CustomStringReplace( _aWord, ' ', '', False );
490 _aWord := CustomStringReplace( _aWord, '??', '', False );
491
492 // ???????帥?у??茯?????
493 case mode of
494 ModeWHira:
495 begin
496 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
497 Result := cutBoth( _aWord, wHiraDelimiter );
498 end;
499
500 ModeWKanji:
501 begin
502 // ???????帥?у??茯?????
503 _aWord := cutBoth( _aWord, wKanjiDelimiter );
504 // 4 byte (2 絖?) ???ゃ?у??茯?????
505 _pWord := PChar( _aWord );
506 _i := Length( _aWord );
507 _pWordTail := _pWord + _i;
508 SetLength( _aWord2, _i + (_i shr 2) );
509 _pWord2 := PChar( _aWord2 );
510
511 while _pWord < _pWordTail do begin
512 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
513 if _pFound = nil then
514 _pFound := _pWordTail;
515 _pFound := _pFound - 3;
516
517 while _pWord <= _pFound do begin
518 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
519 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
520 end;
521 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
522 CopyMemory( _pWord2, _pWord, _i );
523 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
524 end;
525 if _pWord < _pWordTail then begin
526 _i := _pWordTail - _pWord;
527 CopyMemory( _pWord2, _pWord, _i );
528 _pWord2 := _pWord2 + _i;
529 end;
530 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
531
532 Result := _aWord2;
533 end;
534
535 else
536 Result := _aWord;
537 end;
538 end else begin
539 Result := _aWord;
540 end;
541 end;
542 const
543 WHIRA_DELIMITER = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????'
544 + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;+ #10 + '??#39;
545 + #10 + '????' + #10 + '????' + #10 + '????'
546 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
547 + #10 + '????#39; + #10 + '????#39; + #10 + '????#39; + #10 + '????#39;
548 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
549 + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39;
550 + #10 + '????' + #10 + '????#39; + #10 + '????' + #10 + '????'
551 + #10 + '????' + #10 + '??????'
552 + #10 + '?с??' + #10 + '?障??' + #10 + '?障????'
553 + #10 + '?с????' + #10 + '?障????'
554 + #10 + '????' + #10 + '??????' + #10 + '??????' + #10 + '????????'
555 ;
556 WKANJI_DELIMITER = '??' + #10 + '??#39; + #10 + '綣?39; + #10 + '??' + #10 + '羈?'
557 + #10 + '筝?' + #10 + '??#39; + #10 + '??' + #10 + '??'
558 ;
559 WHIRA_FINAL_DELIMITER = '?c??' + #10 + '?c??#39;
560 ;{
561 + #10 + '???c??#39; + #10 + '???????c??#39; + #10 + '??????#39;
562 + #10 + '??????' + #10 + '?с??????'
563 + #10 + '?障??'
564 + #10 + '??????' + #10 + '????' + #10 + '????#39; + #10 + '??????#39;
565 + #10 + '??????' + #10 + '???c?宴??'
566 + #10 + '?с??' + #10 + '????'
567 + #10 + '????' + #10 + '??????' + #10 + '????' + #10 + '??????'
568 ;}
569 // '??#39; ?? '??????????' ????
570 HA_LINE = '?????????????障????????????違?宴????#39;;
571 HI_LINE = '???????<???蚊?帥?????????潟?眼??';
572 HU_LINE = '??????ゃ???泣?????????吟?激??';
573 HE_LINE = '???????????吾?????????鴻?冴??';
574 HO_LINE = '???????????祉???????????若?純??';
575 KA_LINE = '?≪???泣?帥??????ゃ???????吟???????<?泣??#39;;
576 KI_LINE = '?ゃ???激???????????違???吾??????#39;;
577 KU_LINE = '?????鴻?????????????違?????ャ??#39;;
578 KE_LINE = '???宴?祉???????<???宴?蚊?????с??#39;;
579 KO_LINE = '???潟?純???????≪?????蚊?眼??????#39;;
580 kKanji = [$80..$A0, $E0..$ff];
581 begin
582
583 wHiraDelimiter := TStringList.Create;
584 wHiraFinalDelimiter := TStringList.Create;
585 wKanjiDelimiter := TStringList.Create;
586 words := TStringList.Create;
587 try
588 mode := ModeWhite;
589 wHiraDelimiter.Text := WHIRA_DELIMITER;
590 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
591 wKanjiDelimiter.Text := WKANJI_DELIMITER;
592 p := PChar( text );
593 tail := p + Length( text );
594 last := p;
595
596 while p < tail do begin
597 // ??絖????帥?ゃ?????ゅ??/span>
598 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
599 // if Byte(Byte( p^ ) - $a1) < $5e then begin
600 if Byte( p^ ) in kKanji then begin
601 if p + 1 < tail then begin
602 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
603 case ch of
604 // ?鴻???若?鴻?у??茯????????????????
605 //$8140: newMode := ModeWhite;
606 $8141..$824e: newMode := ModeWGraph;
607 $824f..$8258: newMode := ModeWNum;
608 $8260..$829a: newMode := ModeWAlpha;
609 $829f..$82f1: newMode := ModeWHira;
610 $8340..$8396: newMode := ModeWKata;
611 else newMode := ModeWKanji;
612 end;
613 // '??????#39; ??抗篁??????障???????帥?????????障????
614 if (mode = ModeWHira) or (mode = ModeWKata) then
615 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
616 newMode := mode;
617 end else begin
618 newMode := ModeWhite;
619 end;
620
621 chSize := 2;
622 end else begin
623 newMode := Modes( CharMode1[ Byte( p^ ) ] );
624 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
625 // 篁??障?ф?ユ????т??鴻???若??/span>
626 // ??茯???膵???????с?鴻???若?鴻??荅違????
627 // ?糸??茹?????????絽吾?鴻???若?鴻?у?阪??????????????荅違??????
628 newMode := mode;
629 end;
630
631 chSize := 1;
632 end;
633
634 if mode <> newMode then begin
635
636 // ??絖????帥?ゃ????紊??眼??????
637 SetLength( aWord, p - last );
638 CopyMemory( PChar( aWord ), last, p - last );
639
640 words.Text := changeMode( aWord, mode );
641
642 // ??茯??脂??/span>
643 addWord( wordCount, words );
644
645 last := p;
646 mode := newMode;
647
648 end;
649
650 p := p + chSize;
651 end; // while
652
653 if mode <> ModeWhite then begin
654 aWord := Copy( last, 0, p - last );
655 words.Text := changeMode( aWord, mode );
656
657 // ??茯??脂??/span>
658 addWord( wordCount, words );
659 end;
660 finally
661 words.Free;
662 wKanjiDelimiter.Free;
663 wHiraFinalDelimiter.Free;
664 wHiraDelimiter.Free;
665 end;
666
667 end;
668
669 //==============================
670 // CalcPaulGraham
671 //==============================
672 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
673
674 function p( const aWord : string ) : Single;
675 var
676 info : TWordInfo;
677 begin
678 info := Objects[ aWord ];
679 if info = nil then
680 Result := 0.415
681 else if info.NormalWord = 0 then
682 Result := 0.99
683 else if info.ImportantWord = 0 then
684 Result := 0.01
685 else if info.ImportantWord + info.NormalWord * 2 < 5 then
686 Result := 0.5
687 else
688 Result := ( info.ImportantWord / info.ImportantText ) /
689 ((info.NormalWord * 2 / info.NormalText ) +
690 (info.ImportantWord / info.ImportantText));
691 end;
692
693 var
694 s, q : Extended;
695 i : Integer;
696 narray : TList;
697 const
698 SAMPLE_COUNT = 15;
699 begin
700
701 Result := 1;
702 if wordCount.Count = 0 then
703 Exit;
704
705 narray := TList.Create;
706 try
707 for i := 0 to wordCount.Count - 1 do begin
708 narray.Add( Pointer( p( wordCount[ i ] ) ) );
709 end;
710
711 narray.Sort( AbsSort );
712
713 s := 1;
714 q := 1;
715 i := min( SAMPLE_COUNT, narray.Count );
716 while i > 0 do begin
717 Dec( i );
718
719 s := s * Single( narray[ i ] );
720 q := q * (1 - Single( narray[ i ] ));
721 end;
722
723 Result := s / (s + q);
724 finally
725 narray.Free;
726 end;
727
728 end;
729
730 //==============================
731 // CalcGaryRobinson
732 //==============================
733 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
734
735 function p( const aWord : string ) : Single;
736 var
737 info : TWordInfo;
738 begin
739 info := Objects[ aWord ];
740 if info = nil then
741 Result := 0.415
742 else if info.ImportantWord = 0 then
743 Result := 0.01
744 else if info.NormalWord = 0 then
745 Result := 0.99
746 else
747 Result := ( info.ImportantWord / info.ImportantText ) /
748 ((info.NormalWord / info.NormalText ) +
749 (info.ImportantWord / info.ImportantText));
750 end;
751
752 function f( cnt : Integer; n, mean : Single ) : Extended;
753 const
754 k = 0.00001;
755 begin
756 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
757 end;
758
759 var
760 n : Extended;
761 narray : array of Single;
762 mean : Extended;
763 countInfo : TWordCountInfo;
764 i : Integer;
765 P1, Q1, R1 : Extended;
766 cnt : Extended;
767 begin
768
769 if wordCount.Count = 0 then begin
770 Result := 1;
771 Exit;
772 end;
773
774 SetLength( narray, wordCount.Count );
775 mean := 0;
776 for i := 0 to wordCount.Count - 1 do begin
777 n := p( wordCount[ i ] );
778 narray[ i ] := n;
779 mean := mean + n;
780 end;
781 mean := mean / wordCount.Count;
782
783 P1 := 1;
784 Q1 := 1;
785 for i := 0 to wordCount.Count - 1 do begin
786 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
787 n := f( countInfo.WordCount, narray[ i ], mean );
788 P1 := P1 * ( 1 - n );
789 Q1 := Q1 * n;
790 end;
791 cnt := wordCount.Count;
792 if cnt = 0 then
793 cnt := 1
794 else
795 P1 := 1 - Power( P1, 1 / cnt );
796 Q1 := 1 - Power( Q1, 1 / cnt );
797
798 if P1 + Q1 = 0 then begin
799 Result := 0.5
800 end else begin
801 n := (P1 - Q1) / (P1 + Q1);
802 Result := (1 + n) / 2;
803 end;
804
805 end;
806
807 //==============================
808 // CalcGaryRobinsonFisher
809 //==============================
810 function TGikoBayesian.CalcGaryRobinsonFisher(
811 wordCount : TWordCount
812 ) : Extended;
813
814 function p( const aWord : string ) : Single;
815 var
816 info : TWordInfo;
817 begin
818 info := Objects[ aWord ];
819 if info = nil then
820 Result := 0.415
821 else if info.ImportantWord = 0 then
822 Result := 0.01
823 else if info.NormalWord = 0 then
824 Result := 0.99
825 else
826 Result := info.ImportantWord /
827 (info.ImportantWord + info.NormalWord *
828 info.ImportantText / info.NormalText);
829 end;
830
831 function f( cnt : Integer; n, mean : Single ) : Extended;
832 const
833 k = 0.00001;
834 begin
835 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
836 end;
837
838 function prbx( x2, degree : Extended ) : Extended;
839 var
840 m : Extended;
841 sum : Extended;
842 term : Extended;
843 i : extended;
844 begin
845
846 m := x2 / 2;
847 sum := exp( -m );
848 term := -m;
849
850 i := 1;
851 while i < (degree / 2 - 1) do begin
852 term := term + ln( m / i );
853 sum := sum + exp( term );
854 i := i + 1;
855 end;
856
857 if sum < 1 then
858 Result := sum
859 else
860 Result := 1.0;
861
862 end;
863
864 var
865 n : Extended;
866 narray : array of Single;
867 mean : Extended;
868 countInfo : TWordCountInfo;
869 i : Integer;
870 normal : Extended;
871 important : Extended;
872 P1, Q1 : Extended;
873 cnt : Extended;
874 begin
875
876 if wordCount.Count = 0 then begin
877 Result := 1;
878 Exit;
879 end;
880
881 SetLength( narray, wordCount.Count );
882 mean := 0;
883 for i := 0 to wordCount.Count - 1 do begin
884 n := p( wordCount[ i ] );
885 narray[ i ] := n;
886 mean := mean + n;
887 end;
888 mean := mean / wordCount.Count;
889
890 cnt := 0;
891 (*
892 P1 := 1;
893 Q1 := 1;
894 (*)
895 P1 := 0;
896 Q1 := 0;
897 //*
898 for i := 0 to wordCount.Count - 1 do begin
899 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
900 n := f( countInfo.WordCount, narray[ i ], mean );
901 if countInfo <> nil then
902 cnt := cnt + countInfo.WordCount;
903 (*
904 P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
905 Q1 := Q1 + Ln( n ) * countInfo.WordCount;
906 (*)
907 P1 := P1 + Ln( 1 - n );
908 Q1 := Q1 + Ln( n );
909 //*)
910 end;
911 if cnt = 0 then
912 cnt := 1;
913 //(*
914 P1 := prbx( -2 * P1, 2 * cnt );
915 Q1 := prbx( -2 * Q1, 2 * cnt );
916 (*)
917 P1 := prbx( -2 * Ln( P1 ), 2 * cnt );
918 Q1 := prbx( -2 * Ln( Q1 ), 2 * cnt );
919 //*)
920 if P1 + Q1 = 0 then begin
921 Result := 0.5
922 end else begin
923 Result := (1 + Q1 + P1) / 2;
924 end;
925
926 end;
927
928 //==============================
929 // Parse
930 //==============================
931 function TGikoBayesian.Parse(
932 const text : string;
933 wordCount : TWordCount;
934 algorithm : TGikoBayesianAlgorithm
935 ) : Extended;
936 begin
937
938 CountWord( text, wordCount );
939 case algorithm of
940 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
941 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
942 gbaGaryRobinsonFisher:
943 Result := CalcGaryRobinsonFisher( wordCount );
944 else Result := 0;
945 end;
946
947 end;
948
949 //==============================
950 // Learn
951 //==============================
952 procedure TGikoBayesian.Learn(
953 wordCount : TWordCount;
954 isImportant : Boolean );
955 var
956 aWord : string;
957 wordinfo : TWordInfo;
958 countinfo : TWordCountInfo;
959 i : Integer;
960 begin
961
962 for i := 0 to wordCount.Count - 1 do begin
963 aWord := wordCount[ i ];
964 wordinfo := Objects[ aWord ];
965 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
966 if wordinfo = nil then begin
967 wordinfo := TWordInfo.Create;
968 Objects[ aWord ] := wordinfo;
969 end;
970
971 if isImportant then begin
972 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
973 wordinfo.ImportantText := wordinfo.ImportantText + 1;
974 end else begin
975 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
976 wordinfo.NormalText := wordinfo.NormalText + 1;
977 end;
978 end;
979
980 end;
981
982 //==============================
983 // Forget
984 //==============================
985 procedure TGikoBayesian.Forget(
986 wordCount : TWordCount;
987 isImportant : Boolean );
988 var
989 aWord : string;
990 wordinfo : TWordInfo;
991 countinfo : TWordCountInfo;
992 i : Integer;
993 begin
994
995 for i := 0 to wordCount.Count - 1 do begin
996 aWord := wordCount[ i ];
997 wordinfo := Objects[ aWord ];
998 if wordinfo = nil then
999 Continue;
1000
1001 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
1002 if isImportant then begin
1003 if wordInfo.ImportantText > 0 then begin
1004 wordinfo.ImportantText := wordinfo.ImportantText - 1;
1005 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1006 end;
1007 end else begin
1008 if wordinfo.NormalText > 0 then begin
1009 wordinfo.NormalText := wordinfo.NormalText - 1;
1010 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
1011 end;
1012 end;
1013 end;
1014
1015 end;
1016
1017 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26