Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.17.4.1 - (show annotations) (download) (as text)
Sun Jul 10 04:16:46 2005 UTC (18 years, 9 months ago) by h677
Branch: Bb50
CVS Tags: v1_50_2_606, v1_50_0_598, v1_50_0_603, v1_50_0_604, v1_50_0_606, b50, v1_50_0_600, v1_50_2_604, v1_50_0_602, v1_50_0_601, v1_50_0_605, v1_50_0_599
Changes since 1.17: +29 -10 lines
File MIME type: text/x-pascal
例外を吐きそうな計算部に例外処理を適当に追加

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7 $Id: GikoBayesian.pas,v 1.17 2004/11/05 14:24:26 h677 Exp $
8 }
9
10 //! 綛割皿????莨??吾??????????
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
12
13 interface
14
15 //==================================================
16 uses
17 //==================================================
18 Classes;
19
20 //==================================================
21 type
22 //==================================================
23
24 {!***********************************************************
25 \brief ??茯???????????/span>
26 ************************************************************}
27 TWordInfo = class( TObject )
28 private
29 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
30 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
31 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
32 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
33
34 public
35 property NormalWord : Integer read FNormalWord write FNormalWord;
36 property ImportantWord : Integer read FImportantWord write FImportantWord;
37 property NormalText : Integer read FNormalText write FNormalText;
38 property ImportantText : Integer read FImportantText write FImportantText;
39 end;
40
41 {!***********************************************************
42 \brief 茹f??羝??水??茯???????????/span>
43 ************************************************************}
44 TWordCountInfo = class( TObject )
45 private
46 FWordCount : Integer; //!< ??茯???/span>
47
48 public
49 property WordCount : Integer read FWordCount write FWordCount;
50 end;
51
52 {!***********************************************************
53 \brief 茹f??羝??水??茯????鴻??
54 ************************************************************}
55 // TWordCount = class( THashedStringList ) // 羶???
56 TWordCount = class( TStringList )
57 public
58 constructor Create;
59 destructor Destroy; override;
60 end;
61
62 {!***********************************************************
63 \brief ???c???帥?≪???眼???冴??
64 ************************************************************}
65 TGikoBayesianAlgorithm =
66 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
67
68 {!***********************************************************
69 \brief ???ゃ?吾?≪?潟???c????/span>
70 ************************************************************}
71 // TGikoBayesian = class( THashedStringList ) // 羶???
72 TGikoBayesian = class( TStringList )
73 private
74 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
75 function GetObject( const name : string ) : TWordInfo;
76 procedure SetObject( const name : string; value : TWordInfo );
77
78 public
79 constructor Create;
80 destructor Destroy; override;
81
82 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
83 procedure LoadFromFile( const filePath : string );
84
85 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
86 procedure SaveToFile( const filePath : string );
87
88 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
89 procedure Save;
90
91 //! ??茯???????????宴????緇????障??
92 property Objects[ const name : string ] : TWordInfo
93 read GetObject write SetObject; default;
94
95 //! ??腴??????障??????茯????????潟?????障??
96 procedure CountWord(
97 const text : string;
98 wordCount : TWordCount );
99
100 {!
101 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
102 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
103 }
104 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
105
106 {!
107 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
108 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
109 }
110 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
111
112 {!
113 \brief GaryRobinson-Fisher 羈????冴?ャ??????腴???絵??墾??羆阪????障??
114 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
115 }
116 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
117
118 {!
119 \brief ??腴???茹f??
120 \param text 茹f????????腴?
121 \param wordCount 茹f??????????茯????鴻????菴???
122 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
123 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
124
125 CountWord ? Calcxxxxx ???障???????茵??????????с????
126 }
127 function Parse(
128 const text : string;
129 wordCount : TWordCount;
130 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
131 ) : Extended;
132
133 {!
134 \brief 絖??????
135 \param wordCount Parse ?цВ??????????茯????鴻??
136 \param isImportant 羈??????鴻????腴???????????????? True
137 }
138 procedure Learn(
139 wordCount : TWordCount;
140 isImportant : Boolean );
141
142 {!
143 \brief 絖??腟?????綽?????
144 \param wordCount Parse ?цВ??????????茯????鴻??
145 \param isImportant 羈??????鴻????腴???????????????????????? True
146 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
147 Learn ????????????腴??? isImportant ???????c????????腴???
148 Forget ?????????若?帥???若?鴻???贋?????障????<br>
149 絖??羝??帥????????????????????????????????
150
151 ???????膺?腟??????????≪????????с???????障??????<br>
152 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
153
154 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
155 }
156 procedure Forget(
157 wordCount : TWordCount;
158 isImportant : Boolean );
159 end;
160
161 //==================================================
162 implementation
163 //==================================================
164
165 uses
166 SysUtils, Math, Windows,
167 MojuUtils;
168
169 const
170 GIKO_BAYESIAN_FILE_VERSION = '1.0';
171 {
172 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173 ModeWGraph, ModeWAlpha, ModeWNum,
174 ModeWHira, ModeWKata, ModeWKanji);
175 }
176 CharMode1 : array [ 0..255 ] of Byte =
177 (
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
186
187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
195 );
196
197 //************************************************************
198 // misc
199 //************************************************************
200
201 //==============================
202 // RemoveToken
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
205 var
206 p: Integer;
207 begin
208 p := AnsiPos(delimiter, s);
209 if p = 0 then
210 Result := s
211 else
212 Result := Copy(s, 1, p - 1);
213 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
214 end;
215
216 //==============================
217 // AbsSort
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
220 var
221 v1, v2 : Single;
222 begin
223
224 v1 := Abs( Single( p1 ) - 0.5 );
225 v2 := Abs( Single( p2 ) - 0.5 );
226 if v1 > v2 then
227 Result := -1
228 else if v1 = v2 then
229 Result := 0
230 else
231 Result := 1;
232
233 end;
234
235 //************************************************************
236 // TWordCount class
237 //************************************************************
238 constructor TWordCount.Create;
239 begin
240
241 Duplicates := dupIgnore;
242 CaseSensitive := True;
243 Sorted := True;
244
245 end;
246
247 destructor TWordCount.Destroy;
248 var
249 i : Integer;
250 begin
251
252 for i := Count - 1 downto 0 do
253 if Objects[ i ] <> nil then
254 Objects[ i ].Free;
255
256 inherited;
257
258 end;
259
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
263
264 //==============================
265 // Create
266 //==============================
267 constructor TGikoBayesian.Create;
268 begin
269
270 Duplicates := dupIgnore;
271 CaseSensitive := True;
272 Sorted := True;
273
274 end;
275
276 //==============================
277 // Destroy
278 //==============================
279 destructor TGikoBayesian.Destroy;
280 var
281 i : Integer;
282 begin
283
284 for i := Count - 1 downto 0 do
285 if inherited Objects[ i ] <> nil then
286 inherited Objects[ i ].Free;
287
288 inherited;
289
290 end;
291
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
293 var
294 i : Integer;
295 sl : TStringList;
296 s : string;
297 name : string;
298 info : TWordInfo;
299 begin
300
301 FFilePath := filePath;
302
303 if not FileExists( filePath ) then
304 Exit;
305
306 sl := TStringList.Create;
307 try
308 sl.LoadFromFile( filePath );
309
310 for i := 1 to sl.Count - 1 do begin
311 s := sl[ i ];
312 name := RemoveToken( s, #1 );
313 info := TWordInfo.Create;
314 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
318
319 AddObject( name, info );
320 end;
321 finally
322 sl.Free;
323 end;
324
325 end;
326
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
328 var
329 i : Integer;
330 sl : TStringList;
331 s : string;
332 info : TWordInfo;
333 begin
334
335 FFilePath := filePath;
336
337 sl := TStringList.Create;
338 try
339 sl.BeginUpdate;
340 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
341
342 for i := 0 to Count - 1 do begin
343 info := TWordInfo( inherited Objects[ i ] );
344 s := Strings[ i ] + #1
345 + Format('%x', [info.NormalWord]) + #1
346 + Format('%x', [info.ImportantWord]) + #1
347 + Format('%x', [info.NormalText]) + #1
348 + Format('%x', [info.ImportantText]);
349
350 sl.Add(s);
351 end;
352 sl.EndUpdate;
353 sl.SaveToFile( filePath );
354 finally
355 sl.Free;
356 end;
357
358 end;
359
360 procedure TGikoBayesian.Save;
361 begin
362
363 if FFilePath <> '' then
364 SaveToFile( FFilePath );
365
366 end;
367
368 //==============================
369 // GetObject
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
372 var
373 idx : Integer;
374 begin
375
376 if Find( name, idx ) then
377 Result := TWordInfo( inherited Objects[ idx ] )
378 else
379 Result := nil;
380
381 end;
382
383 //==============================
384 // SetObject
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
387 var
388 idx : Integer;
389 begin
390
391 if Find( name, idx ) then
392 inherited Objects[ idx ] := value
393 else
394 AddObject( name, value );
395
396 end;
397
398
399 //==============================
400 // CountWord
401 //==============================
402 procedure TGikoBayesian.CountWord(
403 const text : string;
404 wordCount : TWordCount );
405 type
406 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407 ModeWGraph, ModeWAlpha, ModeWNum,
408 ModeWHira, ModeWKata, ModeWKanji);
409 var
410 p, tail, last : PChar;
411 mode, newMode : Modes;
412 ch : Longword;
413 chSize : Integer;
414 wHiraDelimiter : TStringList;
415 wHiraFinalDelimiter : TStringList;
416 wKanjiDelimiter : TStringList;
417 words : TStringList;
418 aWord : string;
419 countInfo : TWordCountInfo;
420
421 function cutBoth( _aWord : string; _delim : TStringList ) : string;
422 var
423 _i : Integer;
424 begin
425 for _i := 0 to _delim.Count - 1 do begin
426 _aWord := CustomStringReplace(
427 _aWord,
428 _delim[ _i ],
429 #10 + _delim[ _i ] + #10, False );
430 end;
431 Result := _aWord;
432 end;
433
434 function cutFirst( _aWord : string; _delim : TStringList ) : string;
435 var
436 _i : Integer;
437 begin
438 for _i := 0 to _delim.Count - 1 do begin
439 _aWord := CustomStringReplace(
440 _aWord,
441 _delim[ _i ],
442 #10 + _delim[ _i ], False );
443 end;
444 Result := _aWord;
445 end;
446
447 function cutFinal( _aWord : string; _delim : TStringList ) : string;
448 var
449 _i : Integer;
450 begin
451 for _i := 0 to _delim.Count - 1 do begin
452 _aWord := CustomStringReplace(
453 _aWord,
454 _delim[ _i ],
455 _delim[ _i ] + #10, False );
456 end;
457 Result := _aWord;
458 end;
459
460 procedure addWord( _dst : TWordCount; _words : TStringList );
461 var
462 _aWord : string;
463 _i, _idx : Integer;
464 _countInfo : TWordCountInfo;
465 begin
466 for _i := 0 to _words.Count - 1 do begin
467 _aWord := _words[ _i ];
468 if Length( _aWord ) > 0 then begin
469 if _dst.Find( _aWord, _idx ) then begin
470 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
471 end else begin
472 _countInfo := TWordCountInfo.Create;
473 _dst.AddObject( _aWord, _countInfo );
474 end;
475 _countInfo.WordCount := _countInfo.WordCount + 1;
476 end;
477 end;
478 end;
479
480 function changeMode( _aWord : string; _mode : Modes ) : string;
481 var
482 _i : Integer;
483 _aWord2 : string;
484 _pWord, _pWord2 : PChar;
485 _pWordTail, _pFound : PChar;
486 const
487 _delim : string = #10;
488 begin
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490 if mode = ModeWHira then begin
491 Result := '';
492 Exit;
493 end;
494 {$ENDIF}
495 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
496 // ?ユ???
497 // ?鴻???若?鴻??荅違????
498 _aWord := CustomStringReplace( _aWord, ' ', '', False );
499 _aWord := CustomStringReplace( _aWord, '??', '', False );
500
501 // ???????帥?у??茯?????
502 case mode of
503 ModeWHira:
504 begin
505 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506 Result := cutBoth( _aWord, wHiraDelimiter );
507 end;
508
509 ModeWKanji:
510 begin
511 // ???????帥?у??茯?????
512 _aWord := cutBoth( _aWord, wKanjiDelimiter );
513 // 4 byte (2 絖?) ???ゃ?у??茯?????
514 _pWord := PChar( _aWord );
515 _i := Length( _aWord );
516 _pWordTail := _pWord + _i;
517 SetLength( _aWord2, _i + (_i shr 2) );
518 _pWord2 := PChar( _aWord2 );
519
520 while _pWord < _pWordTail do begin
521 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522 if _pFound = nil then
523 _pFound := _pWordTail;
524 _pFound := _pFound - 3;
525
526 while _pWord <= _pFound do begin
527 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
529 end;
530 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531 CopyMemory( _pWord2, _pWord, _i );
532 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
533 end;
534 if _pWord < _pWordTail then begin
535 _i := _pWordTail - _pWord;
536 CopyMemory( _pWord2, _pWord, _i );
537 _pWord2 := _pWord2 + _i;
538 end;
539 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
540
541 Result := _aWord2;
542 end;
543
544 else
545 Result := _aWord;
546 end;
547 end else begin
548 Result := _aWord;
549 end;
550 end;
551 const
552 WHIRA_DELIMITER = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????'
553 + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;+ #10 + '??#39;
554 + #10 + '????' + #10 + '????' + #10 + '????'
555 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
556 + #10 + '????#39; + #10 + '????#39; + #10 + '????#39; + #10 + '????#39;
557 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
558 + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39;
559 + #10 + '????' + #10 + '????#39; + #10 + '????' + #10 + '????'
560 + #10 + '????' + #10 + '??????'
561 + #10 + '?с??' + #10 + '?障??' + #10 + '?障????'
562 + #10 + '?с????' + #10 + '?障????'
563 + #10 + '????' + #10 + '??????' + #10 + '??????' + #10 + '????????'
564 ;
565 WKANJI_DELIMITER = '??' + #10 + '??#39; + #10 + '綣?39; + #10 + '??' + #10 + '羈?'
566 + #10 + '筝?' + #10 + '??#39; + #10 + '??' + #10 + '??'
567 ;
568 WHIRA_FINAL_DELIMITER = '?c??' + #10 + '?c??#39;
569 ;{
570 + #10 + '???c??#39; + #10 + '???????c??#39; + #10 + '??????#39;
571 + #10 + '??????' + #10 + '?с??????'
572 + #10 + '?障??'
573 + #10 + '??????' + #10 + '????' + #10 + '????#39; + #10 + '??????#39;
574 + #10 + '??????' + #10 + '???c?宴??'
575 + #10 + '?с??' + #10 + '????'
576 + #10 + '????' + #10 + '??????' + #10 + '????' + #10 + '??????'
577 ;}
578 // '??#39; ?? '??????????' ????
579 HA_LINE = '?????????????障????????????違?宴????#39;;
580 HI_LINE = '???????<???蚊?帥?????????潟?眼??';
581 HU_LINE = '??????ゃ???泣?????????吟?激??';
582 HE_LINE = '???????????吾?????????鴻?冴??';
583 HO_LINE = '???????????祉???????????若?純??';
584 KA_LINE = '?≪???泣?帥??????ゃ???????吟???????<?泣??#39;;
585 KI_LINE = '?ゃ???激???????????違???吾??????#39;;
586 KU_LINE = '?????鴻?????????????違?????ャ??#39;;
587 KE_LINE = '???宴?祉???????<???宴?蚊?????с??#39;;
588 KO_LINE = '???潟?純???????≪?????蚊?眼??????#39;;
589 kKanji = [$80..$A0, $E0..$ff];
590 begin
591
592 wHiraDelimiter := TStringList.Create;
593 wHiraFinalDelimiter := TStringList.Create;
594 wKanjiDelimiter := TStringList.Create;
595 words := TStringList.Create;
596 try
597 mode := ModeWhite;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599 wHiraDelimiter.Text := WHIRA_DELIMITER;
600 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
601 {$ENDIF}
602 wKanjiDelimiter.Text := WKANJI_DELIMITER;
603 p := PChar( text );
604 tail := p + Length( text );
605 last := p;
606
607 while p < tail do begin
608 // ??絖????帥?ゃ?????ゅ??/span>
609 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
610 // if Byte(Byte( p^ ) - $a1) < $5e then begin
611 if Byte( p^ ) in kKanji then begin
612 if p + 1 < tail then begin
613 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
614 case ch of
615 // ?鴻???若?鴻?у??茯????????????????
616 //$8140: newMode := ModeWhite;
617 $8141..$824e: newMode := ModeWGraph;
618 $824f..$8258: newMode := ModeWNum;
619 $8260..$829a: newMode := ModeWAlpha;
620 $829f..$82f1: newMode := ModeWHira;
621 $8340..$8396: newMode := ModeWKata;
622 else newMode := ModeWKanji;
623 end;
624 // '??????#39; ??抗篁??????障???????帥?????????障????
625 if (mode = ModeWHira) or (mode = ModeWKata) then
626 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
627 newMode := mode;
628 end else begin
629 newMode := ModeWhite;
630 end;
631
632 chSize := 2;
633 end else begin
634 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636 // 篁??障?ф?ユ????т??鴻???若??/span>
637 // ??茯???膵???????с?鴻???若?鴻??荅違????
638 // ?糸??茹?????????絽吾?鴻???若?鴻?у?阪??????????????荅違??????
639 newMode := mode;
640 end;
641
642 chSize := 1;
643 end;
644
645 if mode <> newMode then begin
646
647 // ??絖????帥?ゃ????紊??眼??????
648 if mode <> ModeWhite then begin
649 SetLength( aWord, p - last );
650 CopyMemory( PChar( aWord ), last, p - last );
651
652 words.Text := changeMode( aWord, mode );
653
654 // ??茯??脂??/span>
655 addWord( wordCount, words );
656 end;
657
658 last := p;
659 mode := newMode;
660
661 end;
662
663 p := p + chSize;
664 end; // while
665
666 if mode <> ModeWhite then begin
667 SetLength( aWord, p - last );
668 CopyMemory( PChar( aWord ), last, p - last );
669
670 words.Text := changeMode( aWord, mode );
671
672 // ??茯??脂??/span>
673 addWord( wordCount, words );
674 end;
675 finally
676 words.Free;
677 wKanjiDelimiter.Free;
678 wHiraFinalDelimiter.Free;
679 wHiraDelimiter.Free;
680 end;
681
682 end;
683
684 //==============================
685 // CalcPaulGraham
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
688
689 function p( const aWord : string ) : Single;
690 var
691 info : TWordInfo;
692 begin
693 info := Objects[ aWord ];
694 if info = nil then
695 Result := 0.415
696 else if info.NormalWord = 0 then
697 Result := 0.99
698 else if info.ImportantWord = 0 then
699 Result := 0.01
700 else if info.ImportantWord + info.NormalWord * 2 < 5 then
701 Result := 0.5
702 else begin
703 try
704 Result := ( info.ImportantWord / info.ImportantText ) /
705 ((info.NormalWord * 2 / info.NormalText ) +
706 (info.ImportantWord / info.ImportantText));
707 except
708 on EZeroDivide do Result := 0.99;
709 end;
710 end;
711 end;
712
713 var
714 s, q : Extended;
715 i : Integer;
716 narray : TList;
717 const
718 SAMPLE_COUNT = 15;
719 begin
720
721 Result := 1;
722 if wordCount.Count = 0 then
723 Exit;
724
725 narray := TList.Create;
726 try
727 for i := 0 to wordCount.Count - 1 do begin
728 narray.Add( Pointer( p( wordCount[ i ] ) ) );
729 end;
730
731 narray.Sort( AbsSort );
732
733 s := 1;
734 q := 1;
735 i := min( SAMPLE_COUNT, narray.Count );
736 while i > 0 do begin
737 Dec( i );
738
739 s := s * Single( narray[ i ] );
740 q := q * (1 - Single( narray[ i ] ));
741 end;
742 try
743 Result := s / (s + q);
744 except
745 Result := 0.5;
746 end;
747 finally
748 narray.Free;
749 end;
750
751 end;
752
753 //==============================
754 // CalcGaryRobinson
755 //==============================
756 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
757
758 function p( const aWord : string ) : Single;
759 var
760 info : TWordInfo;
761 begin
762 info := Objects[ aWord ];
763 if info = nil then
764 Result := 0.415
765 else if info.ImportantWord = 0 then
766 Result := 0.01
767 else if info.NormalWord = 0 then
768 Result := 0.99
769 else
770 {
771 Result := ( info.ImportantWord / info.ImportantText ) /
772 ((info.NormalWord / info.NormalText ) +
773 (info.ImportantWord / info.ImportantText));
774 }
775 try
776 Result := (info.ImportantWord * info.NormalText) /
777 (info.NormalWord * info.ImportantText +
778 info.ImportantWord * info.NormalText);
779 except
780 Result := 0.5;
781 end;
782 end;
783
784 function f( cnt : Integer; n, mean : Single ) : Extended;
785 const
786 k = 0.001;
787 begin
788 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
789 end;
790
791 var
792 n : Extended;
793 narray : array of Single;
794 mean : Extended;
795 countInfo : TWordCountInfo;
796 i : Integer;
797 P1, Q1, R1 : Extended;
798 cnt : Extended;
799 begin
800
801 if wordCount.Count = 0 then begin
802 Result := 1;
803 Exit;
804 end;
805
806 SetLength( narray, wordCount.Count );
807 mean := 0;
808 for i := 0 to wordCount.Count - 1 do begin
809 n := p( wordCount[ i ] );
810 narray[ i ] := n;
811 mean := mean + n;
812 end;
813 mean := mean / wordCount.Count;
814
815 P1 := 1;
816 Q1 := 1;
817 for i := 0 to wordCount.Count - 1 do begin
818 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
819 n := f( countInfo.WordCount, narray[ i ], mean );
820 P1 := P1 * ( 1 - n );
821 Q1 := Q1 * n;
822 end;
823 cnt := wordCount.Count;
824 if cnt = 0 then
825 cnt := 1;
826 try
827 P1 := 1 - Power( P1, 1 / cnt );
828 except
829 end;
830 try
831 Q1 := 1 - Power( Q1, 1 / cnt );
832 except
833 end;
834
835 if P1 + Q1 = 0 then begin
836 Result := 0.5
837 end else begin
838 n := (P1 - Q1) / (P1 + Q1);
839 Result := (1 + n) / 2;
840 end;
841
842 end;
843
844 //==============================
845 // CalcGaryRobinsonFisher
846 //==============================
847 function TGikoBayesian.CalcGaryRobinsonFisher(
848 wordCount : TWordCount
849 ) : Extended;
850
851 function p( const aWord : string ) : Single;
852 var
853 info : TWordInfo;
854 begin
855 info := Objects[ aWord ];
856 if info = nil then
857 Result := 0.415
858 else if info.ImportantWord = 0 then
859 Result := 0.01
860 else if info.NormalWord = 0 then
861 Result := 0.99
862 else
863 {
864 Result := ( info.ImportantWord / info.ImportantText ) /
865 ((info.NormalWord / info.NormalText ) +
866 (info.ImportantWord / info.ImportantText));
867 }
868 Result := (info.ImportantWord * info.NormalText) /
869 (info.NormalWord * info.ImportantText +
870 info.ImportantWord * info.NormalText);
871 end;
872
873 function f( cnt : Integer; n, mean : Single ) : Extended;
874 const
875 k = 0.001;
876 begin
877 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
878 end;
879
880 function prbx( x2, degree : Extended ) : Extended;
881 begin
882
883 Result := 0.5;
884
885 end;
886
887 var
888 n : Extended;
889 narray : array of Single;
890 mean : Extended;
891 countInfo : TWordCountInfo;
892 i : Integer;
893 normal : Extended;
894 important : Extended;
895 P1, Q1 : Extended;
896 cnt : Extended;
897 begin
898
899 if wordCount.Count = 0 then begin
900 Result := 1;
901 Exit;
902 end;
903
904 SetLength( narray, wordCount.Count );
905 mean := 0;
906 for i := 0 to wordCount.Count - 1 do begin
907 n := p( wordCount[ i ] );
908 narray[ i ] := n;
909 mean := mean + n;
910 end;
911 mean := mean / wordCount.Count;
912
913 P1 := 1;
914 Q1 := 1;
915 for i := 0 to wordCount.Count - 1 do begin
916 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
917 n := f( countInfo.WordCount, narray[ i ], mean );
918 P1 := P1 * ( 1 - n );
919 Q1 := Q1 * n;
920 end;
921 cnt := wordCount.Count;
922 if cnt = 0 then
923 cnt := 1;
924 try
925 P1 := Power( P1, 1 / cnt );
926 except
927 end;
928 try
929 Q1 := Power( Q1, 1 / cnt );
930 except
931 end;
932
933 P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
934 Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
935
936 Result := (1 + P1 - Q1) / 2;
937
938 end;
939
940 //==============================
941 // Parse
942 //==============================
943 function TGikoBayesian.Parse(
944 const text : string;
945 wordCount : TWordCount;
946 algorithm : TGikoBayesianAlgorithm
947 ) : Extended;
948 begin
949
950 CountWord( text, wordCount );
951 case algorithm of
952 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
953 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
954 gbaGaryRobinsonFisher:
955 Result := CalcGaryRobinsonFisher( wordCount );
956 else Result := 0;
957 end;
958
959 end;
960
961 //==============================
962 // Learn
963 //==============================
964 procedure TGikoBayesian.Learn(
965 wordCount : TWordCount;
966 isImportant : Boolean );
967 var
968 aWord : string;
969 wordinfo : TWordInfo;
970 countinfo : TWordCountInfo;
971 i : Integer;
972 begin
973
974 for i := 0 to wordCount.Count - 1 do begin
975 aWord := wordCount[ i ];
976 wordinfo := Objects[ aWord ];
977 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
978 if wordinfo = nil then begin
979 wordinfo := TWordInfo.Create;
980 Objects[ aWord ] := wordinfo;
981 end;
982
983 if isImportant then begin
984 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
985 wordinfo.ImportantText := wordinfo.ImportantText + 1;
986 end else begin
987 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
988 wordinfo.NormalText := wordinfo.NormalText + 1;
989 end;
990 end;
991
992 end;
993
994 //==============================
995 // Forget
996 //==============================
997 procedure TGikoBayesian.Forget(
998 wordCount : TWordCount;
999 isImportant : Boolean );
1000 var
1001 aWord : string;
1002 wordinfo : TWordInfo;
1003 countinfo : TWordCountInfo;
1004 i : Integer;
1005 begin
1006
1007 for i := 0 to wordCount.Count - 1 do begin
1008 aWord := wordCount[ i ];
1009 wordinfo := Objects[ aWord ];
1010 if wordinfo = nil then
1011 Continue;
1012
1013 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
1014 if isImportant then begin
1015 if wordInfo.ImportantText > 0 then begin
1016 wordinfo.ImportantText := wordinfo.ImportantText - 1;
1017 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1018 end;
1019 end else begin
1020 if wordinfo.NormalText > 0 then begin
1021 wordinfo.NormalText := wordinfo.NormalText - 1;
1022 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
1023 end;
1024 end;
1025 end;
1026
1027 end;
1028
1029 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26