Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.15 - (show annotations) (download) (as text)
Mon Nov 1 10:28:24 2004 UTC (19 years, 5 months ago) by yoffy
Branch: MAIN
Changes since 1.14: +29 -54 lines
File MIME type: text/x-pascal
Gary Robinson-Fisher をバグバグのまま放置しておくのは危険なので
とりあえず必ず 50% になるように変更。

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7 $Id: GikoBayesian.pas,v 1.14 2004/11/01 09:51:57 yoffy Exp $
8 }
9
10 //! 綛割皿????莨??吾??????????
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
12
13 interface
14
15 //==================================================
16 uses
17 //==================================================
18 Classes;
19
20 //==================================================
21 type
22 //==================================================
23
24 {!***********************************************************
25 \brief ??茯???????????/span>
26 ************************************************************}
27 TWordInfo = class( TObject )
28 private
29 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
30 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
31 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
32 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
33
34 public
35 property NormalWord : Integer read FNormalWord write FNormalWord;
36 property ImportantWord : Integer read FImportantWord write FImportantWord;
37 property NormalText : Integer read FNormalText write FNormalText;
38 property ImportantText : Integer read FImportantText write FImportantText;
39 end;
40
41 {!***********************************************************
42 \brief 茹f??羝??水??茯???????????/span>
43 ************************************************************}
44 TWordCountInfo = class( TObject )
45 private
46 FWordCount : Integer; //!< ??茯???/span>
47
48 public
49 property WordCount : Integer read FWordCount write FWordCount;
50 end;
51
52 {!***********************************************************
53 \brief 茹f??羝??水??茯????鴻??
54 ************************************************************}
55 // TWordCount = class( THashedStringList ) // 羶???
56 TWordCount = class( TStringList )
57 public
58 constructor Create;
59 destructor Destroy; override;
60 end;
61
62 {!***********************************************************
63 \brief ???c???帥?≪???眼???冴??
64 ************************************************************}
65 TGikoBayesianAlgorithm =
66 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
67
68 {!***********************************************************
69 \brief ???ゃ?吾?≪?潟???c????/span>
70 ************************************************************}
71 // TGikoBayesian = class( THashedStringList ) // 羶???
72 TGikoBayesian = class( TStringList )
73 private
74 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
75 function GetObject( const name : string ) : TWordInfo;
76 procedure SetObject( const name : string; value : TWordInfo );
77
78 public
79 constructor Create;
80 destructor Destroy; override;
81
82 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
83 procedure LoadFromFile( const filePath : string );
84
85 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
86 procedure SaveToFile( const filePath : string );
87
88 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
89 procedure Save;
90
91 //! ??茯???????????宴????緇????障??
92 property Objects[ const name : string ] : TWordInfo
93 read GetObject write SetObject; default;
94
95 //! ??腴??????障??????茯????????潟?????障??
96 procedure CountWord(
97 const text : string;
98 wordCount : TWordCount );
99
100 {!
101 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
102 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
103 }
104 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
105
106 {!
107 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
108 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
109 }
110 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
111
112 {!
113 \brief GaryRobinson-Fisher 羈????冴?ャ??????腴???絵??墾??羆阪????障??
114 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
115 }
116 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
117
118 {!
119 \brief ??腴???茹f??
120 \param text 茹f????????腴?
121 \param wordCount 茹f??????????茯????鴻????菴???
122 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
123 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
124
125 CountWord ? Calcxxxxx ???障???????茵??????????с????
126 }
127 function Parse(
128 const text : string;
129 wordCount : TWordCount;
130 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
131 ) : Extended;
132
133 {!
134 \brief 絖??????
135 \param wordCount Parse ?цВ??????????茯????鴻??
136 \param isImportant 羈??????鴻????腴???????????????? True
137 }
138 procedure Learn(
139 wordCount : TWordCount;
140 isImportant : Boolean );
141
142 {!
143 \brief 絖??腟?????綽?????
144 \param wordCount Parse ?цВ??????????茯????鴻??
145 \param isImportant 羈??????鴻????腴???????????????????????? True
146 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
147 Learn ????????????腴??? isImportant ???????c????????腴???
148 Forget ?????????若?帥???若?鴻???贋?????障????<br>
149 絖??羝??帥????????????????????????????????
150
151 ???????膺?腟??????????≪????????с???????障??????<br>
152 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
153
154 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
155 }
156 procedure Forget(
157 wordCount : TWordCount;
158 isImportant : Boolean );
159 end;
160
161 //==================================================
162 implementation
163 //==================================================
164
165 uses
166 SysUtils, Math, Windows,
167 MojuUtils;
168
169 const
170 GIKO_BAYESIAN_FILE_VERSION = '1.0';
171 {
172 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173 ModeWGraph, ModeWAlpha, ModeWNum,
174 ModeWHira, ModeWKata, ModeWKanji);
175 }
176 CharMode1 : array [ 0..255 ] of Byte =
177 (
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
186
187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
195 );
196
197 //************************************************************
198 // misc
199 //************************************************************
200
201 //==============================
202 // RemoveToken
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
205 var
206 p: Integer;
207 begin
208 p := AnsiPos(delimiter, s);
209 if p = 0 then
210 Result := s
211 else
212 Result := Copy(s, 1, p - 1);
213 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
214 end;
215
216 //==============================
217 // AbsSort
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
220 var
221 v1, v2 : Single;
222 begin
223
224 v1 := Abs( Single( p1 ) - 0.5 );
225 v2 := Abs( Single( p2 ) - 0.5 );
226 if v1 > v2 then
227 Result := -1
228 else if v1 = v2 then
229 Result := 0
230 else
231 Result := 1;
232
233 end;
234
235 //************************************************************
236 // TWordCount class
237 //************************************************************
238 constructor TWordCount.Create;
239 begin
240
241 Duplicates := dupIgnore;
242 CaseSensitive := True;
243 Sorted := True;
244
245 end;
246
247 destructor TWordCount.Destroy;
248 var
249 i : Integer;
250 begin
251
252 for i := Count - 1 downto 0 do
253 if Objects[ i ] <> nil then
254 Objects[ i ].Free;
255
256 inherited;
257
258 end;
259
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
263
264 //==============================
265 // Create
266 //==============================
267 constructor TGikoBayesian.Create;
268 begin
269
270 Duplicates := dupIgnore;
271 CaseSensitive := True;
272 Sorted := True;
273
274 end;
275
276 //==============================
277 // Destroy
278 //==============================
279 destructor TGikoBayesian.Destroy;
280 var
281 i : Integer;
282 begin
283
284 for i := Count - 1 downto 0 do
285 if inherited Objects[ i ] <> nil then
286 inherited Objects[ i ].Free;
287
288 inherited;
289
290 end;
291
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
293 var
294 i : Integer;
295 sl : TStringList;
296 s : string;
297 name : string;
298 info : TWordInfo;
299 begin
300
301 FFilePath := filePath;
302
303 if not FileExists( filePath ) then
304 Exit;
305
306 sl := TStringList.Create;
307 try
308 sl.LoadFromFile( filePath );
309
310 for i := 1 to sl.Count - 1 do begin
311 s := sl[ i ];
312 name := RemoveToken( s, #1 );
313 info := TWordInfo.Create;
314 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
318
319 AddObject( name, info );
320 end;
321 finally
322 sl.Free;
323 end;
324
325 end;
326
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
328 var
329 i : Integer;
330 sl : TStringList;
331 s : string;
332 info : TWordInfo;
333 begin
334
335 FFilePath := filePath;
336
337 sl := TStringList.Create;
338 try
339 sl.BeginUpdate;
340 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
341
342 for i := 0 to Count - 1 do begin
343 info := TWordInfo( inherited Objects[ i ] );
344 s := Strings[ i ] + #1
345 + Format('%x', [info.NormalWord]) + #1
346 + Format('%x', [info.ImportantWord]) + #1
347 + Format('%x', [info.NormalText]) + #1
348 + Format('%x', [info.ImportantText]);
349
350 sl.Add(s);
351 end;
352 sl.EndUpdate;
353 sl.SaveToFile( filePath );
354 finally
355 sl.Free;
356 end;
357
358 end;
359
360 procedure TGikoBayesian.Save;
361 begin
362
363 if FFilePath <> '' then
364 SaveToFile( FFilePath );
365
366 end;
367
368 //==============================
369 // GetObject
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
372 var
373 idx : Integer;
374 begin
375
376 if Find( name, idx ) then
377 Result := TWordInfo( inherited Objects[ idx ] )
378 else
379 Result := nil;
380
381 end;
382
383 //==============================
384 // SetObject
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
387 var
388 idx : Integer;
389 begin
390
391 if Find( name, idx ) then
392 inherited Objects[ idx ] := value
393 else
394 AddObject( name, value );
395
396 end;
397
398
399 //==============================
400 // CountWord
401 //==============================
402 procedure TGikoBayesian.CountWord(
403 const text : string;
404 wordCount : TWordCount );
405 type
406 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407 ModeWGraph, ModeWAlpha, ModeWNum,
408 ModeWHira, ModeWKata, ModeWKanji);
409 var
410 p, tail, last : PChar;
411 mode, newMode : Modes;
412 ch : Longword;
413 chSize : Integer;
414 wHiraDelimiter : TStringList;
415 wHiraFinalDelimiter : TStringList;
416 wKanjiDelimiter : TStringList;
417 words : TStringList;
418 aWord : string;
419 countInfo : TWordCountInfo;
420
421 function cutBoth( _aWord : string; _delim : TStringList ) : string;
422 var
423 _i : Integer;
424 begin
425 for _i := 0 to _delim.Count - 1 do begin
426 _aWord := CustomStringReplace(
427 _aWord,
428 _delim[ _i ],
429 #10 + _delim[ _i ] + #10, False );
430 end;
431 Result := _aWord;
432 end;
433
434 function cutFirst( _aWord : string; _delim : TStringList ) : string;
435 var
436 _i : Integer;
437 begin
438 for _i := 0 to _delim.Count - 1 do begin
439 _aWord := CustomStringReplace(
440 _aWord,
441 _delim[ _i ],
442 #10 + _delim[ _i ], False );
443 end;
444 Result := _aWord;
445 end;
446
447 function cutFinal( _aWord : string; _delim : TStringList ) : string;
448 var
449 _i : Integer;
450 begin
451 for _i := 0 to _delim.Count - 1 do begin
452 _aWord := CustomStringReplace(
453 _aWord,
454 _delim[ _i ],
455 _delim[ _i ] + #10, False );
456 end;
457 Result := _aWord;
458 end;
459
460 procedure addWord( _dst : TWordCount; _words : TStringList );
461 var
462 _aWord : string;
463 _i, _idx : Integer;
464 _countInfo : TWordCountInfo;
465 begin
466 for _i := 0 to _words.Count - 1 do begin
467 _aWord := _words[ _i ];
468 if Length( _aWord ) > 0 then begin
469 if _dst.Find( _aWord, _idx ) then begin
470 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
471 end else begin
472 _countInfo := TWordCountInfo.Create;
473 _dst.AddObject( _aWord, _countInfo );
474 end;
475 _countInfo.WordCount := _countInfo.WordCount + 1;
476 end;
477 end;
478 end;
479
480 function changeMode( _aWord : string; _mode : Modes ) : string;
481 var
482 _i : Integer;
483 _aWord2 : string;
484 _pWord, _pWord2 : PChar;
485 _pWordTail, _pFound : PChar;
486 const
487 _delim : string = #10;
488 begin
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490 if mode = ModeWHira then begin
491 Result := '';
492 Exit;
493 end;
494 {$ENDIF}
495 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
496 // ?ユ???
497 // ?鴻???若?鴻??荅違????
498 _aWord := CustomStringReplace( _aWord, ' ', '', False );
499 _aWord := CustomStringReplace( _aWord, '??', '', False );
500
501 // ???????帥?у??茯?????
502 case mode of
503 ModeWHira:
504 begin
505 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506 Result := cutBoth( _aWord, wHiraDelimiter );
507 end;
508
509 ModeWKanji:
510 begin
511 // ???????帥?у??茯?????
512 _aWord := cutBoth( _aWord, wKanjiDelimiter );
513 // 4 byte (2 絖?) ???ゃ?у??茯?????
514 _pWord := PChar( _aWord );
515 _i := Length( _aWord );
516 _pWordTail := _pWord + _i;
517 SetLength( _aWord2, _i + (_i shr 2) );
518 _pWord2 := PChar( _aWord2 );
519
520 while _pWord < _pWordTail do begin
521 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522 if _pFound = nil then
523 _pFound := _pWordTail;
524 _pFound := _pFound - 3;
525
526 while _pWord <= _pFound do begin
527 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
529 end;
530 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531 CopyMemory( _pWord2, _pWord, _i );
532 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
533 end;
534 if _pWord < _pWordTail then begin
535 _i := _pWordTail - _pWord;
536 CopyMemory( _pWord2, _pWord, _i );
537 _pWord2 := _pWord2 + _i;
538 end;
539 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
540
541 Result := _aWord2;
542 end;
543
544 else
545 Result := _aWord;
546 end;
547 end else begin
548 Result := _aWord;
549 end;
550 end;
551 const
552 WHIRA_DELIMITER = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????'
553 + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;+ #10 + '??#39;
554 + #10 + '????' + #10 + '????' + #10 + '????'
555 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
556 + #10 + '????#39; + #10 + '????#39; + #10 + '????#39; + #10 + '????#39;
557 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
558 + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39;
559 + #10 + '????' + #10 + '????#39; + #10 + '????' + #10 + '????'
560 + #10 + '????' + #10 + '??????'
561 + #10 + '?с??' + #10 + '?障??' + #10 + '?障????'
562 + #10 + '?с????' + #10 + '?障????'
563 + #10 + '????' + #10 + '??????' + #10 + '??????' + #10 + '????????'
564 ;
565 WKANJI_DELIMITER = '??' + #10 + '??#39; + #10 + '綣?39; + #10 + '??' + #10 + '羈?'
566 + #10 + '筝?' + #10 + '??#39; + #10 + '??' + #10 + '??'
567 ;
568 WHIRA_FINAL_DELIMITER = '?c??' + #10 + '?c??#39;
569 ;{
570 + #10 + '???c??#39; + #10 + '???????c??#39; + #10 + '??????#39;
571 + #10 + '??????' + #10 + '?с??????'
572 + #10 + '?障??'
573 + #10 + '??????' + #10 + '????' + #10 + '????#39; + #10 + '??????#39;
574 + #10 + '??????' + #10 + '???c?宴??'
575 + #10 + '?с??' + #10 + '????'
576 + #10 + '????' + #10 + '??????' + #10 + '????' + #10 + '??????'
577 ;}
578 // '??#39; ?? '??????????' ????
579 HA_LINE = '?????????????障????????????違?宴????#39;;
580 HI_LINE = '???????<???蚊?帥?????????潟?眼??';
581 HU_LINE = '??????ゃ???泣?????????吟?激??';
582 HE_LINE = '???????????吾?????????鴻?冴??';
583 HO_LINE = '???????????祉???????????若?純??';
584 KA_LINE = '?≪???泣?帥??????ゃ???????吟???????<?泣??#39;;
585 KI_LINE = '?ゃ???激???????????違???吾??????#39;;
586 KU_LINE = '?????鴻?????????????違?????ャ??#39;;
587 KE_LINE = '???宴?祉???????<???宴?蚊?????с??#39;;
588 KO_LINE = '???潟?純???????≪?????蚊?眼??????#39;;
589 kKanji = [$80..$A0, $E0..$ff];
590 begin
591
592 wHiraDelimiter := TStringList.Create;
593 wHiraFinalDelimiter := TStringList.Create;
594 wKanjiDelimiter := TStringList.Create;
595 words := TStringList.Create;
596 try
597 mode := ModeWhite;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599 wHiraDelimiter.Text := WHIRA_DELIMITER;
600 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
601 {$ENDIF}
602 wKanjiDelimiter.Text := WKANJI_DELIMITER;
603 p := PChar( text );
604 tail := p + Length( text );
605 last := p;
606
607 while p < tail do begin
608 // ??絖????帥?ゃ?????ゅ??/span>
609 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
610 // if Byte(Byte( p^ ) - $a1) < $5e then begin
611 if Byte( p^ ) in kKanji then begin
612 if p + 1 < tail then begin
613 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
614 case ch of
615 // ?鴻???若?鴻?у??茯????????????????
616 //$8140: newMode := ModeWhite;
617 $8141..$824e: newMode := ModeWGraph;
618 $824f..$8258: newMode := ModeWNum;
619 $8260..$829a: newMode := ModeWAlpha;
620 $829f..$82f1: newMode := ModeWHira;
621 $8340..$8396: newMode := ModeWKata;
622 else newMode := ModeWKanji;
623 end;
624 // '??????#39; ??抗篁??????障???????帥?????????障????
625 if (mode = ModeWHira) or (mode = ModeWKata) then
626 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
627 newMode := mode;
628 end else begin
629 newMode := ModeWhite;
630 end;
631
632 chSize := 2;
633 end else begin
634 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636 // 篁??障?ф?ユ????т??鴻???若??/span>
637 // ??茯???膵???????с?鴻???若?鴻??荅違????
638 // ?糸??茹?????????絽吾?鴻???若?鴻?у?阪??????????????荅違??????
639 newMode := mode;
640 end;
641
642 chSize := 1;
643 end;
644
645 if mode <> newMode then begin
646
647 // ??絖????帥?ゃ????紊??眼??????
648 if mode <> ModeWhite then begin
649 SetLength( aWord, p - last );
650 CopyMemory( PChar( aWord ), last, p - last );
651
652 words.Text := changeMode( aWord, mode );
653
654 // ??茯??脂??/span>
655 addWord( wordCount, words );
656 end;
657
658 last := p;
659 mode := newMode;
660
661 end;
662
663 p := p + chSize;
664 end; // while
665
666 if mode <> ModeWhite then begin
667 SetLength( aWord, p - last );
668 CopyMemory( PChar( aWord ), last, p - last );
669
670 words.Text := changeMode( aWord, mode );
671
672 // ??茯??脂??/span>
673 addWord( wordCount, words );
674 end;
675 finally
676 words.Free;
677 wKanjiDelimiter.Free;
678 wHiraFinalDelimiter.Free;
679 wHiraDelimiter.Free;
680 end;
681
682 end;
683
684 //==============================
685 // CalcPaulGraham
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
688
689 function p( const aWord : string ) : Single;
690 var
691 info : TWordInfo;
692 begin
693 info := Objects[ aWord ];
694 if info = nil then
695 Result := 0.415
696 else if info.NormalWord = 0 then
697 Result := 0.99
698 else if info.ImportantWord = 0 then
699 Result := 0.01
700 else if info.ImportantWord + info.NormalWord * 2 < 5 then
701 Result := 0.5
702 else
703 Result := ( info.ImportantWord / info.ImportantText ) /
704 ((info.NormalWord * 2 / info.NormalText ) +
705 (info.ImportantWord / info.ImportantText));
706 end;
707
708 var
709 s, q : Extended;
710 i : Integer;
711 narray : TList;
712 const
713 SAMPLE_COUNT = 15;
714 begin
715
716 Result := 1;
717 if wordCount.Count = 0 then
718 Exit;
719
720 narray := TList.Create;
721 try
722 for i := 0 to wordCount.Count - 1 do begin
723 narray.Add( Pointer( p( wordCount[ i ] ) ) );
724 end;
725
726 narray.Sort( AbsSort );
727
728 s := 1;
729 q := 1;
730 i := min( SAMPLE_COUNT, narray.Count );
731 while i > 0 do begin
732 Dec( i );
733
734 s := s * Single( narray[ i ] );
735 q := q * (1 - Single( narray[ i ] ));
736 end;
737
738 Result := s / (s + q);
739 finally
740 narray.Free;
741 end;
742
743 end;
744
745 //==============================
746 // CalcGaryRobinson
747 //==============================
748 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
749
750 function p( const aWord : string ) : Single;
751 var
752 info : TWordInfo;
753 begin
754 info := Objects[ aWord ];
755 if info = nil then
756 Result := 0.415
757 else if info.ImportantWord = 0 then
758 Result := 0.01
759 else if info.NormalWord = 0 then
760 Result := 0.99
761 else
762 {
763 Result := ( info.ImportantWord / info.ImportantText ) /
764 ((info.NormalWord / info.NormalText ) +
765 (info.ImportantWord / info.ImportantText));
766 }
767 Result := (info.ImportantWord * info.NormalText) /
768 (info.NormalWord * info.ImportantText +
769 info.ImportantWord * info.NormalText);
770 end;
771
772 function f( cnt : Integer; n, mean : Single ) : Extended;
773 const
774 k = 0.001;
775 begin
776 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
777 end;
778
779 var
780 n : Extended;
781 narray : array of Single;
782 mean : Extended;
783 countInfo : TWordCountInfo;
784 i : Integer;
785 P1, Q1, R1 : Extended;
786 cnt : Extended;
787 begin
788
789 if wordCount.Count = 0 then begin
790 Result := 1;
791 Exit;
792 end;
793
794 SetLength( narray, wordCount.Count );
795 mean := 0;
796 for i := 0 to wordCount.Count - 1 do begin
797 n := p( wordCount[ i ] );
798 narray[ i ] := n;
799 mean := mean + n;
800 end;
801 mean := mean / wordCount.Count;
802
803 P1 := 1;
804 Q1 := 1;
805 for i := 0 to wordCount.Count - 1 do begin
806 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
807 n := f( countInfo.WordCount, narray[ i ], mean );
808 P1 := P1 * ( 1 - n );
809 Q1 := Q1 * n;
810 end;
811 cnt := wordCount.Count;
812 if cnt = 0 then
813 cnt := 1
814 else
815 P1 := 1 - Power( P1, 1 / cnt );
816 Q1 := 1 - Power( Q1, 1 / cnt );
817
818 if P1 + Q1 = 0 then begin
819 Result := 0.5
820 end else begin
821 n := (P1 - Q1) / (P1 + Q1);
822 Result := (1 + n) / 2;
823 end;
824
825 end;
826
827 //==============================
828 // CalcGaryRobinsonFisher
829 //==============================
830 function TGikoBayesian.CalcGaryRobinsonFisher(
831 wordCount : TWordCount
832 ) : Extended;
833
834 function p( const aWord : string ) : Single;
835 var
836 info : TWordInfo;
837 begin
838 info := Objects[ aWord ];
839 if info = nil then
840 Result := 0.415
841 else if info.ImportantWord = 0 then
842 Result := 0.01
843 else if info.NormalWord = 0 then
844 Result := 0.99
845 else
846 {
847 Result := ( info.ImportantWord / info.ImportantText ) /
848 ((info.NormalWord / info.NormalText ) +
849 (info.ImportantWord / info.ImportantText));
850 }
851 Result := (info.ImportantWord * info.NormalText) /
852 (info.NormalWord * info.ImportantText +
853 info.ImportantWord * info.NormalText);
854 end;
855
856 function f( cnt : Integer; n, mean : Single ) : Extended;
857 const
858 k = 0.001;
859 begin
860 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
861 end;
862
863 function prbx( x2, degree : Extended ) : Extended;
864 begin
865
866 Result := 0.5;
867
868 end;
869
870 var
871 n : Extended;
872 narray : array of Single;
873 mean : Extended;
874 countInfo : TWordCountInfo;
875 i : Integer;
876 normal : Extended;
877 important : Extended;
878 P1, Q1 : Extended;
879 cnt : Extended;
880 begin
881
882 if wordCount.Count = 0 then begin
883 Result := 1;
884 Exit;
885 end;
886
887 SetLength( narray, wordCount.Count );
888 mean := 0;
889 for i := 0 to wordCount.Count - 1 do begin
890 n := p( wordCount[ i ] );
891 narray[ i ] := n;
892 mean := mean + n;
893 end;
894 mean := mean / wordCount.Count;
895
896 P1 := 1;
897 Q1 := 1;
898 for i := 0 to wordCount.Count - 1 do begin
899 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
900 n := f( countInfo.WordCount, narray[ i ], mean );
901 P1 := P1 * ( 1 - n );
902 Q1 := Q1 * n;
903 end;
904 cnt := wordCount.Count;
905 if cnt = 0 then
906 cnt := 1
907 else
908 P1 := Power( P1, 1 / cnt );
909 Q1 := Power( Q1, 1 / cnt );
910
911 P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
912 Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
913
914 Result := (1 + P1 - Q1) / 2;
915
916 end;
917
918 //==============================
919 // Parse
920 //==============================
921 function TGikoBayesian.Parse(
922 const text : string;
923 wordCount : TWordCount;
924 algorithm : TGikoBayesianAlgorithm
925 ) : Extended;
926 begin
927
928 CountWord( text, wordCount );
929 case algorithm of
930 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
931 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
932 gbaGaryRobinsonFisher:
933 Result := CalcGaryRobinsonFisher( wordCount );
934 else Result := 0;
935 end;
936
937 end;
938
939 //==============================
940 // Learn
941 //==============================
942 procedure TGikoBayesian.Learn(
943 wordCount : TWordCount;
944 isImportant : Boolean );
945 var
946 aWord : string;
947 wordinfo : TWordInfo;
948 countinfo : TWordCountInfo;
949 i : Integer;
950 begin
951
952 for i := 0 to wordCount.Count - 1 do begin
953 aWord := wordCount[ i ];
954 wordinfo := Objects[ aWord ];
955 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
956 if wordinfo = nil then begin
957 wordinfo := TWordInfo.Create;
958 Objects[ aWord ] := wordinfo;
959 end;
960
961 if isImportant then begin
962 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
963 wordinfo.ImportantText := wordinfo.ImportantText + 1;
964 end else begin
965 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
966 wordinfo.NormalText := wordinfo.NormalText + 1;
967 end;
968 end;
969
970 end;
971
972 //==============================
973 // Forget
974 //==============================
975 procedure TGikoBayesian.Forget(
976 wordCount : TWordCount;
977 isImportant : Boolean );
978 var
979 aWord : string;
980 wordinfo : TWordInfo;
981 countinfo : TWordCountInfo;
982 i : Integer;
983 begin
984
985 for i := 0 to wordCount.Count - 1 do begin
986 aWord := wordCount[ i ];
987 wordinfo := Objects[ aWord ];
988 if wordinfo = nil then
989 Continue;
990
991 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
992 if isImportant then begin
993 if wordInfo.ImportantText > 0 then begin
994 wordinfo.ImportantText := wordinfo.ImportantText - 1;
995 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
996 end;
997 end else begin
998 if wordinfo.NormalText > 0 then begin
999 wordinfo.NormalText := wordinfo.NormalText - 1;
1000 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
1001 end;
1002 end;
1003 end;
1004
1005 end;
1006
1007 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26