Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.16 - (show annotations) (download) (as text)
Mon Nov 1 10:32:02 2004 UTC (19 years, 5 months ago) by yoffy
Branch: MAIN
CVS Tags: bv1_49_0_563
Changes since 1.15: +3 -5 lines
File MIME type: text/x-pascal
else が紛れ込んでたので修正。

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7 $Id: GikoBayesian.pas,v 1.15 2004/11/01 10:28:24 yoffy Exp $
8 }
9
10 //! 綛割皿????莨??吾??????????
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
12
13 interface
14
15 //==================================================
16 uses
17 //==================================================
18 Classes;
19
20 //==================================================
21 type
22 //==================================================
23
24 {!***********************************************************
25 \brief ??茯???????????/span>
26 ************************************************************}
27 TWordInfo = class( TObject )
28 private
29 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
30 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
31 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
32 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
33
34 public
35 property NormalWord : Integer read FNormalWord write FNormalWord;
36 property ImportantWord : Integer read FImportantWord write FImportantWord;
37 property NormalText : Integer read FNormalText write FNormalText;
38 property ImportantText : Integer read FImportantText write FImportantText;
39 end;
40
41 {!***********************************************************
42 \brief 茹f??羝??水??茯???????????/span>
43 ************************************************************}
44 TWordCountInfo = class( TObject )
45 private
46 FWordCount : Integer; //!< ??茯???/span>
47
48 public
49 property WordCount : Integer read FWordCount write FWordCount;
50 end;
51
52 {!***********************************************************
53 \brief 茹f??羝??水??茯????鴻??
54 ************************************************************}
55 // TWordCount = class( THashedStringList ) // 羶???
56 TWordCount = class( TStringList )
57 public
58 constructor Create;
59 destructor Destroy; override;
60 end;
61
62 {!***********************************************************
63 \brief ???c???帥?≪???眼???冴??
64 ************************************************************}
65 TGikoBayesianAlgorithm =
66 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
67
68 {!***********************************************************
69 \brief ???ゃ?吾?≪?潟???c????/span>
70 ************************************************************}
71 // TGikoBayesian = class( THashedStringList ) // 羶???
72 TGikoBayesian = class( TStringList )
73 private
74 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
75 function GetObject( const name : string ) : TWordInfo;
76 procedure SetObject( const name : string; value : TWordInfo );
77
78 public
79 constructor Create;
80 destructor Destroy; override;
81
82 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
83 procedure LoadFromFile( const filePath : string );
84
85 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
86 procedure SaveToFile( const filePath : string );
87
88 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
89 procedure Save;
90
91 //! ??茯???????????宴????緇????障??
92 property Objects[ const name : string ] : TWordInfo
93 read GetObject write SetObject; default;
94
95 //! ??腴??????障??????茯????????潟?????障??
96 procedure CountWord(
97 const text : string;
98 wordCount : TWordCount );
99
100 {!
101 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
102 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
103 }
104 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
105
106 {!
107 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
108 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
109 }
110 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
111
112 {!
113 \brief GaryRobinson-Fisher 羈????冴?ャ??????腴???絵??墾??羆阪????障??
114 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
115 }
116 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
117
118 {!
119 \brief ??腴???茹f??
120 \param text 茹f????????腴?
121 \param wordCount 茹f??????????茯????鴻????菴???
122 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
123 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
124
125 CountWord ? Calcxxxxx ???障???????茵??????????с????
126 }
127 function Parse(
128 const text : string;
129 wordCount : TWordCount;
130 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
131 ) : Extended;
132
133 {!
134 \brief 絖??????
135 \param wordCount Parse ?цВ??????????茯????鴻??
136 \param isImportant 羈??????鴻????腴???????????????? True
137 }
138 procedure Learn(
139 wordCount : TWordCount;
140 isImportant : Boolean );
141
142 {!
143 \brief 絖??腟?????綽?????
144 \param wordCount Parse ?цВ??????????茯????鴻??
145 \param isImportant 羈??????鴻????腴???????????????????????? True
146 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
147 Learn ????????????腴??? isImportant ???????c????????腴???
148 Forget ?????????若?帥???若?鴻???贋?????障????<br>
149 絖??羝??帥????????????????????????????????
150
151 ???????膺?腟??????????≪????????с???????障??????<br>
152 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
153
154 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
155 }
156 procedure Forget(
157 wordCount : TWordCount;
158 isImportant : Boolean );
159 end;
160
161 //==================================================
162 implementation
163 //==================================================
164
165 uses
166 SysUtils, Math, Windows,
167 MojuUtils;
168
169 const
170 GIKO_BAYESIAN_FILE_VERSION = '1.0';
171 {
172 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173 ModeWGraph, ModeWAlpha, ModeWNum,
174 ModeWHira, ModeWKata, ModeWKanji);
175 }
176 CharMode1 : array [ 0..255 ] of Byte =
177 (
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
186
187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
195 );
196
197 //************************************************************
198 // misc
199 //************************************************************
200
201 //==============================
202 // RemoveToken
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
205 var
206 p: Integer;
207 begin
208 p := AnsiPos(delimiter, s);
209 if p = 0 then
210 Result := s
211 else
212 Result := Copy(s, 1, p - 1);
213 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
214 end;
215
216 //==============================
217 // AbsSort
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
220 var
221 v1, v2 : Single;
222 begin
223
224 v1 := Abs( Single( p1 ) - 0.5 );
225 v2 := Abs( Single( p2 ) - 0.5 );
226 if v1 > v2 then
227 Result := -1
228 else if v1 = v2 then
229 Result := 0
230 else
231 Result := 1;
232
233 end;
234
235 //************************************************************
236 // TWordCount class
237 //************************************************************
238 constructor TWordCount.Create;
239 begin
240
241 Duplicates := dupIgnore;
242 CaseSensitive := True;
243 Sorted := True;
244
245 end;
246
247 destructor TWordCount.Destroy;
248 var
249 i : Integer;
250 begin
251
252 for i := Count - 1 downto 0 do
253 if Objects[ i ] <> nil then
254 Objects[ i ].Free;
255
256 inherited;
257
258 end;
259
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
263
264 //==============================
265 // Create
266 //==============================
267 constructor TGikoBayesian.Create;
268 begin
269
270 Duplicates := dupIgnore;
271 CaseSensitive := True;
272 Sorted := True;
273
274 end;
275
276 //==============================
277 // Destroy
278 //==============================
279 destructor TGikoBayesian.Destroy;
280 var
281 i : Integer;
282 begin
283
284 for i := Count - 1 downto 0 do
285 if inherited Objects[ i ] <> nil then
286 inherited Objects[ i ].Free;
287
288 inherited;
289
290 end;
291
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
293 var
294 i : Integer;
295 sl : TStringList;
296 s : string;
297 name : string;
298 info : TWordInfo;
299 begin
300
301 FFilePath := filePath;
302
303 if not FileExists( filePath ) then
304 Exit;
305
306 sl := TStringList.Create;
307 try
308 sl.LoadFromFile( filePath );
309
310 for i := 1 to sl.Count - 1 do begin
311 s := sl[ i ];
312 name := RemoveToken( s, #1 );
313 info := TWordInfo.Create;
314 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
318
319 AddObject( name, info );
320 end;
321 finally
322 sl.Free;
323 end;
324
325 end;
326
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
328 var
329 i : Integer;
330 sl : TStringList;
331 s : string;
332 info : TWordInfo;
333 begin
334
335 FFilePath := filePath;
336
337 sl := TStringList.Create;
338 try
339 sl.BeginUpdate;
340 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
341
342 for i := 0 to Count - 1 do begin
343 info := TWordInfo( inherited Objects[ i ] );
344 s := Strings[ i ] + #1
345 + Format('%x', [info.NormalWord]) + #1
346 + Format('%x', [info.ImportantWord]) + #1
347 + Format('%x', [info.NormalText]) + #1
348 + Format('%x', [info.ImportantText]);
349
350 sl.Add(s);
351 end;
352 sl.EndUpdate;
353 sl.SaveToFile( filePath );
354 finally
355 sl.Free;
356 end;
357
358 end;
359
360 procedure TGikoBayesian.Save;
361 begin
362
363 if FFilePath <> '' then
364 SaveToFile( FFilePath );
365
366 end;
367
368 //==============================
369 // GetObject
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
372 var
373 idx : Integer;
374 begin
375
376 if Find( name, idx ) then
377 Result := TWordInfo( inherited Objects[ idx ] )
378 else
379 Result := nil;
380
381 end;
382
383 //==============================
384 // SetObject
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
387 var
388 idx : Integer;
389 begin
390
391 if Find( name, idx ) then
392 inherited Objects[ idx ] := value
393 else
394 AddObject( name, value );
395
396 end;
397
398
399 //==============================
400 // CountWord
401 //==============================
402 procedure TGikoBayesian.CountWord(
403 const text : string;
404 wordCount : TWordCount );
405 type
406 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407 ModeWGraph, ModeWAlpha, ModeWNum,
408 ModeWHira, ModeWKata, ModeWKanji);
409 var
410 p, tail, last : PChar;
411 mode, newMode : Modes;
412 ch : Longword;
413 chSize : Integer;
414 wHiraDelimiter : TStringList;
415 wHiraFinalDelimiter : TStringList;
416 wKanjiDelimiter : TStringList;
417 words : TStringList;
418 aWord : string;
419 countInfo : TWordCountInfo;
420
421 function cutBoth( _aWord : string; _delim : TStringList ) : string;
422 var
423 _i : Integer;
424 begin
425 for _i := 0 to _delim.Count - 1 do begin
426 _aWord := CustomStringReplace(
427 _aWord,
428 _delim[ _i ],
429 #10 + _delim[ _i ] + #10, False );
430 end;
431 Result := _aWord;
432 end;
433
434 function cutFirst( _aWord : string; _delim : TStringList ) : string;
435 var
436 _i : Integer;
437 begin
438 for _i := 0 to _delim.Count - 1 do begin
439 _aWord := CustomStringReplace(
440 _aWord,
441 _delim[ _i ],
442 #10 + _delim[ _i ], False );
443 end;
444 Result := _aWord;
445 end;
446
447 function cutFinal( _aWord : string; _delim : TStringList ) : string;
448 var
449 _i : Integer;
450 begin
451 for _i := 0 to _delim.Count - 1 do begin
452 _aWord := CustomStringReplace(
453 _aWord,
454 _delim[ _i ],
455 _delim[ _i ] + #10, False );
456 end;
457 Result := _aWord;
458 end;
459
460 procedure addWord( _dst : TWordCount; _words : TStringList );
461 var
462 _aWord : string;
463 _i, _idx : Integer;
464 _countInfo : TWordCountInfo;
465 begin
466 for _i := 0 to _words.Count - 1 do begin
467 _aWord := _words[ _i ];
468 if Length( _aWord ) > 0 then begin
469 if _dst.Find( _aWord, _idx ) then begin
470 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
471 end else begin
472 _countInfo := TWordCountInfo.Create;
473 _dst.AddObject( _aWord, _countInfo );
474 end;
475 _countInfo.WordCount := _countInfo.WordCount + 1;
476 end;
477 end;
478 end;
479
480 function changeMode( _aWord : string; _mode : Modes ) : string;
481 var
482 _i : Integer;
483 _aWord2 : string;
484 _pWord, _pWord2 : PChar;
485 _pWordTail, _pFound : PChar;
486 const
487 _delim : string = #10;
488 begin
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490 if mode = ModeWHira then begin
491 Result := '';
492 Exit;
493 end;
494 {$ENDIF}
495 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
496 // ?ユ???
497 // ?鴻???若?鴻??荅違????
498 _aWord := CustomStringReplace( _aWord, ' ', '', False );
499 _aWord := CustomStringReplace( _aWord, '??', '', False );
500
501 // ???????帥?у??茯?????
502 case mode of
503 ModeWHira:
504 begin
505 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506 Result := cutBoth( _aWord, wHiraDelimiter );
507 end;
508
509 ModeWKanji:
510 begin
511 // ???????帥?у??茯?????
512 _aWord := cutBoth( _aWord, wKanjiDelimiter );
513 // 4 byte (2 絖?) ???ゃ?у??茯?????
514 _pWord := PChar( _aWord );
515 _i := Length( _aWord );
516 _pWordTail := _pWord + _i;
517 SetLength( _aWord2, _i + (_i shr 2) );
518 _pWord2 := PChar( _aWord2 );
519
520 while _pWord < _pWordTail do begin
521 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522 if _pFound = nil then
523 _pFound := _pWordTail;
524 _pFound := _pFound - 3;
525
526 while _pWord <= _pFound do begin
527 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
529 end;
530 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531 CopyMemory( _pWord2, _pWord, _i );
532 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
533 end;
534 if _pWord < _pWordTail then begin
535 _i := _pWordTail - _pWord;
536 CopyMemory( _pWord2, _pWord, _i );
537 _pWord2 := _pWord2 + _i;
538 end;
539 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
540
541 Result := _aWord2;
542 end;
543
544 else
545 Result := _aWord;
546 end;
547 end else begin
548 Result := _aWord;
549 end;
550 end;
551 const
552 WHIRA_DELIMITER = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????'
553 + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;+ #10 + '??#39;
554 + #10 + '????' + #10 + '????' + #10 + '????'
555 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
556 + #10 + '????#39; + #10 + '????#39; + #10 + '????#39; + #10 + '????#39;
557 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
558 + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39;
559 + #10 + '????' + #10 + '????#39; + #10 + '????' + #10 + '????'
560 + #10 + '????' + #10 + '??????'
561 + #10 + '?с??' + #10 + '?障??' + #10 + '?障????'
562 + #10 + '?с????' + #10 + '?障????'
563 + #10 + '????' + #10 + '??????' + #10 + '??????' + #10 + '????????'
564 ;
565 WKANJI_DELIMITER = '??' + #10 + '??#39; + #10 + '綣?39; + #10 + '??' + #10 + '羈?'
566 + #10 + '筝?' + #10 + '??#39; + #10 + '??' + #10 + '??'
567 ;
568 WHIRA_FINAL_DELIMITER = '?c??' + #10 + '?c??#39;
569 ;{
570 + #10 + '???c??#39; + #10 + '???????c??#39; + #10 + '??????#39;
571 + #10 + '??????' + #10 + '?с??????'
572 + #10 + '?障??'
573 + #10 + '??????' + #10 + '????' + #10 + '????#39; + #10 + '??????#39;
574 + #10 + '??????' + #10 + '???c?宴??'
575 + #10 + '?с??' + #10 + '????'
576 + #10 + '????' + #10 + '??????' + #10 + '????' + #10 + '??????'
577 ;}
578 // '??#39; ?? '??????????' ????
579 HA_LINE = '?????????????障????????????違?宴????#39;;
580 HI_LINE = '???????<???蚊?帥?????????潟?眼??';
581 HU_LINE = '??????ゃ???泣?????????吟?激??';
582 HE_LINE = '???????????吾?????????鴻?冴??';
583 HO_LINE = '???????????祉???????????若?純??';
584 KA_LINE = '?≪???泣?帥??????ゃ???????吟???????<?泣??#39;;
585 KI_LINE = '?ゃ???激???????????違???吾??????#39;;
586 KU_LINE = '?????鴻?????????????違?????ャ??#39;;
587 KE_LINE = '???宴?祉???????<???宴?蚊?????с??#39;;
588 KO_LINE = '???潟?純???????≪?????蚊?眼??????#39;;
589 kKanji = [$80..$A0, $E0..$ff];
590 begin
591
592 wHiraDelimiter := TStringList.Create;
593 wHiraFinalDelimiter := TStringList.Create;
594 wKanjiDelimiter := TStringList.Create;
595 words := TStringList.Create;
596 try
597 mode := ModeWhite;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599 wHiraDelimiter.Text := WHIRA_DELIMITER;
600 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
601 {$ENDIF}
602 wKanjiDelimiter.Text := WKANJI_DELIMITER;
603 p := PChar( text );
604 tail := p + Length( text );
605 last := p;
606
607 while p < tail do begin
608 // ??絖????帥?ゃ?????ゅ??/span>
609 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
610 // if Byte(Byte( p^ ) - $a1) < $5e then begin
611 if Byte( p^ ) in kKanji then begin
612 if p + 1 < tail then begin
613 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
614 case ch of
615 // ?鴻???若?鴻?у??茯????????????????
616 //$8140: newMode := ModeWhite;
617 $8141..$824e: newMode := ModeWGraph;
618 $824f..$8258: newMode := ModeWNum;
619 $8260..$829a: newMode := ModeWAlpha;
620 $829f..$82f1: newMode := ModeWHira;
621 $8340..$8396: newMode := ModeWKata;
622 else newMode := ModeWKanji;
623 end;
624 // '??????#39; ??抗篁??????障???????帥?????????障????
625 if (mode = ModeWHira) or (mode = ModeWKata) then
626 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
627 newMode := mode;
628 end else begin
629 newMode := ModeWhite;
630 end;
631
632 chSize := 2;
633 end else begin
634 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636 // 篁??障?ф?ユ????т??鴻???若??/span>
637 // ??茯???膵???????с?鴻???若?鴻??荅違????
638 // ?糸??茹?????????絽吾?鴻???若?鴻?у?阪??????????????荅違??????
639 newMode := mode;
640 end;
641
642 chSize := 1;
643 end;
644
645 if mode <> newMode then begin
646
647 // ??絖????帥?ゃ????紊??眼??????
648 if mode <> ModeWhite then begin
649 SetLength( aWord, p - last );
650 CopyMemory( PChar( aWord ), last, p - last );
651
652 words.Text := changeMode( aWord, mode );
653
654 // ??茯??脂??/span>
655 addWord( wordCount, words );
656 end;
657
658 last := p;
659 mode := newMode;
660
661 end;
662
663 p := p + chSize;
664 end; // while
665
666 if mode <> ModeWhite then begin
667 SetLength( aWord, p - last );
668 CopyMemory( PChar( aWord ), last, p - last );
669
670 words.Text := changeMode( aWord, mode );
671
672 // ??茯??脂??/span>
673 addWord( wordCount, words );
674 end;
675 finally
676 words.Free;
677 wKanjiDelimiter.Free;
678 wHiraFinalDelimiter.Free;
679 wHiraDelimiter.Free;
680 end;
681
682 end;
683
684 //==============================
685 // CalcPaulGraham
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
688
689 function p( const aWord : string ) : Single;
690 var
691 info : TWordInfo;
692 begin
693 info := Objects[ aWord ];
694 if info = nil then
695 Result := 0.415
696 else if info.NormalWord = 0 then
697 Result := 0.99
698 else if info.ImportantWord = 0 then
699 Result := 0.01
700 else if info.ImportantWord + info.NormalWord * 2 < 5 then
701 Result := 0.5
702 else
703 Result := ( info.ImportantWord / info.ImportantText ) /
704 ((info.NormalWord * 2 / info.NormalText ) +
705 (info.ImportantWord / info.ImportantText));
706 end;
707
708 var
709 s, q : Extended;
710 i : Integer;
711 narray : TList;
712 const
713 SAMPLE_COUNT = 15;
714 begin
715
716 Result := 1;
717 if wordCount.Count = 0 then
718 Exit;
719
720 narray := TList.Create;
721 try
722 for i := 0 to wordCount.Count - 1 do begin
723 narray.Add( Pointer( p( wordCount[ i ] ) ) );
724 end;
725
726 narray.Sort( AbsSort );
727
728 s := 1;
729 q := 1;
730 i := min( SAMPLE_COUNT, narray.Count );
731 while i > 0 do begin
732 Dec( i );
733
734 s := s * Single( narray[ i ] );
735 q := q * (1 - Single( narray[ i ] ));
736 end;
737
738 Result := s / (s + q);
739 finally
740 narray.Free;
741 end;
742
743 end;
744
745 //==============================
746 // CalcGaryRobinson
747 //==============================
748 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
749
750 function p( const aWord : string ) : Single;
751 var
752 info : TWordInfo;
753 begin
754 info := Objects[ aWord ];
755 if info = nil then
756 Result := 0.415
757 else if info.ImportantWord = 0 then
758 Result := 0.01
759 else if info.NormalWord = 0 then
760 Result := 0.99
761 else
762 {
763 Result := ( info.ImportantWord / info.ImportantText ) /
764 ((info.NormalWord / info.NormalText ) +
765 (info.ImportantWord / info.ImportantText));
766 }
767 Result := (info.ImportantWord * info.NormalText) /
768 (info.NormalWord * info.ImportantText +
769 info.ImportantWord * info.NormalText);
770 end;
771
772 function f( cnt : Integer; n, mean : Single ) : Extended;
773 const
774 k = 0.001;
775 begin
776 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
777 end;
778
779 var
780 n : Extended;
781 narray : array of Single;
782 mean : Extended;
783 countInfo : TWordCountInfo;
784 i : Integer;
785 P1, Q1, R1 : Extended;
786 cnt : Extended;
787 begin
788
789 if wordCount.Count = 0 then begin
790 Result := 1;
791 Exit;
792 end;
793
794 SetLength( narray, wordCount.Count );
795 mean := 0;
796 for i := 0 to wordCount.Count - 1 do begin
797 n := p( wordCount[ i ] );
798 narray[ i ] := n;
799 mean := mean + n;
800 end;
801 mean := mean / wordCount.Count;
802
803 P1 := 1;
804 Q1 := 1;
805 for i := 0 to wordCount.Count - 1 do begin
806 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
807 n := f( countInfo.WordCount, narray[ i ], mean );
808 P1 := P1 * ( 1 - n );
809 Q1 := Q1 * n;
810 end;
811 cnt := wordCount.Count;
812 if cnt = 0 then
813 cnt := 1;
814 P1 := 1 - Power( P1, 1 / cnt );
815 Q1 := 1 - Power( Q1, 1 / cnt );
816
817 if P1 + Q1 = 0 then begin
818 Result := 0.5
819 end else begin
820 n := (P1 - Q1) / (P1 + Q1);
821 Result := (1 + n) / 2;
822 end;
823
824 end;
825
826 //==============================
827 // CalcGaryRobinsonFisher
828 //==============================
829 function TGikoBayesian.CalcGaryRobinsonFisher(
830 wordCount : TWordCount
831 ) : Extended;
832
833 function p( const aWord : string ) : Single;
834 var
835 info : TWordInfo;
836 begin
837 info := Objects[ aWord ];
838 if info = nil then
839 Result := 0.415
840 else if info.ImportantWord = 0 then
841 Result := 0.01
842 else if info.NormalWord = 0 then
843 Result := 0.99
844 else
845 {
846 Result := ( info.ImportantWord / info.ImportantText ) /
847 ((info.NormalWord / info.NormalText ) +
848 (info.ImportantWord / info.ImportantText));
849 }
850 Result := (info.ImportantWord * info.NormalText) /
851 (info.NormalWord * info.ImportantText +
852 info.ImportantWord * info.NormalText);
853 end;
854
855 function f( cnt : Integer; n, mean : Single ) : Extended;
856 const
857 k = 0.001;
858 begin
859 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
860 end;
861
862 function prbx( x2, degree : Extended ) : Extended;
863 begin
864
865 Result := 0.5;
866
867 end;
868
869 var
870 n : Extended;
871 narray : array of Single;
872 mean : Extended;
873 countInfo : TWordCountInfo;
874 i : Integer;
875 normal : Extended;
876 important : Extended;
877 P1, Q1 : Extended;
878 cnt : Extended;
879 begin
880
881 if wordCount.Count = 0 then begin
882 Result := 1;
883 Exit;
884 end;
885
886 SetLength( narray, wordCount.Count );
887 mean := 0;
888 for i := 0 to wordCount.Count - 1 do begin
889 n := p( wordCount[ i ] );
890 narray[ i ] := n;
891 mean := mean + n;
892 end;
893 mean := mean / wordCount.Count;
894
895 P1 := 1;
896 Q1 := 1;
897 for i := 0 to wordCount.Count - 1 do begin
898 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
899 n := f( countInfo.WordCount, narray[ i ], mean );
900 P1 := P1 * ( 1 - n );
901 Q1 := Q1 * n;
902 end;
903 cnt := wordCount.Count;
904 if cnt = 0 then
905 cnt := 1;
906 P1 := Power( P1, 1 / cnt );
907 Q1 := Power( Q1, 1 / cnt );
908
909 P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
910 Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
911
912 Result := (1 + P1 - Q1) / 2;
913
914 end;
915
916 //==============================
917 // Parse
918 //==============================
919 function TGikoBayesian.Parse(
920 const text : string;
921 wordCount : TWordCount;
922 algorithm : TGikoBayesianAlgorithm
923 ) : Extended;
924 begin
925
926 CountWord( text, wordCount );
927 case algorithm of
928 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
929 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
930 gbaGaryRobinsonFisher:
931 Result := CalcGaryRobinsonFisher( wordCount );
932 else Result := 0;
933 end;
934
935 end;
936
937 //==============================
938 // Learn
939 //==============================
940 procedure TGikoBayesian.Learn(
941 wordCount : TWordCount;
942 isImportant : Boolean );
943 var
944 aWord : string;
945 wordinfo : TWordInfo;
946 countinfo : TWordCountInfo;
947 i : Integer;
948 begin
949
950 for i := 0 to wordCount.Count - 1 do begin
951 aWord := wordCount[ i ];
952 wordinfo := Objects[ aWord ];
953 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
954 if wordinfo = nil then begin
955 wordinfo := TWordInfo.Create;
956 Objects[ aWord ] := wordinfo;
957 end;
958
959 if isImportant then begin
960 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
961 wordinfo.ImportantText := wordinfo.ImportantText + 1;
962 end else begin
963 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
964 wordinfo.NormalText := wordinfo.NormalText + 1;
965 end;
966 end;
967
968 end;
969
970 //==============================
971 // Forget
972 //==============================
973 procedure TGikoBayesian.Forget(
974 wordCount : TWordCount;
975 isImportant : Boolean );
976 var
977 aWord : string;
978 wordinfo : TWordInfo;
979 countinfo : TWordCountInfo;
980 i : Integer;
981 begin
982
983 for i := 0 to wordCount.Count - 1 do begin
984 aWord := wordCount[ i ];
985 wordinfo := Objects[ aWord ];
986 if wordinfo = nil then
987 Continue;
988
989 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
990 if isImportant then begin
991 if wordInfo.ImportantText > 0 then begin
992 wordinfo.ImportantText := wordinfo.ImportantText - 1;
993 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
994 end;
995 end else begin
996 if wordinfo.NormalText > 0 then begin
997 wordinfo.NormalText := wordinfo.NormalText - 1;
998 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
999 end;
1000 end;
1001 end;
1002
1003 end;
1004
1005 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26