Develop and Download Open Source Software

Browse CVS Repository

Contents of /gikonavigoeson/gikonavi/GikoBayesian.pas

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.21 - (show annotations) (download) (as text)
Mon Jun 26 14:57:15 2006 UTC (17 years, 9 months ago) by h677
Branch: MAIN
CVS Tags: v1_59_0_771, v1_59_0_770, v1_59_0_773, v1_59_0_772, v1_59_0_775, v1_59_0_774, v1_59_0_777, v1_59_0_776, v1_56_0_715, v1_59_0_778, v1_57_0_737, v1_57_0_735, v1_57_0_734, v1_57_0_733, v1_57_0_732, v1_57_0_731, v1_57_0_730, v1_57_0_739, v1_57_0_738, v1_59_1_765, v1_53_0_671, v1_54_0_677, v1_54_0_676, v1_58_0_748, v1_58_0_745, v1_58_0_746, v1_54_0_678, v1_56_0_707, v1_59_0_767, v1_56_0_705, v1_56_0_704, v1_56_0_703, v1_56_0_702, v1_56_0_701, v1_56_0_700, v1_59_1_778, v1_57_0_723, v1_59_0_768, v1_59_0_769, v1_57_0_725, v1_57_0_726, v1_57_0_727, v1_57_0_720, v1_57_0_722, v1_57_1_744, v1_57_0_728, v1_57_0_729, v1_57_0_736, v1_58_0_752, v1_58_0_750, v1_58_0_751, v1_58_0_756, v1_58_0_757, v1_58_0_754, v1_58_0_755, v1_58_0_759, v1_57_0_719, v1_57_0_718, v1_56_0_716, v1_54_0_687, v1_56_0_710, v1_56_0_711, v1_56_1_717, v1_56_1_716, v1_54_0_688, v1_54_0_689, v1_54_0_684, v1_54_0_685, v1_53_0_664, v1_53_0_661, v1_53_0_663, v1_53_0_662, v1_53_0_665, v1_53_0_667, v1_53_0_666, v1_53_0_669, v1_53_0_668, v1_55_0_692, v1_55_0_693, v1_55_0_696, v1_55_0_697, v1_55_0_694, v1_55_0_695, v1_54_1_691, v1_56_2_724, v1_56_2_722, v1_55_1_697, v1_56_0_714, v1_53_0_672, v1_53_0_670, v1_54_0_686, v1_54_0_680, v1_54_0_681, v1_56_0_712, v1_56_0_713, v1_54_0_682, v1_56_0_721, v1_54_0_683, v1_58_0_747, v1_54_0_679, v1_59_2_785, marged_bRESPOPUP, v1_54_0_675, v1_56_0_706, root-of-Bb53, v1_54_0_674, v1_54_0_690, v1_58_0_763, v1_58_0_762, v1_58_0_761, v1_58_0_760, v1_57_2_749, v1_57_0_742, v1_57_0_743, v1_57_0_740, v1_57_0_741, v1_56_0_709, v1_57_0_744, v1_56_0_708, v1_53_1_673, v1_56_0_699, v1_56_0_698
Branch point for: Bb57, Bb56, Bb55, Bb53, Bb59, Bb58, bRESPOPUP, bListSU, Bb54
Changes since 1.20: +5 -5 lines
File MIME type: text/x-pascal
未使用変数の削除

1 unit GikoBayesian;
2
3 {!
4 \file GikoBayesian.pas
5 \brief ???ゃ?吾?≪?潟???c????/span>
6
7
8 $Id: GikoBayesian.pas,v 1.20 2005/08/18 14:17:47 h677 Exp $
9 }
10
11 //! 綛割皿????莨??吾??????????
12 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
13
14 interface
15
16 //==================================================
17 uses
18 //==================================================
19 Classes;
20
21 //==================================================
22 type
23 //==================================================
24
25 {!***********************************************************
26 \brief ??茯???????????/span>
27 ************************************************************}
28 TWordInfo = class( TObject )
29 private
30 FNormalWord : Integer; //!< ??絽吾????茯????????糸?眼????????/span>
31 FImportantWord : Integer; //!< 羈?????茯????????糸?眼????????/span>
32 FNormalText : Integer; //!< ??絽吾????茯??????????障??????????腴?????/span>
33 FImportantText : Integer; //!< 羈?????茯??????????障??????????腴?????/span>
34
35 public
36 property NormalWord : Integer read FNormalWord write FNormalWord;
37 property ImportantWord : Integer read FImportantWord write FImportantWord;
38 property NormalText : Integer read FNormalText write FNormalText;
39 property ImportantText : Integer read FImportantText write FImportantText;
40 end;
41
42 {!***********************************************************
43 \brief 茹f??羝??水??茯???????????/span>
44 ************************************************************}
45 TWordCountInfo = class( TObject )
46 private
47 FWordCount : Integer; //!< ??茯???/span>
48
49 public
50 property WordCount : Integer read FWordCount write FWordCount;
51 end;
52
53 {!***********************************************************
54 \brief 茹f??羝??水??茯????鴻??
55 ************************************************************}
56 // TWordCount = class( THashedStringList ) // 羶???
57 TWordCount = class( TStringList )
58 public
59 constructor Create;
60 destructor Destroy; override;
61 end;
62
63 {!***********************************************************
64 \brief ???c???帥?≪???眼???冴??
65 ************************************************************}
66 TGikoBayesianAlgorithm =
67 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
68
69 {!***********************************************************
70 \brief ???ゃ?吾?≪?潟???c????/span>
71 ************************************************************}
72 // TGikoBayesian = class( THashedStringList ) // 羶???
73 TGikoBayesian = class( TStringList )
74 private
75 FFilePath : string; //!< 茯??粋昭???????<?ゃ??????/span>
76 function GetObject( const name : string ) : TWordInfo;
77 procedure SetObject( const name : string; value : TWordInfo );
78
79 public
80 constructor Create;
81 destructor Destroy; override;
82
83 //! ???<?ゃ??????絖??絮ユ???茯??水?冴???障??
84 procedure LoadFromFile( const filePath : string );
85
86 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
87 procedure SaveToFile( const filePath : string );
88
89 //! ???<?ゃ?????膺?絮ユ???篆?絖????障??
90 procedure Save;
91
92 //! ??茯???????????宴????緇????障??
93 property Objects[ const name : string ] : TWordInfo
94 read GetObject write SetObject; default;
95
96 //! ??腴??????障??????茯????????潟?????障??
97 procedure CountWord(
98 const text : string;
99 wordCount : TWordCount );
100
101 {!
102 \brief Paul Graham 羈????冴?ャ??????腴???絵??墾??羆阪????障??
103 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
104 }
105 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
106
107 {!
108 \brief GaryRobinson 羈????冴?ャ??????腴???絵??墾??羆阪????障??
109 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
110 }
111 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
112
113 {!
114 \brief GaryRobinson-Fisher 羈????冴?ャ??????腴???絵??墾??羆阪????障??
115 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
116 }
117 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
118
119 {!
120 \brief ??腴???茹f??
121 \param text 茹f????????腴?
122 \param wordCount 茹f??????????茯????鴻????菴???
123 \param algorithm 羈???墾??浦絎??????????≪???眼???冴??????絎????障??
124 \return ??腴???絵??墾 (羈??????ゃ?????? 0.0??1.0 羈??????鴻??)
125
126 CountWord ? Calcxxxxx ???障???????茵??????????с????
127 }
128 function Parse(
129 const text : string;
130 wordCount : TWordCount;
131 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
132 ) : Extended;
133
134 {!
135 \brief 絖??????
136 \param wordCount Parse ?цВ??????????茯????鴻??
137 \param isImportant 羈??????鴻????腴???????????????? True
138 }
139 procedure Learn(
140 wordCount : TWordCount;
141 isImportant : Boolean );
142
143 {!
144 \brief 絖??腟?????綽?????
145 \param wordCount Parse ?цВ??????????茯????鴻??
146 \param isImportant 羈??????鴻????腴???????????????????????? True
147 \warning 絖??羝??帥????腴???????????∈茯??堺?ャ?障??????<br>
148 Learn ????????????腴??? isImportant ???????c????????腴???
149 Forget ?????????若?帥???若?鴻???贋?????障????<br>
150 絖??羝??帥????????????????????????????????
151
152 ???????膺?腟??????????≪????????с???????障??????<br>
153 wordCount ??緇?????腴? (Parse ? text 綣??? ???膺?腟??????帥?????≪???障????<br><br>
154
155 筝祉??絵????腴?????羈?????腴????????帥????????? Forget -> Learn ?????т戎?????障????
156 }
157 procedure Forget(
158 wordCount : TWordCount;
159 isImportant : Boolean );
160 end;
161
162 //==================================================
163 implementation
164 //==================================================
165
166 uses
167 SysUtils, Math, Windows,
168 MojuUtils;
169
170 const
171 GIKO_BAYESIAN_FILE_VERSION = '1.0';
172 {
173 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
174 ModeWGraph, ModeWAlpha, ModeWNum,
175 ModeWHira, ModeWKata, ModeWKanji);
176 }
177 CharMode1 : array [ 0..255 ] of Byte =
178 (
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
182 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
183 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
184 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
185 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
186 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
187
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
196 );
197
198 //************************************************************
199 // misc
200 //************************************************************
201
202 //==============================
203 // RemoveToken
204 //==============================
205 function RemoveToken(var s: string;const delimiter: string): string;
206 var
207 p: Integer;
208 begin
209 p := AnsiPos(delimiter, s);
210 if p = 0 then
211 Result := s
212 else
213 Result := Copy(s, 1, p - 1);
214 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
215 end;
216
217 //==============================
218 // AbsSort
219 //==============================
220 function AbsSort( p1, p2 : Pointer ) : Integer;
221 var
222 v1, v2 : Single;
223 begin
224
225 v1 := Abs( Single( p1 ) - 0.5 );
226 v2 := Abs( Single( p2 ) - 0.5 );
227 if v1 > v2 then
228 Result := -1
229 else if v1 = v2 then
230 Result := 0
231 else
232 Result := 1;
233
234 end;
235
236 //************************************************************
237 // TWordCount class
238 //************************************************************
239 constructor TWordCount.Create;
240 begin
241
242 Duplicates := dupIgnore;
243 CaseSensitive := True;
244 Sorted := True;
245
246 end;
247
248 destructor TWordCount.Destroy;
249 var
250 i : Integer;
251 begin
252
253 for i := Count - 1 downto 0 do
254 if Objects[ i ] <> nil then
255 Objects[ i ].Free;
256
257 inherited;
258
259 end;
260
261 //************************************************************
262 // TGikoBayesian class
263 //************************************************************
264
265 //==============================
266 // Create
267 //==============================
268 constructor TGikoBayesian.Create;
269 begin
270
271 Duplicates := dupIgnore;
272 CaseSensitive := True;
273 Sorted := True;
274
275 end;
276
277 //==============================
278 // Destroy
279 //==============================
280 destructor TGikoBayesian.Destroy;
281 var
282 i : Integer;
283 begin
284
285 for i := Count - 1 downto 0 do
286 if inherited Objects[ i ] <> nil then
287 inherited Objects[ i ].Free;
288
289 inherited;
290
291 end;
292
293 procedure TGikoBayesian.LoadFromFile( const filePath : string );
294 var
295 i : Integer;
296 sl : TStringList;
297 s : string;
298 name : string;
299 info : TWordInfo;
300 begin
301
302 FFilePath := filePath;
303
304 if not FileExists( filePath ) then
305 Exit;
306
307 sl := TStringList.Create;
308 try
309 sl.LoadFromFile( filePath );
310
311 for i := 1 to sl.Count - 1 do begin
312 s := sl[ i ];
313 name := RemoveToken( s, #1 );
314 info := TWordInfo.Create;
315 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
318 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
319
320 AddObject( name, info );
321 end;
322 finally
323 sl.Free;
324 end;
325
326 end;
327
328 procedure TGikoBayesian.SaveToFile( const filePath : string );
329 var
330 i : Integer;
331 sl : TStringList;
332 s : string;
333 info : TWordInfo;
334 begin
335
336 FFilePath := filePath;
337
338 sl := TStringList.Create;
339 try
340 sl.BeginUpdate;
341 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
342
343 for i := 0 to Count - 1 do begin
344 info := TWordInfo( inherited Objects[ i ] );
345 s := Strings[ i ] + #1
346 + Format('%x', [info.NormalWord]) + #1
347 + Format('%x', [info.ImportantWord]) + #1
348 + Format('%x', [info.NormalText]) + #1
349 + Format('%x', [info.ImportantText]);
350
351 sl.Add(s);
352 end;
353 sl.EndUpdate;
354 sl.SaveToFile( filePath );
355 finally
356 sl.Free;
357 end;
358
359 end;
360
361 procedure TGikoBayesian.Save;
362 begin
363
364 if FFilePath <> '' then
365 SaveToFile( FFilePath );
366
367 end;
368
369 //==============================
370 // GetObject
371 //==============================
372 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
373 var
374 idx : Integer;
375 begin
376
377 if Find( name, idx ) then
378 Result := TWordInfo( inherited Objects[ idx ] )
379 else
380 Result := nil;
381
382 end;
383
384 //==============================
385 // SetObject
386 //==============================
387 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
388 var
389 idx : Integer;
390 begin
391
392 if Find( name, idx ) then
393 inherited Objects[ idx ] := value
394 else
395 AddObject( name, value );
396
397 end;
398
399
400 //==============================
401 // CountWord
402 //==============================
403 procedure TGikoBayesian.CountWord(
404 const text : string;
405 wordCount : TWordCount );
406 type
407 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
408 ModeWGraph, ModeWAlpha, ModeWNum,
409 ModeWHira, ModeWKata, ModeWKanji);
410 var
411 p, tail, last : PChar;
412 mode, newMode : Modes;
413 ch : Longword;
414 chSize : Integer;
415 wHiraDelimiter : TStringList;
416 wHiraFinalDelimiter : TStringList;
417 wKanjiDelimiter : TStringList;
418 words : TStringList;
419 aWord : string;
420 // countInfo : TWordCountInfo;
421
422 function cutBoth( _aWord : string; _delim : TStringList ) : string;
423 var
424 _i : Integer;
425 begin
426 for _i := 0 to _delim.Count - 1 do begin
427 _aWord := CustomStringReplace(
428 _aWord,
429 _delim[ _i ],
430 #10 + _delim[ _i ] + #10, False );
431 end;
432 Result := _aWord;
433 end;
434
435 function cutFirst( _aWord : string; _delim : TStringList ) : string;
436 var
437 _i : Integer;
438 begin
439 for _i := 0 to _delim.Count - 1 do begin
440 _aWord := CustomStringReplace(
441 _aWord,
442 _delim[ _i ],
443 #10 + _delim[ _i ], False );
444 end;
445 Result := _aWord;
446 end;
447
448 function cutFinal( _aWord : string; _delim : TStringList ) : string;
449 var
450 _i : Integer;
451 begin
452 for _i := 0 to _delim.Count - 1 do begin
453 _aWord := CustomStringReplace(
454 _aWord,
455 _delim[ _i ],
456 _delim[ _i ] + #10, False );
457 end;
458 Result := _aWord;
459 end;
460
461 procedure addWord( _dst : TWordCount; _words : TStringList );
462 var
463 _aWord : string;
464 _i, _idx : Integer;
465 _countInfo : TWordCountInfo;
466 begin
467 for _i := 0 to _words.Count - 1 do begin
468 _aWord := _words[ _i ];
469 if Length( _aWord ) > 0 then begin
470 if _dst.Find( _aWord, _idx ) then begin
471 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
472 end else begin
473 _countInfo := TWordCountInfo.Create;
474 _dst.AddObject( _aWord, _countInfo );
475 end;
476 _countInfo.WordCount := _countInfo.WordCount + 1;
477 end;
478 end;
479 end;
480
481 function changeMode( _aWord : string; _mode : Modes ) : string;
482 var
483 _i : Integer;
484 _aWord2 : string;
485 _pWord, _pWord2 : PChar;
486 _pWordTail, _pFound : PChar;
487 const
488 _delim : string = #10;
489 begin
490 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
491 if mode = ModeWHira then begin
492 Result := '';
493 Exit;
494 end;
495 {$ENDIF}
496 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
497 // ?ユ???
498 // ?鴻???若?鴻??荅違????
499 _aWord := CustomStringReplace( _aWord, ' ', '', False );
500 _aWord := CustomStringReplace( _aWord, '??', '', False );
501
502 // ???????帥?у??茯?????
503 case mode of
504 ModeWHira:
505 begin
506 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
507 Result := cutBoth( _aWord, wHiraDelimiter );
508 end;
509
510 ModeWKanji:
511 begin
512 // ???????帥?у??茯?????
513 _aWord := cutBoth( _aWord, wKanjiDelimiter );
514 // 4 byte (2 絖?) ???ゃ?у??茯?????
515 _pWord := PChar( _aWord );
516 _i := Length( _aWord );
517 _pWordTail := _pWord + _i;
518 SetLength( _aWord2, _i + (_i shr 2) );
519 _pWord2 := PChar( _aWord2 );
520
521 while _pWord < _pWordTail do begin
522 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
523 if _pFound = nil then
524 _pFound := _pWordTail;
525 _pFound := _pFound - 3;
526
527 while _pWord <= _pFound do begin
528 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
529 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
530 end;
531 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
532 CopyMemory( _pWord2, _pWord, _i );
533 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
534 end;
535 if _pWord < _pWordTail then begin
536 _i := _pWordTail - _pWord;
537 CopyMemory( _pWord2, _pWord, _i );
538 _pWord2 := _pWord2 + _i;
539 end;
540 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
541
542 Result := _aWord2;
543 end;
544
545 else
546 Result := _aWord;
547 end;
548 end else begin
549 Result := _aWord;
550 end;
551 end;
552 const
553 WHIRA_DELIMITER = '??' + #10 + '??#39; + #10 + '??' + #10 + '??#39; + #10 + '????'
554 + #10 + '??#39; + #10 + '????' + #10 + '?障??#39;+ #10 + '??#39;
555 + #10 + '????' + #10 + '????' + #10 + '????'
556 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
557 + #10 + '????#39; + #10 + '????#39; + #10 + '????#39; + #10 + '????#39;
558 + #10 + '????' + #10 + '????' + #10 + '????' + #10 + '????'
559 + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39; + #10 + '??????#39;
560 + #10 + '????' + #10 + '????#39; + #10 + '????' + #10 + '????'
561 + #10 + '????' + #10 + '??????'
562 + #10 + '?с??' + #10 + '?障??' + #10 + '?障????'
563 + #10 + '?с????' + #10 + '?障????'
564 + #10 + '????' + #10 + '??????' + #10 + '??????' + #10 + '????????'
565 ;
566 WKANJI_DELIMITER = '??' + #10 + '??#39; + #10 + '綣?39; + #10 + '??' + #10 + '羈?'
567 + #10 + '筝?' + #10 + '??#39; + #10 + '??' + #10 + '??'
568 ;
569 WHIRA_FINAL_DELIMITER = '?c??' + #10 + '?c??#39;
570 ;{
571 + #10 + '???c??#39; + #10 + '???????c??#39; + #10 + '??????#39;
572 + #10 + '??????' + #10 + '?с??????'
573 + #10 + '?障??'
574 + #10 + '??????' + #10 + '????' + #10 + '????#39; + #10 + '??????#39;
575 + #10 + '??????' + #10 + '???c?宴??'
576 + #10 + '?с??' + #10 + '????'
577 + #10 + '????' + #10 + '??????' + #10 + '????' + #10 + '??????'
578 ;}
579 // '??#39; ?? '??????????' ????
580 HA_LINE = '?????????????障????????????違?宴????#39;;
581 HI_LINE = '???????<???蚊?帥?????????潟?眼??';
582 HU_LINE = '??????ゃ???泣?????????吟?激??';
583 HE_LINE = '???????????吾?????????鴻?冴??';
584 HO_LINE = '???????????祉???????????若?純??';
585 KA_LINE = '?≪???泣?帥??????ゃ???????吟???????<?泣??#39;;
586 KI_LINE = '?ゃ???激???????????違???吾??????#39;;
587 KU_LINE = '?????鴻?????????????違?????ャ??#39;;
588 KE_LINE = '???宴?祉???????<???宴?蚊?????с??#39;;
589 KO_LINE = '???潟?純???????≪?????蚊?眼??????#39;;
590 kKanji = [$80..$A0, $E0..$ff];
591 begin
592
593 wHiraDelimiter := TStringList.Create;
594 wHiraFinalDelimiter := TStringList.Create;
595 wKanjiDelimiter := TStringList.Create;
596 words := TStringList.Create;
597 try
598 mode := ModeWhite;
599 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
600 wHiraDelimiter.Text := WHIRA_DELIMITER;
601 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
602 {$ENDIF}
603 wKanjiDelimiter.Text := WKANJI_DELIMITER;
604 p := PChar( text );
605 tail := p + Length( text );
606 last := p;
607
608 while p < tail do begin
609 // ??絖????帥?ゃ?????ゅ??/span>
610 // ?糸???鴻? ModeGraph ?????????у???ャ???綽??????????????
611 // if Byte(Byte( p^ ) - $a1) < $5e then begin
612 if Byte( p^ ) in kKanji then begin
613 if p + 1 < tail then begin
614 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
615 case ch of
616 // ?鴻???若?鴻?у??茯????????????????
617 //$8140: newMode := ModeWhite;
618 $8141..$824e: newMode := ModeWGraph;
619 $824f..$8258: newMode := ModeWNum;
620 $8260..$829a: newMode := ModeWAlpha;
621 $829f..$82f1: newMode := ModeWHira;
622 $8340..$8396: newMode := ModeWKata;
623 else newMode := ModeWKanji;
624 end;
625 // '??????#39; ??抗篁??????障???????帥?????????障????
626 if (mode = ModeWHira) or (mode = ModeWKata) then
627 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
628 newMode := mode;
629 end else begin
630 newMode := ModeWhite;
631 end;
632
633 chSize := 2;
634 end else begin
635 newMode := Modes( CharMode1[ Byte( p^ ) ] );
636 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
637 // 篁??障?ф?ユ????т??鴻???若??/span>
638 // ??茯???膵???????с?鴻???若?鴻??荅違????
639 // ?糸??茹?????????絽吾?鴻???若?鴻?у?阪??????????????荅違??????
640 newMode := mode;
641 end;
642
643 chSize := 1;
644 end;
645
646 if mode <> newMode then begin
647
648 // ??絖????帥?ゃ????紊??眼??????
649 if mode <> ModeWhite then begin
650 SetLength( aWord, p - last );
651 CopyMemory( PChar( aWord ), last, p - last );
652
653 words.Text := changeMode( aWord, mode );
654
655 // ??茯??脂??/span>
656 addWord( wordCount, words );
657 end;
658
659 last := p;
660 mode := newMode;
661
662 end;
663
664 p := p + chSize;
665 end; // while
666
667 if mode <> ModeWhite then begin
668 SetLength( aWord, p - last );
669 CopyMemory( PChar( aWord ), last, p - last );
670
671 words.Text := changeMode( aWord, mode );
672
673 // ??茯??脂??/span>
674 addWord( wordCount, words );
675 end;
676 finally
677 words.Free;
678 wKanjiDelimiter.Free;
679 wHiraFinalDelimiter.Free;
680 wHiraDelimiter.Free;
681 end;
682
683 end;
684
685 //==============================
686 // CalcPaulGraham
687 //==============================
688 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
689
690 function p( const aWord : string ) : Single;
691 var
692 info : TWordInfo;
693 begin
694 info := Objects[ aWord ];
695 if info = nil then
696 Result := 0.415
697 else if info.NormalWord = 0 then
698 Result := 0.99
699 else if info.ImportantWord = 0 then
700 Result := 0.01
701 else if info.ImportantWord + info.NormalWord * 2 < 5 then
702 Result := 0.5
703 else begin
704 try
705 Result := ( info.ImportantWord / info.ImportantText ) /
706 ((info.NormalWord * 2 / info.NormalText ) +
707 (info.ImportantWord / info.ImportantText));
708 except
709 on EZeroDivide do Result := 0.99;
710 end;
711 end;
712 end;
713
714 var
715 s, q : Extended;
716 i : Integer;
717 narray : TList;
718 const
719 SAMPLE_COUNT = 15;
720 begin
721
722 Result := 1;
723 if wordCount.Count = 0 then
724 Exit;
725
726 narray := TList.Create;
727 try
728 for i := 0 to wordCount.Count - 1 do begin
729 narray.Add( Pointer( p( wordCount[ i ] ) ) );
730 end;
731
732 narray.Sort( AbsSort );
733
734 s := 1;
735 q := 1;
736 i := min( SAMPLE_COUNT, narray.Count );
737 while i > 0 do begin
738 Dec( i );
739
740 s := s * Single( narray[ i ] );
741 q := q * (1 - Single( narray[ i ] ));
742 end;
743 try
744 Result := s / (s + q);
745 except
746 Result := 0.5;
747 end;
748 finally
749 narray.Free;
750 end;
751
752 end;
753
754 //==============================
755 // CalcGaryRobinson
756 //==============================
757 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
758
759 function p( const aWord : string ) : Single;
760 var
761 info : TWordInfo;
762 begin
763 info := Objects[ aWord ];
764 if info = nil then
765 Result := 0.415
766 else if info.ImportantWord = 0 then
767 Result := 0.01
768 else if info.NormalWord = 0 then
769 Result := 0.99
770 else
771 {
772 Result := ( info.ImportantWord / info.ImportantText ) /
773 ((info.NormalWord / info.NormalText ) +
774 (info.ImportantWord / info.ImportantText));
775 }
776 try
777 Result := (info.ImportantWord * info.NormalText) /
778 (info.NormalWord * info.ImportantText +
779 info.ImportantWord * info.NormalText);
780 except
781 Result := 0.5;
782 end;
783 end;
784
785 function f( cnt : Integer; n, mean : Single ) : Extended;
786 const
787 k = 0.001;
788 begin
789 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
790 end;
791
792 var
793 n : Extended;
794 narray : array of Single;
795 mean : Extended;
796 countInfo : TWordCountInfo;
797 i : Integer;
798 P1, Q1{, R1} : Extended;
799 cnt : Extended;
800 begin
801
802 if wordCount.Count = 0 then begin
803 Result := 1;
804 Exit;
805 end;
806
807 SetLength( narray, wordCount.Count );
808 mean := 0;
809 for i := 0 to wordCount.Count - 1 do begin
810 n := p( wordCount[ i ] );
811 narray[ i ] := n;
812 mean := mean + n;
813 end;
814 mean := mean / wordCount.Count;
815
816 P1 := 1;
817 Q1 := 1;
818 for i := 0 to wordCount.Count - 1 do begin
819 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
820 n := f( countInfo.WordCount, narray[ i ], mean );
821 P1 := P1 * ( 1 - n );
822 Q1 := Q1 * n;
823 end;
824 cnt := wordCount.Count;
825 if cnt = 0 then
826 cnt := 1;
827 try
828 P1 := 1 - Power( P1, 1 / cnt );
829 except
830 end;
831 try
832 Q1 := 1 - Power( Q1, 1 / cnt );
833 except
834 end;
835
836 if P1 + Q1 = 0 then begin
837 Result := 0.5
838 end else begin
839 n := (P1 - Q1) / (P1 + Q1);
840 Result := (1 + n) / 2;
841 end;
842
843 end;
844
845 //==============================
846 // CalcGaryRobinsonFisher
847 //==============================
848 function TGikoBayesian.CalcGaryRobinsonFisher(
849 wordCount : TWordCount
850 ) : Extended;
851
852 function p( const aWord : string ) : Single;
853 var
854 info : TWordInfo;
855 begin
856 info := Objects[ aWord ];
857 if info = nil then
858 Result := 0.415
859 else if info.ImportantWord = 0 then
860 Result := 0.01
861 else if info.NormalWord = 0 then
862 Result := 0.99
863 else
864 {
865 Result := ( info.ImportantWord / info.ImportantText ) /
866 ((info.NormalWord / info.NormalText ) +
867 (info.ImportantWord / info.ImportantText));
868 }
869 Result := (info.ImportantWord * info.NormalText) /
870 (info.NormalWord * info.ImportantText +
871 info.ImportantWord * info.NormalText);
872 end;
873
874 function f( cnt : Integer; n, mean : Single ) : Extended;
875 const
876 k = 0.001;
877 begin
878 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
879 end;
880
881 function prbx( x2, degree : Extended ) : Extended;
882 begin
883
884 Result := 0.5;
885
886 end;
887
888 var
889 n : Extended;
890 narray : array of Single;
891 mean : Extended;
892 countInfo : TWordCountInfo;
893 i : Integer;
894 // normal : Extended;
895 // important : Extended;
896 P1, Q1 : Extended;
897 cnt : Extended;
898 begin
899
900 if wordCount.Count = 0 then begin
901 Result := 1;
902 Exit;
903 end;
904
905 SetLength( narray, wordCount.Count );
906 mean := 0;
907 for i := 0 to wordCount.Count - 1 do begin
908 n := p( wordCount[ i ] );
909 narray[ i ] := n;
910 mean := mean + n;
911 end;
912 mean := mean / wordCount.Count;
913
914 P1 := 1;
915 Q1 := 1;
916 for i := 0 to wordCount.Count - 1 do begin
917 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
918 n := f( countInfo.WordCount, narray[ i ], mean );
919 P1 := P1 * ( 1 - n );
920 Q1 := Q1 * n;
921 end;
922 cnt := wordCount.Count;
923 if cnt = 0 then
924 cnt := 1;
925 try
926 P1 := Power( P1, 1 / cnt );
927 except
928 end;
929 try
930 Q1 := Power( Q1, 1 / cnt );
931 except
932 end;
933
934 P1 := 1 - prbx( -2 * Ln( P1 ), 2 * cnt );
935 Q1 := 1 - prbx( -2 * Ln( Q1 ), 2 * cnt );
936
937 Result := (1 + P1 - Q1) / 2;
938
939 end;
940
941 //==============================
942 // Parse
943 //==============================
944 function TGikoBayesian.Parse(
945 const text : string;
946 wordCount : TWordCount;
947 algorithm : TGikoBayesianAlgorithm
948 ) : Extended;
949 begin
950
951 CountWord( text, wordCount );
952 case algorithm of
953 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
954 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
955 gbaGaryRobinsonFisher:
956 Result := CalcGaryRobinsonFisher( wordCount );
957 else Result := 0;
958 end;
959
960 end;
961
962 //==============================
963 // Learn
964 //==============================
965 procedure TGikoBayesian.Learn(
966 wordCount : TWordCount;
967 isImportant : Boolean );
968 var
969 aWord : string;
970 wordinfo : TWordInfo;
971 countinfo : TWordCountInfo;
972 i : Integer;
973 begin
974
975 for i := 0 to wordCount.Count - 1 do begin
976 aWord := wordCount[ i ];
977 wordinfo := Objects[ aWord ];
978 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
979 if wordinfo = nil then begin
980 wordinfo := TWordInfo.Create;
981 Objects[ aWord ] := wordinfo;
982 end;
983
984 if isImportant then begin
985 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
986 wordinfo.ImportantText := wordinfo.ImportantText + 1;
987 end else begin
988 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
989 wordinfo.NormalText := wordinfo.NormalText + 1;
990 end;
991 end;
992
993 end;
994
995 //==============================
996 // Forget
997 //==============================
998 procedure TGikoBayesian.Forget(
999 wordCount : TWordCount;
1000 isImportant : Boolean );
1001 var
1002 aWord : string;
1003 wordinfo : TWordInfo;
1004 countinfo : TWordCountInfo;
1005 i : Integer;
1006 begin
1007
1008 for i := 0 to wordCount.Count - 1 do begin
1009 aWord := wordCount[ i ];
1010 wordinfo := Objects[ aWord ];
1011 if wordinfo = nil then
1012 Continue;
1013
1014 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
1015 if isImportant then begin
1016 if wordInfo.ImportantText > 0 then begin
1017 wordinfo.ImportantText := wordinfo.ImportantText - 1;
1018 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1019 end;
1020 end else begin
1021 if wordinfo.NormalText > 0 then begin
1022 wordinfo.NormalText := wordinfo.NormalText - 1;
1023 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
1024 end;
1025 end;
1026 end;
1027
1028 end;
1029
1030 end.

Back to OSDN">Back to OSDN
ViewVC Help
Powered by ViewVC 1.1.26