Develop and Download Open Source Software

Browse CVS Repository

Contents of /bsfilter/bsfilter/bsfilter

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.78.4.1 - (show annotations) (download)
Sun Apr 16 05:56:30 2006 UTC (18 years, 1 month ago) by nabeken
Branch: r178_imap
Changes since 1.78: +10 -5 lines
verbose message added for imap-fetch-unflagged

1 #! /usr/bin/env ruby
2 ## -*-Ruby-*- $Id: bsfilter,v 1.78 2006/04/02 04:40:09 nabeken Exp $
3
4 ## Copyright (C) 2003, 2004, 2005, 2006 NABEYA Kenichi
5 ##
6 ## This program is free software; you can redistribute it and/or modify
7 ## it under the terms of the GNU General Public License as published by
8 ## the Free Software Foundation; either version 2 of the License, or
9 ## (at your option) any later version.
10 ##
11 ## This program is distributed in the hope that it will be useful,
12 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ## GNU General Public License for more details.
15 ##
16 ## You should have received a copy of the GNU General Public License
17 ## along with this program; if not, write to the Free Software
18 ## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
20 require 'getoptlong'
21 require 'nkf'
22
23 class Bsfilter
24 def initialize
25 @threads = Array::new
26 @token_dbs = nil
27 @options = Hash::new
28 @db_hash = Hash::new
29 @jtokenizer = nil
30 end
31 attr_accessor :token_dbs
32
33 Release = "$Name: $".split[1].sub(/\A[^\d]*/, '').gsub(/_/, '.')
34 Release.concat("-") if (Release == "")
35 Revision = "$Revision: 1.78 $".gsub(/[^\.\d]/, '')
36 Languages = ["C", "ja"]
37
38 ## Options = Hash::new # used like a global variable
39 ## DB = Hash::new
40
41 Default_header_prefix = "Spam"
42 Default_spam_subject_prefix = "[SPAM] "
43 Default_refer_header =
44 ["Ufrom", "From", "To", "Cc", "Subject", "Reply-to", "Return-path", "Received",
45 "Content-Transfer-Encoding", "Content-Type", "charset", "Content-Disposition"].join(",")
46
47 Default_jtokenizer = "bigram"
48 Default_mark_in_token = "|!*'"
49 Default_homedir = ".bsfilter"
50 Default_conf_file = "bsfilter.conf"
51 Default_pid_file = "bsfilter.pid"
52
53 Default_method = "rf" # Robinson Fisher
54 Default_db = "sdbm"
55 Default_max_mail = 10000
56 Default_min_mail = 8000
57 Default_max_line = 500
58
59 Default_pop_proxy_if = "0.0.0.0"
60 Default_pop_port = "110"
61 Default_pop_proxy_port = "10110"
62 Default_pop_max_size = 50000
63
64 Default_imap_port = "143"
65 Default_imap_auth = "auto"
66 Default_imap_auth_preference = ["cram-md5", "login", "loginc"]
67
68 Default_icon_number = 32512
69
70 Clean_ext = ".clean"
71 Spam_ext = ".spam"
72 Prob_ext = ".prob"
73 Lock_ext = ".lock"
74
75 SDBM_ext = ".sdbm"
76 GDBM_ext = ".gdbm"
77 BDB1_ext = ".bdb1"
78 BDB_ext = ".bdb"
79 QDBM_ext = ".qdbm"
80
81 EXIT_NORMAL = 0
82 CODE_NORMAL = true
83 CODE_SPAM = true
84 CODE_CLEAN = false
85
86 CODESET_EUCJP = "eucJP"
87 CODESET_LATIN = "ISO8859-1"
88 CODESET_GB18030 = "GB18030"
89 CODESET_UTF8 = "UTF-8"
90 PATTERN_UTF8 = '[\xe0-\xef][\x80-\xbf][\x80-\xbf][\xe0-\xef][\x80-\xbf][\x80-\xbf]'
91 RE_UTF8 = Regexp.new(PATTERN_UTF8, 'n')
92
93 ALL_TAGS = ["html", "head", "title", "meta", "body", "div", "spam",
94 "h1", "h2", "h3", "h4", "h5", "h6",
95 "em", "strong", "font", "basefont", "big", "small",
96 "b", "i", "s", "u", "tt", "sub", "sub",
97 "rb", "rp", "rt","ruby",
98 "blink", "marquee",
99 "dfn", "cite", "abbr", "acronym",
100 "blockquote", "q",
101 "br", "pre", "ins", "del", "center", "style", "hr",
102 "ul", "ol", "li", "dl", "dt", "dd",
103 "table", "caption", "thead", "tbody", "tfoot",
104 "colgroup", "col", "tr", "td", "th",
105 "a", "link", "base", "img", "address",
106 "form", "input", "select", "option", "textarea", "label",
107 "fieldset", "legend", "optgroup",
108 "frameset", "frame", "nofrmaes", "iframe"].join('|')
109
110 SPACE_TAGS = "br|p|td|tr|table|ul|ol|dl|li|dt|dd"
111
112 RE_ALL_TAGS = Regexp::compile('\A<(' + ALL_TAGS + ')\b', Regexp::IGNORECASE, 'n')
113 RE_SPACE_TAGS = Regexp::compile('\A<(' + SPACE_TAGS + ')\b', Regexp::IGNORECASE, 'n')
114
115 SOCKET_TIMEOUT = 30 # for single socket operation
116
117 module Bsutil
118 def insert_header!(buf, header, content)
119 buf[0] =~ /([\r\n]*)\z/
120 eol = $1
121
122 (0 ... buf.length).each do |i|
123 if (buf[i] =~/\A(.*?:)/)
124 h = $1
125 if (h == header)
126 buf[i] = "#{header} #{content}#{eol}"
127 break
128 end
129 elsif (buf[i] =~ /\A[\r\n]*\z/)
130 buf[i, 0] = "#{header} #{content}#{eol}"
131 break
132 end
133 end
134 end
135
136 def append_header!(buf, header, prefix)
137 buf[0] =~ /([\r\n]*)\z/
138 eol = $1
139 (0 ... buf.length).each do |i|
140 if (buf[i] =~/\A(.*?:)(\s*)(.*?)([\r\n]*)\z/)
141 h = $1
142 org_content = $3
143 if (h.downcase == header.downcase)
144 buf[i] = "#{header} #{prefix}#{org_content}#{eol}"
145 break
146 end
147 elsif (buf[i] =~ /\A[\r\n]*\z/)
148 buf[i, 0] = "#{header} #{prefix}#{eol}"
149 break
150 end
151 end
152 end
153
154 def x_spam_flag
155 return sprintf("X-%s-Flag:", @options["header-prefix"])
156 end
157
158 def x_spam_probability
159 return sprintf("X-%s-Probability:", @options["header-prefix"])
160 end
161
162 def x_spam_revision
163 return sprintf("X-%s-Revision:", @options["header-prefix"])
164 end
165
166 def insert_headers!(buf, spam_flag, probability=nil)
167 updated = false
168 if (@options["insert-revision"])
169 insert_header!(buf, x_spam_revision, "bsfilter release #{Release} revision #{Revision}")
170 updated = true
171 end
172 if (@options["insert-flag"])
173 updated = true
174 if (spam_flag)
175 insert_header!(buf, x_spam_flag, "Yes")
176 else
177 insert_header!(buf, x_spam_flag, "No")
178 end
179 end
180 if (@options["insert-probability"] && probability)
181 updated = true
182 insert_header!(buf, x_spam_probability, sprintf("%f", probability))
183 end
184 if (@options["mark-spam-subject"])
185 updated = true
186 if (spam_flag)
187 append_header!(buf, "Subject:", @options["spam-subject-prefix"])
188 end
189 end
190 return updated
191 end
192 end # end of module
193
194 include Bsutil
195
196 class DevNull
197 def sync=(*args)
198 end
199 def print(*args)
200 end
201 def printf(*args)
202 end
203 end
204
205 class DBHash < Hash
206 def flatten(magic="###", head="", &block)
207 self.each do |k, v|
208 if (v.class == DBHash)
209 if (head == "")
210 v.flatten(magic, k, &block)
211 else
212 v.flatten(magic, head + magic + k, &block)
213 end
214 else
215 if (head == "")
216 yield k, v
217 else
218 yield head + magic + k, v
219 end
220 end
221 end
222 end
223
224 def add(hash)
225 hash.each do |k, v|
226 if (self[k])
227 if ((self[k].class == DBHash) &&
228 (v.class == DBHash))
229 self[k].add(v)
230 else
231 self[k] += v
232 end
233 else
234 self[k] = v # should do deep copy ?
235 end
236 end
237 end
238 def sub(hash)
239 hash.each do |k, v|
240 if (self[k])
241 if ((self[k].class == DBHash) &&
242 (v.class == DBHash))
243 self[k].sub(v)
244 if (self[k].empty?)
245 self.delete(k)
246 end
247 else
248 if (self[k] > v)
249 self[k] -= v
250 else
251 self.delete(k)
252 end
253 end
254 end
255 end
256 end
257 end
258
259 def safe_require(file)
260 begin
261 require file
262 return true
263 rescue LoadError
264 return false
265 end
266 end
267
268 def latin2ascii(str)
269 newstr = str.tr("\x92\x93\x94", "'''")
270 newstr.tr!("\xc0-\xc5\xc8-\xcb\xcc-\xcf\xd2-\xd6\xd9-\xdc", "AAAAAAEEEEIIIIOOOOOUUUU")
271 newstr.tr!("\xe0-\xe5\xe8-\xeb\xec-\xef\xf2-\xf6\xf9-\xfc", "aaaaaaeeeeiiiiooooouuuu")
272 return newstr
273 end
274
275 def define_safe_iconv
276 def Iconv.safe_iconv(tocode, fromcode, *strs)
277 return strs.map do |str|
278 array = Array::new
279 strs.each do |str|
280 str.split(/(\s+)/).each do |word|
281 begin
282 array.push(Iconv.iconv(tocode, fromcode, word)[0])
283 rescue
284 array.push(' ')
285 end
286 end
287 end
288 array.join
289 end
290 end
291 def Iconv.u2eucjp(str)
292 return NKF::nkf('-e -E -X -Z0', (Iconv.safe_iconv(CODESET_EUCJP, CODESET_UTF8, str))[0])
293 end
294 def Iconv.u2latin(str)
295 return (Iconv.safe_iconv(CODESET_LATIN, CODESET_UTF8, str))[0]
296 end
297 def Iconv.gb180302eucjp(str)
298 return (Iconv.safe_iconv(CODESET_EUCJP, CODESET_GB18030, str))[0]
299 end
300 end
301
302 def open_ro(file)
303 if (file == "-")
304 fh = STDIN
305 yield fh
306 elsif (file.class == Array)
307 file.instance_eval <<EOM
308 @eof = false
309 def gets
310 @n = 0 if (! @n)
311 if (@n >= self.length)
312 nil
313 else
314 @n = @n + 1
315 self[@n - 1]
316 end
317 end
318 def readlines
319 @eof = true
320 self
321 end
322 def eof?
323 (@eof || empty?)
324 end
325 EOM
326 yield file
327 else
328 if (! FileTest::file?(file))
329 raise sprintf("%s is not file", file)
330 end
331 fh = open(file, "rb")
332 yield fh
333 fh.close
334 end
335 end
336
337 def open_wo(file, &block)
338 if (file == "-")
339 fh = STDOUT
340 else
341 fh = open(file, "wb")
342 end
343 if (block)
344 yield fh
345 if (file != "-")
346 fh.close
347 end
348 else
349 return fh
350 end
351 end
352
353 class FLOAT
354 def initialize(f=0, power=1)
355 @mant = 0
356 @exp = 0
357 set_f(f, power)
358 end
359 attr_accessor :mant, :exp
360
361 def to_f
362 return @mant * Math::exp(@exp)
363 end
364
365 def ln
366 return Math::log(@mant) + @exp
367 end
368
369 def * (a)
370 if (a.class == FLOAT)
371 n = FLOAT::new
372 n.mant = @mant * a.mant
373 n.exp = @exp + a.exp
374 else
375 n = FLOAT::new
376 n.exp = @exp
377 n.mant = @mant * a
378 end
379 return n
380 end
381 def set_f (a, power=1)
382 if (a > 0)
383 @mant = 1
384 @exp = Math::log(a) * power
385 elsif (a < 0)
386 @mant = -1
387 @exp = Math::log(-a) * power
388 else
389 @mant = 0
390 @exp = 0
391 end
392 self
393 end
394 end
395
396
397 module TokenAccess
398 def check_size(max_size, min_size)
399 if ((@file_count <= max_size) || (max_size <= 0) || (min_size <= 0))
400 return false
401 end
402 old_count = @file_count
403 if (@options["verbose"])
404 @options["message-fh"].printf("reduce token database %s from %d to %d\n", @filename, old_count, min_size)
405 end
406
407 key_cts.each do |(category, token)|
408 if (category != ".internal")
409 v = value(category, token) || 0
410 sub_scalar(category, token, (v * (old_count - min_size).to_f / old_count.to_f).ceil)
411 if (@options["debug"] && ! value(category, token))
412 @options["message-fh"].printf("deleted %s %s\n", category, token)
413 end
414 end
415 end
416 @file_count = min_size
417 @dirty = true
418 return true
419 end
420
421 def value_with_degene(category, token)
422 if (value(category, token))
423 return value(category, token)
424 elsif (! @options["degeneration"]) # no degeneration
425 return nil
426 else
427 if (v = value(category, token[0 .. -2])) # cut last char
428 return v
429 end
430 token = token.gsub(Regexp::compile("[#{@options['mark-in-token']}]"), '')
431 if (v = value(category, token))
432 return v
433 end
434 token = token.downcase
435 if (v = value(category, token))
436 return v
437 end
438 token = token.upcase
439 if (v = value(category, token))
440 return v
441 end
442 token = token.capitalize
443 if (v = value(category, token))
444 return v
445 end
446 return nil
447 end
448 end
449 def set_scalar(category, token, val)
450 @dirty = true
451 @file_count += 1
452 set(category, token, val)
453 end
454
455 def add_scalar(category, token, val)
456 @dirty = true
457 @file_count += 1
458 if (v = value(category, token))
459 set(category, token, v + val)
460 else
461 set(category, token, val)
462 end
463 end
464
465 def show_new_token(db)
466 db.each_ct do |(category, token)|
467 if (! value(category, token) || (value(category, token) == 0))
468 @options["message-fh"].printf("new %s %s\n", category, token)
469 end
470 end
471 end
472
473 def values
474 array = Array::new
475 each_ct do |c, t|
476 array.push(value(c, t))
477 end
478 return array
479 end
480
481 def key_cts
482 array = Array::new
483 each_ct do |c, t|
484 array.push([c, t])
485 end
486 return array
487 end
488
489 def export(fh)
490 each_ct do |(category, token)|
491 fh.printf("%s %s %s %g\n", @language, category, token, value(category, token)) if (value(category, token))
492 end
493 end
494 end
495
496 class TokenDB
497 include TokenAccess
498
499 def initialize(language=nil)
500 @hash = DBHash::new
501 @file_count = 0
502 @language = language
503 @message_id = "-"
504 @probability = nil
505 @spam_flag = nil
506 @dirty = false
507 @time = nil
508 @filename = "-"
509 end
510 attr_accessor :hash, :file_count, :probability, :language, :spam_flag, :message_id, :time, :filename
511
512 def size
513 @hash.size
514 end
515
516 def each_ct
517 @hash.each_key do |category|
518 @hash[category].each_key do |token|
519 yield(category, token)
520 end
521 end
522 end
523
524 def value(category, token)
525 if (! @hash[category])
526 return nil
527 elsif (v = @hash[category][token])
528 return v
529 else
530 return nil
531 end
532 end
533
534 def set(category, token, v)
535 @dirty = true
536 @hash[category] = DBHash::new if (! @hash[category])
537 @hash[category][token] = v
538 end
539
540 def print_keys_to_str(hash, separator, fh=STDOUT)
541 hash.keys.sort.each do |k|
542 v = hash[k]
543 v = v.to_i
544 fh.print separator
545 fh.print(([k] * v).join(separator))
546 end
547 end
548
549 def clear
550 @dirty = true
551 @file_count = 0
552 @hash = DBHash::new
553 end
554
555 def add_db(db)
556 @dirty = true
557 @file_count += db.file_count
558 if (! @language && db.language)
559 @language = db.language
560 end
561 @hash.add(db.hash)
562 end
563
564 def add_hash(hash)
565 @dirty = true
566 @file_count += 1
567 @hash.add(hash)
568 end
569
570 def sub_scalar(category, token, val)
571 if (@file_count > 0)
572 @file_count -= 1
573 end
574 @hash.sub({category => {token => val}})
575 end
576
577 def sub_hash(hash)
578 @dirty = true
579 if (@file_count > 0)
580 @file_count -= 1
581 end
582 @hash.sub(hash)
583 end
584
585 def sub_db(db)
586 @dirty = true
587 @file_count -= db.file_count
588 if (@file_count < 1)
589 @file_count = 1
590 end
591 @hash.sub(db.hash)
592 end
593 end
594
595 class TokenDBM
596 include TokenAccess
597 MAGIC = "###"
598 def initialize(options, language, ext)
599 @options = options
600 @dbm = nil # SDBM not Hash
601 @dirty = nil # not used. for TokenAccess
602 @lockfh = nil
603 @file_count = nil
604 @language = language
605 end
606 attr_accessor :file_count
607
608 def size
609 @dbm.size
610 end
611
612 def to_db
613 token_db = TokenDB::new(@language)
614 @dbm.each do |ct, v|
615 (category, token) = ct.split(Regexp.new(MAGIC), 2)
616 token_db.set(category, token, v)
617 token_db.file_count = @file_count
618 end
619 return token_db
620 end
621
622 def clear
623 @dbm.clear
624 @file_count = 0
625 set(".internal", "file_count", 0)
626 end
627
628 def each_ct
629 @dbm.each_key do |ct|
630 (category, token) = ct.split(Regexp.new(MAGIC), 2)
631 yield(category, token)
632 end
633 end
634
635 def add_db(token_db)
636 add_hash(token_db.hash)
637 @file_count += + token_db.file_count
638 end
639
640 def add_hash(hash)
641 @dirty = true
642 hash.flatten(MAGIC) do |k, v|
643 if (@dbm[k])
644 @dbm[k] = (@dbm[k].to_f + v.to_f).to_s
645 else
646 @dbm[k] = v.to_s
647 end
648 end
649 end
650
651 def sub_db(token_db)
652 sub_hash(token_db.hash)
653 if (@file_count > token_db.file_count)
654 @file_count -= token_db.file_count
655 else
656 @file_count= 0
657 end
658 end
659 def sub_hash(hash)
660 @dirty = true
661 hash.flatten(MAGIC) do |k, v|
662 if (@dbm[k])
663 if (@dbm[k].to_f > v.to_f)
664 @dbm[k] = (@dbm[k].to_f - v.to_f).to_s
665 else
666 @dbm.delete(k)
667 end
668 end
669 end
670 end
671
672 def value(category, token)
673 v = @dbm[category + MAGIC + token]
674 if (v)
675 return v.to_f
676 else
677 return nil
678 end
679 end
680
681 def set(category, token, v)
682 @dirty = true
683 @dbm[category + MAGIC + token] = v.to_s
684 end
685
686 def sub_scalar(category, token, v)
687 @dirty = true
688 if (@file_count > 0)
689 @file_count -= 1
690 end
691 oldv = value(category, token)
692 if (oldv)
693 if (oldv > v)
694 set(category, token, oldv - v)
695 else
696 @dbm.delete(category + MAGIC + token)
697 end
698 end
699 end
700
701 def open(mode="r")
702 @lockfh = File::open(@lockfile, "w+")
703 case mode
704 when "r"
705 begin
706 @lockfh.flock(File::LOCK_SH)
707 rescue Errno::EINVAL ## Win9x doesn't support LOCK_SH
708 @lockfh.flock(File::LOCK_EX)
709 end
710 when "w", "wr", "rw"
711 @lockfh.flock(File::LOCK_EX)
712 else
713 raise
714 end
715
716 @dbm = open_dbm(@filename, 0600)
717
718 if (v = value(".internal", "file_count"))
719 @file_count = v.to_i
720 else
721 @file_count = 0
722 set(".internal", "file_count", @file_count)
723 end
724 if (@options["verbose"])
725 @options["message-fh"].printf("open %s %d tokens %d mails by %d.\n", @filename, @dbm.length, @file_count, Process::pid)
726 end
727 @dirty = false
728 end
729
730 def close
731 dirty = @dirty
732 set(".internal", "file_count", @file_count) if (dirty)
733 if (@options["verbose"])
734 @options["message-fh"].printf("close %s %d tokens %d mails by %d.\n", @filename, @dbm.length, @file_count, Process::pid)
735 end
736 if (@options["debug"] && dirty)
737 key_cts.sort.each do |(c, t)|
738 @options["message-fh"].printf("%s %s %s %f\n", @filename, c, t, value(c, t))
739 end
740 end
741 @dbm.close
742
743 @lockfh.flock(File::LOCK_UN)
744 @lockfh.close
745 begin
746 File::unlink(@lockfile)
747 rescue
748 end
749 @dirty = false
750 end
751 end
752
753 class TokenSDBM < TokenDBM
754 def initialize(options, language, ext)
755 @filename = options["homedir"] + language + ext + SDBM_ext
756 @lockfile = options["homedir"] + language + ext + SDBM_ext + Lock_ext
757 super
758 end
759 def clear
760 @file_count = 0
761 @dbm.close
762 begin
763 File::unlink(@filename + ".dir")
764 File::unlink(@filename + ".pag")
765 rescue
766 end
767 @dbm = open_dbm(@filename, 0600)
768 if (@options["verbose"])
769 @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid)
770 end
771 end
772 def open_dbm(filename, mode)
773 SDBM::open(filename, mode)
774 end
775 end
776
777 class TokenGDBM < TokenDBM
778 def initialize(options, language, ext)
779 @options = options
780 @filename = @options["homedir"] + language + ext + GDBM_ext
781 @lockfile = @options["homedir"] + language + ext + GDBM_ext + Lock_ext
782 super
783 end
784 def clear
785 @file_count = 0
786 @dbm.close
787 begin
788 File::unlink(@filename)
789 rescue
790 end
791 @dbm = open_dbm(@filename, 0600)
792 if (@options["verbose"])
793 @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid)
794 end
795 end
796 def open_dbm(filename, mode)
797 GDBM::open(filename, mode, GDBM::NOLOCK)
798 end
799 end
800
801 class TokenBDB1 < TokenDBM
802 def initialize(options, language, ext)
803 @filename = options["homedir"] + language + ext + BDB1_ext
804 @lockfile = options["homedir"] + language + ext + BDB1_ext + Lock_ext
805 super
806 end
807 def clear
808 @file_count = 0
809 @dbm.close
810 begin
811 File::unlink(@filename)
812 rescue
813 end
814 @dbm = open_dbm(@filename, 0600)
815 if (@options["verbose"])
816 @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid)
817 end
818 end
819 def open_dbm(filename, mode)
820 BDB1::Hash.open(filename, BDB1::CREATE | BDB1::WRITE, mode)
821 end
822 end
823
824 class TokenBDB < TokenDBM
825 def initialize(options, language, ext)
826 @filename = options["homedir"] + language + ext + BDB_ext
827 @lockfile = options["homedir"] + language + ext + BDB_ext + Lock_ext
828 super
829 end
830 def clear
831 @file_count = 0
832 @dbm.close
833 begin
834 File::unlink(@filename)
835 rescue
836 end
837 @dbm = open_dbm(@filename, 0600)
838 if (@options["verbose"])
839 @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid)
840 end
841 end
842 def open_dbm(filename, mode)
843 BDB::Hash.open(filename, nil, BDB::CREATE, mode)
844 end
845 end
846
847 class TokenQDBM < TokenDBM
848 def initialize(options, language, ext)
849 @filename = options["homedir"] + language + ext + QDBM_ext
850 @lockfile = options["homedir"] + language + ext + QDBM_ext + Lock_ext
851 super
852 end
853 def value(category, token)
854 begin
855 v = @dbm[category + MAGIC + token]
856 rescue DepotError_ENOITEM
857 return nil
858 else
859 return v.to_f
860 end
861 end
862 def add_hash(hash)
863 @dirty = true
864 hash.flatten(MAGIC) do |k, v|
865 begin
866 if (@dbm[k])
867 @dbm[k] = (@dbm[k].to_f + v.to_f).to_s
868 else
869 ## nerver reached. DepotError_ENOITEM asserted when @dbm[k] is nil
870 @dbm[k] = v.to_s
871 end
872 rescue DepotError_ENOITEM
873 @dbm[k] = v.to_s
874 end
875 end
876 end
877 def clear
878 @file_count = 0
879 @dbm.close
880 begin
881 File::unlink(@filename)
882 rescue
883 end
884 @dbm = open_dbm(@filename, 0600)
885 if (@options["verbose"])
886 @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid)
887 end
888 end
889 def open_dbm(filename, mode)
890 Depot::open(filename, Depot::OWRITER | Depot::OCREAT)
891 end
892 end
893
894 def get_lang_from_headers(headers)
895 reg_char_ja = Regexp::compile('\?(iso-2022-jp|iso-2202-jp|x.sjis|shift.jis|euc.jp)\?', Regexp::IGNORECASE, 'n')
896 reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]", nil, 'n') # escape sequence to jisx0208 new and old
897 [headers["from"], headers["subject"]].each do |str|
898 if (str)
899 case str
900 when reg_char_ja
901 @options["message-fh"].printf("lang ja header char_ja\n") if (@options["debug"])
902 return ["ja", nil]
903 when reg_jis
904 @options["message-fh"].printf("lang ja header jis\n") if (@options["debug"])
905 return ["ja", "jis"]
906 end
907 end
908 end
909 return nil
910 end
911
912 def get_lang_from_buf(buf, html_flag)
913 return get_lang(buf, html_flag)
914 end
915
916 def get_lang(buf, html_flag=false)
917 reg_euc = Regexp::compile("[\xa1\xa2-\xa1\xbc\xa4\xa1-\xa4\xf3\xa5\xa1-\xa5\xf6]{4}", nil, 'e') # kana in euc-jp without zenkaku-space
918 reg_sjis = Regexp::compile("[\x81\x40-\x81\x5b\x82\x9f-\x82\xf1\x83\x40-\x83\x96]{2}", nil, 's') # kana in shift-jis
919 reg_utf8 = Regexp::compile("[\xe3\x80\x80-\xe3\x80\x82\xe3\x81\x81-\xe3\x82\x93\xe3\x82\xa1-\xe3\x83\xb6]{4}", nil, 'u') # kana in utf8
920 reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]", nil, 'n') # escape sequence to jisx0208 new and old
921 reg_gb18030_possible = Regexp::compile('[\x80-\x9f]', nil, 'n')
922
923 ## reg_char_utf8 = Regexp::compile('(^\w+: .*|charset="?)(utf-8)', Regexp::IGNORECASE, 'n')
924 ## reg_cte_bin = Regexp::compile('\Acontent-transfer-encoding\s*:\s*(base64|quoted-printable)', Regexp::IGNORECASE, 'n')
925 ## reg_c = Regexp::compile('(^\w+: .*|charset="?)(ks_c_5601|euc-kr|big5|gb2312)', Regexp::IGNORECASE, 'n')
926
927 gb18030_possible = false
928 buf.each do |str|
929 if (html_flag)
930 str = decode_character_reference2u(str)
931 end
932 if (str =~ reg_gb18030_possible)
933 gb18030_possible = true
934 end
935
936 case str.gsub(/\s/, '')
937 when reg_utf8
938 @options["message-fh"].printf("lang ja utf8\n") if (@options["debug"])
939 return ["ja", "utf8"]
940 when reg_jis
941 @options["message-fh"].printf("lang ja jis\n") if (@options["debug"])
942 return ["ja", "jis"]
943 when reg_sjis
944 @options["message-fh"].printf("lang ja sjis\n") if (@options["debug"])
945 return ["ja", "sjis"]
946 when reg_euc
947 if (gb18030_possible)
948 @options["message-fh"].printf("lang ja gb18030\n") if (@options["debug"])
949 return ["ja", "gb18030"]
950 else
951 @options["message-fh"].printf("lang ja euc\n") if (@options["debug"])
952 return ["ja", "euc"]
953 end
954 end
955 end
956 return [nil, nil]
957 end
958
959 def get_headers(buf, lang)
960 headers = DBHash::new
961 buf = buf.dup
962 header_buf = Array::new
963 if ((buf[0] !~ /\Afrom\s+(\S+)/i) && # this isn't mail
964 (buf[0] !~ /\A(\S+):/))
965 if (@options["max-line"] <= 0)
966 return [headers, buf, lang]
967 else
968 return [headers, buf[0 .. @options["max-line"]], lang]
969 end
970 end
971
972 while (str = buf.shift)
973 header_buf.push(str)
974 str = str.chomp
975 if (str =~ /\A(\S+?):\s*(.*)/)
976 current = $1.downcase
977 if (current == "received")
978 headers[current] = $2.sub(/[\r\n]*\z/, '')
979 else
980 headers[current] = (headers[current] || "") + " " + $2.sub(/[\r\n]*\z/, '')
981 end
982 elsif (str =~ /\Afrom\s+(\S+)/i)
983 headers["ufrom"] = $1
984 elsif (str =~ /\A\r*\z/)
985 break
986 elsif (! current)
987 break
988 else
989 if (str =~ /\A\s*=\?/)
990 headers[current] += str.sub(/[\r\n]*\z/, '').sub(/\A\s*/, '')
991 else
992 headers[current] += str.sub(/[\r\n]*\z/, '').sub(/\A\s*/, ' ')
993 end
994 end
995 end
996
997 if ((headers["content-type"] =~ /\bboundary=\s*"(.*?)"/i) ||
998 (headers["content-type"] =~ /\bboundary=\s*'(.*?)'/i) ||
999 (headers["content-type"] =~ /\bboundary=([^\s;]+)/i))
1000 headers["boundary"] = $1
1001 end
1002 if (headers["content-type"] =~ /charset=([\'\"]*)([^\s\1\;]+)\1/i)
1003 headers["charset"] = $2
1004 end
1005 if (headers["content-type"] =~ /\A([^;]+)/)
1006 headers["content-type"] = $1
1007 end
1008
1009 if (@options["max-line"] <= 0)
1010 return [headers, buf, lang]
1011 else
1012 return [headers, buf[0 .. @options["max-line"]], lang]
1013 end
1014 end
1015
1016
1017 class Jtokenizer
1018 def initialize(method)
1019 case method
1020 when "bigram"
1021 @method = Proc::new {|s| bigram(s)}
1022 when "block"
1023 @method = Proc::new {|s| block(s)}
1024 when "mecab"
1025 @method = Proc::new {|s| mecab(s)}
1026 @m = MeCab::Tagger.new([$0, "-Ochasen"])
1027 when "chasen"
1028 Chasen.getopt("-F", '%H %m\n', "-j")
1029 @method = Proc::new {|s| chasen(s)}
1030 when "kakasi"
1031 @method = Proc::new {|s| kakasi(s)}
1032 else
1033 raise
1034 end
1035 end
1036
1037 def split(str)
1038 @method.call(str)
1039 end
1040
1041 Reg_kanji = Regexp::compile("[\xb0\xa1-\xf4\xa4]+", nil, 'e')
1042 Reg_katakana = Regexp::compile("[\xa1\xbc\xa5\xa1-\xa5\xf6]+", nil, 'e')
1043 Reg_kanji_katakana = Regexp::compile("[\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e')
1044 Reg_not_kanji_katakana = Regexp::compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e')
1045
1046 def kakasi(str)
1047 str = str.gsub(/[\x00-\x7f]/, ' ')
1048 if (str =~ /\A +\z/)
1049 return []
1050 end
1051 array = Array::new
1052 Kakasi::kakasi("-oeuc -w", str).scan(/\S+/).each do |token|
1053 token.gsub!(Reg_not_kanji_katakana, '')
1054 if ((token =~ Reg_kanji) || (token.length > 2))
1055 array.push(token)
1056 end
1057 end
1058 return array
1059 end
1060
1061 def mecab(str)
1062 str = str.gsub(/[\x00-\x7f]/, ' ')
1063 if (str =~ /\A +\z/)
1064 return []
1065 end
1066 array = Array::new
1067 node = @m.parseToNode(str)
1068 while (node.hasNode == 1)
1069 token = node.getSurface
1070 hinshi = node.getFeature.split(/,/)[0]
1071 ## print token, hinshi, "\n"
1072 if (hinshi == "\xcc\xbe\xbb\xec")
1073 if ((token =~ Reg_kanji_katakana) || (token.length > 2))
1074 array.push(token)
1075 end
1076 else
1077 token.gsub!(Reg_not_kanji_katakana, '')
1078 if ((token =~ Reg_kanji) || (token.length > 2))
1079 array.push(token)
1080 end
1081 end
1082 node = node.next
1083 end
1084 return array
1085 end
1086
1087 def chasen(str)
1088 str = str.gsub(/[\x00-\x7f]/, ' ')
1089 if (str =~ /\A +\z/)
1090 return []
1091 end
1092 array = Array::new
1093 Chasen.sparse(str).split("\n").each do |hinshi_token|
1094 if (hinshi_token =~ /(.*) (.*)/)
1095 hinshi = $1
1096 token = $2
1097 if (hinshi == "\xcc\xbe\xbb\xec")
1098 if ((token =~ Reg_kanji_katakana) || (token.length > 2))
1099 array.push(token)
1100 end
1101 else
1102 token.gsub!(Reg_not_kanji_katakana, '')
1103 if ((token =~ Reg_kanji) || (token.length > 2))
1104 array.push(token)
1105 end
1106 end
1107 end
1108 end
1109 return array
1110 end
1111
1112 def block(str)
1113 tokens = str.scan(Reg_kanji)
1114 tokens.concat(str.scan(Reg_katakana))
1115 return tokens
1116 end
1117
1118 def bigram(str)
1119 tokens = Array::new
1120
1121 str.scan(Reg_kanji).each do |token|
1122 case token.length
1123 when 2, 4
1124 tokens.push(token)
1125 else
1126 l = token.length / 2 - 2
1127 for i in (0 .. l)
1128 tokens.push(token[i * 2, 4])
1129 end
1130 end
1131 end
1132 tokens.concat(str.scan(Reg_katakana))
1133 return tokens
1134 end
1135 end
1136
1137 def tokenize_headers(lang, headers)
1138 (lang, code) = get_lang_from_headers(headers) if (! lang)
1139
1140 head_db = TokenDB::new(lang)
1141 reg_token = Regexp::compile("\\b\\d[\\d\\.]+\\d\\b|[\\w#{@options['mark-in-token']}]+")
1142
1143 if (headers["received"])
1144 str = headers["received"]
1145 str =~ /envelope\-from\s+([\w@\.\-]+)/
1146 efrom = $1
1147 str =~ /for\s+<([\w@\.\-]+)>/
1148 foraddress = $1
1149 str.sub!(/(\bid|;).*/im, '')
1150 str.sub!(/\(qmail[^\)]*\)/, '')
1151 str += " " + efrom if efrom
1152 str += " " + foraddress if foraddress
1153 headers["received"] = str
1154 end
1155
1156 # if (headers["domainkey-signature"])
1157 # headers["domainkey-signature"] = headers["domainkey-signature"].sub(/b=[^:;\s]+/, '')
1158 # end
1159
1160 # "authentication-results", "domainkey-signature"
1161 headers.each do |header, content|
1162 if (@options["refer-header"][header])
1163 if (lang == "ja")
1164 content.gsub!(/=\?utf\-8\?([bq])\?(\S*)\?=/i) do |s|
1165 b_or_q = $1
1166 encoded_str = $2
1167 if (@options["utf-8"])
1168 if (b_or_q =~ /q/i)
1169 decoded_str = encoded_str.unpack("M*").to_s
1170 else
1171 decoded_str = encoded_str.unpack("m*").to_s
1172 end
1173 Iconv.u2eucjp(decoded_str)
1174 else
1175 ""
1176 end
1177 end
1178 content = NKF::nkf('-e -X -Z0', content.gsub(/\?(iso-2202-jp|shift-jis)\?/i, '?ISO-2022-JP?'))
1179 else
1180 content = latin2ascii(content)
1181 end
1182 content.scan(reg_token).each do |token|
1183 head_db.add_scalar(header, token, 1) if (token.length < 20)
1184 @options["message-fh"].printf("tokenizer %s %s\n", header, token) if (@options["debug"])
1185 end
1186 if (lang == "ja")
1187 @jtokenizer.split(content.gsub(/\s+/, '')).each do |token|
1188 head_db.add_scalar(header, token, 1)
1189 @options["message-fh"].printf("tokenizer %s %s\n", header, token) if (@options["debug"])
1190 end
1191 end
1192 end
1193 end
1194 return head_db
1195 end
1196
1197 def tokenize_buf(buf)
1198 lang = nil # lang in unknown at first
1199
1200 separators = Array::new
1201 delimiters = Array::new
1202 (headers, buf, lang) = get_headers(buf, lang)
1203 if (headers.empty?) # this is not a mail
1204 (db, buf) = tokenize_body(lang, headers, buf, separators, delimiters)
1205 db.time = Time::new
1206 db.language = "C" if (! db.language)
1207 return db
1208 end
1209
1210 body_db = TokenDB::new(lang)
1211 body_db.message_id = headers["message-id"] || "-"
1212
1213 sub_head_db = TokenDB::new(lang)
1214 main_head_db = tokenize_headers(lang, headers)
1215 lang = main_head_db.language if main_head_db
1216
1217 found_html_part = false
1218 plain_bodies = Array::new
1219 html_bodies = Array::new
1220
1221 while (! buf.empty?)
1222 separators.push("--" + headers["boundary"]) if (headers["boundary"])
1223 delimiters.push("--" + headers["boundary"] + "--") if (headers["boundary"])
1224
1225 if ((! headers["content-type"]) ||
1226 (headers["content-type"] !~ /rfc822/i))
1227 (db, buf) = tokenize_body(lang, headers, buf, separators, delimiters)
1228 lang = db.language
1229 if (headers["content-type"] =~ /html/i)
1230 found_html_part = true
1231 html_bodies.push(db)
1232 else
1233 plain_bodies.push(db)
1234 end
1235 end
1236 (headers, buf, lang) = get_headers(buf, lang)
1237 db = tokenize_headers(lang, headers)
1238 sub_head_db.add_db(db)
1239 end
1240
1241 if (@options["ignore-plain-text-part"] && found_html_part)
1242 html_bodies.each do |db|
1243 body_db.add_db(db)
1244 end
1245 else # default
1246 html_bodies.each do |db|
1247 body_db.add_db(db)
1248 end
1249 plain_bodies.each do |db|
1250 body_db.add_db(db)
1251 end
1252 end
1253
1254 body_db.add_db(main_head_db)
1255 body_db.add_db(sub_head_db)
1256 body_db.file_count = 1
1257 body_db.time = Time::new
1258 body_db.language = "C" if (! body_db.language)
1259 return body_db
1260 end
1261
1262 def i2eucjp(i)
1263 Iconv.u2eucjp([i].pack("U"))
1264 end
1265
1266 def i2ascii(i)
1267 latin2ascii(Iconv.u2latin([i].pack("U")))
1268 end
1269
1270 def i2u(i)
1271 [i].pack("U")
1272 end
1273
1274 def decode_character_reference2u(str)
1275 if (@options["utf-8"])
1276 newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i) do
1277 hex_or_dec = $1
1278 if (hex_or_dec =~ /^x(.*)/i)
1279 hex_str = $1
1280 i2u(hex_str.hex)
1281 else
1282 i2u(hex_or_dec.to_i)
1283 end
1284 end
1285 else
1286 newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i, "")
1287 end
1288 return newstr
1289 end
1290
1291 def decode_character_reference(str, lang)
1292 if (@options["utf-8"])
1293 newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i) do
1294 hex_or_dec = $1
1295 if (hex_or_dec =~ /^x(.*)/i)
1296 hex_str = $1
1297 if (lang == "ja")
1298 i2eucjp(hex_str.hex)
1299 else
1300 i2ascii(hex_str.hex)
1301 end
1302 else
1303 if (lang == "ja")
1304 i2eucjp(hex_or_dec.to_i)
1305 else
1306 i2ascii(hex_or_dec.to_i)
1307 end
1308 end
1309 end
1310 else
1311 newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i, "")
1312 end
1313 return newstr
1314 end
1315
1316 def tokenize_str(str, lang)
1317 body_hash = DBHash::new(0)
1318 url_hash = DBHash::new(0)
1319
1320 reg_token = Regexp::compile("(?:http:|www)[\\w\\-\\.\\/@%:\?=]+|[\\w\\-\\.]+@[\\w\\-\\.]+|\\b\\d[\\d\\.]+\\d\\b|[\\w#{@options['mark-in-token']}]+")
1321 reg_url = Regexp::compile('(^http:|https:|^www|@)')
1322 reg_token2 = Regexp::compile('\b\d[\d\.]+\d\b|[\w%]+')
1323 reg_noret = Regexp::compile('[\r\n]*\z')
1324
1325 str.scan(reg_token).each do |token|
1326 if (token =~ reg_url)
1327 token.scan(reg_token2).each do |token2|
1328 if (token2.length < 20)
1329 url_hash[token2] += 1
1330 @options["message-fh"].printf("tokenizer %s %s\n", "url", token2) if (@options["debug"])
1331 end
1332 end
1333 elsif (token.length < 20)
1334 body_hash[token] += 1
1335 @options["message-fh"].printf("tokenizer C %s %s\n", "body", token) if (@options["debug"])
1336 end
1337 end
1338
1339 if (lang == "ja")
1340 str.gsub!(Regexp::compile("^[ -\\~]*[\|\>]+", nil, 'e'), '') # delete cite mark
1341 str.gsub!(Regexp::compile("^[ \\t\xa1\xa1]+", nil, 'e'), '') # delete white space
1342 str.gsub!(Regexp::compile("(\\r?\\n){2,}", nil, 'e'), ' ') # keep multiple newline as space
1343 str.gsub!(Regexp::compile("[\\r\\n]+", nil, 'e'), '') # delete newline
1344 str.split.each do |s|
1345 @jtokenizer.split(s).each do |token|
1346 body_hash[token] += 1
1347 @options["message-fh"].printf("tokenizer ja %s %s\n", "body", token) if (@options["debug"])
1348 end
1349 end
1350 end
1351 return [body_hash, url_hash]
1352 end
1353
1354 def base64_encoded?(buf)
1355 [buf.dup, buf.reverse].each do |b|
1356 while (str = b.shift)
1357 if (str =~ /\A[\s\r\n]*\z/)
1358 next
1359 elsif (str =~ /\A[A-z0-9=+\/]+[\s\r\n]*\z/)
1360 break
1361 else
1362 return false
1363 end
1364 end
1365 end
1366 return true
1367 end
1368
1369 def tokenize_body(lang, headers, body, separators, delimiters)
1370 reg_return_codes = Regexp::compile('[\r\n]*\z')
1371
1372 db = TokenDB::new(lang)
1373 body = body.dup
1374
1375 buf = Array::new
1376
1377 delimiter = delimiters.last
1378 separator = separators.last
1379
1380 if (separators.empty?)
1381 buf = body
1382 body = Array::new
1383 else
1384 while (str = body.shift)
1385 str_noret = str.sub(reg_return_codes, '')
1386 case str_noret
1387 when separator
1388 break
1389 when delimiter
1390 delimiters.pop
1391 separators.pop
1392 delimiter = delimiters.last
1393 separator = separators.last
1394 else
1395 buf.push(str)
1396 end
1397 end
1398 end
1399
1400 if (headers["content-type"] && headers["content-type"] !~ /text/i)
1401 return [db, body] # skip non-text body
1402 end
1403
1404 case headers["content-transfer-encoding"]
1405 when /base64/i
1406 if (base64_encoded?(buf))
1407 buf.map! {|str| str.unpack("m*").to_s}
1408 end
1409 when /quoted-printable/i
1410 buf.map! {|str| str.unpack("M*").to_s}
1411 end
1412
1413 lang_backup = lang
1414 if (headers["content-type"] =~ /html/i)
1415 (lang, code) = get_lang_from_buf(buf, true)
1416 else
1417 (lang, code) = get_lang_from_buf(buf, false)
1418 end
1419 if (! lang)
1420 lang = lang_backup
1421 end
1422
1423 str = buf.join
1424 str.gsub!(/^begin[^\r\n]+(([\r\n]+M)([^\r\n]+))*/, '') # remove uuencoded lines
1425
1426 if (lang == "ja")
1427 if (code == "utf8")
1428 if (@options["utf-8"])
1429 str = Iconv.u2eucjp(str)
1430 else
1431 lang = "C" # can't use iconv / stop ja tokenizer
1432 end
1433 elsif (code == "gb18030")
1434 if (@options["utf-8"])
1435 str = Iconv.gb180302eucjp(str)
1436 else
1437 lang = "C"
1438 end
1439 else
1440 str = NKF::nkf('-e -X -Z0', str)
1441 end
1442 else
1443 str = latin2ascii(str)
1444 end
1445
1446 tags = Array::new
1447 if (headers["content-type"] =~ /html/i)
1448 # remove salad at head of part
1449 if (str =~ Regexp::compile('\A[^<>]*?(<(\?xml|!doctype|html|body)\b.*)\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n'))
1450 str = $1
1451 end
1452
1453 # remove salad in head, except style
1454 if (str =~ /\A(.*?)(<body.*)\z/im)
1455 before_body_tag = $1
1456 after_body_tag = $2
1457 before_body_tag.gsub!(/>[^<>]*<(?!\/style)/im, '><')
1458 str = before_body_tag + after_body_tag
1459 end
1460
1461 # remove <p style="font-size:0px..>
1462 str.gsub!(/(<p[^<>]*font-size\s*:\s*[01]\s*(;|px)[^<>]*>)([^<>]*)(<\/p>)/im, '')
1463 str.gsub!(/(<font[^<>]*font-size\s*:\s*[01]\s*(;|px)[^<>]*>)([^<>]*)(<\/font>)/im, '')
1464
1465 # remove <span style="DISPLAY: none..>
1466 str.gsub!(/(<span[^<>]*display\s*:\s*none[^>]*>)([^<>]*)(<\/span>)/im, '')
1467
1468 if (@options["ignore-after-last-atag"])
1469 if (str =~ /\A(.*)<\/a>/im)
1470 str = $1
1471 end
1472 end
1473
1474
1475 # remove salad after body or html
1476 if (str =~ Regexp::compile('\A(.*)</html>[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n'))
1477 str = $1
1478 end
1479 if (str =~ Regexp::compile('\A(.*)</body>[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n'))
1480 str = $1
1481 end
1482 str.gsub!(Regexp::compile('<[^>]*>', Regexp::MULTILINE, 'n')) do |t|
1483 t = t.gsub(/\n/, '')
1484 if (t =~ RE_ALL_TAGS) # end tags are thrown away
1485 tags.push(t)
1486 end
1487
1488 if (t =~ RE_SPACE_TAGS)
1489 " "
1490 else
1491 ""
1492 end
1493 end
1494 body_str = decode_character_reference(str, lang) # out of tags
1495 tag_str = decode_character_reference(tags.join, lang) # in tags
1496 else # if plain text
1497 body_str = str
1498 tag_str = ""
1499 end
1500 (body_hash, url_body_hash) = tokenize_str(body_str, lang)
1501 (tag_hash, url_tag_hash) = tokenize_str(tag_str, lang)
1502
1503 if (! body_hash.empty? && @options["use-body"])
1504 db.add_hash({"body" => body_hash})
1505 end
1506 if (! tag_hash.empty?)
1507 db.add_hash({"tag" => tag_hash})
1508 end
1509 if (! url_body_hash.empty?)
1510 db.add_hash({"url" => url_body_hash})
1511 end
1512 if (! url_tag_hash.empty?)
1513 db.add_hash({"url" => url_tag_hash})
1514 end
1515 db.file_count = 1
1516 db.language = lang
1517 return [db, body]
1518 end
1519
1520 class Probability # for each lang
1521 def initialize(options, lang)
1522 @options = options
1523 @filename = @options["homedir"] + lang + Prob_ext
1524 case (@options["db"])
1525 when "sdbm"
1526 @clean = TokenSDBM::new(@options, lang, Clean_ext)
1527 @spam = TokenSDBM::new(@options, lang, Spam_ext)
1528 @prob = TokenSDBM::new(@options, lang, Prob_ext)
1529 when "gdbm"
1530 @clean = TokenGDBM::new(@options, lang, Clean_ext)
1531 @spam = TokenGDBM::new(@options, lang, Spam_ext)
1532 @prob = TokenGDBM::new(@options, lang, Prob_ext)
1533 when "bdb1"
1534 @clean = TokenBDB1::new(@options, lang, Clean_ext)
1535 @spam = TokenBDB1::new(@options, lang, Spam_ext)
1536 @prob = TokenBDB1::new(@options, lang, Prob_ext)
1537 when "bdb"
1538 @clean = TokenBDB::new(@options, lang, Clean_ext)
1539 @spam = TokenBDB::new(@options, lang, Spam_ext)
1540 @prob = TokenBDB::new(@options, lang, Prob_ext)
1541 when "qdbm"
1542 @clean = TokenQDBM::new(@options, lang, Clean_ext)
1543 @spam = TokenQDBM::new(@options, lang, Spam_ext)
1544 @prob = TokenQDBM::new(@options, lang, Prob_ext)
1545 end
1546
1547 @language = lang
1548 end
1549
1550 attr_accessor :prob, :clean, :spam, :spam_cutoff, :language
1551
1552 def merge_dbs_of_lang(token_dbs)
1553 new_db = TokenDB::new
1554 token_dbs.each do |db|
1555 if (@language == db.language)
1556 new_db.add_db(db)
1557 end
1558 end
1559 return new_db
1560 end
1561 end
1562
1563 class Graham < Probability
1564 def initialize(options, lang)
1565 @spam_cutoff = 0.9
1566 @default_probability = 0.4
1567 super
1568 end
1569
1570 def product(a)
1571 n = 1
1572 a.each do |v|
1573 n = n * v if (v != 0)
1574 end
1575 return n
1576 end
1577
1578 def get_combined_probability(token_db)
1579 prob_db = TokenDB::new # temporary
1580
1581 token_db.each_ct do |(category, token)|
1582 probability = @prob.value_with_degene(category, token)
1583 if (probability)
1584 prob_db.set_scalar(category, token, probability)
1585 else
1586 prob_db.set_scalar(category, token, @default_probability) # 0.4
1587 end
1588 end
1589
1590 probs = prob_db.values.sort {|a, b| (b - 0.5).abs <=> (a - 0.5).abs}[0, 15]
1591
1592 if (@options["debug"])
1593 prob_array = Array::new
1594 prob_db.each_ct do |c, t|
1595 prob_array.push([[c, t], prob_db.value(c, t)])
1596 end
1597 prob_array.sort! {|a, b| (b[1] - 0.5).abs <=> (a[1] - 0.5).abs}
1598 prob_array = prob_array[0, 15]
1599 prob_array.sort! {|a, b| b[1] <=> a[1]}
1600 prob_array.each do |k, v|
1601 @options["message-fh"].printf("word probability %s %s %f\n", k[0], k[1], v)
1602 end
1603 end
1604
1605 prod = product(probs)
1606 token_db.probability = prod / (prod + product(probs.map {|x| 1 - x}))
1607 if (token_db.probability > @spam_cutoff)
1608 token_db.spam_flag = true
1609 else
1610 token_db.spam_flag = false
1611 end
1612 return token_db
1613 end
1614
1615 def update_probability(token_dbs)
1616 c_count = [@clean.file_count, 1].max
1617 s_count = [@spam.file_count, 1].max
1618
1619 if (token_dbs.empty?)
1620 incremental = false
1621 target_cts = @clean.key_cts | @spam.key_cts
1622 @prob.open("w")
1623 @prob.clear
1624 else
1625 incremental = true
1626 merged_db = merge_dbs_of_lang(token_dbs)
1627 target_cts = merged_db.key_cts
1628 return if (target_cts.empty?)
1629 @prob.open("rw")
1630 end
1631 old_file_count = @prob.file_count
1632 new_file_count = 0
1633
1634 cnum = c_count.to_f
1635 snum = s_count.to_f
1636
1637 target_cts.each do |(category, token)|
1638 c_count = @clean.value(category, token) || 0
1639 s_count = @spam.value(category, token) || 0
1640 update = false
1641 if (incremental && @prob.value(category, token))
1642 @prob.sub_scalar(category, token, 1.0) # 1.0 is big enough for delete
1643 new_file_count -= 1
1644 end
1645 if (c_count == 0)
1646 if (s_count > 10)
1647 new_file_count += 1
1648 @prob.set_scalar(category, token, 0.9999)
1649 elsif (s_count > 5)
1650 new_file_count += 1
1651 @prob.set_scalar(category, token, 0.9998)
1652 end
1653 elsif (s_count == 0)
1654 if (c_count > 10)
1655 new_file_count += 1
1656 @prob.set_scalar(category, token, 0.0001)
1657 elsif (c_count > 5)
1658 new_file_count += 1
1659 @prob.set_scalar(category, token, 0.0002)
1660 end
1661 elsif (c_count + s_count > 5)
1662 c = c_count * 2
1663 s = s_count
1664 p = [[[s / snum, 1.0].min / ([c / cnum, 1.0].min + [s / snum, 1.0].min),
1665 0.9999].min,
1666 0.0001].max
1667 new_file_count += 1
1668 @prob.set_scalar(category, token, p)
1669 end
1670 end
1671 @prob.file_count = new_file_count + old_file_count if (incremental)
1672 @prob.close
1673 end
1674 end
1675
1676 class Robinson < Probability
1677 def initialize(options, lang)
1678 @robx_max = 1
1679 @min_dev = 0.1
1680 @spam_cutoff = 0.582
1681 @center = 0.5
1682 @robs = 0.001 # from bogofilter/robinson.h
1683 @default_robx = 0.415 # from bogofilter/robinson.h / not used
1684 super
1685 end
1686
1687 def get_pw(category, token, g, b)
1688 return pw
1689 end
1690
1691
1692 def update_probability(token_dbs)
1693 pwdb = TokenDB::new
1694 c_count = [@clean.file_count, 1].max
1695 s_count = [@spam.file_count, 1].max
1696
1697 if (token_dbs.empty?)
1698 incremental = false
1699 target_cts = @clean.key_cts | @spam.key_cts
1700 else
1701 incremental = true
1702 merged_db = merge_dbs_of_lang(token_dbs)
1703 target_cts = merged_db.key_cts
1704 return if (target_cts.empty?)
1705 end
1706
1707 ## loop1
1708 ## get pw and robx(average of pw)
1709 count = 0
1710 pw_sum = 0.0
1711
1712 good_mail = [1, @clean.file_count].max.to_f
1713 bad_mail = [1, @spam.file_count].max.to_f
1714 target_cts.each do |(category, token)|
1715 g = [@clean.value(category, token) || 0, c_count].min
1716 b = [@spam.value(category, token) || 0, s_count].min
1717 n = g + b
1718 if (n == 0)
1719 pwdb.set_scalar(category, token, nil) # need to delete this token from prob.db
1720 else
1721 pw = (b / bad_mail) / (b / bad_mail + g / good_mail)
1722 if ((@robx_max == 0) || (n <= @robx_max))
1723 pw_sum += pw
1724 count += 1
1725 end
1726 pwdb.set_scalar(category, token, pw)
1727 end
1728 end
1729
1730 if (incremental)
1731 @prob.open("rw")
1732 old_file_count = @prob.file_count
1733 old_robx = @prob.value(".internal", "robx") || @default_robx
1734 robx = (pw_sum + old_file_count * old_robx) / (count + old_file_count)
1735 robs = @robs
1736 else
1737 @prob.open("w")
1738 @prob.clear
1739 if (count != 0)
1740 robx = pw_sum / count
1741 else
1742 robx = @default_robx
1743 end
1744 robs = @robs
1745 end
1746 ## loop2
1747 ## get fw from pw
1748 new_file_count = 0
1749 pwdb.key_cts.each do |(category, token)|
1750 g = [@clean.value(category, token) || 0, c_count].min
1751 b = [@spam.value(category, token) || 0, s_count].min
1752 n = g + b
1753 pw = pwdb.value(category, token)
1754 if (incremental && @prob.value(category, token))
1755 new_file_count -= 1
1756 @prob.sub_scalar(category, token, 1.0) # 1.0 is big enough for delete
1757 end
1758 if (pw)
1759 new_file_count += 1
1760 @prob.set_scalar(category, token, (robs * robx + n * pw) / (robs + n)) # fw
1761 end
1762 end
1763 @prob.set_scalar(".internal", "robx", robx)
1764 @prob.file_count = new_file_count + old_file_count if (incremental)
1765 @prob.close
1766 end
1767
1768 def get_probability(pminus, qminus, count)
1769 r = 1.0 / [1, count].max
1770 p = 1.0 - Math::exp(pminus.ln * r)
1771 q = 1.0 - Math::exp(qminus.ln * r)
1772 s = (1.0 + (p - q) / (p + q)) / 2.0
1773 return s
1774 end
1775
1776 def get_combined_probability(token_db)
1777 robx = @prob.value(".internal", "robx") || @default_robx
1778
1779 count = 0
1780 pminus = FLOAT::new(1)
1781 qminus = FLOAT::new(1)
1782 token_db.each_ct do |(category, token)|
1783 probability = @prob.value_with_degene(category, token) || robx
1784 if ((probability - @center).abs > @min_dev)
1785 if (probability <= 0.0)
1786 probability = 0.0000001
1787 elsif (probability >= 1.0)
1788 probability = 0.9999999
1789 end
1790 c = token_db.value(category, token)
1791 count += c
1792 pminus = pminus * FLOAT::new(1.0 - probability, c)
1793 qminus = qminus * FLOAT::new(probability, c)
1794 @options["message-fh"].printf("word probability %s %s %d %f\n", category, token, c, probability) if (@options["debug"])
1795 end
1796 end
1797
1798 if (count == 0)
1799 token_db.probability = 0.0
1800 else
1801 token_db.probability = get_probability(pminus, qminus, count)
1802 end
1803 if (token_db.probability > @spam_cutoff)
1804 token_db.spam_flag = true
1805 else
1806 token_db.spam_flag = false
1807 end
1808 return token_db
1809 end
1810 end
1811
1812
1813 class RobinsonFisher < Robinson
1814 def initialize(options, lang)
1815 super
1816 @spam_cutoff = 0.95
1817 end
1818
1819 def chi2q(x2, v)
1820 m = x2 / 2.0
1821 sum = Math::exp(0.0 - m)
1822 term = FLOAT::new
1823 term.exp = 0.0 - m
1824 term.mant = 1
1825
1826 (1 .. (v / 2) - 1).each do |i|
1827 term = term * FLOAT::new(m / i)
1828 sum += term.to_f
1829 end
1830 return sum < 1.0 ? sum : 1.0
1831 end
1832
1833 def get_probability(pminus, qminus, count)
1834 p = 1 - chi2q(-2.0 * pminus.ln, 2 * count)
1835 q = 1 - chi2q(-2.0 * qminus.ln, 2 * count)
1836 s = (1.0 + p - q) / 2.0
1837 return s
1838 end
1839 end
1840
1841 def init_dir(dir)
1842 if (! FileTest::directory?(dir))
1843 Dir.mkdir(dir, 0700)
1844 end
1845 end
1846
1847 def usage
1848
1849 print <<EOM
1850
1851 NAME
1852 bsfilter - bayesian spam filter
1853
1854 SYNOPSIS
1855 bsfilter [options] [commands] < MAIL
1856 bsfilter [options] [commands] MAIL ...
1857
1858 DESCRIPTION
1859 filter spam.
1860 If commands are specified, bsfilter is in maintenance mode, otherwise it is in filtering mode.
1861 If bsfilter does not find spam in filtering mode, exit status is 1.
1862 If bsfilter runs with --pipe option or finds spam, exit status is 0.
1863
1864 COMMANDS
1865 --add-clean|-c
1866 add mails into the clean token database
1867
1868 --add-spam|-s
1869 add mails into the spam token database
1870
1871 --sub-clean|-C
1872 subtract mails from the clean token database
1873
1874 --sub-spam|-S
1875 subtract mails from the spam token database
1876
1877 --update|-u
1878 update the probability table from clean and spam token databases
1879
1880 --export-clean
1881 export the clean token database
1882
1883 --export-spam
1884 export the spam token database
1885
1886 --import-clean
1887 import the clean token database
1888
1889 --import-spam
1890 import the spam token database
1891
1892 --export-probability
1893 export the probability database (for debugging purpose)
1894 OPTIONS
1895 --homedir directory
1896 specify the name of the bsfilter\'s home directory
1897 If this option is not used, a directory specified with the environment variable "BSFILTERHOME" is used
1898 If the variable "BSFILTERHOME" is not defined, ".bsfilter" directory under your home is used
1899 If the variable "HOME" is not defined, a directory which bsfilter is located at is used
1900
1901 --config-file file
1902 specify the name of the bsfilter\'s configuration file
1903 "bsfilter.conf" in bsfilter\'s home directory is used by default
1904
1905 --max-line number
1906 check and/or study the first number of lines
1907 default is #{Default_max_line}. 0 means all
1908
1909 --db sdbm|gdbm|bdb1|bdb|qdbm
1910 specify the name of database type
1911 "sdbm" by default
1912
1913 --jtokenizer|-j bigram|block|mecab|chasen|kakasi
1914 specify algorithm of a tokenizer for Japanese language
1915 "bigram" by default
1916
1917 --list-clean
1918 print filename of clean mail
1919
1920 --list-spam
1921 print filename of spam
1922
1923 --imap
1924 access IMAP server
1925
1926 --imap-server hostname
1927 specify hostname of IMAP server
1928
1929 --imap-port number
1930 specify port number of IMAP server. default is #{Default_imap_port}
1931
1932 --imap-auth method
1933 specify authorization method. default is "auto"
1934 "cram-md5" use "AUTHENTICATE CRAM-MD5" command
1935 "login" use "AUTHENTICATE LOGIN" command
1936 "loginc" use "LOGIN" command
1937 "auto" try #{Default_imap_auth_preference.join(', ')} in this order.
1938
1939 --imap-user name
1940 specify user name of IMAP server
1941
1942 --imap-password password
1943 specify password of imap-user
1944
1945 --imap-folder-clean folder
1946 specify destination folder for clean mails. "inbox.clean" for example
1947
1948 --imap-folder-spam folder
1949 specify destination folder for spams. "inbox.spam" for example
1950
1951 --imap-fetch-unseen
1952 filter or study mails without SEEN flag
1953
1954 --imap-fetch-unflagged
1955 filter or study mails without "X-Spam-Flag" header
1956
1957 --imap-reset-seen-flag
1958 reset SEEN flag when bsfilter moves or modifies mails
1959
1960 --pop
1961 work as POP proxy
1962
1963 --pid-file file
1964 specify filename for logging process ID of bsfilter
1965 "bsfilter.pid" in bsfilter\'s home directory is used by default
1966 this function is valid when "--pop" is specified
1967
1968 --tasktray
1969 sit in tasktray
1970 this is valid with "--pop" on VisualuRuby
1971
1972 --pop-server hostname
1973 specify hostname of POP server
1974
1975 --pop-port number
1976 specify port number of POP server. default is #{Default_pop_port}
1977
1978 --pop-proxy-if address
1979 specify address of interface which bsfilter listens at
1980 default is 0.0.0.0 and all interfaces are active
1981
1982 --pop-proxy-port number
1983 specify port number which bsfilter listens at. default is #{Default_pop_proxy_port}
1984
1985 --pop-user name
1986 optional. specify username of POP server.
1987 bsfilter checks match between value of this options and a name which MUA sends.
1988 in case of mismatch, bsfilter closes sockets.
1989
1990 --pop-proxy-set set[,set...]
1991 specify rules of pop proxy.
1992 alternative way of pop-server, pop-port, pop-proxy-port and pop-user option.
1993 format of "set" is "pop-server:pop-port:[proxy-interface]:proxy-port[:pop-user]"
1994 If proxy-interface is specified and isn\'t 0.0.0.0 , other interfaces are not used.
1995 "--pop-proxy-set 192.168.1.1:110::10110" is equivalent with
1996 "--pop-server 192.168.1.1 --pop-port 110 --pop-proxy-port 10110"
1997
1998 --pop-max-size number
1999 When mail is longer than the specified number, the mail is not filtered.
2000 When 0 is specified, all mails are tested and filtered.
2001 unit is byte. default is #{Default_pop_max_size}
2002
2003 --ssl
2004 use POP over SSL with --pop option
2005 use IMAP over SSL with --imap option
2006
2007 --ssl-cert filename|dirname
2008 specify a filename of a certificate of a trusted CA or
2009 a name of a directory of certificates
2010
2011 --method|-m g|r|rf
2012 specify filtering method. "rf" by default
2013 "g" means Paul Graham method,
2014 "r" means Gary Robinson method,
2015 and "rf" means Robinson-Fisher method
2016
2017 --spam-cutoff number
2018 specify spam-cutoff value
2019 0.9 by default for Paul Graham method
2020 0.582 by default for Gary Robinson method
2021 0.95 by default for Robinson-Fisher method
2022
2023 --auto-update|-a
2024 recognize mails, add them into clean or spam token database
2025 and update the probability table
2026
2027 --disable-degeneration|-D
2028 disable degeneration during probability table lookup
2029
2030 --disable-utf-8
2031 disable utf-8 support
2032
2033 --refer-header header[,header...]
2034 refer specified headers of mails
2035 "#{Default_refer_header}"
2036 by default
2037
2038 --ignore-header|-H
2039 ignore headers of mails
2040 same as --refer-header ""
2041
2042 --ignore-body|-B
2043 ignore body of mails, except URL or mail address
2044
2045 --ignore-plain-text-part
2046 ignore plain text part if html part is included in the mail
2047
2048 --ignore-after-last-atag
2049 ignore text after last "A" tag
2050
2051 --mark-in-token "characters"
2052 specify characters which are allowable in a token
2053 "#{Default_mark_in_token}" by default
2054
2055 --show-process
2056 show summary of execution
2057
2058 --show-new-token
2059 show tokens which are newly added into the token database
2060
2061 --mbox
2062 use "unix from" to divide mbox format file
2063
2064 --max-mail number
2065 reduce token database when the number of stored mails is larger than this one
2066 #{Default_max_mail} by default
2067
2068 --min-mail number
2069 reduce token database as if this number of mails are stored
2070 #{Default_min_mail} by default
2071
2072 --pipe
2073 write a mail to stdout.
2074 this options is invalid when "--imap" or "--pop" is specified
2075
2076 --insert-revision
2077 insert "X-#{Default_header_prefix}-Revision: bsfilter release..." into a mail
2078
2079 --insert-flag
2080 insert "X-#{Default_header_prefix}-Flag: Yes" or "X-#{Default_header_prefix}-Flag: No" into a mail
2081
2082 --insert-probability
2083 insert "X-#{Default_header_prefix}-Probability: number" into a mail
2084
2085 --header-prefix string
2086 valid with --insert-flag and/or --insert-probability option
2087 insert "X-specified_string-..." headers, instead of "#{Default_header_prefix}"
2088
2089 --mark-spam-subject
2090 insert "#{Default_spam_subject_prefix}" at the beginning of Subject header
2091
2092 --spam-subject-prefix string
2093 valid with --mark-spam-subject option
2094 insert specified string, instead of "#{Default_spam_subject_prefix}"
2095
2096 --show-db-status
2097 show numbers of tokens and mails in databases and quit
2098
2099 --help|-h
2100 help
2101
2102 --quiet|-q
2103 quiet mode
2104
2105 --verbose|-v
2106 verbose mode
2107
2108 --debug|-d
2109 debug mode
2110
2111 EXAMPLES
2112
2113 % bsfilter -s ~/Mail/spam/* ## add spam
2114 % bsfilter -u -c ~/Mail/job/* ~/Mail/private/* ## add clean mails and update probability table
2115 % bsfilter ~/Mail/inbox/1 ## show spam probability
2116
2117 ## recipe of procmail (1)
2118 :0 HB
2119 * ? bsfilter -a
2120 spam/.
2121
2122 ## recipe of procmail (2)
2123 :0 fw
2124 | bsfilter -a --pipe --insert-flag --insert-probability
2125
2126 :0
2127 * ^X-Spam-Flag: Yes
2128 spam/.
2129
2130 LICENSE
2131 this file is distributed under GPL version2 and might be compiled by Exerb with VisualuRuby
2132
2133 SEE ALSO
2134 http://bsfilter.org/
2135 http://sourceforge.jp/projects/bsfilter/
2136 http://exerb.sourceforge.jp/
2137 http://www.osk.3web.ne.jp/~nyasu/software/vrproject.html
2138 http://www.ruby-lang.org/
2139
2140 RELEASE
2141 #{Release}
2142
2143 REVISION
2144 #{Revision}
2145 EOM
2146 end
2147
2148 class Mbox
2149 def initialize(options, fh)
2150 @buf = Array::new
2151 @options = options
2152 @fh = fh
2153 end
2154 def read
2155 if (! @options["mbox"])
2156 if (@fh.eof?)
2157 return nil
2158 else
2159 buf = @fh.readlines
2160 if ((buf.length == 1) && (buf.last =~ /\r\z/))
2161 return buf.last.scan(/.*?\r/)
2162 else
2163 return buf
2164 end
2165 end
2166 end
2167
2168 ## reg_ufrom = Regexp::compile('^From .*@.* \d{2}:\d{2}:\d{2} ')
2169 reg_ufrom = Regexp::compile('^From ')
2170 while (str = @fh.gets)
2171 if (str =~ reg_ufrom)
2172 if (@buf.empty?)
2173 @buf.push(str)
2174 else
2175 ret_buf = @buf
2176 @buf = Array::new
2177 @buf.push(str)
2178 return ret_buf
2179 end
2180 else
2181 @buf.push(str)
2182 end
2183 end
2184 ret_buf = @buf
2185 @buf = nil
2186 return ret_buf
2187 end
2188 end
2189
2190 def update_token_db_one(db, command=@options)
2191 maintenance_command = ""
2192 maintenance_command += "c" if (command["add-clean"])
2193 maintenance_command += "s" if (command["add-spam"])
2194 maintenance_command += "C" if (command["sub-clean"])
2195 maintenance_command += "S" if (command["sub-spam"])
2196 maintenance_command = "-" if (maintenance_command == "")
2197
2198 show_process(db, maintenance_command) if (@options["show-process"])
2199
2200 if (command["add-clean"] || command["import-clean"])
2201 @db_hash[db.language].clean.show_new_token(db) if (@options["show-new-token"])
2202 @db_hash[db.language].clean.add_db(db)
2203 end
2204 if (command["add-spam"] || command["import-spam"])
2205 @db_hash[db.language].spam.show_new_token(db) if (@options["show-new-token"])
2206 @db_hash[db.language].spam.add_db(db)
2207 end
2208 if (command["sub-clean"])
2209 @db_hash[db.language].clean.sub_db(db)
2210 end
2211 if (command["sub-spam"])
2212 @db_hash[db.language].spam.sub_db(db)
2213 end
2214 end
2215
2216 def read_exported_text(fh)
2217 dbs = DBHash::new
2218 Languages.each do |lang|
2219 dbs[lang] = TokenDB::new(lang)
2220 dbs[lang].time = Time::new
2221 end
2222 while (str = fh.gets)
2223 str.chomp!
2224 if (str =~ /^\s*#/)
2225 next
2226 end
2227 (lang, category, token, val) = str.split
2228 val = val.to_f.to_i
2229 if (category == ".internal")
2230 if (token == "file_count")
2231 dbs[lang].file_count = dbs[lang].file_count + val
2232 end
2233 else
2234 dbs[lang].add_scalar(category, token, val)
2235 dbs[lang].file_count = dbs[lang].file_count - 1
2236 end
2237 end
2238 return dbs
2239 end
2240
2241 def update_token_dbs(files)
2242 dbs = Array::new
2243 Languages.each do |lang|
2244 @db_hash[lang].clean.open("rw")
2245 @db_hash[lang].spam.open("rw")
2246 end
2247
2248 if (@options["imap"])
2249 if (@options["ssl"])
2250 if (@options["ssl-cert"])
2251 verify_mode = OpenSSL::SSL::VERIFY_PEER
2252 else
2253 verify_mode = nil
2254 end
2255 imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"], @options["ssl"], @options["ssl-cert"], verify_mode)
2256 else
2257 imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"])
2258 end
2259 imap.auto_authenticate(@options, @options["imap-auth"], @options["imap-user"], @options["imap-password"], @options["imap-auth-preference"])
2260
2261 files.each do |mailbox|
2262 target_mailbox = mailbox
2263 target_mailbox = @options["imap-folder-clean"] if (@options["add-clean"] && @options["imap-folder-clean"])
2264 target_mailbox = @options["imap-folder-spam"] if (@options["add-spam"] && @options["imap-folder-spam"])
2265 uids = imap_get_target_uids(imap, mailbox)
2266 uids.each do |uid|
2267 imapm = IMAPMessage::new(@options, imap, uid)
2268 imapm.fetch_rfc822
2269 db = tokenize_buf(imapm.buf)
2270 db.filename = uid
2271 update_token_db_one(db)
2272 updated = imapm.insert_rfc822_headers!((@options["add-spam"] || @options["sub-clean"]), nil)
2273 if (updated)
2274 imapm.append(target_mailbox)
2275 imapm.set_delete_flag
2276 elsif (target_mailbox != mailbox)
2277 imapm.copy(target_mailbox)
2278 imapm.set_delete_flag
2279 end
2280 end
2281 imap.close
2282 end
2283 imap.logout
2284 else
2285 files.each do |file|
2286 open_ro(file) do |fh|
2287 if (@options["import-clean"] || @options["import-spam"])
2288 imported_dbs = read_exported_text(fh)
2289 imported_dbs.each do |lang, db|
2290 update_token_db_one(db)
2291 end
2292 else
2293 mbox = Mbox::new(@options, fh)
2294 while (buf = mbox.read)
2295 db = tokenize_buf(buf)
2296 db.filename = file
2297 dbs.push(db)
2298 if (@options["pipe"])
2299 insert_headers!(buf, (@options["add-spam"] || @options["sub-clean"]), nil)
2300 @options["pipe-fh"].print buf
2301 end
2302 update_token_db_one(db)
2303 end
2304 end
2305 end
2306 end
2307 end
2308
2309 slimed = false
2310 Languages.each do |lang|
2311 slimed |= @db_hash[lang].clean.check_size(@options["max-mail"], @options["min-mail"])
2312 slimed |= @db_hash[lang].spam.check_size(@options["max-mail"], @options["min-mail"])
2313 @db_hash[lang].clean.close
2314 @db_hash[lang].spam.close
2315 end
2316 dbs.clear if (slimed) # disable incremental
2317 return dbs
2318 end
2319
2320 def auto_update(token_dbs)
2321 command = Hash::new
2322 updated_langs = Array::new
2323 token_dbs.each do |token_db|
2324 updated_langs.push(token_db.language)
2325 end
2326 updated_langs.uniq.each do |lang|
2327 @db_hash[lang].clean.open("rw")
2328 @db_hash[lang].spam.open("rw")
2329 end
2330
2331 command["sub-clean"] = false
2332 command["sub-spam"] = false
2333 command["import-clean"] = false
2334 command["import-spam"] = false
2335
2336 token_dbs.each do |token_db|
2337 if (token_db.spam_flag)
2338 command["add-clean"] = false
2339 command["add-spam"] = true
2340 else
2341 command["add-clean"] = true
2342 command["add-spam"] = false
2343 end
2344 update_token_db_one(token_db, command)
2345 end
2346
2347 slimed = false
2348 updated_langs.uniq.each do |lang|
2349 slimed |= @db_hash[lang].clean.check_size(@options["max-mail"], @options["min-mail"])
2350 slimed |= @db_hash[lang].spam.check_size(@options["max-mail"], @options["min-mail"])
2351 end
2352 token_dbs.clear if (slimed) # can't use incremental mode
2353
2354 updated_langs.uniq.each do |lang|
2355 @db_hash[lang].update_probability(token_dbs)
2356 end
2357
2358 updated_langs.uniq.each do |lang|
2359 @db_hash[lang].clean.close
2360 @db_hash[lang].spam.close
2361 end
2362 end
2363
2364 def read_config_file(file)
2365 configs = Array::new
2366
2367 open(file) do |fh|
2368 while (str = fh.gets)
2369 if ((str =~ /\A\s*#/) || (str =~ /\A\s*\z/))
2370 next
2371 end
2372 str.chomp!
2373 str.sub!(/\s+\z/, '')
2374 str.sub!(/\A\s+/, '')
2375 tokens = str.split(/\s+/, 2)
2376 if (! tokens.empty?)
2377 tokens[0] = "--" + tokens[0]
2378 configs.concat(tokens)
2379 end
2380 end
2381 end
2382 return configs
2383 end
2384
2385 def imap_get_target_uids(imap, mailbox)
2386 keys = Array::new
2387 if (mailbox =~ /(.*)\/(.*)/)
2388 mailbox = $1
2389 seqs = $2
2390 else
2391 seqs = nil
2392 end
2393 imap.select(mailbox)
2394 if (@options["imap-fetch-unseen"])
2395 if (seqs)
2396 uids = imap.uid_search(["UNSEEN", seqs])
2397 else
2398 uids = imap.uid_search(["UNSEEN"])
2399 end
2400 else
2401 if (seqs)
2402 uids = imap.uid_search([seqs])
2403 else
2404 uids = imap.uid_search(["ALL"])
2405 end
2406 end
2407 if (@options["imap-fetch-unflagged"])
2408 null = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), ""])
2409 yes = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), "Yes"])
2410 no = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), "No"])
2411 @options["message-fh"].printf("imap-fetch-unflagged working original %d null %d Yes %d No %d\n",
2412 uids.length, null.length, yes.length, no.length) if (@options["verbose"])
2413 uids = uids - imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), ""])
2414 @options["message-fh"].printf("imap-fetch-unflagged worked %d\n",
2415 uids.length) if (@options["verbose"])
2416 end
2417 return uids
2418 end
2419
2420 class IMAPMessage
2421 include Bsutil
2422 def initialize(options, imap, uid=nil)
2423 @options = options
2424 @seqno = nil
2425 @seen = nil
2426 @uid = uid
2427 @imap = imap
2428 @buf = Array::new
2429 end
2430 attr_accessor :seqno, :uid, :imap, :buf, :seen
2431
2432 def fetch_rfc822
2433 # @options["message-fh"].printf("fetch_rfc822 %d\n", @uid) if (@options["verbose"])
2434 fetched = @imap.uid_fetch(@uid, ["RFC822", "FLAGS"])
2435 @seqno = fetched[0].seqno
2436 @buf = fetched[0].attr["RFC822"].split("\n")
2437 @seen = fetched[0].attr["FLAGS"].include?(:Seen)
2438 if (! @seen)
2439 @imap.uid_store(@uid, "-FLAGS", [:Seen])
2440 end
2441 end
2442
2443 def insert_rfc822_headers!(*args)
2444 return insert_headers!(@buf, *args)
2445 end
2446
2447 def insert_rfc822_header!(header, content)
2448 # @options["message-fh"].printf("insert_rfc822_header %d %s %s\n", @uid, header, content) if (@options["verbose"])
2449 insert_header!(@buf, header, content)
2450 end
2451
2452 def append(mailbox)
2453 @buf.map! do |str|
2454 str.sub(/[\r\n]*\z/, "\r\n")
2455 end
2456 # @options["message-fh"].printf("append %d %s\n", @uid, mailbox) if (@options["verbose"])
2457 if (@seen)
2458 @imap.append(mailbox, @buf.join, [:Seen])
2459 else
2460 @imap.append(mailbox, @buf.join, [])
2461 end
2462 end
2463
2464 def copy(mailbox)
2465 # @options["message-fh"].printf("copy %d %s\n", @uid, mailbox) if (@options["verbose"])
2466 @imap.uid_copy(@uid, mailbox)
2467 end
2468
2469 def set_delete_flag
2470 # @options["message-fh"].printf("set_delete_flag %d\n", @uid) if (@options["verbose"])
2471 @imap.uid_store(@uid, "+FLAGS", [:Deleted])
2472 end
2473
2474 def reset_seen_flag
2475 # @options["message-fh"].printf("reset_seen_flag %d\n", @uid) if (@options["verbose"])
2476 @seen = false
2477 @imap.uid_store(@uid, "-FLAGS", [:Seen])
2478 end
2479 end # end of class IMAPMessage
2480
2481 def socket_send_rec(command, socket)
2482 buf = Array::new
2483 if (command)
2484 @options["message-fh"].printf("send %s %s", socket, command.sub(/\APASS.*/i, "PASS ********")) if (@options["debug"])
2485 socket.write_timeout(command) # pass command to pop-server
2486 end
2487 response = socket.gets_timeout # get response from pop-server
2488 buf.push(response)
2489 @options["message-fh"].printf("resp %s %s", socket, response.sub(/\APASS.*/i, "PASS ********")) if (@options["debug"])
2490 if ((response =~ /\A\+OK/) &&
2491 ((command =~ /(RETR|TOP|CAPA)/i) ||
2492 (command =~ /(UIDL|LIST)[^\d]*\z/i)))
2493 while (response != ".\r\n")
2494 response = socket.gets_timeout
2495 buf.push(response)
2496 end
2497 end
2498 return buf
2499 end
2500
2501 def pop_proxy_multi(pop_proxy_sets)
2502 trap("SIGINT") do
2503 @options["message-fh"].printf("SIGINT received\n") if (@options["verbose"])
2504 @threads.each do |thread| # kill child threads
2505 Thread::kill(thread)
2506 end
2507 end
2508
2509 pop_proxy_sets.split(/,/).each do |pop_proxy_set|
2510 (pop_server, pop_port, pop_proxy_if, pop_proxy_port, pop_user) = pop_proxy_set.split(/:/)
2511 pop_port = Default_pop_port if ((! pop_port) || pop_port == '')
2512 pop_proxy_if = Default_pop_proxy_if if ((! pop_proxy_if) || pop_proxy_if == '')
2513 pop_proxy_port = Default_pop_proxy_port if ((! pop_proxy_port) || pop_proxy_port == '')
2514 t = Thread::start do # start child threads
2515 pop_proxy_one(pop_server, pop_port, pop_proxy_if, pop_proxy_port, pop_user)
2516 end
2517 @threads.push(t)
2518 end
2519 @threads.each do |t| # join child threads
2520 t.join
2521 end
2522
2523 Thread::list.each do |t| # join grandchild threads
2524 t.join if (t != Thread::current)
2525 end
2526 return 0
2527 end
2528
2529 def pop_bypass_large_mail(command, pop_socket, pop_proxy_socket)
2530 pop_socket.write_timeout(command) # RETR to server
2531 str = pop_socket.gets_timeout # response from server
2532 pop_proxy_socket.write_timeout(str) # forward
2533 return if (str =~ /^\A\+ng/i)
2534
2535 while (str != ".\r\n")
2536 timeout(SOCKET_TIMEOUT) do
2537 pop_proxy_socket.write(str = pop_socket.gets) # forward
2538 end
2539 end
2540 return
2541 end
2542
2543 def snoop_list_response(strs)
2544 h = DBHash::new
2545 if (strs[0] =~ /\A\+ok\s*(\d+)\s+(\d+)/)
2546 h[$1] = $2.to_i
2547 else
2548 strs.each do |str|
2549 if (str =~ /^(\d+)\s+(\d+)/)
2550 h[$1] = $2.to_i
2551 end
2552 end
2553 end
2554 return h
2555 end
2556
2557 def pop_proxy_one(pop_server, pop_port, pop_proxy_if, pop_proxy_port, pop_user)
2558 gs = TCPserver.open(pop_proxy_if, pop_proxy_port)
2559 addr = gs.addr
2560 addr.shift
2561 @options["message-fh"].printf("pop_proxy is on %s\n", addr.join(":")) if (@options["verbose"])
2562 while true
2563 Thread::start(gs.accept) do |pop_proxy_socket| # start grandchild threads
2564 @options["message-fh"].print(pop_proxy_socket, " is accepted\n") if (@options["verbose"])
2565 begin
2566 pop_socket = nil
2567 timeout(SOCKET_TIMEOUT) do
2568 pop_socket = TCPsocket.open(pop_server, pop_port)
2569 end
2570 @options["message-fh"].print(pop_socket, " is connected\n") if (@options["verbose"])
2571
2572 pop_socket = get_ssl_socket(pop_socket, @options["ssl-cert"]) if (@options["ssl"])
2573
2574 hello = socket_send_rec(nil, pop_socket)[0]
2575 hello.sub!(/(.*)\r/, "\\1(pop_proxy by bsfilter)\r")
2576 pop_proxy_socket.write(hello)
2577
2578 sizes = DBHash::new
2579 while (command = socket_send_rec(nil, pop_proxy_socket)[0]) # get command from MUA
2580 if (command =~ /\ARETR\s+(\d+)/i)
2581 n = $1
2582 if (sizes[n] &&
2583 (0 < @options["pop-max-size"]) && (@options["pop-max-size"] < sizes[n]))
2584 pop_bypass_large_mail(command, pop_socket, pop_proxy_socket)
2585 next
2586 end
2587 end
2588 response = socket_send_rec(command, pop_socket)
2589 if (command =~ /\ALIST/i)
2590 sizes.update(snoop_list_response(response))
2591 elsif ((command =~ /\A(TOP|RETR)/i) && (response[0] =~ /\A\+OK/))
2592 buf = response[1..-1].dup
2593 token_db = tokenize_buf(buf)
2594 @db_hash[token_db.language].prob.open("r")
2595 @db_hash[token_db.language].get_combined_probability(token_db)
2596 @db_hash[token_db.language].prob.close
2597 if (@options["auto-update"])
2598 auto_update([token_db])
2599 elsif (@options["show-process"])
2600 show_process(token_db, "-")
2601 end
2602 @options["message-fh"].printf("combined probability %f\n", token_db.probability) if (@options["verbose"])
2603 insert_headers!(buf, token_db.spam_flag, token_db.probability)
2604 response[1..-1] = buf
2605 end
2606 # don't use elsif
2607 if (command =~ /QUIT/i)
2608 pop_proxy_socket.write(response) # return response to MUA
2609 break
2610 elsif ((command =~ /\AUSER\s*(\S*)\r/) &&
2611 (pop_user && pop_user != $1))
2612 @options["message-fh"].printf("username unmatch error\n")
2613 pop_proxy_socket.write("-ERR unregistered user\r\n") # return response to MUA
2614 break
2615 else
2616 pop_proxy_socket.write(response) # return response to MUA
2617 end
2618 end
2619 rescue TimeoutError
2620 @options["message-fh"].printf("Timeout error %s %s %s\n", pop_server, pop_port, pop_proxy_port) if (@options["verbose"])
2621 rescue
2622 @options["message-fh"].printf("pop exception caught %s %s %s\n", pop_server, pop_port, pop_proxy_port) if (@options["verbose"])
2623 p "#{$!}" if (@options["verbose"])
2624 p "#{$@}" if (@options["debug"])
2625 ensure
2626 if (pop_proxy_socket && ! pop_proxy_socket.closed?)
2627 @options["message-fh"].print(pop_proxy_socket, " is gone\n") if (@options["verbose"])
2628 pop_proxy_socket.close
2629 end
2630 if (pop_socket && ! pop_socket.closed?)
2631 @options["message-fh"].print(pop_socket, " is gone\n") if (@options["verbose"])
2632 pop_socket.close
2633 end
2634 end
2635 end # thread end
2636 end
2637 end
2638
2639 def check_options_for_pop!(options)
2640 error = false
2641 options["icon_number"] = (options["icon-number"] || Default_icon_number).to_i
2642 options["pop-port"] = Default_pop_port if (! options["pop-port"])
2643 options["pop-proxy-if"] = Default_pop_proxy_if if (! options["pop-proxy-if"])
2644 options["pop-proxy-port"] = Default_pop_proxy_port if (! options["pop-proxy-port"])
2645 options["pop-max-size"] = (options["pop-max-size"] || Default_pop_max_size).to_i
2646
2647 if (options["tasktray"])
2648 require('vr/vrcontrol')
2649 require('vr/vrtray')
2650 end
2651
2652 if (! options["pop-proxy-set"])
2653 ["pop-server"].each do |name|
2654 if (! options[name])
2655 printf("specify %s\n", name)
2656 error = true
2657 end
2658 end
2659 end
2660
2661 raise "error found in pop options" if (error)
2662 return
2663 end
2664
2665 def check_options_for_imap!(options)
2666 error = false
2667 options["imap-port"] = Default_imap_port if (! options["imap-port"])
2668 ["imap-server", "imap-auth", "imap-user", "imap-password"].each do |name|
2669 if (! options[name])
2670 printf("specify %s\n", name)
2671 error = true
2672 end
2673 end
2674
2675 raise "error found in imap options" if (error)
2676 return
2677 end
2678
2679 def do_imap(command_line_args, token_dbs)
2680 ret_code = CODE_CLEAN
2681 if (@options["ssl"])
2682 if (@options["ssl-cert"])
2683 verify_mode = OpenSSL::SSL::VERIFY_PEER
2684 else
2685 verify_mode = nil
2686 end
2687 imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"], @options["ssl"], @options["ssl-cert"], verify_mode)
2688 else
2689 imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"])
2690 end
2691 imap.auto_authenticate(@options, @options["imap-auth"], @options["imap-user"], @options["imap-password"], @options["imap-auth-preference"])
2692
2693 imap.select(@options["imap-folder-clean"]) if (@options["imap-folder-clean"]) # only for check
2694 imap.select(@options["imap-folder-spam"]) if (@options["imap-folder-spam"]) # only for check
2695 command_line_args.each do |mailbox|
2696 uids = imap_get_target_uids(imap, mailbox)
2697 uids.each do |uid|
2698 imapm = IMAPMessage::new(@options, imap, uid)
2699 imapm.fetch_rfc822
2700 token_db = tokenize_buf(imapm.buf)
2701 token_db.filename = uid
2702 @db_hash[token_db.language].get_combined_probability(token_db)
2703 token_dbs.push(token_db)
2704 @options["message-fh"].printf("combined probability %s %d %f\n", mailbox, imapm.seqno, token_db.probability) if (@options["verbose"])
2705
2706 updated = false
2707 target_mailbox = mailbox
2708 if (token_db.spam_flag)
2709 target_mailbox = @options["imap-folder-spam"] if (@options["imap-folder-spam"])
2710 ret_code = CODE_SPAM
2711 else
2712 target_mailbox = @options["imap-folder-clean"] if (@options["imap-folder-clean"])
2713 end
2714 updated = imapm.insert_rfc822_headers!(token_db.spam_flag, token_db.probability)
2715 if (updated)
2716 imapm.reset_seen_flag if (@options["imap-reset-seen-flag"])
2717 imapm.append(target_mailbox)
2718 imapm.set_delete_flag
2719 elsif (target_mailbox != mailbox)
2720 imapm.reset_seen_flag if (@options["imap-reset-seen-flag"])
2721 imapm.copy(target_mailbox)
2722 imapm.set_delete_flag
2723 end
2724 end
2725 imap.close
2726 end
2727 imap.logout
2728 return ret_code
2729 end
2730
2731
2732 def do_export(command_line_args)
2733 if (command_line_args.empty?)
2734 file = "-"
2735 else
2736 file = command_line_args[0]
2737 end
2738 if (@options["export-clean"])
2739 open_wo(file) do |fh|
2740 Languages.each do |lang|
2741 @db_hash[lang].clean.open("r")
2742 @db_hash[lang].clean.export(fh) if (@db_hash[lang].clean.file_count > 0)
2743 @db_hash[lang].clean.close
2744 end
2745 end
2746 end
2747 if (@options["export-spam"])
2748 open_wo(file) do |fh|
2749 Languages.each do |lang|
2750 @db_hash[lang].spam.open("r")
2751 @db_hash[lang].spam.export(fh) if (@db_hash[lang].spam.file_count > 0)
2752 @db_hash[lang].spam.close
2753 end
2754 end
2755 end
2756 if (@options["export-probability"])
2757 open_wo(file) do |fh|
2758 Languages.each do |lang|
2759 @db_hash[lang].prob.open("r")
2760 @db_hash[lang].prob.export(fh) if (@db_hash[lang].prob.file_count > 0)
2761 @db_hash[lang].prob.close
2762 end
2763 end
2764 end
2765 end
2766
2767 def setup_imap
2768 Net::IMAP.class_eval <<EOM
2769 def auto_authenticate(options, auth, user, password, auth_list=[])
2770 case auth.downcase
2771 when "loginc"
2772 if (options["verbose"])
2773 options["message-fh"].printf("try to login imap server for %s with login command\n", user)
2774 end
2775 return login(user, password)
2776 when "auto"
2777 capa = capability
2778 auth_list.each do |auth|
2779 if (auth == "loginc")
2780 return auto_authenticate(options, "loginc", user, password)
2781 elsif (capa.include?("AUTH=" + auth.upcase))
2782 return auto_authenticate(options, auth, user, password)
2783 end
2784 end
2785 raise sprintf("can't login imap server for %s with %s", user, auth_list)
2786 else
2787 if (options["verbose"])
2788 options["message-fh"].printf("try to login imap server for %s with authenticate %s\n", user, auth)
2789 end
2790 return authenticate(auth, user, password)
2791 end
2792 end
2793 EOM
2794 end
2795
2796 def setup_socket_timeout
2797 TCPSocket.class_eval <<EOM
2798 def write_timeout(str)
2799 timeout(SOCKET_TIMEOUT) do
2800 return self.write(str)
2801 end
2802 end
2803 def gets_timeout
2804 timeout(SOCKET_TIMEOUT) do
2805 return self.gets
2806 end
2807 end
2808 EOM
2809 end
2810
2811 def setup_ssl_socket_timeout
2812 OpenSSL::SSL::SSLSocket.class_eval <<EOM
2813 def write_timeout(str)
2814 timeout(SOCKET_TIMEOUT) do
2815 return self.write(str)
2816 end
2817 end
2818 def gets_timeout
2819 timeout(SOCKET_TIMEOUT) do
2820 return self.gets
2821 end
2822 end
2823 EOM
2824 end
2825
2826 def get_ssl_socket(socket, cert=nil)
2827 context = OpenSSL::SSL::SSLContext::new()
2828
2829 if (cert)
2830 if (FileTest::file?(cert))
2831 @options["message-fh"].print(cert, " is used for SSL ca_file\n") if (@options["verbose"])
2832 context.ca_file = cert
2833 elsif (FileTest::directory?(cert))
2834 @options["message-fh"].print(cert, " is used for SSL ca_path\n") if (@options["verbose"])
2835 context.ca_path = cert
2836 end
2837 context.verify_mode = OpenSSL::SSL::VERIFY_PEER
2838 end
2839 ssl = OpenSSL::SSL::SSLSocket::new(socket, context)
2840 ssl.connect
2841 print(ssl, " is connected\n") if (@options["verbose"])
2842 return ssl
2843 end
2844
2845 def setup_tasktray
2846 eval <<EOM
2847 class MyForm < VRForm
2848 include VRTrayiconFeasible
2849 include VRMenuUseable
2850 LoadIcon = Win32API.new("user32", "LoadIcon", "II", "I")
2851
2852 def construct
2853 @traymenu = newPopupMenu
2854 @traymenu.set([
2855 ["exit", "exit"]
2856 ])
2857 @mytrayicon=0
2858 end
2859 def self_trayrbuttonup(iconid)
2860 showPopup @traymenu
2861 end
2862 def into_trayicon(icon_number)
2863 create_trayicon(LoadIcon.call(0, icon_number),
2864 "bsfilter release #{Release} revision #{Revision}", @mytrayicon)
2865 myexstyle = self.exwinstyle
2866 myexstyle.ws_ex_toolwindow = true
2867 myexstyle.ws_ex_appwindow = false
2868 end
2869
2870 def exit_clicked
2871 delete_trayicon(@mytrayicon)
2872 self.close
2873 end
2874 end
2875 EOM
2876 frm = VRLocalScreen.newform(nil, nil, MyForm)
2877 frm.create
2878 frm.into_trayicon(@options["icon_number"])
2879 VRLocalScreen.messageloop
2880 @threads.each do |thread| # kill child threads
2881 Thread::kill(thread)
2882 end
2883 end
2884
2885 def do_pop
2886 Thread.abort_on_exception = true
2887 @options["message-fh"].print "pop mode start ", Time::new.to_s, "\n" if (@options["verbose"])
2888
2889 if (@options["tasktray"])
2890 Thread::start do
2891 setup_tasktray
2892 end
2893 end
2894
2895 if (@options["pop-proxy-set"])
2896 pop_proxy_sets = @options["pop-proxy-set"].gsub(/\s/, '')
2897 else
2898 pop_proxy_sets = [@options["pop-server"], @options["pop-port"],
2899 @options["pop-proxy-if"], @options["pop-proxy-port"], @options["pop-user"]].join(":")
2900 end
2901 ret_code = pop_proxy_multi(pop_proxy_sets)
2902
2903 # never reached
2904 @options["message-fh"].print "pop mode end ", Time::new.to_s, "\n" if (@options["verbose"])
2905 return ret_code
2906 end
2907
2908 def write_pid_file(file)
2909 open(file, "w") do |fh|
2910 fh.print Process::pid, "\n"
2911 end
2912 end
2913
2914 def parse_command_line
2915 options = DBHash::new
2916
2917 parser = GetoptLong.new
2918 parser.ordering = GetoptLong::REQUIRE_ORDER
2919 parser.set_options(
2920 ["--icon-number", GetoptLong::REQUIRED_ARGUMENT],
2921 ["--ssl", GetoptLong::NO_ARGUMENT],
2922 ["--ssl-cert", GetoptLong::REQUIRED_ARGUMENT],
2923 ["--pop", GetoptLong::NO_ARGUMENT],
2924 ["--tasktray", GetoptLong::NO_ARGUMENT],
2925 ["--pop-proxy-set", GetoptLong::REQUIRED_ARGUMENT],
2926 ["--pop-server", GetoptLong::REQUIRED_ARGUMENT],
2927 ["--pop-port", GetoptLong::REQUIRED_ARGUMENT],
2928 ["--pop-proxy-if", GetoptLong::REQUIRED_ARGUMENT],
2929 ["--pop-proxy-port", GetoptLong::REQUIRED_ARGUMENT],
2930 ["--pop-user", GetoptLong::REQUIRED_ARGUMENT],
2931 ["--pop-max-size", GetoptLong::REQUIRED_ARGUMENT],
2932 ["--imap", GetoptLong::NO_ARGUMENT],
2933 ["--imap-server", GetoptLong::REQUIRED_ARGUMENT],
2934 ["--imap-port", GetoptLong::REQUIRED_ARGUMENT],
2935 ["--imap-auth", GetoptLong::REQUIRED_ARGUMENT],
2936 ["--imap-user", GetoptLong::REQUIRED_ARGUMENT],
2937 ["--imap-password", GetoptLong::REQUIRED_ARGUMENT],
2938 ["--imap-folder-clean", GetoptLong::REQUIRED_ARGUMENT],
2939 ["--imap-folder-spam", GetoptLong::REQUIRED_ARGUMENT],
2940 ["--imap-fetch-unseen", GetoptLong::NO_ARGUMENT],
2941 ["--imap-fetch-unflagged", GetoptLong::NO_ARGUMENT],
2942 ["--imap-reset-seen-flag", GetoptLong::NO_ARGUMENT],
2943 ["--homedir", GetoptLong::REQUIRED_ARGUMENT],
2944 ["--config-file", GetoptLong::REQUIRED_ARGUMENT],
2945 ["--pid-file", GetoptLong::REQUIRED_ARGUMENT],
2946 ["--db", GetoptLong::REQUIRED_ARGUMENT],
2947 ["--max-line", GetoptLong::REQUIRED_ARGUMENT],
2948 ["--export-clean", GetoptLong::NO_ARGUMENT],
2949 ["--export-spam", GetoptLong::NO_ARGUMENT],
2950 ["--export-probability", GetoptLong::NO_ARGUMENT],
2951 ["--import-clean", GetoptLong::NO_ARGUMENT],
2952 ["--import-spam", GetoptLong::NO_ARGUMENT],
2953 ["--mbox", GetoptLong::NO_ARGUMENT],
2954 ["--jtokenizer", "-j", GetoptLong::REQUIRED_ARGUMENT],
2955 ["--method", "-m", GetoptLong::REQUIRED_ARGUMENT],
2956 ["--spam-cutoff", GetoptLong::REQUIRED_ARGUMENT],
2957 ["--mark-in-token", GetoptLong::REQUIRED_ARGUMENT],
2958 ["--max-mail", GetoptLong::REQUIRED_ARGUMENT],
2959 ["--min-mail", GetoptLong::REQUIRED_ARGUMENT],
2960 ["--show-new-token", GetoptLong::NO_ARGUMENT],
2961 ["--auto-update", "-a", GetoptLong::NO_ARGUMENT],
2962 ["--update", "-u", GetoptLong::NO_ARGUMENT],
2963 ["--add-clean", "-c", GetoptLong::NO_ARGUMENT],
2964 ["--add-spam", "-s", GetoptLong::NO_ARGUMENT],
2965 ["--sub-clean", "-C", GetoptLong::NO_ARGUMENT],
2966 ["--sub-spam", "-S", GetoptLong::NO_ARGUMENT],
2967 ["--disable-degeneration", "-D", GetoptLong::NO_ARGUMENT],
2968 ["--disable-utf-8", GetoptLong::NO_ARGUMENT],
2969 ["--ignore-body", "-B", GetoptLong::NO_ARGUMENT],
2970 ["--refer-header", GetoptLong::REQUIRED_ARGUMENT],
2971 ["--ignore-header", "-H", GetoptLong::NO_ARGUMENT],
2972 ["--ignore-plain-text-part", GetoptLong::NO_ARGUMENT],
2973 ["--ignore-after-last-atag", GetoptLong::NO_ARGUMENT],
2974 ["--pipe", GetoptLong::NO_ARGUMENT],
2975 ["--insert-revision", GetoptLong::NO_ARGUMENT],
2976 ["--insert-flag", GetoptLong::NO_ARGUMENT],
2977 ["--insert-probability", GetoptLong::NO_ARGUMENT],
2978 ["--header-prefix", GetoptLong::REQUIRED_ARGUMENT],
2979 ["--mark-spam-subject", GetoptLong::NO_ARGUMENT],
2980 ["--spam-subject-prefix", GetoptLong::REQUIRED_ARGUMENT],
2981 ["--list-clean", GetoptLong::NO_ARGUMENT],
2982 ["--list-spam", GetoptLong::NO_ARGUMENT],
2983 ["--show-db-status", GetoptLong::NO_ARGUMENT],
2984 ["--show-process", GetoptLong::NO_ARGUMENT],
2985 ["--help", "-h", GetoptLong::NO_ARGUMENT],
2986 ["--revision", GetoptLong::NO_ARGUMENT],
2987 ["--quiet", "-q", GetoptLong::NO_ARGUMENT],
2988 ["--debug", "-d", GetoptLong::NO_ARGUMENT],
2989 ["--verbose", "-v", GetoptLong::NO_ARGUMENT])
2990
2991 allow_multi = {"pop-proxy-set" => true}
2992
2993 parser.quiet = true
2994 begin
2995 parser.each_option do |name, arg|
2996 name.sub!(/^--/, '')
2997 if (options[name] && allow_multi[name])
2998 options[name] += ("," + arg)
2999 else
3000 options[name] = arg.dup
3001 end
3002 end
3003 rescue
3004 usage
3005 raise parser.error_message
3006 end
3007 return options
3008 end
3009
3010
3011 def get_options
3012 argv_backup = Marshal::load(Marshal::dump(ARGV)) # shallow copy is enough?
3013 options = parse_command_line
3014
3015 if (options["config-file"] && (! File::file?(options["config-file"])))
3016 raise sprintf("can't find config file %s\n", options["config-file"])
3017 end
3018
3019 if (! options["homedir"])
3020 if (ENV["BSFILTERHOME"])
3021 options["homedir"] = ENV["BSFILTERHOME"]
3022 elsif (ENV["HOME"])
3023 options["homedir"] = ENV["HOME"] + "/" + Default_homedir
3024 elsif (defined?(Exerb) && Exerb.runtime?)
3025 options["homedir"] = File.dirname(Exerb.filepath)
3026 else
3027 options["homedir"] = File.dirname($0)
3028 end
3029 end
3030
3031 if (! options["config-file"])
3032 options["config-file"] = options["homedir"] + "/" + Default_conf_file
3033 end
3034 if (options["config-file"] && File::file?(options["config-file"]))
3035 ARGV.clear
3036 argv_config = read_config_file(options["config-file"])
3037 (argv_config + argv_backup).reverse.each do |argv|
3038 ARGV.unshift(argv)
3039 end
3040 options.update(parse_command_line)
3041 end
3042
3043 if (options["help"])
3044 usage
3045 exit 0
3046 end
3047 if (options["revision"])
3048 print "bsfilter release #{Release} revision #{Revision}\n"
3049 exit 0
3050 end
3051
3052 options["homedir"] = options["homedir"].sub(/\/*$/, '') + "/"
3053
3054 if (options["method"])
3055 if (options["method"] !~ /\A(g|r|rf)\z/)
3056 usage
3057 raise sprintf("unsupported method %s\n", options["method"])
3058 end
3059 else
3060 options["method"] = Default_method
3061 end
3062
3063 options["header-prefix"] = Default_header_prefix if (! options["header-prefix"])
3064 options["spam-subject-prefix"] = Default_spam_subject_prefix if (! options["spam-subject-prefix"])
3065
3066 options["db"] = Default_db if (! options["db"])
3067 case options["db"]
3068 when "sdbm"
3069 require 'sdbm'
3070 when "gdbm"
3071 require 'gdbm'
3072 when "bdb1"
3073 require 'bdb1'
3074 when "bdb"
3075 require 'bdb'
3076 when "qdbm"
3077 require 'depot'
3078 else
3079 raise sprintf("unsupported db %s\n", options["db"])
3080 end
3081
3082 if (options["jtokenizer"])
3083 options["jtokenizer"].downcase!
3084 else
3085 options["jtokenizer"] = Default_jtokenizer
3086 end
3087 case options["jtokenizer"]
3088 when "bigram"
3089 when "block"
3090 when "mecab"
3091 require 'MeCab'
3092 when "chasen"
3093 require 'chasen.o'
3094 when "kakasi"
3095 require 'kakasi'
3096 else
3097 raise sprintf("unsupported jtokenizer %s\n", options["jtokenizer"])
3098 end
3099 @jtokenizer = Jtokenizer::new(options["jtokenizer"])
3100
3101 options['mark-in-token'] = Default_mark_in_token if (! options['mark-in-token'])
3102 options['mark-in-token'].gsub!(/\s/, '')
3103 options["max-line"] = (options["max-line"] || Default_max_line).to_i
3104 options["max-mail"] = (options["max-mail"] || Default_max_mail).to_i
3105 options["min-mail"] = (options["min-mail"] || Default_min_mail).to_i
3106
3107 options["degeneration"] = options["disable-degeneration"] ? false : true
3108
3109 if (options["refer-header"])
3110 array = options["refer-header"].downcase.split(',')
3111 elsif (options["ignore-header"])
3112 array = Array::new
3113 else
3114 array = Default_refer_header.downcase.split(',')
3115 end
3116 options["refer-header"] = Hash::new
3117 array.each do |header|
3118 options["refer-header"][header] = true
3119 end
3120
3121 options["use-body"] = options["ignore-body"] ? false : true
3122
3123 options["pid-file"] = options["homedir"] + Default_pid_file if (! options["pid-file"])
3124
3125 options["imap-auth"] = options["imap-auth"] || Default_imap_auth
3126 options["imap-auth-preference"] = Default_imap_auth_preference # can't modify with command line option
3127
3128 if ((! options["disable-utf-8"]) &&
3129 safe_require("iconv"))
3130 options["utf-8"] = true
3131 define_safe_iconv if (! defined?(Iconv.safe_iconv))
3132 else
3133 options["utf-8"] = false
3134 end
3135
3136 if (options["pop"])
3137 check_options_for_pop!(options)
3138 require 'timeout'
3139 require 'socket'
3140 setup_socket_timeout
3141 end
3142 if (options["imap"])
3143 check_options_for_imap!(options)
3144 require 'net/imap'
3145 setup_imap
3146 end
3147 if (options["ssl"])
3148 if (options["ssl-cert"])
3149 if (! File::readable?(options["ssl-cert"]))
3150 raise sprintf("can't read %s. check --ssl-cert option", options["ssl-cert"])
3151 end
3152 end
3153 require "openssl"
3154 setup_ssl_socket_timeout
3155 end
3156 return options
3157 end
3158
3159 def show_db_status
3160 Languages.each do |lang|
3161 @db_hash[lang].clean.open("r")
3162 @db_hash[lang].spam.open("r")
3163 @db_hash[lang].prob.open("r")
3164 @options["