| 1 |
#! /usr/bin/env ruby |
| 2 |
## -*-Ruby-*- $Id: bsfilter,v 1.78 2006/04/02 04:40:09 nabeken Exp $ |
| 3 |
|
| 4 |
## Copyright (C) 2003, 2004, 2005, 2006 NABEYA Kenichi |
| 5 |
## |
| 6 |
## This program is free software; you can redistribute it and/or modify |
| 7 |
## it under the terms of the GNU General Public License as published by |
| 8 |
## the Free Software Foundation; either version 2 of the License, or |
| 9 |
## (at your option) any later version. |
| 10 |
## |
| 11 |
## This program is distributed in the hope that it will be useful, |
| 12 |
## but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 |
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 |
## GNU General Public License for more details. |
| 15 |
## |
| 16 |
## You should have received a copy of the GNU General Public License |
| 17 |
## along with this program; if not, write to the Free Software |
| 18 |
## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 19 |
|
| 20 |
require 'getoptlong' |
| 21 |
require 'nkf' |
| 22 |
|
| 23 |
class Bsfilter |
| 24 |
def initialize |
| 25 |
@threads = Array::new |
| 26 |
@token_dbs = nil |
| 27 |
@options = Hash::new |
| 28 |
@db_hash = Hash::new |
| 29 |
@jtokenizer = nil |
| 30 |
end |
| 31 |
attr_accessor :token_dbs |
| 32 |
|
| 33 |
Release = "$Name: $".split[1].sub(/\A[^\d]*/, '').gsub(/_/, '.') |
| 34 |
Release.concat("-") if (Release == "") |
| 35 |
Revision = "$Revision: 1.78 $".gsub(/[^\.\d]/, '') |
| 36 |
Languages = ["C", "ja"] |
| 37 |
|
| 38 |
## Options = Hash::new # used like a global variable |
| 39 |
## DB = Hash::new |
| 40 |
|
| 41 |
Default_header_prefix = "Spam" |
| 42 |
Default_spam_subject_prefix = "[SPAM] " |
| 43 |
Default_refer_header = |
| 44 |
["Ufrom", "From", "To", "Cc", "Subject", "Reply-to", "Return-path", "Received", |
| 45 |
"Content-Transfer-Encoding", "Content-Type", "charset", "Content-Disposition"].join(",") |
| 46 |
|
| 47 |
Default_jtokenizer = "bigram" |
| 48 |
Default_mark_in_token = "|!*'" |
| 49 |
Default_homedir = ".bsfilter" |
| 50 |
Default_conf_file = "bsfilter.conf" |
| 51 |
Default_pid_file = "bsfilter.pid" |
| 52 |
|
| 53 |
Default_method = "rf" # Robinson Fisher |
| 54 |
Default_db = "sdbm" |
| 55 |
Default_max_mail = 10000 |
| 56 |
Default_min_mail = 8000 |
| 57 |
Default_max_line = 500 |
| 58 |
|
| 59 |
Default_pop_proxy_if = "0.0.0.0" |
| 60 |
Default_pop_port = "110" |
| 61 |
Default_pop_proxy_port = "10110" |
| 62 |
Default_pop_max_size = 50000 |
| 63 |
|
| 64 |
Default_imap_port = "143" |
| 65 |
Default_imap_auth = "auto" |
| 66 |
Default_imap_auth_preference = ["cram-md5", "login", "loginc"] |
| 67 |
|
| 68 |
Default_icon_number = 32512 |
| 69 |
|
| 70 |
Clean_ext = ".clean" |
| 71 |
Spam_ext = ".spam" |
| 72 |
Prob_ext = ".prob" |
| 73 |
Lock_ext = ".lock" |
| 74 |
|
| 75 |
SDBM_ext = ".sdbm" |
| 76 |
GDBM_ext = ".gdbm" |
| 77 |
BDB1_ext = ".bdb1" |
| 78 |
BDB_ext = ".bdb" |
| 79 |
QDBM_ext = ".qdbm" |
| 80 |
|
| 81 |
EXIT_NORMAL = 0 |
| 82 |
CODE_NORMAL = true |
| 83 |
CODE_SPAM = true |
| 84 |
CODE_CLEAN = false |
| 85 |
|
| 86 |
CODESET_EUCJP = "eucJP" |
| 87 |
CODESET_LATIN = "ISO8859-1" |
| 88 |
CODESET_GB18030 = "GB18030" |
| 89 |
CODESET_UTF8 = "UTF-8" |
| 90 |
PATTERN_UTF8 = '[\xe0-\xef][\x80-\xbf][\x80-\xbf][\xe0-\xef][\x80-\xbf][\x80-\xbf]' |
| 91 |
RE_UTF8 = Regexp.new(PATTERN_UTF8, 'n') |
| 92 |
|
| 93 |
ALL_TAGS = ["html", "head", "title", "meta", "body", "div", "spam", |
| 94 |
"h1", "h2", "h3", "h4", "h5", "h6", |
| 95 |
"em", "strong", "font", "basefont", "big", "small", |
| 96 |
"b", "i", "s", "u", "tt", "sub", "sub", |
| 97 |
"rb", "rp", "rt","ruby", |
| 98 |
"blink", "marquee", |
| 99 |
"dfn", "cite", "abbr", "acronym", |
| 100 |
"blockquote", "q", |
| 101 |
"br", "pre", "ins", "del", "center", "style", "hr", |
| 102 |
"ul", "ol", "li", "dl", "dt", "dd", |
| 103 |
"table", "caption", "thead", "tbody", "tfoot", |
| 104 |
"colgroup", "col", "tr", "td", "th", |
| 105 |
"a", "link", "base", "img", "address", |
| 106 |
"form", "input", "select", "option", "textarea", "label", |
| 107 |
"fieldset", "legend", "optgroup", |
| 108 |
"frameset", "frame", "nofrmaes", "iframe"].join('|') |
| 109 |
|
| 110 |
SPACE_TAGS = "br|p|td|tr|table|ul|ol|dl|li|dt|dd" |
| 111 |
|
| 112 |
RE_ALL_TAGS = Regexp::compile('\A<(' + ALL_TAGS + ')\b', Regexp::IGNORECASE, 'n') |
| 113 |
RE_SPACE_TAGS = Regexp::compile('\A<(' + SPACE_TAGS + ')\b', Regexp::IGNORECASE, 'n') |
| 114 |
|
| 115 |
SOCKET_TIMEOUT = 30 # for single socket operation |
| 116 |
|
| 117 |
module Bsutil |
| 118 |
def insert_header!(buf, header, content) |
| 119 |
buf[0] =~ /([\r\n]*)\z/ |
| 120 |
eol = $1 |
| 121 |
|
| 122 |
(0 ... buf.length).each do |i| |
| 123 |
if (buf[i] =~/\A(.*?:)/) |
| 124 |
h = $1 |
| 125 |
if (h == header) |
| 126 |
buf[i] = "#{header} #{content}#{eol}" |
| 127 |
break |
| 128 |
end |
| 129 |
elsif (buf[i] =~ /\A[\r\n]*\z/) |
| 130 |
buf[i, 0] = "#{header} #{content}#{eol}" |
| 131 |
break |
| 132 |
end |
| 133 |
end |
| 134 |
end |
| 135 |
|
| 136 |
def append_header!(buf, header, prefix) |
| 137 |
buf[0] =~ /([\r\n]*)\z/ |
| 138 |
eol = $1 |
| 139 |
(0 ... buf.length).each do |i| |
| 140 |
if (buf[i] =~/\A(.*?:)(\s*)(.*?)([\r\n]*)\z/) |
| 141 |
h = $1 |
| 142 |
org_content = $3 |
| 143 |
if (h.downcase == header.downcase) |
| 144 |
buf[i] = "#{header} #{prefix}#{org_content}#{eol}" |
| 145 |
break |
| 146 |
end |
| 147 |
elsif (buf[i] =~ /\A[\r\n]*\z/) |
| 148 |
buf[i, 0] = "#{header} #{prefix}#{eol}" |
| 149 |
break |
| 150 |
end |
| 151 |
end |
| 152 |
end |
| 153 |
|
| 154 |
def x_spam_flag |
| 155 |
return sprintf("X-%s-Flag:", @options["header-prefix"]) |
| 156 |
end |
| 157 |
|
| 158 |
def x_spam_probability |
| 159 |
return sprintf("X-%s-Probability:", @options["header-prefix"]) |
| 160 |
end |
| 161 |
|
| 162 |
def x_spam_revision |
| 163 |
return sprintf("X-%s-Revision:", @options["header-prefix"]) |
| 164 |
end |
| 165 |
|
| 166 |
def insert_headers!(buf, spam_flag, probability=nil) |
| 167 |
updated = false |
| 168 |
if (@options["insert-revision"]) |
| 169 |
insert_header!(buf, x_spam_revision, "bsfilter release #{Release} revision #{Revision}") |
| 170 |
updated = true |
| 171 |
end |
| 172 |
if (@options["insert-flag"]) |
| 173 |
updated = true |
| 174 |
if (spam_flag) |
| 175 |
insert_header!(buf, x_spam_flag, "Yes") |
| 176 |
else |
| 177 |
insert_header!(buf, x_spam_flag, "No") |
| 178 |
end |
| 179 |
end |
| 180 |
if (@options["insert-probability"] && probability) |
| 181 |
updated = true |
| 182 |
insert_header!(buf, x_spam_probability, sprintf("%f", probability)) |
| 183 |
end |
| 184 |
if (@options["mark-spam-subject"]) |
| 185 |
updated = true |
| 186 |
if (spam_flag) |
| 187 |
append_header!(buf, "Subject:", @options["spam-subject-prefix"]) |
| 188 |
end |
| 189 |
end |
| 190 |
return updated |
| 191 |
end |
| 192 |
end # end of module |
| 193 |
|
| 194 |
include Bsutil |
| 195 |
|
| 196 |
class DevNull |
| 197 |
def sync=(*args) |
| 198 |
end |
| 199 |
def print(*args) |
| 200 |
end |
| 201 |
def printf(*args) |
| 202 |
end |
| 203 |
end |
| 204 |
|
| 205 |
class DBHash < Hash |
| 206 |
def flatten(magic="###", head="", &block) |
| 207 |
self.each do |k, v| |
| 208 |
if (v.class == DBHash) |
| 209 |
if (head == "") |
| 210 |
v.flatten(magic, k, &block) |
| 211 |
else |
| 212 |
v.flatten(magic, head + magic + k, &block) |
| 213 |
end |
| 214 |
else |
| 215 |
if (head == "") |
| 216 |
yield k, v |
| 217 |
else |
| 218 |
yield head + magic + k, v |
| 219 |
end |
| 220 |
end |
| 221 |
end |
| 222 |
end |
| 223 |
|
| 224 |
def add(hash) |
| 225 |
hash.each do |k, v| |
| 226 |
if (self[k]) |
| 227 |
if ((self[k].class == DBHash) && |
| 228 |
(v.class == DBHash)) |
| 229 |
self[k].add(v) |
| 230 |
else |
| 231 |
self[k] += v |
| 232 |
end |
| 233 |
else |
| 234 |
self[k] = v # should do deep copy ? |
| 235 |
end |
| 236 |
end |
| 237 |
end |
| 238 |
def sub(hash) |
| 239 |
hash.each do |k, v| |
| 240 |
if (self[k]) |
| 241 |
if ((self[k].class == DBHash) && |
| 242 |
(v.class == DBHash)) |
| 243 |
self[k].sub(v) |
| 244 |
if (self[k].empty?) |
| 245 |
self.delete(k) |
| 246 |
end |
| 247 |
else |
| 248 |
if (self[k] > v) |
| 249 |
self[k] -= v |
| 250 |
else |
| 251 |
self.delete(k) |
| 252 |
end |
| 253 |
end |
| 254 |
end |
| 255 |
end |
| 256 |
end |
| 257 |
end |
| 258 |
|
| 259 |
def safe_require(file) |
| 260 |
begin |
| 261 |
require file |
| 262 |
return true |
| 263 |
rescue LoadError |
| 264 |
return false |
| 265 |
end |
| 266 |
end |
| 267 |
|
| 268 |
def latin2ascii(str) |
| 269 |
newstr = str.tr("\x92\x93\x94", "'''") |
| 270 |
newstr.tr!("\xc0-\xc5\xc8-\xcb\xcc-\xcf\xd2-\xd6\xd9-\xdc", "AAAAAAEEEEIIIIOOOOOUUUU") |
| 271 |
newstr.tr!("\xe0-\xe5\xe8-\xeb\xec-\xef\xf2-\xf6\xf9-\xfc", "aaaaaaeeeeiiiiooooouuuu") |
| 272 |
return newstr |
| 273 |
end |
| 274 |
|
| 275 |
def define_safe_iconv |
| 276 |
def Iconv.safe_iconv(tocode, fromcode, *strs) |
| 277 |
return strs.map do |str| |
| 278 |
array = Array::new |
| 279 |
strs.each do |str| |
| 280 |
str.split(/(\s+)/).each do |word| |
| 281 |
begin |
| 282 |
array.push(Iconv.iconv(tocode, fromcode, word)[0]) |
| 283 |
rescue |
| 284 |
array.push(' ') |
| 285 |
end |
| 286 |
end |
| 287 |
end |
| 288 |
array.join |
| 289 |
end |
| 290 |
end |
| 291 |
def Iconv.u2eucjp(str) |
| 292 |
return NKF::nkf('-e -E -X -Z0', (Iconv.safe_iconv(CODESET_EUCJP, CODESET_UTF8, str))[0]) |
| 293 |
end |
| 294 |
def Iconv.u2latin(str) |
| 295 |
return (Iconv.safe_iconv(CODESET_LATIN, CODESET_UTF8, str))[0] |
| 296 |
end |
| 297 |
def Iconv.gb180302eucjp(str) |
| 298 |
return (Iconv.safe_iconv(CODESET_EUCJP, CODESET_GB18030, str))[0] |
| 299 |
end |
| 300 |
end |
| 301 |
|
| 302 |
def open_ro(file) |
| 303 |
if (file == "-") |
| 304 |
fh = STDIN |
| 305 |
yield fh |
| 306 |
elsif (file.class == Array) |
| 307 |
file.instance_eval <<EOM |
| 308 |
@eof = false |
| 309 |
def gets |
| 310 |
@n = 0 if (! @n) |
| 311 |
if (@n >= self.length) |
| 312 |
nil |
| 313 |
else |
| 314 |
@n = @n + 1 |
| 315 |
self[@n - 1] |
| 316 |
end |
| 317 |
end |
| 318 |
def readlines |
| 319 |
@eof = true |
| 320 |
self |
| 321 |
end |
| 322 |
def eof? |
| 323 |
(@eof || empty?) |
| 324 |
end |
| 325 |
EOM |
| 326 |
yield file |
| 327 |
else |
| 328 |
if (! FileTest::file?(file)) |
| 329 |
raise sprintf("%s is not file", file) |
| 330 |
end |
| 331 |
fh = open(file, "rb") |
| 332 |
yield fh |
| 333 |
fh.close |
| 334 |
end |
| 335 |
end |
| 336 |
|
| 337 |
def open_wo(file, &block) |
| 338 |
if (file == "-") |
| 339 |
fh = STDOUT |
| 340 |
else |
| 341 |
fh = open(file, "wb") |
| 342 |
end |
| 343 |
if (block) |
| 344 |
yield fh |
| 345 |
if (file != "-") |
| 346 |
fh.close |
| 347 |
end |
| 348 |
else |
| 349 |
return fh |
| 350 |
end |
| 351 |
end |
| 352 |
|
| 353 |
class FLOAT |
| 354 |
def initialize(f=0, power=1) |
| 355 |
@mant = 0 |
| 356 |
@exp = 0 |
| 357 |
set_f(f, power) |
| 358 |
end |
| 359 |
attr_accessor :mant, :exp |
| 360 |
|
| 361 |
def to_f |
| 362 |
return @mant * Math::exp(@exp) |
| 363 |
end |
| 364 |
|
| 365 |
def ln |
| 366 |
return Math::log(@mant) + @exp |
| 367 |
end |
| 368 |
|
| 369 |
def * (a) |
| 370 |
if (a.class == FLOAT) |
| 371 |
n = FLOAT::new |
| 372 |
n.mant = @mant * a.mant |
| 373 |
n.exp = @exp + a.exp |
| 374 |
else |
| 375 |
n = FLOAT::new |
| 376 |
n.exp = @exp |
| 377 |
n.mant = @mant * a |
| 378 |
end |
| 379 |
return n |
| 380 |
end |
| 381 |
def set_f (a, power=1) |
| 382 |
if (a > 0) |
| 383 |
@mant = 1 |
| 384 |
@exp = Math::log(a) * power |
| 385 |
elsif (a < 0) |
| 386 |
@mant = -1 |
| 387 |
@exp = Math::log(-a) * power |
| 388 |
else |
| 389 |
@mant = 0 |
| 390 |
@exp = 0 |
| 391 |
end |
| 392 |
self |
| 393 |
end |
| 394 |
end |
| 395 |
|
| 396 |
|
| 397 |
module TokenAccess |
| 398 |
def check_size(max_size, min_size) |
| 399 |
if ((@file_count <= max_size) || (max_size <= 0) || (min_size <= 0)) |
| 400 |
return false |
| 401 |
end |
| 402 |
old_count = @file_count |
| 403 |
if (@options["verbose"]) |
| 404 |
@options["message-fh"].printf("reduce token database %s from %d to %d\n", @filename, old_count, min_size) |
| 405 |
end |
| 406 |
|
| 407 |
key_cts.each do |(category, token)| |
| 408 |
if (category != ".internal") |
| 409 |
v = value(category, token) || 0 |
| 410 |
sub_scalar(category, token, (v * (old_count - min_size).to_f / old_count.to_f).ceil) |
| 411 |
if (@options["debug"] && ! value(category, token)) |
| 412 |
@options["message-fh"].printf("deleted %s %s\n", category, token) |
| 413 |
end |
| 414 |
end |
| 415 |
end |
| 416 |
@file_count = min_size |
| 417 |
@dirty = true |
| 418 |
return true |
| 419 |
end |
| 420 |
|
| 421 |
def value_with_degene(category, token) |
| 422 |
if (value(category, token)) |
| 423 |
return value(category, token) |
| 424 |
elsif (! @options["degeneration"]) # no degeneration |
| 425 |
return nil |
| 426 |
else |
| 427 |
if (v = value(category, token[0 .. -2])) # cut last char |
| 428 |
return v |
| 429 |
end |
| 430 |
token = token.gsub(Regexp::compile("[#{@options['mark-in-token']}]"), '') |
| 431 |
if (v = value(category, token)) |
| 432 |
return v |
| 433 |
end |
| 434 |
token = token.downcase |
| 435 |
if (v = value(category, token)) |
| 436 |
return v |
| 437 |
end |
| 438 |
token = token.upcase |
| 439 |
if (v = value(category, token)) |
| 440 |
return v |
| 441 |
end |
| 442 |
token = token.capitalize |
| 443 |
if (v = value(category, token)) |
| 444 |
return v |
| 445 |
end |
| 446 |
return nil |
| 447 |
end |
| 448 |
end |
| 449 |
def set_scalar(category, token, val) |
| 450 |
@dirty = true |
| 451 |
@file_count += 1 |
| 452 |
set(category, token, val) |
| 453 |
end |
| 454 |
|
| 455 |
def add_scalar(category, token, val) |
| 456 |
@dirty = true |
| 457 |
@file_count += 1 |
| 458 |
if (v = value(category, token)) |
| 459 |
set(category, token, v + val) |
| 460 |
else |
| 461 |
set(category, token, val) |
| 462 |
end |
| 463 |
end |
| 464 |
|
| 465 |
def show_new_token(db) |
| 466 |
db.each_ct do |(category, token)| |
| 467 |
if (! value(category, token) || (value(category, token) == 0)) |
| 468 |
@options["message-fh"].printf("new %s %s\n", category, token) |
| 469 |
end |
| 470 |
end |
| 471 |
end |
| 472 |
|
| 473 |
def values |
| 474 |
array = Array::new |
| 475 |
each_ct do |c, t| |
| 476 |
array.push(value(c, t)) |
| 477 |
end |
| 478 |
return array |
| 479 |
end |
| 480 |
|
| 481 |
def key_cts |
| 482 |
array = Array::new |
| 483 |
each_ct do |c, t| |
| 484 |
array.push([c, t]) |
| 485 |
end |
| 486 |
return array |
| 487 |
end |
| 488 |
|
| 489 |
def export(fh) |
| 490 |
each_ct do |(category, token)| |
| 491 |
fh.printf("%s %s %s %g\n", @language, category, token, value(category, token)) if (value(category, token)) |
| 492 |
end |
| 493 |
end |
| 494 |
end |
| 495 |
|
| 496 |
class TokenDB |
| 497 |
include TokenAccess |
| 498 |
|
| 499 |
def initialize(language=nil) |
| 500 |
@hash = DBHash::new |
| 501 |
@file_count = 0 |
| 502 |
@language = language |
| 503 |
@message_id = "-" |
| 504 |
@probability = nil |
| 505 |
@spam_flag = nil |
| 506 |
@dirty = false |
| 507 |
@time = nil |
| 508 |
@filename = "-" |
| 509 |
end |
| 510 |
attr_accessor :hash, :file_count, :probability, :language, :spam_flag, :message_id, :time, :filename |
| 511 |
|
| 512 |
def size |
| 513 |
@hash.size |
| 514 |
end |
| 515 |
|
| 516 |
def each_ct |
| 517 |
@hash.each_key do |category| |
| 518 |
@hash[category].each_key do |token| |
| 519 |
yield(category, token) |
| 520 |
end |
| 521 |
end |
| 522 |
end |
| 523 |
|
| 524 |
def value(category, token) |
| 525 |
if (! @hash[category]) |
| 526 |
return nil |
| 527 |
elsif (v = @hash[category][token]) |
| 528 |
return v |
| 529 |
else |
| 530 |
return nil |
| 531 |
end |
| 532 |
end |
| 533 |
|
| 534 |
def set(category, token, v) |
| 535 |
@dirty = true |
| 536 |
@hash[category] = DBHash::new if (! @hash[category]) |
| 537 |
@hash[category][token] = v |
| 538 |
end |
| 539 |
|
| 540 |
def print_keys_to_str(hash, separator, fh=STDOUT) |
| 541 |
hash.keys.sort.each do |k| |
| 542 |
v = hash[k] |
| 543 |
v = v.to_i |
| 544 |
fh.print separator |
| 545 |
fh.print(([k] * v).join(separator)) |
| 546 |
end |
| 547 |
end |
| 548 |
|
| 549 |
def clear |
| 550 |
@dirty = true |
| 551 |
@file_count = 0 |
| 552 |
@hash = DBHash::new |
| 553 |
end |
| 554 |
|
| 555 |
def add_db(db) |
| 556 |
@dirty = true |
| 557 |
@file_count += db.file_count |
| 558 |
if (! @language && db.language) |
| 559 |
@language = db.language |
| 560 |
end |
| 561 |
@hash.add(db.hash) |
| 562 |
end |
| 563 |
|
| 564 |
def add_hash(hash) |
| 565 |
@dirty = true |
| 566 |
@file_count += 1 |
| 567 |
@hash.add(hash) |
| 568 |
end |
| 569 |
|
| 570 |
def sub_scalar(category, token, val) |
| 571 |
if (@file_count > 0) |
| 572 |
@file_count -= 1 |
| 573 |
end |
| 574 |
@hash.sub({category => {token => val}}) |
| 575 |
end |
| 576 |
|
| 577 |
def sub_hash(hash) |
| 578 |
@dirty = true |
| 579 |
if (@file_count > 0) |
| 580 |
@file_count -= 1 |
| 581 |
end |
| 582 |
@hash.sub(hash) |
| 583 |
end |
| 584 |
|
| 585 |
def sub_db(db) |
| 586 |
@dirty = true |
| 587 |
@file_count -= db.file_count |
| 588 |
if (@file_count < 1) |
| 589 |
@file_count = 1 |
| 590 |
end |
| 591 |
@hash.sub(db.hash) |
| 592 |
end |
| 593 |
end |
| 594 |
|
| 595 |
class TokenDBM |
| 596 |
include TokenAccess |
| 597 |
MAGIC = "###" |
| 598 |
def initialize(options, language, ext) |
| 599 |
@options = options |
| 600 |
@dbm = nil # SDBM not Hash |
| 601 |
@dirty = nil # not used. for TokenAccess |
| 602 |
@lockfh = nil |
| 603 |
@file_count = nil |
| 604 |
@language = language |
| 605 |
end |
| 606 |
attr_accessor :file_count |
| 607 |
|
| 608 |
def size |
| 609 |
@dbm.size |
| 610 |
end |
| 611 |
|
| 612 |
def to_db |
| 613 |
token_db = TokenDB::new(@language) |
| 614 |
@dbm.each do |ct, v| |
| 615 |
(category, token) = ct.split(Regexp.new(MAGIC), 2) |
| 616 |
token_db.set(category, token, v) |
| 617 |
token_db.file_count = @file_count |
| 618 |
end |
| 619 |
return token_db |
| 620 |
end |
| 621 |
|
| 622 |
def clear |
| 623 |
@dbm.clear |
| 624 |
@file_count = 0 |
| 625 |
set(".internal", "file_count", 0) |
| 626 |
end |
| 627 |
|
| 628 |
def each_ct |
| 629 |
@dbm.each_key do |ct| |
| 630 |
(category, token) = ct.split(Regexp.new(MAGIC), 2) |
| 631 |
yield(category, token) |
| 632 |
end |
| 633 |
end |
| 634 |
|
| 635 |
def add_db(token_db) |
| 636 |
add_hash(token_db.hash) |
| 637 |
@file_count += + token_db.file_count |
| 638 |
end |
| 639 |
|
| 640 |
def add_hash(hash) |
| 641 |
@dirty = true |
| 642 |
hash.flatten(MAGIC) do |k, v| |
| 643 |
if (@dbm[k]) |
| 644 |
@dbm[k] = (@dbm[k].to_f + v.to_f).to_s |
| 645 |
else |
| 646 |
@dbm[k] = v.to_s |
| 647 |
end |
| 648 |
end |
| 649 |
end |
| 650 |
|
| 651 |
def sub_db(token_db) |
| 652 |
sub_hash(token_db.hash) |
| 653 |
if (@file_count > token_db.file_count) |
| 654 |
@file_count -= token_db.file_count |
| 655 |
else |
| 656 |
@file_count= 0 |
| 657 |
end |
| 658 |
end |
| 659 |
def sub_hash(hash) |
| 660 |
@dirty = true |
| 661 |
hash.flatten(MAGIC) do |k, v| |
| 662 |
if (@dbm[k]) |
| 663 |
if (@dbm[k].to_f > v.to_f) |
| 664 |
@dbm[k] = (@dbm[k].to_f - v.to_f).to_s |
| 665 |
else |
| 666 |
@dbm.delete(k) |
| 667 |
end |
| 668 |
end |
| 669 |
end |
| 670 |
end |
| 671 |
|
| 672 |
def value(category, token) |
| 673 |
v = @dbm[category + MAGIC + token] |
| 674 |
if (v) |
| 675 |
return v.to_f |
| 676 |
else |
| 677 |
return nil |
| 678 |
end |
| 679 |
end |
| 680 |
|
| 681 |
def set(category, token, v) |
| 682 |
@dirty = true |
| 683 |
@dbm[category + MAGIC + token] = v.to_s |
| 684 |
end |
| 685 |
|
| 686 |
def sub_scalar(category, token, v) |
| 687 |
@dirty = true |
| 688 |
if (@file_count > 0) |
| 689 |
@file_count -= 1 |
| 690 |
end |
| 691 |
oldv = value(category, token) |
| 692 |
if (oldv) |
| 693 |
if (oldv > v) |
| 694 |
set(category, token, oldv - v) |
| 695 |
else |
| 696 |
@dbm.delete(category + MAGIC + token) |
| 697 |
end |
| 698 |
end |
| 699 |
end |
| 700 |
|
| 701 |
def open(mode="r") |
| 702 |
@lockfh = File::open(@lockfile, "w+") |
| 703 |
case mode |
| 704 |
when "r" |
| 705 |
begin |
| 706 |
@lockfh.flock(File::LOCK_SH) |
| 707 |
rescue Errno::EINVAL ## Win9x doesn't support LOCK_SH |
| 708 |
@lockfh.flock(File::LOCK_EX) |
| 709 |
end |
| 710 |
when "w", "wr", "rw" |
| 711 |
@lockfh.flock(File::LOCK_EX) |
| 712 |
else |
| 713 |
raise |
| 714 |
end |
| 715 |
|
| 716 |
@dbm = open_dbm(@filename, 0600) |
| 717 |
|
| 718 |
if (v = value(".internal", "file_count")) |
| 719 |
@file_count = v.to_i |
| 720 |
else |
| 721 |
@file_count = 0 |
| 722 |
set(".internal", "file_count", @file_count) |
| 723 |
end |
| 724 |
if (@options["verbose"]) |
| 725 |
@options["message-fh"].printf("open %s %d tokens %d mails by %d.\n", @filename, @dbm.length, @file_count, Process::pid) |
| 726 |
end |
| 727 |
@dirty = false |
| 728 |
end |
| 729 |
|
| 730 |
def close |
| 731 |
dirty = @dirty |
| 732 |
set(".internal", "file_count", @file_count) if (dirty) |
| 733 |
if (@options["verbose"]) |
| 734 |
@options["message-fh"].printf("close %s %d tokens %d mails by %d.\n", @filename, @dbm.length, @file_count, Process::pid) |
| 735 |
end |
| 736 |
if (@options["debug"] && dirty) |
| 737 |
key_cts.sort.each do |(c, t)| |
| 738 |
@options["message-fh"].printf("%s %s %s %f\n", @filename, c, t, value(c, t)) |
| 739 |
end |
| 740 |
end |
| 741 |
@dbm.close |
| 742 |
|
| 743 |
@lockfh.flock(File::LOCK_UN) |
| 744 |
@lockfh.close |
| 745 |
begin |
| 746 |
File::unlink(@lockfile) |
| 747 |
rescue |
| 748 |
end |
| 749 |
@dirty = false |
| 750 |
end |
| 751 |
end |
| 752 |
|
| 753 |
class TokenSDBM < TokenDBM |
| 754 |
def initialize(options, language, ext) |
| 755 |
@filename = options["homedir"] + language + ext + SDBM_ext |
| 756 |
@lockfile = options["homedir"] + language + ext + SDBM_ext + Lock_ext |
| 757 |
super |
| 758 |
end |
| 759 |
def clear |
| 760 |
@file_count = 0 |
| 761 |
@dbm.close |
| 762 |
begin |
| 763 |
File::unlink(@filename + ".dir") |
| 764 |
File::unlink(@filename + ".pag") |
| 765 |
rescue |
| 766 |
end |
| 767 |
@dbm = open_dbm(@filename, 0600) |
| 768 |
if (@options["verbose"]) |
| 769 |
@options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid) |
| 770 |
end |
| 771 |
end |
| 772 |
def open_dbm(filename, mode) |
| 773 |
SDBM::open(filename, mode) |
| 774 |
end |
| 775 |
end |
| 776 |
|
| 777 |
class TokenGDBM < TokenDBM |
| 778 |
def initialize(options, language, ext) |
| 779 |
@options = options |
| 780 |
@filename = @options["homedir"] + language + ext + GDBM_ext |
| 781 |
@lockfile = @options["homedir"] + language + ext + GDBM_ext + Lock_ext |
| 782 |
super |
| 783 |
end |
| 784 |
def clear |
| 785 |
@file_count = 0 |
| 786 |
@dbm.close |
| 787 |
begin |
| 788 |
File::unlink(@filename) |
| 789 |
rescue |
| 790 |
end |
| 791 |
@dbm = open_dbm(@filename, 0600) |
| 792 |
if (@options["verbose"]) |
| 793 |
@options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid) |
| 794 |
end |
| 795 |
end |
| 796 |
def open_dbm(filename, mode) |
| 797 |
GDBM::open(filename, mode, GDBM::NOLOCK) |
| 798 |
end |
| 799 |
end |
| 800 |
|
| 801 |
class TokenBDB1 < TokenDBM |
| 802 |
def initialize(options, language, ext) |
| 803 |
@filename = options["homedir"] + language + ext + BDB1_ext |
| 804 |
@lockfile = options["homedir"] + language + ext + BDB1_ext + Lock_ext |
| 805 |
super |
| 806 |
end |
| 807 |
def clear |
| 808 |
@file_count = 0 |
| 809 |
@dbm.close |
| 810 |
begin |
| 811 |
File::unlink(@filename) |
| 812 |
rescue |
| 813 |
end |
| 814 |
@dbm = open_dbm(@filename, 0600) |
| 815 |
if (@options["verbose"]) |
| 816 |
@options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid) |
| 817 |
end |
| 818 |
end |
| 819 |
def open_dbm(filename, mode) |
| 820 |
BDB1::Hash.open(filename, BDB1::CREATE | BDB1::WRITE, mode) |
| 821 |
end |
| 822 |
end |
| 823 |
|
| 824 |
class TokenBDB < TokenDBM |
| 825 |
def initialize(options, language, ext) |
| 826 |
@filename = options["homedir"] + language + ext + BDB_ext |
| 827 |
@lockfile = options["homedir"] + language + ext + BDB_ext + Lock_ext |
| 828 |
super |
| 829 |
end |
| 830 |
def clear |
| 831 |
@file_count = 0 |
| 832 |
@dbm.close |
| 833 |
begin |
| 834 |
File::unlink(@filename) |
| 835 |
rescue |
| 836 |
end |
| 837 |
@dbm = open_dbm(@filename, 0600) |
| 838 |
if (@options["verbose"]) |
| 839 |
@options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid) |
| 840 |
end |
| 841 |
end |
| 842 |
def open_dbm(filename, mode) |
| 843 |
BDB::Hash.open(filename, nil, BDB::CREATE, mode) |
| 844 |
end |
| 845 |
end |
| 846 |
|
| 847 |
class TokenQDBM < TokenDBM |
| 848 |
def initialize(options, language, ext) |
| 849 |
@filename = options["homedir"] + language + ext + QDBM_ext |
| 850 |
@lockfile = options["homedir"] + language + ext + QDBM_ext + Lock_ext |
| 851 |
super |
| 852 |
end |
| 853 |
def value(category, token) |
| 854 |
begin |
| 855 |
v = @dbm[category + MAGIC + token] |
| 856 |
rescue DepotError_ENOITEM |
| 857 |
return nil |
| 858 |
else |
| 859 |
return v.to_f |
| 860 |
end |
| 861 |
end |
| 862 |
def add_hash(hash) |
| 863 |
@dirty = true |
| 864 |
hash.flatten(MAGIC) do |k, v| |
| 865 |
begin |
| 866 |
if (@dbm[k]) |
| 867 |
@dbm[k] = (@dbm[k].to_f + v.to_f).to_s |
| 868 |
else |
| 869 |
## nerver reached. DepotError_ENOITEM asserted when @dbm[k] is nil |
| 870 |
@dbm[k] = v.to_s |
| 871 |
end |
| 872 |
rescue DepotError_ENOITEM |
| 873 |
@dbm[k] = v.to_s |
| 874 |
end |
| 875 |
end |
| 876 |
end |
| 877 |
def clear |
| 878 |
@file_count = 0 |
| 879 |
@dbm.close |
| 880 |
begin |
| 881 |
File::unlink(@filename) |
| 882 |
rescue |
| 883 |
end |
| 884 |
@dbm = open_dbm(@filename, 0600) |
| 885 |
if (@options["verbose"]) |
| 886 |
@options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid) |
| 887 |
end |
| 888 |
end |
| 889 |
def open_dbm(filename, mode) |
| 890 |
Depot::open(filename, Depot::OWRITER | Depot::OCREAT) |
| 891 |
end |
| 892 |
end |
| 893 |
|
| 894 |
def get_lang_from_headers(headers) |
| 895 |
reg_char_ja = Regexp::compile('\?(iso-2022-jp|iso-2202-jp|x.sjis|shift.jis|euc.jp)\?', Regexp::IGNORECASE, 'n') |
| 896 |
reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]", nil, 'n') # escape sequence to jisx0208 new and old |
| 897 |
[headers["from"], headers["subject"]].each do |str| |
| 898 |
if (str) |
| 899 |
case str |
| 900 |
when reg_char_ja |
| 901 |
@options["message-fh"].printf("lang ja header char_ja\n") if (@options["debug"]) |
| 902 |
return ["ja", nil] |
| 903 |
when reg_jis |
| 904 |
@options["message-fh"].printf("lang ja header jis\n") if (@options["debug"]) |
| 905 |
return ["ja", "jis"] |
| 906 |
end |
| 907 |
end |
| 908 |
end |
| 909 |
return nil |
| 910 |
end |
| 911 |
|
| 912 |
def get_lang_from_buf(buf, html_flag) |
| 913 |
return get_lang(buf, html_flag) |
| 914 |
end |
| 915 |
|
| 916 |
def get_lang(buf, html_flag=false) |
| 917 |
reg_euc = Regexp::compile("[\xa1\xa2-\xa1\xbc\xa4\xa1-\xa4\xf3\xa5\xa1-\xa5\xf6]{4}", nil, 'e') # kana in euc-jp without zenkaku-space |
| 918 |
reg_sjis = Regexp::compile("[\x81\x40-\x81\x5b\x82\x9f-\x82\xf1\x83\x40-\x83\x96]{2}", nil, 's') # kana in shift-jis |
| 919 |
reg_utf8 = Regexp::compile("[\xe3\x80\x80-\xe3\x80\x82\xe3\x81\x81-\xe3\x82\x93\xe3\x82\xa1-\xe3\x83\xb6]{4}", nil, 'u') # kana in utf8 |
| 920 |
reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]", nil, 'n') # escape sequence to jisx0208 new and old |
| 921 |
reg_gb18030_possible = Regexp::compile('[\x80-\x9f]', nil, 'n') |
| 922 |
|
| 923 |
## reg_char_utf8 = Regexp::compile('(^\w+: .*|charset="?)(utf-8)', Regexp::IGNORECASE, 'n') |
| 924 |
## reg_cte_bin = Regexp::compile('\Acontent-transfer-encoding\s*:\s*(base64|quoted-printable)', Regexp::IGNORECASE, 'n') |
| 925 |
## reg_c = Regexp::compile('(^\w+: .*|charset="?)(ks_c_5601|euc-kr|big5|gb2312)', Regexp::IGNORECASE, 'n') |
| 926 |
|
| 927 |
gb18030_possible = false |
| 928 |
buf.each do |str| |
| 929 |
if (html_flag) |
| 930 |
str = decode_character_reference2u(str) |
| 931 |
end |
| 932 |
if (str =~ reg_gb18030_possible) |
| 933 |
gb18030_possible = true |
| 934 |
end |
| 935 |
|
| 936 |
case str.gsub(/\s/, '') |
| 937 |
when reg_utf8 |
| 938 |
@options["message-fh"].printf("lang ja utf8\n") if (@options["debug"]) |
| 939 |
return ["ja", "utf8"] |
| 940 |
when reg_jis |
| 941 |
@options["message-fh"].printf("lang ja jis\n") if (@options["debug"]) |
| 942 |
return ["ja", "jis"] |
| 943 |
when reg_sjis |
| 944 |
@options["message-fh"].printf("lang ja sjis\n") if (@options["debug"]) |
| 945 |
return ["ja", "sjis"] |
| 946 |
when reg_euc |
| 947 |
if (gb18030_possible) |
| 948 |
@options["message-fh"].printf("lang ja gb18030\n") if (@options["debug"]) |
| 949 |
return ["ja", "gb18030"] |
| 950 |
else |
| 951 |
@options["message-fh"].printf("lang ja euc\n") if (@options["debug"]) |
| 952 |
return ["ja", "euc"] |
| 953 |
end |
| 954 |
end |
| 955 |
end |
| 956 |
return [nil, nil] |
| 957 |
end |
| 958 |
|
| 959 |
def get_headers(buf, lang) |
| 960 |
headers = DBHash::new |
| 961 |
buf = buf.dup |
| 962 |
header_buf = Array::new |
| 963 |
if ((buf[0] !~ /\Afrom\s+(\S+)/i) && # this isn't mail |
| 964 |
(buf[0] !~ /\A(\S+):/)) |
| 965 |
if (@options["max-line"] <= 0) |
| 966 |
return [headers, buf, lang] |
| 967 |
else |
| 968 |
return [headers, buf[0 .. @options["max-line"]], lang] |
| 969 |
end |
| 970 |
end |
| 971 |
|
| 972 |
while (str = buf.shift) |
| 973 |
header_buf.push(str) |
| 974 |
str = str.chomp |
| 975 |
if (str =~ /\A(\S+?):\s*(.*)/) |
| 976 |
current = $1.downcase |
| 977 |
if (current == "received") |
| 978 |
headers[current] = $2.sub(/[\r\n]*\z/, '') |
| 979 |
else |
| 980 |
headers[current] = (headers[current] || "") + " " + $2.sub(/[\r\n]*\z/, '') |
| 981 |
end |
| 982 |
elsif (str =~ /\Afrom\s+(\S+)/i) |
| 983 |
headers["ufrom"] = $1 |
| 984 |
elsif (str =~ /\A\r*\z/) |
| 985 |
break |
| 986 |
elsif (! current) |
| 987 |
break |
| 988 |
else |
| 989 |
if (str =~ /\A\s*=\?/) |
| 990 |
headers[current] += str.sub(/[\r\n]*\z/, '').sub(/\A\s*/, '') |
| 991 |
else |
| 992 |
headers[current] += str.sub(/[\r\n]*\z/, '').sub(/\A\s*/, ' ') |
| 993 |
end |
| 994 |
end |
| 995 |
end |
| 996 |
|
| 997 |
if ((headers["content-type"] =~ /\bboundary=\s*"(.*?)"/i) || |
| 998 |
(headers["content-type"] =~ /\bboundary=\s*'(.*?)'/i) || |
| 999 |
(headers["content-type"] =~ /\bboundary=([^\s;]+)/i)) |
| 1000 |
headers["boundary"] = $1 |
| 1001 |
end |
| 1002 |
if (headers["content-type"] =~ /charset=([\'\"]*)([^\s\1\;]+)\1/i) |
| 1003 |
headers["charset"] = $2 |
| 1004 |
end |
| 1005 |
if (headers["content-type"] =~ /\A([^;]+)/) |
| 1006 |
headers["content-type"] = $1 |
| 1007 |
end |
| 1008 |
|
| 1009 |
if (@options["max-line"] <= 0) |
| 1010 |
return [headers, buf, lang] |
| 1011 |
else |
| 1012 |
return [headers, buf[0 .. @options["max-line"]], lang] |
| 1013 |
end |
| 1014 |
end |
| 1015 |
|
| 1016 |
|
| 1017 |
class Jtokenizer |
| 1018 |
def initialize(method) |
| 1019 |
case method |
| 1020 |
when "bigram" |
| 1021 |
@method = Proc::new {|s| bigram(s)} |
| 1022 |
when "block" |
| 1023 |
@method = Proc::new {|s| block(s)} |
| 1024 |
when "mecab" |
| 1025 |
@method = Proc::new {|s| mecab(s)} |
| 1026 |
@m = MeCab::Tagger.new([$0, "-Ochasen"]) |
| 1027 |
when "chasen" |
| 1028 |
Chasen.getopt("-F", '%H %m\n', "-j") |
| 1029 |
@method = Proc::new {|s| chasen(s)} |
| 1030 |
when "kakasi" |
| 1031 |
@method = Proc::new {|s| kakasi(s)} |
| 1032 |
else |
| 1033 |
raise |
| 1034 |
end |
| 1035 |
end |
| 1036 |
|
| 1037 |
def split(str) |
| 1038 |
@method.call(str) |
| 1039 |
end |
| 1040 |
|
| 1041 |
Reg_kanji = Regexp::compile("[\xb0\xa1-\xf4\xa4]+", nil, 'e') |
| 1042 |
Reg_katakana = Regexp::compile("[\xa1\xbc\xa5\xa1-\xa5\xf6]+", nil, 'e') |
| 1043 |
Reg_kanji_katakana = Regexp::compile("[\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e') |
| 1044 |
Reg_not_kanji_katakana = Regexp::compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e') |
| 1045 |
|
| 1046 |
def kakasi(str) |
| 1047 |
str = str.gsub(/[\x00-\x7f]/, ' ') |
| 1048 |
if (str =~ /\A +\z/) |
| 1049 |
return [] |
| 1050 |
end |
| 1051 |
array = Array::new |
| 1052 |
Kakasi::kakasi("-oeuc -w", str).scan(/\S+/).each do |token| |
| 1053 |
token.gsub!(Reg_not_kanji_katakana, '') |
| 1054 |
if ((token =~ Reg_kanji) || (token.length > 2)) |
| 1055 |
array.push(token) |
| 1056 |
end |
| 1057 |
end |
| 1058 |
return array |
| 1059 |
end |
| 1060 |
|
| 1061 |
def mecab(str) |
| 1062 |
str = str.gsub(/[\x00-\x7f]/, ' ') |
| 1063 |
if (str =~ /\A +\z/) |
| 1064 |
return [] |
| 1065 |
end |
| 1066 |
array = Array::new |
| 1067 |
node = @m.parseToNode(str) |
| 1068 |
while (node.hasNode == 1) |
| 1069 |
token = node.getSurface |
| 1070 |
hinshi = node.getFeature.split(/,/)[0] |
| 1071 |
## print token, hinshi, "\n" |
| 1072 |
if (hinshi == "\xcc\xbe\xbb\xec") |
| 1073 |
if ((token =~ Reg_kanji_katakana) || (token.length > 2)) |
| 1074 |
array.push(token) |
| 1075 |
end |
| 1076 |
else |
| 1077 |
token.gsub!(Reg_not_kanji_katakana, '') |
| 1078 |
if ((token =~ Reg_kanji) || (token.length > 2)) |
| 1079 |
array.push(token) |
| 1080 |
end |
| 1081 |
end |
| 1082 |
node = node.next |
| 1083 |
end |
| 1084 |
return array |
| 1085 |
end |
| 1086 |
|
| 1087 |
def chasen(str) |
| 1088 |
str = str.gsub(/[\x00-\x7f]/, ' ') |
| 1089 |
if (str =~ /\A +\z/) |
| 1090 |
return [] |
| 1091 |
end |
| 1092 |
array = Array::new |
| 1093 |
Chasen.sparse(str).split("\n").each do |hinshi_token| |
| 1094 |
if (hinshi_token =~ /(.*) (.*)/) |
| 1095 |
hinshi = $1 |
| 1096 |
token = $2 |
| 1097 |
if (hinshi == "\xcc\xbe\xbb\xec") |
| 1098 |
if ((token =~ Reg_kanji_katakana) || (token.length > 2)) |
| 1099 |
array.push(token) |
| 1100 |
end |
| 1101 |
else |
| 1102 |
token.gsub!(Reg_not_kanji_katakana, '') |
| 1103 |
if ((token =~ Reg_kanji) || (token.length > 2)) |
| 1104 |
array.push(token) |
| 1105 |
end |
| 1106 |
end |
| 1107 |
end |
| 1108 |
end |
| 1109 |
return array |
| 1110 |
end |
| 1111 |
|
| 1112 |
def block(str) |
| 1113 |
tokens = str.scan(Reg_kanji) |
| 1114 |
tokens.concat(str.scan(Reg_katakana)) |
| 1115 |
return tokens |
| 1116 |
end |
| 1117 |
|
| 1118 |
def bigram(str) |
| 1119 |
tokens = Array::new |
| 1120 |
|
| 1121 |
str.scan(Reg_kanji).each do |token| |
| 1122 |
case token.length |
| 1123 |
when 2, 4 |
| 1124 |
tokens.push(token) |
| 1125 |
else |
| 1126 |
l = token.length / 2 - 2 |
| 1127 |
for i in (0 .. l) |
| 1128 |
tokens.push(token[i * 2, 4]) |
| 1129 |
end |
| 1130 |
end |
| 1131 |
end |
| 1132 |
tokens.concat(str.scan(Reg_katakana)) |
| 1133 |
return tokens |
| 1134 |
end |
| 1135 |
end |
| 1136 |
|
| 1137 |
def tokenize_headers(lang, headers) |
| 1138 |
(lang, code) = get_lang_from_headers(headers) if (! lang) |
| 1139 |
|
| 1140 |
head_db = TokenDB::new(lang) |
| 1141 |
reg_token = Regexp::compile("\\b\\d[\\d\\.]+\\d\\b|[\\w#{@options['mark-in-token']}]+") |
| 1142 |
|
| 1143 |
if (headers["received"]) |
| 1144 |
str = headers["received"] |
| 1145 |
str =~ /envelope\-from\s+([\w@\.\-]+)/ |
| 1146 |
efrom = $1 |
| 1147 |
str =~ /for\s+<([\w@\.\-]+)>/ |
| 1148 |
foraddress = $1 |
| 1149 |
str.sub!(/(\bid|;).*/im, '') |
| 1150 |
str.sub!(/\(qmail[^\)]*\)/, '') |
| 1151 |
str += " " + efrom if efrom |
| 1152 |
str += " " + foraddress if foraddress |
| 1153 |
headers["received"] = str |
| 1154 |
end |
| 1155 |
|
| 1156 |
# if (headers["domainkey-signature"]) |
| 1157 |
# headers["domainkey-signature"] = headers["domainkey-signature"].sub(/b=[^:;\s]+/, '') |
| 1158 |
# end |
| 1159 |
|
| 1160 |
# "authentication-results", "domainkey-signature" |
| 1161 |
headers.each do |header, content| |
| 1162 |
if (@options["refer-header"][header]) |
| 1163 |
if (lang == "ja") |
| 1164 |
content.gsub!(/=\?utf\-8\?([bq])\?(\S*)\?=/i) do |s| |
| 1165 |
b_or_q = $1 |
| 1166 |
encoded_str = $2 |
| 1167 |
if (@options["utf-8"]) |
| 1168 |
if (b_or_q =~ /q/i) |
| 1169 |
decoded_str = encoded_str.unpack("M*").to_s |
| 1170 |
else |
| 1171 |
decoded_str = encoded_str.unpack("m*").to_s |
| 1172 |
end |
| 1173 |
Iconv.u2eucjp(decoded_str) |
| 1174 |
else |
| 1175 |
"" |
| 1176 |
end |
| 1177 |
end |
| 1178 |
content = NKF::nkf('-e -X -Z0', content.gsub(/\?(iso-2202-jp|shift-jis)\?/i, '?ISO-2022-JP?')) |
| 1179 |
else |
| 1180 |
content = latin2ascii(content) |
| 1181 |
end |
| 1182 |
content.scan(reg_token).each do |token| |
| 1183 |
head_db.add_scalar(header, token, 1) if (token.length < 20) |
| 1184 |
@options["message-fh"].printf("tokenizer %s %s\n", header, token) if (@options["debug"]) |
| 1185 |
end |
| 1186 |
if (lang == "ja") |
| 1187 |
@jtokenizer.split(content.gsub(/\s+/, '')).each do |token| |
| 1188 |
head_db.add_scalar(header, token, 1) |
| 1189 |
@options["message-fh"].printf("tokenizer %s %s\n", header, token) if (@options["debug"]) |
| 1190 |
end |
| 1191 |
end |
| 1192 |
end |
| 1193 |
end |
| 1194 |
return head_db |
| 1195 |
end |
| 1196 |
|
| 1197 |
def tokenize_buf(buf) |
| 1198 |
lang = nil # lang in unknown at first |
| 1199 |
|
| 1200 |
separators = Array::new |
| 1201 |
delimiters = Array::new |
| 1202 |
(headers, buf, lang) = get_headers(buf, lang) |
| 1203 |
if (headers.empty?) # this is not a mail |
| 1204 |
(db, buf) = tokenize_body(lang, headers, buf, separators, delimiters) |
| 1205 |
db.time = Time::new |
| 1206 |
db.language = "C" if (! db.language) |
| 1207 |
return db |
| 1208 |
end |
| 1209 |
|
| 1210 |
body_db = TokenDB::new(lang) |
| 1211 |
body_db.message_id = headers["message-id"] || "-" |
| 1212 |
|
| 1213 |
sub_head_db = TokenDB::new(lang) |
| 1214 |
main_head_db = tokenize_headers(lang, headers) |
| 1215 |
lang = main_head_db.language if main_head_db |
| 1216 |
|
| 1217 |
found_html_part = false |
| 1218 |
plain_bodies = Array::new |
| 1219 |
html_bodies = Array::new |
| 1220 |
|
| 1221 |
while (! buf.empty?) |
| 1222 |
separators.push("--" + headers["boundary"]) if (headers["boundary"]) |
| 1223 |
delimiters.push("--" + headers["boundary"] + "--") if (headers["boundary"]) |
| 1224 |
|
| 1225 |
if ((! headers["content-type"]) || |
| 1226 |
(headers["content-type"] !~ /rfc822/i)) |
| 1227 |
(db, buf) = tokenize_body(lang, headers, buf, separators, delimiters) |
| 1228 |
lang = db.language |
| 1229 |
if (headers["content-type"] =~ /html/i) |
| 1230 |
found_html_part = true |
| 1231 |
html_bodies.push(db) |
| 1232 |
else |
| 1233 |
plain_bodies.push(db) |
| 1234 |
end |
| 1235 |
end |
| 1236 |
(headers, buf, lang) = get_headers(buf, lang) |
| 1237 |
db = tokenize_headers(lang, headers) |
| 1238 |
sub_head_db.add_db(db) |
| 1239 |
end |
| 1240 |
|
| 1241 |
if (@options["ignore-plain-text-part"] && found_html_part) |
| 1242 |
html_bodies.each do |db| |
| 1243 |
body_db.add_db(db) |
| 1244 |
end |
| 1245 |
else # default |
| 1246 |
html_bodies.each do |db| |
| 1247 |
body_db.add_db(db) |
| 1248 |
end |
| 1249 |
plain_bodies.each do |db| |
| 1250 |
body_db.add_db(db) |
| 1251 |
end |
| 1252 |
end |
| 1253 |
|
| 1254 |
body_db.add_db(main_head_db) |
| 1255 |
body_db.add_db(sub_head_db) |
| 1256 |
body_db.file_count = 1 |
| 1257 |
body_db.time = Time::new |
| 1258 |
body_db.language = "C" if (! body_db.language) |
| 1259 |
return body_db |
| 1260 |
end |
| 1261 |
|
| 1262 |
def i2eucjp(i) |
| 1263 |
Iconv.u2eucjp([i].pack("U")) |
| 1264 |
end |
| 1265 |
|
| 1266 |
def i2ascii(i) |
| 1267 |
latin2ascii(Iconv.u2latin([i].pack("U"))) |
| 1268 |
end |
| 1269 |
|
| 1270 |
def i2u(i) |
| 1271 |
[i].pack("U") |
| 1272 |
end |
| 1273 |
|
| 1274 |
def decode_character_reference2u(str) |
| 1275 |
if (@options["utf-8"]) |
| 1276 |
newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i) do |
| 1277 |
hex_or_dec = $1 |
| 1278 |
if (hex_or_dec =~ /^x(.*)/i) |
| 1279 |
hex_str = $1 |
| 1280 |
i2u(hex_str.hex) |
| 1281 |
else |
| 1282 |
i2u(hex_or_dec.to_i) |
| 1283 |
end |
| 1284 |
end |
| 1285 |
else |
| 1286 |
newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i, "") |
| 1287 |
end |
| 1288 |
return newstr |
| 1289 |
end |
| 1290 |
|
| 1291 |
def decode_character_reference(str, lang) |
| 1292 |
if (@options["utf-8"]) |
| 1293 |
newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i) do |
| 1294 |
hex_or_dec = $1 |
| 1295 |
if (hex_or_dec =~ /^x(.*)/i) |
| 1296 |
hex_str = $1 |
| 1297 |
if (lang == "ja") |
| 1298 |
i2eucjp(hex_str.hex) |
| 1299 |
else |
| 1300 |
i2ascii(hex_str.hex) |
| 1301 |
end |
| 1302 |
else |
| 1303 |
if (lang == "ja") |
| 1304 |
i2eucjp(hex_or_dec.to_i) |
| 1305 |
else |
| 1306 |
i2ascii(hex_or_dec.to_i) |
| 1307 |
end |
| 1308 |
end |
| 1309 |
end |
| 1310 |
else |
| 1311 |
newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i, "") |
| 1312 |
end |
| 1313 |
return newstr |
| 1314 |
end |
| 1315 |
|
| 1316 |
def tokenize_str(str, lang) |
| 1317 |
body_hash = DBHash::new(0) |
| 1318 |
url_hash = DBHash::new(0) |
| 1319 |
|
| 1320 |
reg_token = Regexp::compile("(?:http:|www)[\\w\\-\\.\\/@%:\?=]+|[\\w\\-\\.]+@[\\w\\-\\.]+|\\b\\d[\\d\\.]+\\d\\b|[\\w#{@options['mark-in-token']}]+") |
| 1321 |
reg_url = Regexp::compile('(^http:|https:|^www|@)') |
| 1322 |
reg_token2 = Regexp::compile('\b\d[\d\.]+\d\b|[\w%]+') |
| 1323 |
reg_noret = Regexp::compile('[\r\n]*\z') |
| 1324 |
|
| 1325 |
str.scan(reg_token).each do |token| |
| 1326 |
if (token =~ reg_url) |
| 1327 |
token.scan(reg_token2).each do |token2| |
| 1328 |
if (token2.length < 20) |
| 1329 |
url_hash[token2] += 1 |
| 1330 |
@options["message-fh"].printf("tokenizer %s %s\n", "url", token2) if (@options["debug"]) |
| 1331 |
end |
| 1332 |
end |
| 1333 |
elsif (token.length < 20) |
| 1334 |
body_hash[token] += 1 |
| 1335 |
@options["message-fh"].printf("tokenizer C %s %s\n", "body", token) if (@options["debug"]) |
| 1336 |
end |
| 1337 |
end |
| 1338 |
|
| 1339 |
if (lang == "ja") |
| 1340 |
str.gsub!(Regexp::compile("^[ -\\~]*[\|\>]+", nil, 'e'), '') # delete cite mark |
| 1341 |
str.gsub!(Regexp::compile("^[ \\t\xa1\xa1]+", nil, 'e'), '') # delete white space |
| 1342 |
str.gsub!(Regexp::compile("(\\r?\\n){2,}", nil, 'e'), ' ') # keep multiple newline as space |
| 1343 |
str.gsub!(Regexp::compile("[\\r\\n]+", nil, 'e'), '') # delete newline |
| 1344 |
str.split.each do |s| |
| 1345 |
@jtokenizer.split(s).each do |token| |
| 1346 |
body_hash[token] += 1 |
| 1347 |
@options["message-fh"].printf("tokenizer ja %s %s\n", "body", token) if (@options["debug"]) |
| 1348 |
end |
| 1349 |
end |
| 1350 |
end |
| 1351 |
return [body_hash, url_hash] |
| 1352 |
end |
| 1353 |
|
| 1354 |
def base64_encoded?(buf) |
| 1355 |
[buf.dup, buf.reverse].each do |b| |
| 1356 |
while (str = b.shift) |
| 1357 |
if (str =~ /\A[\s\r\n]*\z/) |
| 1358 |
next |
| 1359 |
elsif (str =~ /\A[A-z0-9=+\/]+[\s\r\n]*\z/) |
| 1360 |
break |
| 1361 |
else |
| 1362 |
return false |
| 1363 |
end |
| 1364 |
end |
| 1365 |
end |
| 1366 |
return true |
| 1367 |
end |
| 1368 |
|
| 1369 |
def tokenize_body(lang, headers, body, separators, delimiters) |
| 1370 |
reg_return_codes = Regexp::compile('[\r\n]*\z') |
| 1371 |
|
| 1372 |
db = TokenDB::new(lang) |
| 1373 |
body = body.dup |
| 1374 |
|
| 1375 |
buf = Array::new |
| 1376 |
|
| 1377 |
delimiter = delimiters.last |
| 1378 |
separator = separators.last |
| 1379 |
|
| 1380 |
if (separators.empty?) |
| 1381 |
buf = body |
| 1382 |
body = Array::new |
| 1383 |
else |
| 1384 |
while (str = body.shift) |
| 1385 |
str_noret = str.sub(reg_return_codes, '') |
| 1386 |
case str_noret |
| 1387 |
when separator |
| 1388 |
break |
| 1389 |
when delimiter |
| 1390 |
delimiters.pop |
| 1391 |
separators.pop |
| 1392 |
delimiter = delimiters.last |
| 1393 |
separator = separators.last |
| 1394 |
else |
| 1395 |
buf.push(str) |
| 1396 |
end |
| 1397 |
end |
| 1398 |
end |
| 1399 |
|
| 1400 |
if (headers["content-type"] && headers["content-type"] !~ /text/i) |
| 1401 |
return [db, body] # skip non-text body |
| 1402 |
end |
| 1403 |
|
| 1404 |
case headers["content-transfer-encoding"] |
| 1405 |
when /base64/i |
| 1406 |
if (base64_encoded?(buf)) |
| 1407 |
buf.map! {|str| str.unpack("m*").to_s} |
| 1408 |
end |
| 1409 |
when /quoted-printable/i |
| 1410 |
buf.map! {|str| str.unpack("M*").to_s} |
| 1411 |
end |
| 1412 |
|
| 1413 |
lang_backup = lang |
| 1414 |
if (headers["content-type"] =~ /html/i) |
| 1415 |
(lang, code) = get_lang_from_buf(buf, true) |
| 1416 |
else |
| 1417 |
(lang, code) = get_lang_from_buf(buf, false) |
| 1418 |
end |
| 1419 |
if (! lang) |
| 1420 |
lang = lang_backup |
| 1421 |
end |
| 1422 |
|
| 1423 |
str = buf.join |
| 1424 |
str.gsub!(/^begin[^\r\n]+(([\r\n]+M)([^\r\n]+))*/, '') # remove uuencoded lines |
| 1425 |
|
| 1426 |
if (lang == "ja") |
| 1427 |
if (code == "utf8") |
| 1428 |
if (@options["utf-8"]) |
| 1429 |
str = Iconv.u2eucjp(str) |
| 1430 |
else |
| 1431 |
lang = "C" # can't use iconv / stop ja tokenizer |
| 1432 |
end |
| 1433 |
elsif (code == "gb18030") |
| 1434 |
if (@options["utf-8"]) |
| 1435 |
str = Iconv.gb180302eucjp(str) |
| 1436 |
else |
| 1437 |
lang = "C" |
| 1438 |
end |
| 1439 |
else |
| 1440 |
str = NKF::nkf('-e -X -Z0', str) |
| 1441 |
end |
| 1442 |
else |
| 1443 |
str = latin2ascii(str) |
| 1444 |
end |
| 1445 |
|
| 1446 |
tags = Array::new |
| 1447 |
if (headers["content-type"] =~ /html/i) |
| 1448 |
# remove salad at head of part |
| 1449 |
if (str =~ Regexp::compile('\A[^<>]*?(<(\?xml|!doctype|html|body)\b.*)\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n')) |
| 1450 |
str = $1 |
| 1451 |
end |
| 1452 |
|
| 1453 |
# remove salad in head, except style |
| 1454 |
if (str =~ /\A(.*?)(<body.*)\z/im) |
| 1455 |
before_body_tag = $1 |
| 1456 |
after_body_tag = $2 |
| 1457 |
before_body_tag.gsub!(/>[^<>]*<(?!\/style)/im, '><') |
| 1458 |
str = before_body_tag + after_body_tag |
| 1459 |
end |
| 1460 |
|
| 1461 |
# remove <p style="font-size:0px..> |
| 1462 |
str.gsub!(/(<p[^<>]*font-size\s*:\s*[01]\s*(;|px)[^<>]*>)([^<>]*)(<\/p>)/im, '') |
| 1463 |
str.gsub!(/(<font[^<>]*font-size\s*:\s*[01]\s*(;|px)[^<>]*>)([^<>]*)(<\/font>)/im, '') |
| 1464 |
|
| 1465 |
# remove <span style="DISPLAY: none..> |
| 1466 |
str.gsub!(/(<span[^<>]*display\s*:\s*none[^>]*>)([^<>]*)(<\/span>)/im, '') |
| 1467 |
|
| 1468 |
if (@options["ignore-after-last-atag"]) |
| 1469 |
if (str =~ /\A(.*)<\/a>/im) |
| 1470 |
str = $1 |
| 1471 |
end |
| 1472 |
end |
| 1473 |
|
| 1474 |
|
| 1475 |
# remove salad after body or html |
| 1476 |
if (str =~ Regexp::compile('\A(.*)</html>[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n')) |
| 1477 |
str = $1 |
| 1478 |
end |
| 1479 |
if (str =~ Regexp::compile('\A(.*)</body>[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n')) |
| 1480 |
str = $1 |
| 1481 |
end |
| 1482 |
str.gsub!(Regexp::compile('<[^>]*>', Regexp::MULTILINE, 'n')) do |t| |
| 1483 |
t = t.gsub(/\n/, '') |
| 1484 |
if (t =~ RE_ALL_TAGS) # end tags are thrown away |
| 1485 |
tags.push(t) |
| 1486 |
end |
| 1487 |
|
| 1488 |
if (t =~ RE_SPACE_TAGS) |
| 1489 |
" " |
| 1490 |
else |
| 1491 |
"" |
| 1492 |
end |
| 1493 |
end |
| 1494 |
body_str = decode_character_reference(str, lang) # out of tags |
| 1495 |
tag_str = decode_character_reference(tags.join, lang) # in tags |
| 1496 |
else # if plain text |
| 1497 |
body_str = str |
| 1498 |
tag_str = "" |
| 1499 |
end |
| 1500 |
(body_hash, url_body_hash) = tokenize_str(body_str, lang) |
| 1501 |
(tag_hash, url_tag_hash) = tokenize_str(tag_str, lang) |
| 1502 |
|
| 1503 |
if (! body_hash.empty? && @options["use-body"]) |
| 1504 |
db.add_hash({"body" => body_hash}) |
| 1505 |
end |
| 1506 |
if (! tag_hash.empty?) |
| 1507 |
db.add_hash({"tag" => tag_hash}) |
| 1508 |
end |
| 1509 |
if (! url_body_hash.empty?) |
| 1510 |
db.add_hash({"url" => url_body_hash}) |
| 1511 |
end |
| 1512 |
if (! url_tag_hash.empty?) |
| 1513 |
db.add_hash({"url" => url_tag_hash}) |
| 1514 |
end |
| 1515 |
db.file_count = 1 |
| 1516 |
db.language = lang |
| 1517 |
return [db, body] |
| 1518 |
end |
| 1519 |
|
| 1520 |
class Probability # for each lang |
| 1521 |
def initialize(options, lang) |
| 1522 |
@options = options |
| 1523 |
@filename = @options["homedir"] + lang + Prob_ext |
| 1524 |
case (@options["db"]) |
| 1525 |
when "sdbm" |
| 1526 |
@clean = TokenSDBM::new(@options, lang, Clean_ext) |
| 1527 |
@spam = TokenSDBM::new(@options, lang, Spam_ext) |
| 1528 |
@prob = TokenSDBM::new(@options, lang, Prob_ext) |
| 1529 |
when "gdbm" |
| 1530 |
@clean = TokenGDBM::new(@options, lang, Clean_ext) |
| 1531 |
@spam = TokenGDBM::new(@options, lang, Spam_ext) |
| 1532 |
@prob = TokenGDBM::new(@options, lang, Prob_ext) |
| 1533 |
when "bdb1" |
| 1534 |
@clean = TokenBDB1::new(@options, lang, Clean_ext) |
| 1535 |
@spam = TokenBDB1::new(@options, lang, Spam_ext) |
| 1536 |
@prob = TokenBDB1::new(@options, lang, Prob_ext) |
| 1537 |
when "bdb" |
| 1538 |
@clean = TokenBDB::new(@options, lang, Clean_ext) |
| 1539 |
@spam = TokenBDB::new(@options, lang, Spam_ext) |
| 1540 |
@prob = TokenBDB::new(@options, lang, Prob_ext) |
| 1541 |
when "qdbm" |
| 1542 |
@clean = TokenQDBM::new(@options, lang, Clean_ext) |
| 1543 |
@spam = TokenQDBM::new(@options, lang, Spam_ext) |
| 1544 |
@prob = TokenQDBM::new(@options, lang, Prob_ext) |
| 1545 |
end |
| 1546 |
|
| 1547 |
@language = lang |
| 1548 |
end |
| 1549 |
|
| 1550 |
attr_accessor :prob, :clean, :spam, :spam_cutoff, :language |
| 1551 |
|
| 1552 |
def merge_dbs_of_lang(token_dbs) |
| 1553 |
new_db = TokenDB::new |
| 1554 |
token_dbs.each do |db| |
| 1555 |
if (@language == db.language) |
| 1556 |
new_db.add_db(db) |
| 1557 |
end |
| 1558 |
end |
| 1559 |
return new_db |
| 1560 |
end |
| 1561 |
end |
| 1562 |
|
| 1563 |
class Graham < Probability |
| 1564 |
def initialize(options, lang) |
| 1565 |
@spam_cutoff = 0.9 |
| 1566 |
@default_probability = 0.4 |
| 1567 |
super |
| 1568 |
end |
| 1569 |
|
| 1570 |
def product(a) |
| 1571 |
n = 1 |
| 1572 |
a.each do |v| |
| 1573 |
n = n * v if (v != 0) |
| 1574 |
end |
| 1575 |
return n |
| 1576 |
end |
| 1577 |
|
| 1578 |
def get_combined_probability(token_db) |
| 1579 |
prob_db = TokenDB::new # temporary |
| 1580 |
|
| 1581 |
token_db.each_ct do |(category, token)| |
| 1582 |
probability = @prob.value_with_degene(category, token) |
| 1583 |
if (probability) |
| 1584 |
prob_db.set_scalar(category, token, probability) |
| 1585 |
else |
| 1586 |
prob_db.set_scalar(category, token, @default_probability) # 0.4 |
| 1587 |
end |
| 1588 |
end |
| 1589 |
|
| 1590 |
probs = prob_db.values.sort {|a, b| (b - 0.5).abs <=> (a - 0.5).abs}[0, 15] |
| 1591 |
|
| 1592 |
if (@options["debug"]) |
| 1593 |
prob_array = Array::new |
| 1594 |
prob_db.each_ct do |c, t| |
| 1595 |
prob_array.push([[c, t], prob_db.value(c, t)]) |
| 1596 |
end |
| 1597 |
prob_array.sort! {|a, b| (b[1] - 0.5).abs <=> (a[1] - 0.5).abs} |
| 1598 |
prob_array = prob_array[0, 15] |
| 1599 |
prob_array.sort! {|a, b| b[1] <=> a[1]} |
| 1600 |
prob_array.each do |k, v| |
| 1601 |
@options["message-fh"].printf("word probability %s %s %f\n", k[0], k[1], v) |
| 1602 |
end |
| 1603 |
end |
| 1604 |
|
| 1605 |
prod = product(probs) |
| 1606 |
token_db.probability = prod / (prod + product(probs.map {|x| 1 - x})) |
| 1607 |
if (token_db.probability > @spam_cutoff) |
| 1608 |
token_db.spam_flag = true |
| 1609 |
else |
| 1610 |
token_db.spam_flag = false |
| 1611 |
end |
| 1612 |
return token_db |
| 1613 |
end |
| 1614 |
|
| 1615 |
def update_probability(token_dbs) |
| 1616 |
c_count = [@clean.file_count, 1].max |
| 1617 |
s_count = [@spam.file_count, 1].max |
| 1618 |
|
| 1619 |
if (token_dbs.empty?) |
| 1620 |
incremental = false |
| 1621 |
target_cts = @clean.key_cts | @spam.key_cts |
| 1622 |
@prob.open("w") |
| 1623 |
@prob.clear |
| 1624 |
else |
| 1625 |
incremental = true |
| 1626 |
merged_db = merge_dbs_of_lang(token_dbs) |
| 1627 |
target_cts = merged_db.key_cts |
| 1628 |
return if (target_cts.empty?) |
| 1629 |
@prob.open("rw") |
| 1630 |
end |
| 1631 |
old_file_count = @prob.file_count |
| 1632 |
new_file_count = 0 |
| 1633 |
|
| 1634 |
cnum = c_count.to_f |
| 1635 |
snum = s_count.to_f |
| 1636 |
|
| 1637 |
target_cts.each do |(category, token)| |
| 1638 |
c_count = @clean.value(category, token) || 0 |
| 1639 |
s_count = @spam.value(category, token) || 0 |
| 1640 |
update = false |
| 1641 |
if (incremental && @prob.value(category, token)) |
| 1642 |
@prob.sub_scalar(category, token, 1.0) # 1.0 is big enough for delete |
| 1643 |
new_file_count -= 1 |
| 1644 |
end |
| 1645 |
if (c_count == 0) |
| 1646 |
if (s_count > 10) |
| 1647 |
new_file_count += 1 |
| 1648 |
@prob.set_scalar(category, token, 0.9999) |
| 1649 |
elsif (s_count > 5) |
| 1650 |
new_file_count += 1 |
| 1651 |
@prob.set_scalar(category, token, 0.9998) |
| 1652 |
end |
| 1653 |
elsif (s_count == 0) |
| 1654 |
if (c_count > 10) |
| 1655 |
new_file_count += 1 |
| 1656 |
@prob.set_scalar(category, token, 0.0001) |
| 1657 |
elsif (c_count > 5) |
| 1658 |
new_file_count += 1 |
| 1659 |
@prob.set_scalar(category, token, 0.0002) |
| 1660 |
end |
| 1661 |
elsif (c_count + s_count > 5) |
| 1662 |
c = c_count * 2 |
| 1663 |
s = s_count |
| 1664 |
p = [[[s / snum, 1.0].min / ([c / cnum, 1.0].min + [s / snum, 1.0].min), |
| 1665 |
0.9999].min, |
| 1666 |
0.0001].max |
| 1667 |
new_file_count += 1 |
| 1668 |
@prob.set_scalar(category, token, p) |
| 1669 |
end |
| 1670 |
end |
| 1671 |
@prob.file_count = new_file_count + old_file_count if (incremental) |
| 1672 |
@prob.close |
| 1673 |
end |
| 1674 |
end |
| 1675 |
|
| 1676 |
class Robinson < Probability |
| 1677 |
def initialize(options, lang) |
| 1678 |
@robx_max = 1 |
| 1679 |
@min_dev = 0.1 |
| 1680 |
@spam_cutoff = 0.582 |
| 1681 |
@center = 0.5 |
| 1682 |
@robs = 0.001 # from bogofilter/robinson.h |
| 1683 |
@default_robx = 0.415 # from bogofilter/robinson.h / not used |
| 1684 |
super |
| 1685 |
end |
| 1686 |
|
| 1687 |
def get_pw(category, token, g, b) |
| 1688 |
return pw |
| 1689 |
end |
| 1690 |
|
| 1691 |
|
| 1692 |
def update_probability(token_dbs) |
| 1693 |
pwdb = TokenDB::new |
| 1694 |
c_count = [@clean.file_count, 1].max |
| 1695 |
s_count = [@spam.file_count, 1].max |
| 1696 |
|
| 1697 |
if (token_dbs.empty?) |
| 1698 |
incremental = false |
| 1699 |
target_cts = @clean.key_cts | @spam.key_cts |
| 1700 |
else |
| 1701 |
incremental = true |
| 1702 |
merged_db = merge_dbs_of_lang(token_dbs) |
| 1703 |
target_cts = merged_db.key_cts |
| 1704 |
return if (target_cts.empty?) |
| 1705 |
end |
| 1706 |
|
| 1707 |
## loop1 |
| 1708 |
## get pw and robx(average of pw) |
| 1709 |
count = 0 |
| 1710 |
pw_sum = 0.0 |
| 1711 |
|
| 1712 |
good_mail = [1, @clean.file_count].max.to_f |
| 1713 |
bad_mail = [1, @spam.file_count].max.to_f |
| 1714 |
target_cts.each do |(category, token)| |
| 1715 |
g = [@clean.value(category, token) || 0, c_count].min |
| 1716 |
b = [@spam.value(category, token) || 0, s_count].min |
| 1717 |
n = g + b |
| 1718 |
if (n == 0) |
| 1719 |
pwdb.set_scalar(category, token, nil) # need to delete this token from prob.db |
| 1720 |
else |
| 1721 |
pw = (b / bad_mail) / (b / bad_mail + g / good_mail) |
| 1722 |
if ((@robx_max == 0) || (n <= @robx_max)) |
| 1723 |
pw_sum += pw |
| 1724 |
count += 1 |
| 1725 |
end |
| 1726 |
pwdb.set_scalar(category, token, pw) |
| 1727 |
end |
| 1728 |
end |
| 1729 |
|
| 1730 |
if (incremental) |
| 1731 |
@prob.open("rw") |
| 1732 |
old_file_count = @prob.file_count |
| 1733 |
old_robx = @prob.value(".internal", "robx") || @default_robx |
| 1734 |
robx = (pw_sum + old_file_count * old_robx) / (count + old_file_count) |
| 1735 |
robs = @robs |
| 1736 |
else |
| 1737 |
@prob.open("w") |
| 1738 |
@prob.clear |
| 1739 |
if (count != 0) |
| 1740 |
robx = pw_sum / count |
| 1741 |
else |
| 1742 |
robx = @default_robx |
| 1743 |
end |
| 1744 |
robs = @robs |
| 1745 |
end |
| 1746 |
## loop2 |
| 1747 |
## get fw from pw |
| 1748 |
new_file_count = 0 |
| 1749 |
pwdb.key_cts.each do |(category, token)| |
| 1750 |
g = [@clean.value(category, token) || 0, c_count].min |
| 1751 |
b = [@spam.value(category, token) || 0, s_count].min |
| 1752 |
n = g + b |
| 1753 |
pw = pwdb.value(category, token) |
| 1754 |
if (incremental && @prob.value(category, token)) |
| 1755 |
new_file_count -= 1 |
| 1756 |
@prob.sub_scalar(category, token, 1.0) # 1.0 is big enough for delete |
| 1757 |
end |
| 1758 |
if (pw) |
| 1759 |
new_file_count += 1 |
| 1760 |
@prob.set_scalar(category, token, (robs * robx + n * pw) / (robs + n)) # fw |
| 1761 |
end |
| 1762 |
end |
| 1763 |
@prob.set_scalar(".internal", "robx", robx) |
| 1764 |
@prob.file_count = new_file_count + old_file_count if (incremental) |
| 1765 |
@prob.close |
| 1766 |
end |
| 1767 |
|
| 1768 |
def get_probability(pminus, qminus, count) |
| 1769 |
r = 1.0 / [1, count].max |
| 1770 |
p = 1.0 - Math::exp(pminus.ln * r) |
| 1771 |
q = 1.0 - Math::exp(qminus.ln * r) |
| 1772 |
s = (1.0 + (p - q) / (p + q)) / 2.0 |
| 1773 |
return s |
| 1774 |
end |
| 1775 |
|
| 1776 |
def get_combined_probability(token_db) |
| 1777 |
robx = @prob.value(".internal", "robx") || @default_robx |
| 1778 |
|
| 1779 |
count = 0 |
| 1780 |
pminus = FLOAT::new(1) |
| 1781 |
qminus = FLOAT::new(1) |
| 1782 |
token_db.each_ct do |(category, token)| |
| 1783 |
probability = @prob.value_with_degene(category, token) || robx |
| 1784 |
if ((probability - @center).abs > @min_dev) |
| 1785 |
if (probability <= 0.0) |
| 1786 |
probability = 0.0000001 |
| 1787 |
elsif (probability >= 1.0) |
| 1788 |
probability = 0.9999999 |
| 1789 |
end |
| 1790 |
c = token_db.value(category, token) |
| 1791 |
count += c |
| 1792 |
pminus = pminus * FLOAT::new(1.0 - probability, c) |
| 1793 |
qminus = qminus * FLOAT::new(probability, c) |
| 1794 |
@options["message-fh"].printf("word probability %s %s %d %f\n", category, token, c, probability) if (@options["debug"]) |
| 1795 |
end |
| 1796 |
end |
| 1797 |
|
| 1798 |
if (count == 0) |
| 1799 |
token_db.probability = 0.0 |
| 1800 |
else |
| 1801 |
token_db.probability = get_probability(pminus, qminus, count) |
| 1802 |
end |
| 1803 |
if (token_db.probability > @spam_cutoff) |
| 1804 |
token_db.spam_flag = true |
| 1805 |
else |
| 1806 |
token_db.spam_flag = false |
| 1807 |
end |
| 1808 |
return token_db |
| 1809 |
end |
| 1810 |
end |
| 1811 |
|
| 1812 |
|
| 1813 |
class RobinsonFisher < Robinson |
| 1814 |
def initialize(options, lang) |
| 1815 |
super |
| 1816 |
@spam_cutoff = 0.95 |
| 1817 |
end |
| 1818 |
|
| 1819 |
def chi2q(x2, v) |
| 1820 |
m = x2 / 2.0 |
| 1821 |
sum = Math::exp(0.0 - m) |
| 1822 |
term = FLOAT::new |
| 1823 |
term.exp = 0.0 - m |
| 1824 |
term.mant = 1 |
| 1825 |
|
| 1826 |
(1 .. (v / 2) - 1).each do |i| |
| 1827 |
term = term * FLOAT::new(m / i) |
| 1828 |
sum += term.to_f |
| 1829 |
end |
| 1830 |
return sum < 1.0 ? sum : 1.0 |
| 1831 |
end |
| 1832 |
|
| 1833 |
def get_probability(pminus, qminus, count) |
| 1834 |
p = 1 - chi2q(-2.0 * pminus.ln, 2 * count) |
| 1835 |
q = 1 - chi2q(-2.0 * qminus.ln, 2 * count) |
| 1836 |
s = (1.0 + p - q) / 2.0 |
| 1837 |
return s |
| 1838 |
end |
| 1839 |
end |
| 1840 |
|
| 1841 |
def init_dir(dir) |
| 1842 |
if (! FileTest::directory?(dir)) |
| 1843 |
Dir.mkdir(dir, 0700) |
| 1844 |
end |
| 1845 |
end |
| 1846 |
|
| 1847 |
def usage |
| 1848 |
|
| 1849 |
print <<EOM |
| 1850 |
|
| 1851 |
NAME |
| 1852 |
bsfilter - bayesian spam filter |
| 1853 |
|
| 1854 |
SYNOPSIS |
| 1855 |
bsfilter [options] [commands] < MAIL |
| 1856 |
bsfilter [options] [commands] MAIL ... |
| 1857 |
|
| 1858 |
DESCRIPTION |
| 1859 |
filter spam. |
| 1860 |
If commands are specified, bsfilter is in maintenance mode, otherwise it is in filtering mode. |
| 1861 |
If bsfilter does not find spam in filtering mode, exit status is 1. |
| 1862 |
If bsfilter runs with --pipe option or finds spam, exit status is 0. |
| 1863 |
|
| 1864 |
COMMANDS |
| 1865 |
--add-clean|-c |
| 1866 |
add mails into the clean token database |
| 1867 |
|
| 1868 |
--add-spam|-s |
| 1869 |
add mails into the spam token database |
| 1870 |
|
| 1871 |
--sub-clean|-C |
| 1872 |
subtract mails from the clean token database |
| 1873 |
|
| 1874 |
--sub-spam|-S |
| 1875 |
subtract mails from the spam token database |
| 1876 |
|
| 1877 |
--update|-u |
| 1878 |
update the probability table from clean and spam token databases |
| 1879 |
|
| 1880 |
--export-clean |
| 1881 |
export the clean token database |
| 1882 |
|
| 1883 |
--export-spam |
| 1884 |
export the spam token database |
| 1885 |
|
| 1886 |
--import-clean |
| 1887 |
import the clean token database |
| 1888 |
|
| 1889 |
--import-spam |
| 1890 |
import the spam token database |
| 1891 |
|
| 1892 |
--export-probability |
| 1893 |
export the probability database (for debugging purpose) |
| 1894 |
OPTIONS |
| 1895 |
--homedir directory |
| 1896 |
specify the name of the bsfilter\'s home directory |
| 1897 |
If this option is not used, a directory specified with the environment variable "BSFILTERHOME" is used |
| 1898 |
If the variable "BSFILTERHOME" is not defined, ".bsfilter" directory under your home is used |
| 1899 |
If the variable "HOME" is not defined, a directory which bsfilter is located at is used |
| 1900 |
|
| 1901 |
--config-file file |
| 1902 |
specify the name of the bsfilter\'s configuration file |
| 1903 |
"bsfilter.conf" in bsfilter\'s home directory is used by default |
| 1904 |
|
| 1905 |
--max-line number |
| 1906 |
check and/or study the first number of lines |
| 1907 |
default is #{Default_max_line}. 0 means all |
| 1908 |
|
| 1909 |
--db sdbm|gdbm|bdb1|bdb|qdbm |
| 1910 |
specify the name of database type |
| 1911 |
"sdbm" by default |
| 1912 |
|
| 1913 |
--jtokenizer|-j bigram|block|mecab|chasen|kakasi |
| 1914 |
specify algorithm of a tokenizer for Japanese language |
| 1915 |
"bigram" by default |
| 1916 |
|
| 1917 |
--list-clean |
| 1918 |
print filename of clean mail |
| 1919 |
|
| 1920 |
--list-spam |
| 1921 |
print filename of spam |
| 1922 |
|
| 1923 |
--imap |
| 1924 |
access IMAP server |
| 1925 |
|
| 1926 |
--imap-server hostname |
| 1927 |
specify hostname of IMAP server |
| 1928 |
|
| 1929 |
--imap-port number |
| 1930 |
specify port number of IMAP server. default is #{Default_imap_port} |
| 1931 |
|
| 1932 |
--imap-auth method |
| 1933 |
specify authorization method. default is "auto" |
| 1934 |
"cram-md5" use "AUTHENTICATE CRAM-MD5" command |
| 1935 |
"login" use "AUTHENTICATE LOGIN" command |
| 1936 |
"loginc" use "LOGIN" command |
| 1937 |
"auto" try #{Default_imap_auth_preference.join(', ')} in this order. |
| 1938 |
|
| 1939 |
--imap-user name |
| 1940 |
specify user name of IMAP server |
| 1941 |
|
| 1942 |
--imap-password password |
| 1943 |
specify password of imap-user |
| 1944 |
|
| 1945 |
--imap-folder-clean folder |
| 1946 |
specify destination folder for clean mails. "inbox.clean" for example |
| 1947 |
|
| 1948 |
--imap-folder-spam folder |
| 1949 |
specify destination folder for spams. "inbox.spam" for example |
| 1950 |
|
| 1951 |
--imap-fetch-unseen |
| 1952 |
filter or study mails without SEEN flag |
| 1953 |
|
| 1954 |
--imap-fetch-unflagged |
| 1955 |
filter or study mails without "X-Spam-Flag" header |
| 1956 |
|
| 1957 |
--imap-reset-seen-flag |
| 1958 |
reset SEEN flag when bsfilter moves or modifies mails |
| 1959 |
|
| 1960 |
--pop |
| 1961 |
work as POP proxy |
| 1962 |
|
| 1963 |
--pid-file file |
| 1964 |
specify filename for logging process ID of bsfilter |
| 1965 |
"bsfilter.pid" in bsfilter\'s home directory is used by default |
| 1966 |
this function is valid when "--pop" is specified |
| 1967 |
|
| 1968 |
--tasktray |
| 1969 |
sit in tasktray |
| 1970 |
this is valid with "--pop" on VisualuRuby |
| 1971 |
|
| 1972 |
--pop-server hostname |
| 1973 |
specify hostname of POP server |
| 1974 |
|
| 1975 |
--pop-port number |
| 1976 |
specify port number of POP server. default is #{Default_pop_port} |
| 1977 |
|
| 1978 |
--pop-proxy-if address |
| 1979 |
specify address of interface which bsfilter listens at |
| 1980 |
default is 0.0.0.0 and all interfaces are active |
| 1981 |
|
| 1982 |
--pop-proxy-port number |
| 1983 |
specify port number which bsfilter listens at. default is #{Default_pop_proxy_port} |
| 1984 |
|
| 1985 |
--pop-user name |
| 1986 |
optional. specify username of POP server. |
| 1987 |
bsfilter checks match between value of this options and a name which MUA sends. |
| 1988 |
in case of mismatch, bsfilter closes sockets. |
| 1989 |
|
| 1990 |
--pop-proxy-set set[,set...] |
| 1991 |
specify rules of pop proxy. |
| 1992 |
alternative way of pop-server, pop-port, pop-proxy-port and pop-user option. |
| 1993 |
format of "set" is "pop-server:pop-port:[proxy-interface]:proxy-port[:pop-user]" |
| 1994 |
If proxy-interface is specified and isn\'t 0.0.0.0 , other interfaces are not used. |
| 1995 |
"--pop-proxy-set 192.168.1.1:110::10110" is equivalent with |
| 1996 |
"--pop-server 192.168.1.1 --pop-port 110 --pop-proxy-port 10110" |
| 1997 |
|
| 1998 |
--pop-max-size number |
| 1999 |
When mail is longer than the specified number, the mail is not filtered. |
| 2000 |
When 0 is specified, all mails are tested and filtered. |
| 2001 |
unit is byte. default is #{Default_pop_max_size} |
| 2002 |
|
| 2003 |
--ssl |
| 2004 |
use POP over SSL with --pop option |
| 2005 |
use IMAP over SSL with --imap option |
| 2006 |
|
| 2007 |
--ssl-cert filename|dirname |
| 2008 |
specify a filename of a certificate of a trusted CA or |
| 2009 |
a name of a directory of certificates |
| 2010 |
|
| 2011 |
--method|-m g|r|rf |
| 2012 |
specify filtering method. "rf" by default |
| 2013 |
"g" means Paul Graham method, |
| 2014 |
"r" means Gary Robinson method, |
| 2015 |
and "rf" means Robinson-Fisher method |
| 2016 |
|
| 2017 |
--spam-cutoff number |
| 2018 |
specify spam-cutoff value |
| 2019 |
0.9 by default for Paul Graham method |
| 2020 |
0.582 by default for Gary Robinson method |
| 2021 |
0.95 by default for Robinson-Fisher method |
| 2022 |
|
| 2023 |
--auto-update|-a |
| 2024 |
recognize mails, add them into clean or spam token database |
| 2025 |
and update the probability table |
| 2026 |
|
| 2027 |
--disable-degeneration|-D |
| 2028 |
disable degeneration during probability table lookup |
| 2029 |
|
| 2030 |
--disable-utf-8 |
| 2031 |
disable utf-8 support |
| 2032 |
|
| 2033 |
--refer-header header[,header...] |
| 2034 |
refer specified headers of mails |
| 2035 |
"#{Default_refer_header}" |
| 2036 |
by default |
| 2037 |
|
| 2038 |
--ignore-header|-H |
| 2039 |
ignore headers of mails |
| 2040 |
same as --refer-header "" |
| 2041 |
|
| 2042 |
--ignore-body|-B |
| 2043 |
ignore body of mails, except URL or mail address |
| 2044 |
|
| 2045 |
--ignore-plain-text-part |
| 2046 |
ignore plain text part if html part is included in the mail |
| 2047 |
|
| 2048 |
--ignore-after-last-atag |
| 2049 |
ignore text after last "A" tag |
| 2050 |
|
| 2051 |
--mark-in-token "characters" |
| 2052 |
specify characters which are allowable in a token |
| 2053 |
"#{Default_mark_in_token}" by default |
| 2054 |
|
| 2055 |
--show-process |
| 2056 |
show summary of execution |
| 2057 |
|
| 2058 |
--show-new-token |
| 2059 |
show tokens which are newly added into the token database |
| 2060 |
|
| 2061 |
--mbox |
| 2062 |
use "unix from" to divide mbox format file |
| 2063 |
|
| 2064 |
--max-mail number |
| 2065 |
reduce token database when the number of stored mails is larger than this one |
| 2066 |
#{Default_max_mail} by default |
| 2067 |
|
| 2068 |
--min-mail number |
| 2069 |
reduce token database as if this number of mails are stored |
| 2070 |
#{Default_min_mail} by default |
| 2071 |
|
| 2072 |
--pipe |
| 2073 |
write a mail to stdout. |
| 2074 |
this options is invalid when "--imap" or "--pop" is specified |
| 2075 |
|
| 2076 |
--insert-revision |
| 2077 |
insert "X-#{Default_header_prefix}-Revision: bsfilter release..." into a mail |
| 2078 |
|
| 2079 |
--insert-flag |
| 2080 |
insert "X-#{Default_header_prefix}-Flag: Yes" or "X-#{Default_header_prefix}-Flag: No" into a mail |
| 2081 |
|
| 2082 |
--insert-probability |
| 2083 |
insert "X-#{Default_header_prefix}-Probability: number" into a mail |
| 2084 |
|
| 2085 |
--header-prefix string |
| 2086 |
valid with --insert-flag and/or --insert-probability option |
| 2087 |
insert "X-specified_string-..." headers, instead of "#{Default_header_prefix}" |
| 2088 |
|
| 2089 |
--mark-spam-subject |
| 2090 |
insert "#{Default_spam_subject_prefix}" at the beginning of Subject header |
| 2091 |
|
| 2092 |
--spam-subject-prefix string |
| 2093 |
valid with --mark-spam-subject option |
| 2094 |
insert specified string, instead of "#{Default_spam_subject_prefix}" |
| 2095 |
|
| 2096 |
--show-db-status |
| 2097 |
show numbers of tokens and mails in databases and quit |
| 2098 |
|
| 2099 |
--help|-h |
| 2100 |
help |
| 2101 |
|
| 2102 |
--quiet|-q |
| 2103 |
quiet mode |
| 2104 |
|
| 2105 |
--verbose|-v |
| 2106 |
verbose mode |
| 2107 |
|
| 2108 |
--debug|-d |
| 2109 |
debug mode |
| 2110 |
|
| 2111 |
EXAMPLES |
| 2112 |
|
| 2113 |
% bsfilter -s ~/Mail/spam/* ## add spam |
| 2114 |
% bsfilter -u -c ~/Mail/job/* ~/Mail/private/* ## add clean mails and update probability table |
| 2115 |
% bsfilter ~/Mail/inbox/1 ## show spam probability |
| 2116 |
|
| 2117 |
## recipe of procmail (1) |
| 2118 |
:0 HB |
| 2119 |
* ? bsfilter -a |
| 2120 |
spam/. |
| 2121 |
|
| 2122 |
## recipe of procmail (2) |
| 2123 |
:0 fw |
| 2124 |
| bsfilter -a --pipe --insert-flag --insert-probability |
| 2125 |
|
| 2126 |
:0 |
| 2127 |
* ^X-Spam-Flag: Yes |
| 2128 |
spam/. |
| 2129 |
|
| 2130 |
LICENSE |
| 2131 |
this file is distributed under GPL version2 and might be compiled by Exerb with VisualuRuby |
| 2132 |
|
| 2133 |
SEE ALSO |
| 2134 |
http://bsfilter.org/ |
| 2135 |
http://sourceforge.jp/projects/bsfilter/ |
| 2136 |
http://exerb.sourceforge.jp/ |
| 2137 |
http://www.osk.3web.ne.jp/~nyasu/software/vrproject.html |
| 2138 |
http://www.ruby-lang.org/ |
| 2139 |
|
| 2140 |
RELEASE |
| 2141 |
#{Release} |
| 2142 |
|
| 2143 |
REVISION |
| 2144 |
#{Revision} |
| 2145 |
EOM |
| 2146 |
end |
| 2147 |
|
| 2148 |
class Mbox |
| 2149 |
def initialize(options, fh) |
| 2150 |
@buf = Array::new |
| 2151 |
@options = options |
| 2152 |
@fh = fh |
| 2153 |
end |
| 2154 |
def read |
| 2155 |
if (! @options["mbox"]) |
| 2156 |
if (@fh.eof?) |
| 2157 |
return nil |
| 2158 |
else |
| 2159 |
buf = @fh.readlines |
| 2160 |
if ((buf.length == 1) && (buf.last =~ /\r\z/)) |
| 2161 |
return buf.last.scan(/.*?\r/) |
| 2162 |
else |
| 2163 |
return buf |
| 2164 |
end |
| 2165 |
end |
| 2166 |
end |
| 2167 |
|
| 2168 |
## reg_ufrom = Regexp::compile('^From .*@.* \d{2}:\d{2}:\d{2} ') |
| 2169 |
reg_ufrom = Regexp::compile('^From ') |
| 2170 |
while (str = @fh.gets) |
| 2171 |
if (str =~ reg_ufrom) |
| 2172 |
if (@buf.empty?) |
| 2173 |
@buf.push(str) |
| 2174 |
else |
| 2175 |
ret_buf = @buf |
| 2176 |
@buf = Array::new |
| 2177 |
@buf.push(str) |
| 2178 |
return ret_buf |
| 2179 |
end |
| 2180 |
else |
| 2181 |
@buf.push(str) |
| 2182 |
end |
| 2183 |
end |
| 2184 |
ret_buf = @buf |
| 2185 |
@buf = nil |
| 2186 |
return ret_buf |
| 2187 |
end |
| 2188 |
end |
| 2189 |
|
| 2190 |
def update_token_db_one(db, command=@options) |
| 2191 |
maintenance_command = "" |
| 2192 |
maintenance_command += "c" if (command["add-clean"]) |
| 2193 |
maintenance_command += "s" if (command["add-spam"]) |
| 2194 |
maintenance_command += "C" if (command["sub-clean"]) |
| 2195 |
maintenance_command += "S" if (command["sub-spam"]) |
| 2196 |
maintenance_command = "-" if (maintenance_command == "") |
| 2197 |
|
| 2198 |
show_process(db, maintenance_command) if (@options["show-process"]) |
| 2199 |
|
| 2200 |
if (command["add-clean"] || command["import-clean"]) |
| 2201 |
@db_hash[db.language].clean.show_new_token(db) if (@options["show-new-token"]) |
| 2202 |
@db_hash[db.language].clean.add_db(db) |
| 2203 |
end |
| 2204 |
if (command["add-spam"] || command["import-spam"]) |
| 2205 |
@db_hash[db.language].spam.show_new_token(db) if (@options["show-new-token"]) |
| 2206 |
@db_hash[db.language].spam.add_db(db) |
| 2207 |
end |
| 2208 |
if (command["sub-clean"]) |
| 2209 |
@db_hash[db.language].clean.sub_db(db) |
| 2210 |
end |
| 2211 |
if (command["sub-spam"]) |
| 2212 |
@db_hash[db.language].spam.sub_db(db) |
| 2213 |
end |
| 2214 |
end |
| 2215 |
|
| 2216 |
def read_exported_text(fh) |
| 2217 |
dbs = DBHash::new |
| 2218 |
Languages.each do |lang| |
| 2219 |
dbs[lang] = TokenDB::new(lang) |
| 2220 |
dbs[lang].time = Time::new |
| 2221 |
end |
| 2222 |
while (str = fh.gets) |
| 2223 |
str.chomp! |
| 2224 |
if (str =~ /^\s*#/) |
| 2225 |
next |
| 2226 |
end |
| 2227 |
(lang, category, token, val) = str.split |
| 2228 |
val = val.to_f.to_i |
| 2229 |
if (category == ".internal") |
| 2230 |
if (token == "file_count") |
| 2231 |
dbs[lang].file_count = dbs[lang].file_count + val |
| 2232 |
end |
| 2233 |
else |
| 2234 |
dbs[lang].add_scalar(category, token, val) |
| 2235 |
dbs[lang].file_count = dbs[lang].file_count - 1 |
| 2236 |
end |
| 2237 |
end |
| 2238 |
return dbs |
| 2239 |
end |
| 2240 |
|
| 2241 |
def update_token_dbs(files) |
| 2242 |
dbs = Array::new |
| 2243 |
Languages.each do |lang| |
| 2244 |
@db_hash[lang].clean.open("rw") |
| 2245 |
@db_hash[lang].spam.open("rw") |
| 2246 |
end |
| 2247 |
|
| 2248 |
if (@options["imap"]) |
| 2249 |
if (@options["ssl"]) |
| 2250 |
if (@options["ssl-cert"]) |
| 2251 |
verify_mode = OpenSSL::SSL::VERIFY_PEER |
| 2252 |
else |
| 2253 |
verify_mode = nil |
| 2254 |
end |
| 2255 |
imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"], @options["ssl"], @options["ssl-cert"], verify_mode) |
| 2256 |
else |
| 2257 |
imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"]) |
| 2258 |
end |
| 2259 |
imap.auto_authenticate(@options, @options["imap-auth"], @options["imap-user"], @options["imap-password"], @options["imap-auth-preference"]) |
| 2260 |
|
| 2261 |
files.each do |mailbox| |
| 2262 |
target_mailbox = mailbox |
| 2263 |
target_mailbox = @options["imap-folder-clean"] if (@options["add-clean"] && @options["imap-folder-clean"]) |
| 2264 |
target_mailbox = @options["imap-folder-spam"] if (@options["add-spam"] && @options["imap-folder-spam"]) |
| 2265 |
uids = imap_get_target_uids(imap, mailbox) |
| 2266 |
uids.each do |uid| |
| 2267 |
imapm = IMAPMessage::new(@options, imap, uid) |
| 2268 |
imapm.fetch_rfc822 |
| 2269 |
db = tokenize_buf(imapm.buf) |
| 2270 |
db.filename = uid |
| 2271 |
update_token_db_one(db) |
| 2272 |
updated = imapm.insert_rfc822_headers!((@options["add-spam"] || @options["sub-clean"]), nil) |
| 2273 |
if (updated) |
| 2274 |
imapm.append(target_mailbox) |
| 2275 |
imapm.set_delete_flag |
| 2276 |
elsif (target_mailbox != mailbox) |
| 2277 |
imapm.copy(target_mailbox) |
| 2278 |
imapm.set_delete_flag |
| 2279 |
end |
| 2280 |
end |
| 2281 |
imap.close |
| 2282 |
end |
| 2283 |
imap.logout |
| 2284 |
else |
| 2285 |
files.each do |file| |
| 2286 |
open_ro(file) do |fh| |
| 2287 |
if (@options["import-clean"] || @options["import-spam"]) |
| 2288 |
imported_dbs = read_exported_text(fh) |
| 2289 |
imported_dbs.each do |lang, db| |
| 2290 |
update_token_db_one(db) |
| 2291 |
end |
| 2292 |
else |
| 2293 |
mbox = Mbox::new(@options, fh) |
| 2294 |
while (buf = mbox.read) |
| 2295 |
db = tokenize_buf(buf) |
| 2296 |
db.filename = file |
| 2297 |
dbs.push(db) |
| 2298 |
if (@options["pipe"]) |
| 2299 |
insert_headers!(buf, (@options["add-spam"] || @options["sub-clean"]), nil) |
| 2300 |
@options["pipe-fh"].print buf |
| 2301 |
end |
| 2302 |
update_token_db_one(db) |
| 2303 |
end |
| 2304 |
end |
| 2305 |
end |
| 2306 |
end |
| 2307 |
end |
| 2308 |
|
| 2309 |
slimed = false |
| 2310 |
Languages.each do |lang| |
| 2311 |
slimed |= @db_hash[lang].clean.check_size(@options["max-mail"], @options["min-mail"]) |
| 2312 |
slimed |= @db_hash[lang].spam.check_size(@options["max-mail"], @options["min-mail"]) |
| 2313 |
@db_hash[lang].clean.close |
| 2314 |
@db_hash[lang].spam.close |
| 2315 |
end |
| 2316 |
dbs.clear if (slimed) # disable incremental |
| 2317 |
return dbs |
| 2318 |
end |
| 2319 |
|
| 2320 |
def auto_update(token_dbs) |
| 2321 |
command = Hash::new |
| 2322 |
updated_langs = Array::new |
| 2323 |
token_dbs.each do |token_db| |
| 2324 |
updated_langs.push(token_db.language) |
| 2325 |
end |
| 2326 |
updated_langs.uniq.each do |lang| |
| 2327 |
@db_hash[lang].clean.open("rw") |
| 2328 |
@db_hash[lang].spam.open("rw") |
| 2329 |
end |
| 2330 |
|
| 2331 |
command["sub-clean"] = false |
| 2332 |
command["sub-spam"] = false |
| 2333 |
command["import-clean"] = false |
| 2334 |
command["import-spam"] = false |
| 2335 |
|
| 2336 |
token_dbs.each do |token_db| |
| 2337 |
if (token_db.spam_flag) |
| 2338 |
command["add-clean"] = false |
| 2339 |
command["add-spam"] = true |
| 2340 |
else |
| 2341 |
command["add-clean"] = true |
| 2342 |
command["add-spam"] = false |
| 2343 |
end |
| 2344 |
update_token_db_one(token_db, command) |
| 2345 |
end |
| 2346 |
|
| 2347 |
slimed = false |
| 2348 |
updated_langs.uniq.each do |lang| |
| 2349 |
slimed |= @db_hash[lang].clean.check_size(@options["max-mail"], @options["min-mail"]) |
| 2350 |
slimed |= @db_hash[lang].spam.check_size(@options["max-mail"], @options["min-mail"]) |
| 2351 |
end |
| 2352 |
token_dbs.clear if (slimed) # can't use incremental mode |
| 2353 |
|
| 2354 |
updated_langs.uniq.each do |lang| |
| 2355 |
@db_hash[lang].update_probability(token_dbs) |
| 2356 |
end |
| 2357 |
|
| 2358 |
updated_langs.uniq.each do |lang| |
| 2359 |
@db_hash[lang].clean.close |
| 2360 |
@db_hash[lang].spam.close |
| 2361 |
end |
| 2362 |
end |
| 2363 |
|
| 2364 |
def read_config_file(file) |
| 2365 |
configs = Array::new |
| 2366 |
|
| 2367 |
open(file) do |fh| |
| 2368 |
while (str = fh.gets) |
| 2369 |
if ((str =~ /\A\s*#/) || (str =~ /\A\s*\z/)) |
| 2370 |
next |
| 2371 |
end |
| 2372 |
str.chomp! |
| 2373 |
str.sub!(/\s+\z/, '') |
| 2374 |
str.sub!(/\A\s+/, '') |
| 2375 |
tokens = str.split(/\s+/, 2) |
| 2376 |
if (! tokens.empty?) |
| 2377 |
tokens[0] = "--" + tokens[0] |
| 2378 |
configs.concat(tokens) |
| 2379 |
end |
| 2380 |
end |
| 2381 |
end |
| 2382 |
return configs |
| 2383 |
end |
| 2384 |
|
| 2385 |
def imap_get_target_uids(imap, mailbox) |
| 2386 |
keys = Array::new |
| 2387 |
if (mailbox =~ /(.*)\/(.*)/) |
| 2388 |
mailbox = $1 |
| 2389 |
seqs = $2 |
| 2390 |
else |
| 2391 |
seqs = nil |
| 2392 |
end |
| 2393 |
imap.select(mailbox) |
| 2394 |
if (@options["imap-fetch-unseen"]) |
| 2395 |
if (seqs) |
| 2396 |
uids = imap.uid_search(["UNSEEN", seqs]) |
| 2397 |
else |
| 2398 |
uids = imap.uid_search(["UNSEEN"]) |
| 2399 |
end |
| 2400 |
else |
| 2401 |
if (seqs) |
| 2402 |
uids = imap.uid_search([seqs]) |
| 2403 |
else |
| 2404 |
uids = imap.uid_search(["ALL"]) |
| 2405 |
end |
| 2406 |
end |
| 2407 |
if (@options["imap-fetch-unflagged"]) |
| 2408 |
null = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), ""]) |
| 2409 |
yes = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), "Yes"]) |
| 2410 |
no = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), "No"]) |
| 2411 |
@options["message-fh"].printf("imap-fetch-unflagged working original %d null %d Yes %d No %d\n", |
| 2412 |
uids.length, null.length, yes.length, no.length) if (@options["verbose"]) |
| 2413 |
uids = uids - imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), ""]) |
| 2414 |
@options["message-fh"].printf("imap-fetch-unflagged worked %d\n", |
| 2415 |
uids.length) if (@options["verbose"]) |
| 2416 |
end |
| 2417 |
return uids |
| 2418 |
end |
| 2419 |
|
| 2420 |
class IMAPMessage |
| 2421 |
include Bsutil |
| 2422 |
def initialize(options, imap, uid=nil) |
| 2423 |
@options = options |
| 2424 |
@seqno = nil |
| 2425 |
@seen = nil |
| 2426 |
@uid = uid |
| 2427 |
@imap = imap |
| 2428 |
@buf = Array::new |
| 2429 |
end |
| 2430 |
attr_accessor :seqno, :uid, :imap, :buf, :seen |
| 2431 |
|
| 2432 |
def fetch_rfc822 |
| 2433 |
# @options["message-fh"].printf("fetch_rfc822 %d\n", @uid) if (@options["verbose"]) |
| 2434 |
fetched = @imap.uid_fetch(@uid, ["RFC822", "FLAGS"]) |
| 2435 |
@seqno = fetched[0].seqno |
| 2436 |
@buf = fetched[0].attr["RFC822"].split("\n") |
| 2437 |
@seen = fetched[0].attr["FLAGS"].include?(:Seen) |
| 2438 |
if (! @seen) |
| 2439 |
@imap.uid_store(@uid, "-FLAGS", [:Seen]) |
| 2440 |
end |
| 2441 |
end |
| 2442 |
|
| 2443 |
def insert_rfc822_headers!(*args) |
| 2444 |
return insert_headers!(@buf, *args) |
| 2445 |
end |
| 2446 |
|
| 2447 |
def insert_rfc822_header!(header, content) |
| 2448 |
# @options["message-fh"].printf("insert_rfc822_header %d %s %s\n", @uid, header, content) if (@options["verbose"]) |
| 2449 |
insert_header!(@buf, header, content) |
| 2450 |
end |
| 2451 |
|
| 2452 |
def append(mailbox) |
| 2453 |
@buf.map! do |str| |
| 2454 |
str.sub(/[\r\n]*\z/, "\r\n") |
| 2455 |
end |
| 2456 |
# @options["message-fh"].printf("append %d %s\n", @uid, mailbox) if (@options["verbose"]) |
| 2457 |
if (@seen) |
| 2458 |
@imap.append(mailbox, @buf.join, [:Seen]) |
| 2459 |
else |
| 2460 |
@imap.append(mailbox, @buf.join, []) |
| 2461 |
end |
| 2462 |
end |
| 2463 |
|
| 2464 |
def copy(mailbox) |
| 2465 |
# @options["message-fh"].printf("copy %d %s\n", @uid, mailbox) if (@options["verbose"]) |
| 2466 |
@imap.uid_copy(@uid, mailbox) |
| 2467 |
end |
| 2468 |
|
| 2469 |
def set_delete_flag |
| 2470 |
# @options["message-fh"].printf("set_delete_flag %d\n", @uid) if (@options["verbose"]) |
| 2471 |
@imap.uid_store(@uid, "+FLAGS", [:Deleted]) |
| 2472 |
end |
| 2473 |
|
| 2474 |
def reset_seen_flag |
| 2475 |
# @options["message-fh"].printf("reset_seen_flag %d\n", @uid) if (@options["verbose"]) |
| 2476 |
@seen = false |
| 2477 |
@imap.uid_store(@uid, "-FLAGS", [:Seen]) |
| 2478 |
end |
| 2479 |
end # end of class IMAPMessage |
| 2480 |
|
| 2481 |
def socket_send_rec(command, socket) |
| 2482 |
buf = Array::new |
| 2483 |
if (command) |
| 2484 |
@options["message-fh"].printf("send %s %s", socket, command.sub(/\APASS.*/i, "PASS ********")) if (@options["debug"]) |
| 2485 |
socket.write_timeout(command) # pass command to pop-server |
| 2486 |
end |
| 2487 |
response = socket.gets_timeout # get response from pop-server |
| 2488 |
buf.push(response) |
| 2489 |
@options["message-fh"].printf("resp %s %s", socket, response.sub(/\APASS.*/i, "PASS ********")) if (@options["debug"]) |
| 2490 |
if ((response =~ /\A\+OK/) && |
| 2491 |
((command =~ /(RETR|TOP|CAPA)/i) || |
| 2492 |
(command =~ /(UIDL|LIST)[^\d]*\z/i))) |
| 2493 |
while (response != ".\r\n") |
| 2494 |
response = socket.gets_timeout |
| 2495 |
buf.push(response) |
| 2496 |
end |
| 2497 |
end |
| 2498 |
return buf |
| 2499 |
end |
| 2500 |
|
| 2501 |
def pop_proxy_multi(pop_proxy_sets) |
| 2502 |
trap("SIGINT") do |
| 2503 |
@options["message-fh"].printf("SIGINT received\n") if (@options["verbose"]) |
| 2504 |
@threads.each do |thread| # kill child threads |
| 2505 |
Thread::kill(thread) |
| 2506 |
end |
| 2507 |
end |
| 2508 |
|
| 2509 |
pop_proxy_sets.split(/,/).each do |pop_proxy_set| |
| 2510 |
(pop_server, pop_port, pop_proxy_if, pop_proxy_port, pop_user) = pop_proxy_set.split(/:/) |
| 2511 |
pop_port = Default_pop_port if ((! pop_port) || pop_port == '') |
| 2512 |
pop_proxy_if = Default_pop_proxy_if if ((! pop_proxy_if) || pop_proxy_if == '') |
| 2513 |
pop_proxy_port = Default_pop_proxy_port if ((! pop_proxy_port) || pop_proxy_port == '') |
| 2514 |
t = Thread::start do # start child threads |
| 2515 |
pop_proxy_one(pop_server, pop_port, pop_proxy_if, pop_proxy_port, pop_user) |
| 2516 |
end |
| 2517 |
@threads.push(t) |
| 2518 |
end |
| 2519 |
@threads.each do |t| # join child threads |
| 2520 |
t.join |
| 2521 |
end |
| 2522 |
|
| 2523 |
Thread::list.each do |t| # join grandchild threads |
| 2524 |
t.join if (t != Thread::current) |
| 2525 |
end |
| 2526 |
return 0 |
| 2527 |
end |
| 2528 |
|
| 2529 |
def pop_bypass_large_mail(command, pop_socket, pop_proxy_socket) |
| 2530 |
pop_socket.write_timeout(command) # RETR to server |
| 2531 |
str = pop_socket.gets_timeout # response from server |
| 2532 |
pop_proxy_socket.write_timeout(str) # forward |
| 2533 |
return if (str =~ /^\A\+ng/i) |
| 2534 |
|
| 2535 |
while (str != ".\r\n") |
| 2536 |
timeout(SOCKET_TIMEOUT) do |
| 2537 |
pop_proxy_socket.write(str = pop_socket.gets) # forward |
| 2538 |
end |
| 2539 |
end |
| 2540 |
return |
| 2541 |
end |
| 2542 |
|
| 2543 |
def snoop_list_response(strs) |
| 2544 |
h = DBHash::new |
| 2545 |
if (strs[0] =~ /\A\+ok\s*(\d+)\s+(\d+)/) |
| 2546 |
h[$1] = $2.to_i |
| 2547 |
else |
| 2548 |
strs.each do |str| |
| 2549 |
if (str =~ /^(\d+)\s+(\d+)/) |
| 2550 |
h[$1] = $2.to_i |
| 2551 |
end |
| 2552 |
end |
| 2553 |
end |
| 2554 |
return h |
| 2555 |
end |
| 2556 |
|
| 2557 |
def pop_proxy_one(pop_server, pop_port, pop_proxy_if, pop_proxy_port, pop_user) |
| 2558 |
gs = TCPserver.open(pop_proxy_if, pop_proxy_port) |
| 2559 |
addr = gs.addr |
| 2560 |
addr.shift |
| 2561 |
@options["message-fh"].printf("pop_proxy is on %s\n", addr.join(":")) if (@options["verbose"]) |
| 2562 |
while true |
| 2563 |
Thread::start(gs.accept) do |pop_proxy_socket| # start grandchild threads |
| 2564 |
@options["message-fh"].print(pop_proxy_socket, " is accepted\n") if (@options["verbose"]) |
| 2565 |
begin |
| 2566 |
pop_socket = nil |
| 2567 |
timeout(SOCKET_TIMEOUT) do |
| 2568 |
pop_socket = TCPsocket.open(pop_server, pop_port) |
| 2569 |
end |
| 2570 |
@options["message-fh"].print(pop_socket, " is connected\n") if (@options["verbose"]) |
| 2571 |
|
| 2572 |
pop_socket = get_ssl_socket(pop_socket, @options["ssl-cert"]) if (@options["ssl"]) |
| 2573 |
|
| 2574 |
hello = socket_send_rec(nil, pop_socket)[0] |
| 2575 |
hello.sub!(/(.*)\r/, "\\1(pop_proxy by bsfilter)\r") |
| 2576 |
pop_proxy_socket.write(hello) |
| 2577 |
|
| 2578 |
sizes = DBHash::new |
| 2579 |
while (command = socket_send_rec(nil, pop_proxy_socket)[0]) # get command from MUA |
| 2580 |
if (command =~ /\ARETR\s+(\d+)/i) |
| 2581 |
n = $1 |
| 2582 |
if (sizes[n] && |
| 2583 |
(0 < @options["pop-max-size"]) && (@options["pop-max-size"] < sizes[n])) |
| 2584 |
pop_bypass_large_mail(command, pop_socket, pop_proxy_socket) |
| 2585 |
next |
| 2586 |
end |
| 2587 |
end |
| 2588 |
response = socket_send_rec(command, pop_socket) |
| 2589 |
if (command =~ /\ALIST/i) |
| 2590 |
sizes.update(snoop_list_response(response)) |
| 2591 |
elsif ((command =~ /\A(TOP|RETR)/i) && (response[0] =~ /\A\+OK/)) |
| 2592 |
buf = response[1..-1].dup |
| 2593 |
token_db = tokenize_buf(buf) |
| 2594 |
@db_hash[token_db.language].prob.open("r") |
| 2595 |
@db_hash[token_db.language].get_combined_probability(token_db) |
| 2596 |
@db_hash[token_db.language].prob.close |
| 2597 |
if (@options["auto-update"]) |
| 2598 |
auto_update([token_db]) |
| 2599 |
elsif (@options["show-process"]) |
| 2600 |
show_process(token_db, "-") |
| 2601 |
end |
| 2602 |
@options["message-fh"].printf("combined probability %f\n", token_db.probability) if (@options["verbose"]) |
| 2603 |
insert_headers!(buf, token_db.spam_flag, token_db.probability) |
| 2604 |
response[1..-1] = buf |
| 2605 |
end |
| 2606 |
# don't use elsif |
| 2607 |
if (command =~ /QUIT/i) |
| 2608 |
pop_proxy_socket.write(response) # return response to MUA |
| 2609 |
break |
| 2610 |
elsif ((command =~ /\AUSER\s*(\S*)\r/) && |
| 2611 |
(pop_user && pop_user != $1)) |
| 2612 |
@options["message-fh"].printf("username unmatch error\n") |
| 2613 |
pop_proxy_socket.write("-ERR unregistered user\r\n") # return response to MUA |
| 2614 |
break |
| 2615 |
else |
| 2616 |
pop_proxy_socket.write(response) # return response to MUA |
| 2617 |
end |
| 2618 |
end |
| 2619 |
rescue TimeoutError |
| 2620 |
@options["message-fh"].printf("Timeout error %s %s %s\n", pop_server, pop_port, pop_proxy_port) if (@options["verbose"]) |
| 2621 |
rescue |
| 2622 |
@options["message-fh"].printf("pop exception caught %s %s %s\n", pop_server, pop_port, pop_proxy_port) if (@options["verbose"]) |
| 2623 |
p "#{$!}" if (@options["verbose"]) |
| 2624 |
p "#{$@}" if (@options["debug"]) |
| 2625 |
ensure |
| 2626 |
if (pop_proxy_socket && ! pop_proxy_socket.closed?) |
| 2627 |
@options["message-fh"].print(pop_proxy_socket, " is gone\n") if (@options["verbose"]) |
| 2628 |
pop_proxy_socket.close |
| 2629 |
end |
| 2630 |
if (pop_socket && ! pop_socket.closed?) |
| 2631 |
@options["message-fh"].print(pop_socket, " is gone\n") if (@options["verbose"]) |
| 2632 |
pop_socket.close |
| 2633 |
end |
| 2634 |
end |
| 2635 |
end # thread end |
| 2636 |
end |
| 2637 |
end |
| 2638 |
|
| 2639 |
def check_options_for_pop!(options) |
| 2640 |
error = false |
| 2641 |
options["icon_number"] = (options["icon-number"] || Default_icon_number).to_i |
| 2642 |
options["pop-port"] = Default_pop_port if (! options["pop-port"]) |
| 2643 |
options["pop-proxy-if"] = Default_pop_proxy_if if (! options["pop-proxy-if"]) |
| 2644 |
options["pop-proxy-port"] = Default_pop_proxy_port if (! options["pop-proxy-port"]) |
| 2645 |
options["pop-max-size"] = (options["pop-max-size"] || Default_pop_max_size).to_i |
| 2646 |
|
| 2647 |
if (options["tasktray"]) |
| 2648 |
require('vr/vrcontrol') |
| 2649 |
require('vr/vrtray') |
| 2650 |
end |
| 2651 |
|
| 2652 |
if (! options["pop-proxy-set"]) |
| 2653 |
["pop-server"].each do |name| |
| 2654 |
if (! options[name]) |
| 2655 |
printf("specify %s\n", name) |
| 2656 |
error = true |
| 2657 |
end |
| 2658 |
end |
| 2659 |
end |
| 2660 |
|
| 2661 |
raise "error found in pop options" if (error) |
| 2662 |
return |
| 2663 |
end |
| 2664 |
|
| 2665 |
def check_options_for_imap!(options) |
| 2666 |
error = false |
| 2667 |
options["imap-port"] = Default_imap_port if (! options["imap-port"]) |
| 2668 |
["imap-server", "imap-auth", "imap-user", "imap-password"].each do |name| |
| 2669 |
if (! options[name]) |
| 2670 |
printf("specify %s\n", name) |
| 2671 |
error = true |
| 2672 |
end |
| 2673 |
end |
| 2674 |
|
| 2675 |
raise "error found in imap options" if (error) |
| 2676 |
return |
| 2677 |
end |
| 2678 |
|
| 2679 |
def do_imap(command_line_args, token_dbs) |
| 2680 |
ret_code = CODE_CLEAN |
| 2681 |
if (@options["ssl"]) |
| 2682 |
if (@options["ssl-cert"]) |
| 2683 |
verify_mode = OpenSSL::SSL::VERIFY_PEER |
| 2684 |
else |
| 2685 |
verify_mode = nil |
| 2686 |
end |
| 2687 |
imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"], @options["ssl"], @options["ssl-cert"], verify_mode) |
| 2688 |
else |
| 2689 |
imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"]) |
| 2690 |
end |
| 2691 |
imap.auto_authenticate(@options, @options["imap-auth"], @options["imap-user"], @options["imap-password"], @options["imap-auth-preference"]) |
| 2692 |
|
| 2693 |
imap.select(@options["imap-folder-clean"]) if (@options["imap-folder-clean"]) # only for check |
| 2694 |
imap.select(@options["imap-folder-spam"]) if (@options["imap-folder-spam"]) # only for check |
| 2695 |
command_line_args.each do |mailbox| |
| 2696 |
uids = imap_get_target_uids(imap, mailbox) |
| 2697 |
uids.each do |uid| |
| 2698 |
imapm = IMAPMessage::new(@options, imap, uid) |
| 2699 |
imapm.fetch_rfc822 |
| 2700 |
token_db = tokenize_buf(imapm.buf) |
| 2701 |
token_db.filename = uid |
| 2702 |
@db_hash[token_db.language].get_combined_probability(token_db) |
| 2703 |
token_dbs.push(token_db) |
| 2704 |
@options["message-fh"].printf("combined probability %s %d %f\n", mailbox, imapm.seqno, token_db.probability) if (@options["verbose"]) |
| 2705 |
|
| 2706 |
updated = false |
| 2707 |
target_mailbox = mailbox |
| 2708 |
if (token_db.spam_flag) |
| 2709 |
target_mailbox = @options["imap-folder-spam"] if (@options["imap-folder-spam"]) |
| 2710 |
ret_code = CODE_SPAM |
| 2711 |
else |
| 2712 |
target_mailbox = @options["imap-folder-clean"] if (@options["imap-folder-clean"]) |
| 2713 |
end |
| 2714 |
updated = imapm.insert_rfc822_headers!(token_db.spam_flag, token_db.probability) |
| 2715 |
if (updated) |
| 2716 |
imapm.reset_seen_flag if (@options["imap-reset-seen-flag"]) |
| 2717 |
imapm.append(target_mailbox) |
| 2718 |
imapm.set_delete_flag |
| 2719 |
elsif (target_mailbox != mailbox) |
| 2720 |
imapm.reset_seen_flag if (@options["imap-reset-seen-flag"]) |
| 2721 |
imapm.copy(target_mailbox) |
| 2722 |
imapm.set_delete_flag |
| 2723 |
end |
| 2724 |
end |
| 2725 |
imap.close |
| 2726 |
end |
| 2727 |
imap.logout |
| 2728 |
return ret_code |
| 2729 |
end |
| 2730 |
|
| 2731 |
|
| 2732 |
def do_export(command_line_args) |
| 2733 |
if (command_line_args.empty?) |
| 2734 |
file = "-" |
| 2735 |
else |
| 2736 |
file = command_line_args[0] |
| 2737 |
end |
| 2738 |
if (@options["export-clean"]) |
| 2739 |
open_wo(file) do |fh| |
| 2740 |
Languages.each do |lang| |
| 2741 |
@db_hash[lang].clean.open("r") |
| 2742 |
@db_hash[lang].clean.export(fh) if (@db_hash[lang].clean.file_count > 0) |
| 2743 |
@db_hash[lang].clean.close |
| 2744 |
end |
| 2745 |
end |
| 2746 |
end |
| 2747 |
if (@options["export-spam"]) |
| 2748 |
open_wo(file) do |fh| |
| 2749 |
Languages.each do |lang| |
| 2750 |
@db_hash[lang].spam.open("r") |
| 2751 |
@db_hash[lang].spam.export(fh) if (@db_hash[lang].spam.file_count > 0) |
| 2752 |
@db_hash[lang].spam.close |
| 2753 |
end |
| 2754 |
end |
| 2755 |
end |
| 2756 |
if (@options["export-probability"]) |
| 2757 |
open_wo(file) do |fh| |
| 2758 |
Languages.each do |lang| |
| 2759 |
@db_hash[lang].prob.open("r") |
| 2760 |
@db_hash[lang].prob.export(fh) if (@db_hash[lang].prob.file_count > 0) |
| 2761 |
@db_hash[lang].prob.close |
| 2762 |
end |
| 2763 |
end |
| 2764 |
end |
| 2765 |
end |
| 2766 |
|
| 2767 |
def setup_imap |
| 2768 |
Net::IMAP.class_eval <<EOM |
| 2769 |
def auto_authenticate(options, auth, user, password, auth_list=[]) |
| 2770 |
case auth.downcase |
| 2771 |
when "loginc" |
| 2772 |
if (options["verbose"]) |
| 2773 |
options["message-fh"].printf("try to login imap server for %s with login command\n", user) |
| 2774 |
end |
| 2775 |
return login(user, password) |
| 2776 |
when "auto" |
| 2777 |
capa = capability |
| 2778 |
auth_list.each do |auth| |
| 2779 |
if (auth == "loginc") |
| 2780 |
return auto_authenticate(options, "loginc", user, password) |
| 2781 |
elsif (capa.include?("AUTH=" + auth.upcase)) |
| 2782 |
return auto_authenticate(options, auth, user, password) |
| 2783 |
end |
| 2784 |
end |
| 2785 |
raise sprintf("can't login imap server for %s with %s", user, auth_list) |
| 2786 |
else |
| 2787 |
if (options["verbose"]) |
| 2788 |
options["message-fh"].printf("try to login imap server for %s with authenticate %s\n", user, auth) |
| 2789 |
end |
| 2790 |
return authenticate(auth, user, password) |
| 2791 |
end |
| 2792 |
end |
| 2793 |
EOM |
| 2794 |
end |
| 2795 |
|
| 2796 |
def setup_socket_timeout |
| 2797 |
TCPSocket.class_eval <<EOM |
| 2798 |
def write_timeout(str) |
| 2799 |
timeout(SOCKET_TIMEOUT) do |
| 2800 |
return self.write(str) |
| 2801 |
end |
| 2802 |
end |
| 2803 |
def gets_timeout |
| 2804 |
timeout(SOCKET_TIMEOUT) do |
| 2805 |
return self.gets |
| 2806 |
end |
| 2807 |
end |
| 2808 |
EOM |
| 2809 |
end |
| 2810 |
|
| 2811 |
def setup_ssl_socket_timeout |
| 2812 |
OpenSSL::SSL::SSLSocket.class_eval <<EOM |
| 2813 |
def write_timeout(str) |
| 2814 |
timeout(SOCKET_TIMEOUT) do |
| 2815 |
return self.write(str) |
| 2816 |
end |
| 2817 |
end |
| 2818 |
def gets_timeout |
| 2819 |
timeout(SOCKET_TIMEOUT) do |
| 2820 |
return self.gets |
| 2821 |
end |
| 2822 |
end |
| 2823 |
EOM |
| 2824 |
end |
| 2825 |
|
| 2826 |
def get_ssl_socket(socket, cert=nil) |
| 2827 |
context = OpenSSL::SSL::SSLContext::new() |
| 2828 |
|
| 2829 |
if (cert) |
| 2830 |
if (FileTest::file?(cert)) |
| 2831 |
@options["message-fh"].print(cert, " is used for SSL ca_file\n") if (@options["verbose"]) |
| 2832 |
context.ca_file = cert |
| 2833 |
elsif (FileTest::directory?(cert)) |
| 2834 |
@options["message-fh"].print(cert, " is used for SSL ca_path\n") if (@options["verbose"]) |
| 2835 |
context.ca_path = cert |
| 2836 |
end |
| 2837 |
context.verify_mode = OpenSSL::SSL::VERIFY_PEER |
| 2838 |
end |
| 2839 |
ssl = OpenSSL::SSL::SSLSocket::new(socket, context) |
| 2840 |
ssl.connect |
| 2841 |
print(ssl, " is connected\n") if (@options["verbose"]) |
| 2842 |
return ssl |
| 2843 |
end |
| 2844 |
|
| 2845 |
def setup_tasktray |
| 2846 |
eval <<EOM |
| 2847 |
class MyForm < VRForm |
| 2848 |
include VRTrayiconFeasible |
| 2849 |
include VRMenuUseable |
| 2850 |
LoadIcon = Win32API.new("user32", "LoadIcon", "II", "I") |
| 2851 |
|
| 2852 |
def construct |
| 2853 |
@traymenu = newPopupMenu |
| 2854 |
@traymenu.set([ |
| 2855 |
["exit", "exit"] |
| 2856 |
]) |
| 2857 |
@mytrayicon=0 |
| 2858 |
end |
| 2859 |
def self_trayrbuttonup(iconid) |
| 2860 |
showPopup @traymenu |
| 2861 |
end |
| 2862 |
def into_trayicon(icon_number) |
| 2863 |
create_trayicon(LoadIcon.call(0, icon_number), |
| 2864 |
"bsfilter release #{Release} revision #{Revision}", @mytrayicon) |
| 2865 |
myexstyle = self.exwinstyle |
| 2866 |
myexstyle.ws_ex_toolwindow = true |
| 2867 |
myexstyle.ws_ex_appwindow = false |
| 2868 |
end |
| 2869 |
|
| 2870 |
def exit_clicked |
| 2871 |
delete_trayicon(@mytrayicon) |
| 2872 |
self.close |
| 2873 |
end |
| 2874 |
end |
| 2875 |
EOM |
| 2876 |
frm = VRLocalScreen.newform(nil, nil, MyForm) |
| 2877 |
frm.create |
| 2878 |
frm.into_trayicon(@options["icon_number"]) |
| 2879 |
VRLocalScreen.messageloop |
| 2880 |
@threads.each do |thread| # kill child threads |
| 2881 |
Thread::kill(thread) |
| 2882 |
end |
| 2883 |
end |
| 2884 |
|
| 2885 |
def do_pop |
| 2886 |
Thread.abort_on_exception = true |
| 2887 |
@options["message-fh"].print "pop mode start ", Time::new.to_s, "\n" if (@options["verbose"]) |
| 2888 |
|
| 2889 |
if (@options["tasktray"]) |
| 2890 |
Thread::start do |
| 2891 |
setup_tasktray |
| 2892 |
end |
| 2893 |
end |
| 2894 |
|
| 2895 |
if (@options["pop-proxy-set"]) |
| 2896 |
pop_proxy_sets = @options["pop-proxy-set"].gsub(/\s/, '') |
| 2897 |
else |
| 2898 |
pop_proxy_sets = [@options["pop-server"], @options["pop-port"], |
| 2899 |
@options["pop-proxy-if"], @options["pop-proxy-port"], @options["pop-user"]].join(":") |
| 2900 |
end |
| 2901 |
ret_code = pop_proxy_multi(pop_proxy_sets) |
| 2902 |
|
| 2903 |
# never reached |
| 2904 |
@options["message-fh"].print "pop mode end ", Time::new.to_s, "\n" if (@options["verbose"]) |
| 2905 |
return ret_code |
| 2906 |
end |
| 2907 |
|
| 2908 |
def write_pid_file(file) |
| 2909 |
open(file, "w") do |fh| |
| 2910 |
fh.print Process::pid, "\n" |
| 2911 |
end |
| 2912 |
end |
| 2913 |
|
| 2914 |
def parse_command_line |
| 2915 |
options = DBHash::new |
| 2916 |
|
| 2917 |
parser = GetoptLong.new |
| 2918 |
parser.ordering = GetoptLong::REQUIRE_ORDER |
| 2919 |
parser.set_options( |
| 2920 |
["--icon-number", GetoptLong::REQUIRED_ARGUMENT], |
| 2921 |
["--ssl", GetoptLong::NO_ARGUMENT], |
| 2922 |
["--ssl-cert", GetoptLong::REQUIRED_ARGUMENT], |
| 2923 |
["--pop", GetoptLong::NO_ARGUMENT], |
| 2924 |
["--tasktray", GetoptLong::NO_ARGUMENT], |
| 2925 |
["--pop-proxy-set", GetoptLong::REQUIRED_ARGUMENT], |
| 2926 |
["--pop-server", GetoptLong::REQUIRED_ARGUMENT], |
| 2927 |
["--pop-port", GetoptLong::REQUIRED_ARGUMENT], |
| 2928 |
["--pop-proxy-if", GetoptLong::REQUIRED_ARGUMENT], |
| 2929 |
["--pop-proxy-port", GetoptLong::REQUIRED_ARGUMENT], |
| 2930 |
["--pop-user", GetoptLong::REQUIRED_ARGUMENT], |
| 2931 |
["--pop-max-size", GetoptLong::REQUIRED_ARGUMENT], |
| 2932 |
["--imap", GetoptLong::NO_ARGUMENT], |
| 2933 |
["--imap-server", GetoptLong::REQUIRED_ARGUMENT], |
| 2934 |
["--imap-port", GetoptLong::REQUIRED_ARGUMENT], |
| 2935 |
["--imap-auth", GetoptLong::REQUIRED_ARGUMENT], |
| 2936 |
["--imap-user", GetoptLong::REQUIRED_ARGUMENT], |
| 2937 |
["--imap-password", GetoptLong::REQUIRED_ARGUMENT], |
| 2938 |
["--imap-folder-clean", GetoptLong::REQUIRED_ARGUMENT], |
| 2939 |
["--imap-folder-spam", GetoptLong::REQUIRED_ARGUMENT], |
| 2940 |
["--imap-fetch-unseen", GetoptLong::NO_ARGUMENT], |
| 2941 |
["--imap-fetch-unflagged", GetoptLong::NO_ARGUMENT], |
| 2942 |
["--imap-reset-seen-flag", GetoptLong::NO_ARGUMENT], |
| 2943 |
["--homedir", GetoptLong::REQUIRED_ARGUMENT], |
| 2944 |
["--config-file", GetoptLong::REQUIRED_ARGUMENT], |
| 2945 |
["--pid-file", GetoptLong::REQUIRED_ARGUMENT], |
| 2946 |
["--db", GetoptLong::REQUIRED_ARGUMENT], |
| 2947 |
["--max-line", GetoptLong::REQUIRED_ARGUMENT], |
| 2948 |
["--export-clean", GetoptLong::NO_ARGUMENT], |
| 2949 |
["--export-spam", GetoptLong::NO_ARGUMENT], |
| 2950 |
["--export-probability", GetoptLong::NO_ARGUMENT], |
| 2951 |
["--import-clean", GetoptLong::NO_ARGUMENT], |
| 2952 |
["--import-spam", GetoptLong::NO_ARGUMENT], |
| 2953 |
["--mbox", GetoptLong::NO_ARGUMENT], |
| 2954 |
["--jtokenizer", "-j", GetoptLong::REQUIRED_ARGUMENT], |
| 2955 |
["--method", "-m", GetoptLong::REQUIRED_ARGUMENT], |
| 2956 |
["--spam-cutoff", GetoptLong::REQUIRED_ARGUMENT], |
| 2957 |
["--mark-in-token", GetoptLong::REQUIRED_ARGUMENT], |
| 2958 |
["--max-mail", GetoptLong::REQUIRED_ARGUMENT], |
| 2959 |
["--min-mail", GetoptLong::REQUIRED_ARGUMENT], |
| 2960 |
["--show-new-token", GetoptLong::NO_ARGUMENT], |
| 2961 |
["--auto-update", "-a", GetoptLong::NO_ARGUMENT], |
| 2962 |
["--update", "-u", GetoptLong::NO_ARGUMENT], |
| 2963 |
["--add-clean", "-c", GetoptLong::NO_ARGUMENT], |
| 2964 |
["--add-spam", "-s", GetoptLong::NO_ARGUMENT], |
| 2965 |
["--sub-clean", "-C", GetoptLong::NO_ARGUMENT], |
| 2966 |
["--sub-spam", "-S", GetoptLong::NO_ARGUMENT], |
| 2967 |
["--disable-degeneration", "-D", GetoptLong::NO_ARGUMENT], |
| 2968 |
["--disable-utf-8", GetoptLong::NO_ARGUMENT], |
| 2969 |
["--ignore-body", "-B", GetoptLong::NO_ARGUMENT], |
| 2970 |
["--refer-header", GetoptLong::REQUIRED_ARGUMENT], |
| 2971 |
["--ignore-header", "-H", GetoptLong::NO_ARGUMENT], |
| 2972 |
["--ignore-plain-text-part", GetoptLong::NO_ARGUMENT], |
| 2973 |
["--ignore-after-last-atag", GetoptLong::NO_ARGUMENT], |
| 2974 |
["--pipe", GetoptLong::NO_ARGUMENT], |
| 2975 |
["--insert-revision", GetoptLong::NO_ARGUMENT], |
| 2976 |
["--insert-flag", GetoptLong::NO_ARGUMENT], |
| 2977 |
["--insert-probability", GetoptLong::NO_ARGUMENT], |
| 2978 |
["--header-prefix", GetoptLong::REQUIRED_ARGUMENT], |
| 2979 |
["--mark-spam-subject", GetoptLong::NO_ARGUMENT], |
| 2980 |
["--spam-subject-prefix", GetoptLong::REQUIRED_ARGUMENT], |
| 2981 |
["--list-clean", GetoptLong::NO_ARGUMENT], |
| 2982 |
["--list-spam", GetoptLong::NO_ARGUMENT], |
| 2983 |
["--show-db-status", GetoptLong::NO_ARGUMENT], |
| 2984 |
["--show-process", GetoptLong::NO_ARGUMENT], |
| 2985 |
["--help", "-h", GetoptLong::NO_ARGUMENT], |
| 2986 |
["--revision", GetoptLong::NO_ARGUMENT], |
| 2987 |
["--quiet", "-q", GetoptLong::NO_ARGUMENT], |
| 2988 |
["--debug", "-d", GetoptLong::NO_ARGUMENT], |
| 2989 |
["--verbose", "-v", GetoptLong::NO_ARGUMENT]) |
| 2990 |
|
| 2991 |
allow_multi = {"pop-proxy-set" => true} |
| 2992 |
|
| 2993 |
parser.quiet = true |
| 2994 |
begin |
| 2995 |
parser.each_option do |name, arg| |
| 2996 |
name.sub!(/^--/, '') |
| 2997 |
if (options[name] && allow_multi[name]) |
| 2998 |
options[name] += ("," + arg) |
| 2999 |
else |
| 3000 |
options[name] = arg.dup |
| 3001 |
end |
| 3002 |
end |
| 3003 |
rescue |
| 3004 |
usage |
| 3005 |
raise parser.error_message |
| 3006 |
end |
| 3007 |
return options |
| 3008 |
end |
| 3009 |
|
| 3010 |
|
| 3011 |
def get_options |
| 3012 |
argv_backup = Marshal::load(Marshal::dump(ARGV)) # shallow copy is enough? |
| 3013 |
options = parse_command_line |
| 3014 |
|
| 3015 |
if (options["config-file"] && (! File::file?(options["config-file"]))) |
| 3016 |
raise sprintf("can't find config file %s\n", options["config-file"]) |
| 3017 |
end |
| 3018 |
|
| 3019 |
if (! options["homedir"]) |
| 3020 |
if (ENV["BSFILTERHOME"]) |
| 3021 |
options["homedir"] = ENV["BSFILTERHOME"] |
| 3022 |
elsif (ENV["HOME"]) |
| 3023 |
options["homedir"] = ENV["HOME"] + "/" + Default_homedir |
| 3024 |
elsif (defined?(Exerb) && Exerb.runtime?) |
| 3025 |
options["homedir"] = File.dirname(Exerb.filepath) |
| 3026 |
else |
| 3027 |
options["homedir"] = File.dirname($0) |
| 3028 |
end |
| 3029 |
end |
| 3030 |
|
| 3031 |
if (! options["config-file"]) |
| 3032 |
options["config-file"] = options["homedir"] + "/" + Default_conf_file |
| 3033 |
end |
| 3034 |
if (options["config-file"] && File::file?(options["config-file"])) |
| 3035 |
ARGV.clear |
| 3036 |
argv_config = read_config_file(options["config-file"]) |
| 3037 |
(argv_config + argv_backup).reverse.each do |argv| |
| 3038 |
ARGV.unshift(argv) |
| 3039 |
end |
| 3040 |
options.update(parse_command_line) |
| 3041 |
end |
| 3042 |
|
| 3043 |
if (options["help"]) |
| 3044 |
usage |
| 3045 |
exit 0 |
| 3046 |
end |
| 3047 |
if (options["revision"]) |
| 3048 |
print "bsfilter release #{Release} revision #{Revision}\n" |
| 3049 |
exit 0 |
| 3050 |
end |
| 3051 |
|
| 3052 |
options["homedir"] = options["homedir"].sub(/\/*$/, '') + "/" |
| 3053 |
|
| 3054 |
if (options["method"]) |
| 3055 |
if (options["method"] !~ /\A(g|r|rf)\z/) |
| 3056 |
usage |
| 3057 |
raise sprintf("unsupported method %s\n", options["method"]) |
| 3058 |
end |
| 3059 |
else |
| 3060 |
options["method"] = Default_method |
| 3061 |
end |
| 3062 |
|
| 3063 |
options["header-prefix"] = Default_header_prefix if (! options["header-prefix"]) |
| 3064 |
options["spam-subject-prefix"] = Default_spam_subject_prefix if (! options["spam-subject-prefix"]) |
| 3065 |
|
| 3066 |
options["db"] = Default_db if (! options["db"]) |
| 3067 |
case options["db"] |
| 3068 |
when "sdbm" |
| 3069 |
require 'sdbm' |
| 3070 |
when "gdbm" |
| 3071 |
require 'gdbm' |
| 3072 |
when "bdb1" |
| 3073 |
require 'bdb1' |
| 3074 |
when "bdb" |
| 3075 |
require 'bdb' |
| 3076 |
when "qdbm" |
| 3077 |
require 'depot' |
| 3078 |
else |
| 3079 |
raise sprintf("unsupported db %s\n", options["db"]) |
| 3080 |
end |
| 3081 |
|
| 3082 |
if (options["jtokenizer"]) |
| 3083 |
options["jtokenizer"].downcase! |
| 3084 |
else |
| 3085 |
options["jtokenizer"] = Default_jtokenizer |
| 3086 |
end |
| 3087 |
case options["jtokenizer"] |
| 3088 |
when "bigram" |
| 3089 |
when "block" |
| 3090 |
when "mecab" |
| 3091 |
require 'MeCab' |
| 3092 |
when "chasen" |
| 3093 |
require 'chasen.o' |
| 3094 |
when "kakasi" |
| 3095 |
require 'kakasi' |
| 3096 |
else |
| 3097 |
raise sprintf("unsupported jtokenizer %s\n", options["jtokenizer"]) |
| 3098 |
end |
| 3099 |
@jtokenizer = Jtokenizer::new(options["jtokenizer"]) |
| 3100 |
|
| 3101 |
options['mark-in-token'] = Default_mark_in_token if (! options['mark-in-token']) |
| 3102 |
options['mark-in-token'].gsub!(/\s/, '') |
| 3103 |
options["max-line"] = (options["max-line"] || Default_max_line).to_i |
| 3104 |
options["max-mail"] = (options["max-mail"] || Default_max_mail).to_i |
| 3105 |
options["min-mail"] = (options["min-mail"] || Default_min_mail).to_i |
| 3106 |
|
| 3107 |
options["degeneration"] = options["disable-degeneration"] ? false : true |
| 3108 |
|
| 3109 |
if (options["refer-header"]) |
| 3110 |
array = options["refer-header"].downcase.split(',') |
| 3111 |
elsif (options["ignore-header"]) |
| 3112 |
array = Array::new |
| 3113 |
else |
| 3114 |
array = Default_refer_header.downcase.split(',') |
| 3115 |
end |
| 3116 |
options["refer-header"] = Hash::new |
| 3117 |
array.each do |header| |
| 3118 |
options["refer-header"][header] = true |
| 3119 |
end |
| 3120 |
|
| 3121 |
options["use-body"] = options["ignore-body"] ? false : true |
| 3122 |
|
| 3123 |
options["pid-file"] = options["homedir"] + Default_pid_file if (! options["pid-file"]) |
| 3124 |
|
| 3125 |
options["imap-auth"] = options["imap-auth"] || Default_imap_auth |
| 3126 |
options["imap-auth-preference"] = Default_imap_auth_preference # can't modify with command line option |
| 3127 |
|
| 3128 |
if ((! options["disable-utf-8"]) && |
| 3129 |
safe_require("iconv")) |
| 3130 |
options["utf-8"] = true |
| 3131 |
define_safe_iconv if (! defined?(Iconv.safe_iconv)) |
| 3132 |
else |
| 3133 |
options["utf-8"] = false |
| 3134 |
end |
| 3135 |
|
| 3136 |
if (options["pop"]) |
| 3137 |
check_options_for_pop!(options) |
| 3138 |
require 'timeout' |
| 3139 |
require 'socket' |
| 3140 |
setup_socket_timeout |
| 3141 |
end |
| 3142 |
if (options["imap"]) |
| 3143 |
check_options_for_imap!(options) |
| 3144 |
require 'net/imap' |
| 3145 |
setup_imap |
| 3146 |
end |
| 3147 |
if (options["ssl"]) |
| 3148 |
if (options["ssl-cert"]) |
| 3149 |
if (! File::readable?(options["ssl-cert"])) |
| 3150 |
raise sprintf("can't read %s. check --ssl-cert option", options["ssl-cert"]) |
| 3151 |
end |
| 3152 |
end |
| 3153 |
require "openssl" |
| 3154 |
setup_ssl_socket_timeout |
| 3155 |
end |
| 3156 |
return options |
| 3157 |
end |
| 3158 |
|
| 3159 |
def show_db_status |
| 3160 |
Languages.each do |lang| |
| 3161 |
@db_hash[lang].clean.open("r") |
| 3162 |
@db_hash[lang].spam.open("r") |
| 3163 |
@db_hash[lang].prob.open("r") |
| 3164 |
@options["
| |