• R/O
  • SSH
  • HTTPS

mail2weblog: Commit


Commit MetaInfo

Revision561 (tree)
Time2012-02-27 15:24:26
Authortoy_dev

Log Message

HTML の解析機能を理想的な HTML を元にして一通り作成した。テストも一通り作成し通した。
HTML 解析機能の今後については、実際の HTML メールを解析したテストを追加し、完成度を高めていくこととする。

Change Summary

Incremental Difference

--- branches/private/toydev/htmlmail/test/m2w/test_m2w_mail_parser_html_text_type.rb (revision 560)
+++ branches/private/toydev/htmlmail/test/m2w/test_m2w_mail_parser_html_text_type.rb (revision 561)
@@ -6,51 +6,247 @@
66 $:.unshift(ROOT_PATH + "/../../main/")
77 $:.unshift(ROOT_PATH + "/../../main/lib")
88
9-# mail2weblog のルートパスを設定する
9+# mail2weblog 縺ョ繝ォ繝シ繝医ヱ繧ケ繧定ィュ螳壹☆繧
1010 M2W_ROOT_PATH = ROOT_PATH + "/../../main/"
1111
1212 require 'm2w_configurator'
13+require 'm2w_mail_parser'
1314 require 'm2w_mail_parser_html_text_type'
14-require 'mailutils/mail_processor'
1515
1616 require 'minitest/unit'
1717
1818 MiniTest::Unit.autorun
1919
20-#=Mail2WeblogMailParser のテスト
20+#=Mail2WeblogMailParser 縺ョ繝?せ繝
2121 #
22-# 最初の著者:: トゥイー
23-# リポジトリ情報:: $Id$
24-# 著作権:: Copyright (C) Ownway.info, 2011. All rights reserved.
25-# ライセンス:: CPL(Common Public Licence)
22+# 譛?蛻昴?闡苓??: 繝医ぇ繧、繝シ
23+# 繝ェ繝昴ず繝医Μ諠??ア:: $Id$
24+# 闡嶺ス懈ィゥ:: Copyright (C) Ownway.info, 2011. All rights reserved.
25+# 繝ゥ繧、繧サ繝ウ繧ケ:: CPL(Common Public Licence)
2626 class TestMail2WeblogMailParserHtmlTextType < MiniTest::Unit::TestCase
2727
28+ def test_parse
29+ parser = Mail2WeblogMailParser.new(
30+ M2W_SUBJECT_HEADER,
31+ M2W_SUBJECT_SEPARATOR_REGEX)
32+
33+ mail_content = <<"END_CONTENT"
34+<div>
35+<div>
36+attr1.value1<br />
37+attr2.value2<br />
38+attr3.value3<br />
39+<br />
40+<div>譛ャ譁?/div>
41+</div>
42+</div>
43+END_CONTENT
44+
45+ (header, content, contents) = parser.parse_content(mail_content, {}, Mail2WeblogMailParserHtmlTextType.new)
46+ assert_equal "value1", header['attr1'], header.to_s
47+ assert_equal "value2", header['attr2'], header.to_s
48+ assert_equal "value3", header['attr3'], header.to_s
49+ assert_equal "<div>譛ャ譁?/div>", content, content
50+ assert_equal "<div>譛ャ譁?/div>", contents['description'], contents.to_s
51+ end
52+
53+ def test_delete_prefix_tags
54+ type = Mail2WeblogMailParserHtmlTextType.new
55+ mail_content = <<"END_CONTENT".gsub(/\r|\n/, '')
56+<div>
57+<div>
58+attr1.value1<br />
59+attr2.value2<br />
60+attr3.value3<br />
61+<br />
62+<div>譛ャ譁?/div>
63+</div>
64+</div>
65+END_CONTENT
66+
67+ expected_content = <<"END_CONTENT".gsub(/\r|\n/, '')
68+attr1.value1<br />
69+attr2.value2<br />
70+attr3.value3<br />
71+<br />
72+<div>譛ャ譁?/div>
73+</div>
74+</div>
75+END_CONTENT
76+
77+ (content, delete_count) = type.__delete_prefix_tags(mail_content)
78+ assert_equal expected_content, content
79+ assert_equal 2, delete_count
80+ end
81+
82+ def test_delete_suffix_tags
83+ type = Mail2WeblogMailParserHtmlTextType.new
84+ mail_content = <<"END_CONTENT".gsub(/\r|\n/, '')
85+attr1.value1<br />
86+attr2.value2<br />
87+attr3.value3<br />
88+<br />
89+<div>譛ャ譁?/div>
90+</div>
91+</div>
92+END_CONTENT
93+
94+ expected_content = <<"END_CONTENT".gsub(/\r|\n/, '')
95+attr1.value1<br />
96+attr2.value2<br />
97+attr3.value3<br />
98+<br />
99+<div>譛ャ譁?/div>
100+END_CONTENT
101+
102+ content = type.__delete_suffix_tags(mail_content, 2)
103+ assert_equal expected_content, content
104+ end
105+
106+ def test_delete_prefix_suffix_tags
107+ type = Mail2WeblogMailParserHtmlTextType.new
108+ mail_content = <<"END_CONTENT".gsub(/\r|\n/, '')
109+<div>
110+<div>
111+attr1.value1<br />
112+attr2.value2<br />
113+attr3.value3<br />
114+<br />
115+<div>譛ャ譁?/div>
116+</div>
117+</div>
118+END_CONTENT
119+
120+ expected_content = <<"END_CONTENT".gsub(/\r|\n/, '')
121+attr1.value1<br />
122+attr2.value2<br />
123+attr3.value3<br />
124+<br />
125+<div>譛ャ譁?/div>
126+END_CONTENT
127+
128+ content = type.__delete_prefix_suffix_tags(mail_content)
129+ assert_equal expected_content, content
130+ end
131+
132+ def test_clean_html_line
133+ type = Mail2WeblogMailParserHtmlTextType.new
134+
135+ assert_equal "abcdef", type.__clean_html_line(" abcdef ")
136+ end
137+
138+ def test_split
139+ type = Mail2WeblogMailParserHtmlTextType.new
140+ mail_content = <<"END_CONTENT"
141+<div>
142+<div>
143+attr1.value1<br />
144+attr2.value2<br />
145+attr3.value3<br />
146+<br />
147+<div>譛ャ譁?/div>
148+</div>
149+</div>
150+END_CONTENT
151+
152+ expected = [
153+ "attr1.value1<br />",
154+ "attr2.value2<br />",
155+ "attr3.value3<br />",
156+ "<br />",
157+ "<div>譛ャ譁?/div>",
158+ ]
159+
160+ buffers = type.split(mail_content)
161+ assert expected == buffers, buffers.to_s
162+ end
163+
28164 def test_is_space_line
29- parser = Mail2WeblogMailParserHtmlTextType.new
165+ type = Mail2WeblogMailParserHtmlTextType.new
30166
31- assert parser.is_space_line('<div></div>')
32- assert parser.is_space_line('<DIV></DIV>')
33- assert parser.is_space_line('<div></DIV>')
34- assert parser.is_space_line('<DIV></div>')
35- assert parser.is_space_line('<div attr1="value1"></div>')
36- assert !parser.is_space_line('<div>')
37- assert !parser.is_space_line('<DIV>')
167+ assert type.is_space_line('<div></div>')
168+ assert type.is_space_line('<DIV></DIV>')
169+ assert type.is_space_line('<div></DIV>')
170+ assert type.is_space_line('<DIV></div>')
171+ assert type.is_space_line('<div attr1="value1"></div>')
172+ assert !type.is_space_line('<div>')
173+ assert !type.is_space_line('<DIV>')
38174
39- assert parser.is_space_line('<p></p>')
40- assert parser.is_space_line('<P></P>')
41- assert parser.is_space_line('<p></P>')
42- assert parser.is_space_line('<P></p>')
43- assert parser.is_space_line('<p attr1="value1"></p>')
44- assert parser.is_space_line('<p>')
45- assert parser.is_space_line('<p></p>')
175+ assert type.is_space_line('<p></p>')
176+ assert type.is_space_line('<P></P>')
177+ assert type.is_space_line('<p></P>')
178+ assert type.is_space_line('<P></p>')
179+ assert type.is_space_line('<p attr1="value1"></p>')
180+ assert type.is_space_line('<p>')
181+ assert type.is_space_line('<p></p>')
46182
47- assert parser.is_space_line('<br>')
48- assert parser.is_space_line('<BR>')
183+ assert type.is_space_line('<br>')
184+ assert type.is_space_line('<BR>')
49185
50- assert parser.is_space_line('<div>&nbsp;</div>')
51- assert parser.is_space_line('<p>&nbsp;</p>')
52- assert parser.is_space_line('&nbsp;<p>')
53- assert parser.is_space_line('&nbsp;<br>')
186+ assert type.is_space_line('<div>&nbsp;</div>')
187+ assert type.is_space_line('<p>&nbsp;</p>')
188+ assert type.is_space_line('&nbsp;<p>')
189+ assert type.is_space_line('&nbsp;<br>')
54190 end
55191
192+ def test_parse_header
193+ type = Mail2WeblogMailParserHtmlTextType.new
194+
195+ datas = [
196+ [true , "<div>attr1.value1</div>"],
197+ [true , "<DIV>attr1.value1</DIV>"],
198+ [true , "<div>attr1.value1</DIV>"],
199+ [true , "<DIV>attr1.value1</div>"],
200+ [true , "<div attr='value'>attr1.value1</div>"],
201+ [true , "<p>attr1.value1</p>"],
202+ [true , "<P>attr1.value1</P>"],
203+ [true , "<p>attr1.value1</P>"],
204+ [true , "<P>attr1.value1</p>"],
205+ [true , "<p attr='value'>attr1.value1</p>"],
206+ [true , "attr1.value1<br />"],
207+ [true , "attr1.value1<p />"],
208+ [false, "attr1.value1<div />"],
209+ ]
210+
211+ datas.each do |data|
212+ (is_header, key, value) = type.parse_header(data[1], M2W_SUBJECT_SEPARATOR_REGEX)
213+ assert_equal data[0], is_header, data.to_s
214+ if data[0] then
215+ assert_equal "attr1", key, data.to_s
216+ assert_equal "value1", value, data.to_s
217+ end
218+ end
219+ end
220+
221+ def test_parse_subject_separator
222+ type = Mail2WeblogMailParserHtmlTextType.new
223+
224+ datas = [
225+ [true , "<div>more....</div>"],
226+ [true , "<div>more////</div>"],
227+ [true , "<div>more::::</div>"],
228+ [false, "<div>more.:::</div>"],
229+ [true , "<DIV>more....</DIV>"],
230+ [true , "<div>more....</DIV>"],
231+ [true , "<DIV>more....</div>"],
232+ [true , "<div attr='value'>more....</div>"],
233+ [true , "<p>more....</p>"],
234+ [true , "<P>more....</P>"],
235+ [true , "<p>more....</P>"],
236+ [true , "<P>more....</p>"],
237+ [true , "<p attr='value'>more....</p>"],
238+ [true , "more....<br />"],
239+ [true , "more....<p />"],
240+ [false, "more....<div />"],
241+ ]
242+
243+ datas.each do |data|
244+ (is_subject_separator, separating_contents_name) = type.parse_subject_separation(data[1], M2W_SUBJECT_SEPARATOR_REGEX)
245+ assert_equal data[0], is_subject_separator, data.to_s
246+ if data[0] then
247+ assert_equal "more", separating_contents_name, data.to_s
248+ end
249+ end
250+ end
251+
56252 end
--- branches/private/toydev/htmlmail/main/lib/m2w_mail_parser_html_text_type.rb (revision 560)
+++ branches/private/toydev/htmlmail/main/lib/m2w_mail_parser_html_text_type.rb (revision 561)
@@ -11,29 +11,64 @@
1111 def split(content)
1212 content = content.gsub(/\r|\n/, '')
1313
14- if content =~ %r!<body.*?>(.+)</body>!i then
15- content = $2
16- end
14+ content = __delete_prefix_suffix_tags(content)
1715
18- result = ''
16+ temp = []
1917 while true
2018 (left, sep, right) = content.partition(%r!</(div|p)>|<(br|p).*?/?>!i)
2119 if sep.length == 0 then
22- result << __clean_html_line(left)
20+ temp.push(__clean_html_line(left))
2321 break
2422 else
25- result << __clean_html_line(left) << "\n"
26- result << sep
23+ line = __clean_html_line(left)
24+ line << sep
25+ temp.push(line)
2726 content = right
2827 end
2928 end
3029
30+ result = []
31+ temp.each do |line|
32+ if line.length > 0 then
33+ result.push(line)
34+ end
35+ end
36+
3137 return result
3238 end
3339
40+ def __delete_prefix_suffix_tags(content)
41+ (content, delete_count) = __delete_prefix_tags(content)
42+ return __delete_suffix_tags(content, delete_count)
43+ end
44+
45+
46+ def __delete_prefix_tags(content)
47+ if %r!<body.*?>(.+)</body>!i =~ content then
48+ content = $2
49+ end
50+
51+ delete_count = 0
52+ while %r!^\s*?<div.*?>(.+)$!i =~ content
53+ content = $1
54+ delete_count = delete_count + 1
55+ end
56+ return [content, delete_count]
57+ end
58+
59+ def __delete_suffix_tags(content, delete_count)
60+ delete_count.times do
61+ if %r!^\s*(.+?)\s*?</div>\s*?$!i =~ content then
62+ content = $1
63+ end
64+ end
65+
66+ return content
67+ end
68+
3469 def __clean_html_line(line)
3570 # 前後のスペースを排除する
36- if /^\s*?(.+)\s*$/ =~ line then
71+ if /^\s*([^\s]*)\s*$/ =~ line then
3772 line = $1
3873 end
3974
@@ -51,9 +86,9 @@
5186 end
5287
5388 def parse_header(line, subject_separator)
54- if %r!^<(div|DIV|p|P).*?>([0-9a-zA-Z_]+?)#{subject_separator}(.*?)</(div|DIV|p|P)>$! =~ line then
89+ if %r!^<(div|p).*?>([0-9a-zA-Z_]+?)#{subject_separator}(.*?)</(div|p)>$!i =~ line then
5590 return [true, $2, $3]
56- elsif %r!^([0-9a-zA-Z_]+?)#{subject_separator}(.*?)<(br|BR|p|P).*/?>! =~ line then
91+ elsif %r!^([0-9a-zA-Z_]+?)#{subject_separator}(.*?)<(br|p).*/?>!i =~ line then
5792 return [true, $1, $2]
5893 else
5994 return [false, nil, nil]
@@ -69,8 +104,10 @@
69104 end
70105
71106 def parse_subject_separation(line, subject_separator)
72- if line =~ %r!^<(div|DIV|p|P).*?>([0-9a-zA-Z_]+?)(#{subject_separator})\2{3}</\1>$!
107+ if %r!^<(div|p).*?>([0-9a-zA-Z_]+?)(#{subject_separator})\3{3}</\1>$!i =~ line then
73108 return [true, $2]
109+ elsif %r!^([0-9a-zA-Z_]+?)(#{subject_separator})\2{3}<(br|p).*/?>!i =~ line then
110+ return [true, $1]
74111 else
75112 return [false, nil]
76113 end
Show on old repository browser