HTML の解析機能を理想的な HTML を元にして一通り作成した。テストも一通り作成し通した。
HTML 解析機能の今後については、実際の HTML メールを解析したテストを追加し、完成度を高めていくこととする。
@@ -6,51 +6,247 @@ | ||
6 | 6 | $:.unshift(ROOT_PATH + "/../../main/") |
7 | 7 | $:.unshift(ROOT_PATH + "/../../main/lib") |
8 | 8 | |
9 | -# mail2weblog のルートパスを設定する | |
9 | +# mail2weblog 縺ョ繝ォ繝シ繝医ヱ繧ケ繧定ィュ螳壹☆繧 | |
10 | 10 | M2W_ROOT_PATH = ROOT_PATH + "/../../main/" |
11 | 11 | |
12 | 12 | require 'm2w_configurator' |
13 | +require 'm2w_mail_parser' | |
13 | 14 | require 'm2w_mail_parser_html_text_type' |
14 | -require 'mailutils/mail_processor' | |
15 | 15 | |
16 | 16 | require 'minitest/unit' |
17 | 17 | |
18 | 18 | MiniTest::Unit.autorun |
19 | 19 | |
20 | -#=Mail2WeblogMailParser のテスト | |
20 | +#=Mail2WeblogMailParser 縺ョ繝?せ繝 | |
21 | 21 | # |
22 | -# 最初の著者:: トゥイー | |
23 | -# リポジトリ情報:: $Id$ | |
24 | -# 著作権:: Copyright (C) Ownway.info, 2011. All rights reserved. | |
25 | -# ライセンス:: CPL(Common Public Licence) | |
22 | +# 譛?蛻昴?闡苓??: 繝医ぇ繧、繝シ | |
23 | +# 繝ェ繝昴ず繝医Μ諠??ア:: $Id$ | |
24 | +# 闡嶺ス懈ィゥ:: Copyright (C) Ownway.info, 2011. All rights reserved. | |
25 | +# 繝ゥ繧、繧サ繝ウ繧ケ:: CPL(Common Public Licence) | |
26 | 26 | class TestMail2WeblogMailParserHtmlTextType < MiniTest::Unit::TestCase |
27 | 27 | |
28 | + def test_parse | |
29 | + parser = Mail2WeblogMailParser.new( | |
30 | + M2W_SUBJECT_HEADER, | |
31 | + M2W_SUBJECT_SEPARATOR_REGEX) | |
32 | + | |
33 | + mail_content = <<"END_CONTENT" | |
34 | +<div> | |
35 | +<div> | |
36 | +attr1.value1<br /> | |
37 | +attr2.value2<br /> | |
38 | +attr3.value3<br /> | |
39 | +<br /> | |
40 | +<div>譛ャ譁?/div> | |
41 | +</div> | |
42 | +</div> | |
43 | +END_CONTENT | |
44 | + | |
45 | + (header, content, contents) = parser.parse_content(mail_content, {}, Mail2WeblogMailParserHtmlTextType.new) | |
46 | + assert_equal "value1", header['attr1'], header.to_s | |
47 | + assert_equal "value2", header['attr2'], header.to_s | |
48 | + assert_equal "value3", header['attr3'], header.to_s | |
49 | + assert_equal "<div>譛ャ譁?/div>", content, content | |
50 | + assert_equal "<div>譛ャ譁?/div>", contents['description'], contents.to_s | |
51 | + end | |
52 | + | |
53 | + def test_delete_prefix_tags | |
54 | + type = Mail2WeblogMailParserHtmlTextType.new | |
55 | + mail_content = <<"END_CONTENT".gsub(/\r|\n/, '') | |
56 | +<div> | |
57 | +<div> | |
58 | +attr1.value1<br /> | |
59 | +attr2.value2<br /> | |
60 | +attr3.value3<br /> | |
61 | +<br /> | |
62 | +<div>譛ャ譁?/div> | |
63 | +</div> | |
64 | +</div> | |
65 | +END_CONTENT | |
66 | + | |
67 | + expected_content = <<"END_CONTENT".gsub(/\r|\n/, '') | |
68 | +attr1.value1<br /> | |
69 | +attr2.value2<br /> | |
70 | +attr3.value3<br /> | |
71 | +<br /> | |
72 | +<div>譛ャ譁?/div> | |
73 | +</div> | |
74 | +</div> | |
75 | +END_CONTENT | |
76 | + | |
77 | + (content, delete_count) = type.__delete_prefix_tags(mail_content) | |
78 | + assert_equal expected_content, content | |
79 | + assert_equal 2, delete_count | |
80 | + end | |
81 | + | |
82 | + def test_delete_suffix_tags | |
83 | + type = Mail2WeblogMailParserHtmlTextType.new | |
84 | + mail_content = <<"END_CONTENT".gsub(/\r|\n/, '') | |
85 | +attr1.value1<br /> | |
86 | +attr2.value2<br /> | |
87 | +attr3.value3<br /> | |
88 | +<br /> | |
89 | +<div>譛ャ譁?/div> | |
90 | +</div> | |
91 | +</div> | |
92 | +END_CONTENT | |
93 | + | |
94 | + expected_content = <<"END_CONTENT".gsub(/\r|\n/, '') | |
95 | +attr1.value1<br /> | |
96 | +attr2.value2<br /> | |
97 | +attr3.value3<br /> | |
98 | +<br /> | |
99 | +<div>譛ャ譁?/div> | |
100 | +END_CONTENT | |
101 | + | |
102 | + content = type.__delete_suffix_tags(mail_content, 2) | |
103 | + assert_equal expected_content, content | |
104 | + end | |
105 | + | |
106 | + def test_delete_prefix_suffix_tags | |
107 | + type = Mail2WeblogMailParserHtmlTextType.new | |
108 | + mail_content = <<"END_CONTENT".gsub(/\r|\n/, '') | |
109 | +<div> | |
110 | +<div> | |
111 | +attr1.value1<br /> | |
112 | +attr2.value2<br /> | |
113 | +attr3.value3<br /> | |
114 | +<br /> | |
115 | +<div>譛ャ譁?/div> | |
116 | +</div> | |
117 | +</div> | |
118 | +END_CONTENT | |
119 | + | |
120 | + expected_content = <<"END_CONTENT".gsub(/\r|\n/, '') | |
121 | +attr1.value1<br /> | |
122 | +attr2.value2<br /> | |
123 | +attr3.value3<br /> | |
124 | +<br /> | |
125 | +<div>譛ャ譁?/div> | |
126 | +END_CONTENT | |
127 | + | |
128 | + content = type.__delete_prefix_suffix_tags(mail_content) | |
129 | + assert_equal expected_content, content | |
130 | + end | |
131 | + | |
132 | + def test_clean_html_line | |
133 | + type = Mail2WeblogMailParserHtmlTextType.new | |
134 | + | |
135 | + assert_equal "abcdef", type.__clean_html_line(" abcdef ") | |
136 | + end | |
137 | + | |
138 | + def test_split | |
139 | + type = Mail2WeblogMailParserHtmlTextType.new | |
140 | + mail_content = <<"END_CONTENT" | |
141 | +<div> | |
142 | +<div> | |
143 | +attr1.value1<br /> | |
144 | +attr2.value2<br /> | |
145 | +attr3.value3<br /> | |
146 | +<br /> | |
147 | +<div>譛ャ譁?/div> | |
148 | +</div> | |
149 | +</div> | |
150 | +END_CONTENT | |
151 | + | |
152 | + expected = [ | |
153 | + "attr1.value1<br />", | |
154 | + "attr2.value2<br />", | |
155 | + "attr3.value3<br />", | |
156 | + "<br />", | |
157 | + "<div>譛ャ譁?/div>", | |
158 | + ] | |
159 | + | |
160 | + buffers = type.split(mail_content) | |
161 | + assert expected == buffers, buffers.to_s | |
162 | + end | |
163 | + | |
28 | 164 | def test_is_space_line |
29 | - parser = Mail2WeblogMailParserHtmlTextType.new | |
165 | + type = Mail2WeblogMailParserHtmlTextType.new | |
30 | 166 | |
31 | - assert parser.is_space_line('<div></div>') | |
32 | - assert parser.is_space_line('<DIV></DIV>') | |
33 | - assert parser.is_space_line('<div></DIV>') | |
34 | - assert parser.is_space_line('<DIV></div>') | |
35 | - assert parser.is_space_line('<div attr1="value1"></div>') | |
36 | - assert !parser.is_space_line('<div>') | |
37 | - assert !parser.is_space_line('<DIV>') | |
167 | + assert type.is_space_line('<div></div>') | |
168 | + assert type.is_space_line('<DIV></DIV>') | |
169 | + assert type.is_space_line('<div></DIV>') | |
170 | + assert type.is_space_line('<DIV></div>') | |
171 | + assert type.is_space_line('<div attr1="value1"></div>') | |
172 | + assert !type.is_space_line('<div>') | |
173 | + assert !type.is_space_line('<DIV>') | |
38 | 174 | |
39 | - assert parser.is_space_line('<p></p>') | |
40 | - assert parser.is_space_line('<P></P>') | |
41 | - assert parser.is_space_line('<p></P>') | |
42 | - assert parser.is_space_line('<P></p>') | |
43 | - assert parser.is_space_line('<p attr1="value1"></p>') | |
44 | - assert parser.is_space_line('<p>') | |
45 | - assert parser.is_space_line('<p></p>') | |
175 | + assert type.is_space_line('<p></p>') | |
176 | + assert type.is_space_line('<P></P>') | |
177 | + assert type.is_space_line('<p></P>') | |
178 | + assert type.is_space_line('<P></p>') | |
179 | + assert type.is_space_line('<p attr1="value1"></p>') | |
180 | + assert type.is_space_line('<p>') | |
181 | + assert type.is_space_line('<p></p>') | |
46 | 182 | |
47 | - assert parser.is_space_line('<br>') | |
48 | - assert parser.is_space_line('<BR>') | |
183 | + assert type.is_space_line('<br>') | |
184 | + assert type.is_space_line('<BR>') | |
49 | 185 | |
50 | - assert parser.is_space_line('<div> </div>') | |
51 | - assert parser.is_space_line('<p> </p>') | |
52 | - assert parser.is_space_line(' <p>') | |
53 | - assert parser.is_space_line(' <br>') | |
186 | + assert type.is_space_line('<div> </div>') | |
187 | + assert type.is_space_line('<p> </p>') | |
188 | + assert type.is_space_line(' <p>') | |
189 | + assert type.is_space_line(' <br>') | |
54 | 190 | end |
55 | 191 | |
192 | + def test_parse_header | |
193 | + type = Mail2WeblogMailParserHtmlTextType.new | |
194 | + | |
195 | + datas = [ | |
196 | + [true , "<div>attr1.value1</div>"], | |
197 | + [true , "<DIV>attr1.value1</DIV>"], | |
198 | + [true , "<div>attr1.value1</DIV>"], | |
199 | + [true , "<DIV>attr1.value1</div>"], | |
200 | + [true , "<div attr='value'>attr1.value1</div>"], | |
201 | + [true , "<p>attr1.value1</p>"], | |
202 | + [true , "<P>attr1.value1</P>"], | |
203 | + [true , "<p>attr1.value1</P>"], | |
204 | + [true , "<P>attr1.value1</p>"], | |
205 | + [true , "<p attr='value'>attr1.value1</p>"], | |
206 | + [true , "attr1.value1<br />"], | |
207 | + [true , "attr1.value1<p />"], | |
208 | + [false, "attr1.value1<div />"], | |
209 | + ] | |
210 | + | |
211 | + datas.each do |data| | |
212 | + (is_header, key, value) = type.parse_header(data[1], M2W_SUBJECT_SEPARATOR_REGEX) | |
213 | + assert_equal data[0], is_header, data.to_s | |
214 | + if data[0] then | |
215 | + assert_equal "attr1", key, data.to_s | |
216 | + assert_equal "value1", value, data.to_s | |
217 | + end | |
218 | + end | |
219 | + end | |
220 | + | |
221 | + def test_parse_subject_separator | |
222 | + type = Mail2WeblogMailParserHtmlTextType.new | |
223 | + | |
224 | + datas = [ | |
225 | + [true , "<div>more....</div>"], | |
226 | + [true , "<div>more////</div>"], | |
227 | + [true , "<div>more::::</div>"], | |
228 | + [false, "<div>more.:::</div>"], | |
229 | + [true , "<DIV>more....</DIV>"], | |
230 | + [true , "<div>more....</DIV>"], | |
231 | + [true , "<DIV>more....</div>"], | |
232 | + [true , "<div attr='value'>more....</div>"], | |
233 | + [true , "<p>more....</p>"], | |
234 | + [true , "<P>more....</P>"], | |
235 | + [true , "<p>more....</P>"], | |
236 | + [true , "<P>more....</p>"], | |
237 | + [true , "<p attr='value'>more....</p>"], | |
238 | + [true , "more....<br />"], | |
239 | + [true , "more....<p />"], | |
240 | + [false, "more....<div />"], | |
241 | + ] | |
242 | + | |
243 | + datas.each do |data| | |
244 | + (is_subject_separator, separating_contents_name) = type.parse_subject_separation(data[1], M2W_SUBJECT_SEPARATOR_REGEX) | |
245 | + assert_equal data[0], is_subject_separator, data.to_s | |
246 | + if data[0] then | |
247 | + assert_equal "more", separating_contents_name, data.to_s | |
248 | + end | |
249 | + end | |
250 | + end | |
251 | + | |
56 | 252 | end |
@@ -11,29 +11,64 @@ | ||
11 | 11 | def split(content) |
12 | 12 | content = content.gsub(/\r|\n/, '') |
13 | 13 | |
14 | - if content =~ %r!<body.*?>(.+)</body>!i then | |
15 | - content = $2 | |
16 | - end | |
14 | + content = __delete_prefix_suffix_tags(content) | |
17 | 15 | |
18 | - result = '' | |
16 | + temp = [] | |
19 | 17 | while true |
20 | 18 | (left, sep, right) = content.partition(%r!</(div|p)>|<(br|p).*?/?>!i) |
21 | 19 | if sep.length == 0 then |
22 | - result << __clean_html_line(left) | |
20 | + temp.push(__clean_html_line(left)) | |
23 | 21 | break |
24 | 22 | else |
25 | - result << __clean_html_line(left) << "\n" | |
26 | - result << sep | |
23 | + line = __clean_html_line(left) | |
24 | + line << sep | |
25 | + temp.push(line) | |
27 | 26 | content = right |
28 | 27 | end |
29 | 28 | end |
30 | 29 | |
30 | + result = [] | |
31 | + temp.each do |line| | |
32 | + if line.length > 0 then | |
33 | + result.push(line) | |
34 | + end | |
35 | + end | |
36 | + | |
31 | 37 | return result |
32 | 38 | end |
33 | 39 | |
40 | + def __delete_prefix_suffix_tags(content) | |
41 | + (content, delete_count) = __delete_prefix_tags(content) | |
42 | + return __delete_suffix_tags(content, delete_count) | |
43 | + end | |
44 | + | |
45 | + | |
46 | + def __delete_prefix_tags(content) | |
47 | + if %r!<body.*?>(.+)</body>!i =~ content then | |
48 | + content = $2 | |
49 | + end | |
50 | + | |
51 | + delete_count = 0 | |
52 | + while %r!^\s*?<div.*?>(.+)$!i =~ content | |
53 | + content = $1 | |
54 | + delete_count = delete_count + 1 | |
55 | + end | |
56 | + return [content, delete_count] | |
57 | + end | |
58 | + | |
59 | + def __delete_suffix_tags(content, delete_count) | |
60 | + delete_count.times do | |
61 | + if %r!^\s*(.+?)\s*?</div>\s*?$!i =~ content then | |
62 | + content = $1 | |
63 | + end | |
64 | + end | |
65 | + | |
66 | + return content | |
67 | + end | |
68 | + | |
34 | 69 | def __clean_html_line(line) |
35 | 70 | # 前後のスペースを排除する |
36 | - if /^\s*?(.+)\s*$/ =~ line then | |
71 | + if /^\s*([^\s]*)\s*$/ =~ line then | |
37 | 72 | line = $1 |
38 | 73 | end |
39 | 74 |
@@ -51,9 +86,9 @@ | ||
51 | 86 | end |
52 | 87 | |
53 | 88 | def parse_header(line, subject_separator) |
54 | - if %r!^<(div|DIV|p|P).*?>([0-9a-zA-Z_]+?)#{subject_separator}(.*?)</(div|DIV|p|P)>$! =~ line then | |
89 | + if %r!^<(div|p).*?>([0-9a-zA-Z_]+?)#{subject_separator}(.*?)</(div|p)>$!i =~ line then | |
55 | 90 | return [true, $2, $3] |
56 | - elsif %r!^([0-9a-zA-Z_]+?)#{subject_separator}(.*?)<(br|BR|p|P).*/?>! =~ line then | |
91 | + elsif %r!^([0-9a-zA-Z_]+?)#{subject_separator}(.*?)<(br|p).*/?>!i =~ line then | |
57 | 92 | return [true, $1, $2] |
58 | 93 | else |
59 | 94 | return [false, nil, nil] |
@@ -69,8 +104,10 @@ | ||
69 | 104 | end |
70 | 105 | |
71 | 106 | def parse_subject_separation(line, subject_separator) |
72 | - if line =~ %r!^<(div|DIV|p|P).*?>([0-9a-zA-Z_]+?)(#{subject_separator})\2{3}</\1>$! | |
107 | + if %r!^<(div|p).*?>([0-9a-zA-Z_]+?)(#{subject_separator})\3{3}</\1>$!i =~ line then | |
73 | 108 | return [true, $2] |
109 | + elsif %r!^([0-9a-zA-Z_]+?)(#{subject_separator})\2{3}<(br|p).*/?>!i =~ line then | |
110 | + return [true, $1] | |
74 | 111 | else |
75 | 112 | return [false, nil] |
76 | 113 | end |