レコメンドIndexをScalaで
| Revision | 1059eedb662cc5e753c0f819406b29100947e5c6 (tree) |
|---|---|
| Time | 2011-03-03 16:51:34 |
| Author | tachiki <tachiki@p-wi...> |
| Commiter | tachiki |
Blog本文のparserを追加
| @@ -5,7 +5,7 @@ import scala.actors.Actor | ||
| 5 | 5 | import com.parrotstudio.recommend.ro.model._ |
| 6 | 6 | |
| 7 | 7 | class RecommendWorker(val index: Int) extends Actor with RecommendMessage { |
| 8 | - val parser = """.*<title>Angel, alone.*孤独な天使.*-(.*)-</title>.*<div id="more">(.*?)</div>.*""".r | |
| 8 | + val parser = """.*<title>Angel, alone.*孤独な天使.*-(.*)-</title>.*<div class="_body">(.*?)</div>.*<div id="more">(.*?)</div>.*""".r | |
| 9 | 9 | |
| 10 | 10 | def act { |
| 11 | 11 | loop { |
| @@ -28,7 +28,7 @@ class RecommendWorker(val index: Int) extends Actor with RecommendMessage { | ||
| 28 | 28 | |
| 29 | 29 | def parse(p: Page): Article = { |
| 30 | 30 | p.page match { |
| 31 | - case parser(title, ext) => Article(normalize(title), "本文", normalize(ext)) | |
| 31 | + case parser(title, body, ext) => Article(normalize(title), normalize(body), normalize(ext)) | |
| 32 | 32 | case _ => new Article() |
| 33 | 33 | } |
| 34 | 34 | } |
| @@ -43,7 +43,33 @@ class RecommendWorker(val index: Int) extends Actor with RecommendMessage { | ||
| 43 | 43 | } |
| 44 | 44 | |
| 45 | 45 | private def normalize(text: String): String = { |
| 46 | - text.trim.replaceAll("<br.*?/?>", "\n").replaceAll("<a.+href=.*?>", "").replaceAll("</a>", "") | |
| 46 | + val funcs = List( | |
| 47 | + replace_line_tag(_), | |
| 48 | + remove_link_tag(_), | |
| 49 | + remove_img_tag(_), | |
| 50 | + remove_form_tag(_), | |
| 51 | + replace_webclap_comment(_)) | |
| 52 | + | |
| 53 | + Function.chain(funcs)(text.trim) | |
| 54 | + } | |
| 55 | + | |
| 56 | + private def replace_line_tag(text: String): String = { | |
| 57 | + text.replaceAll("<br.*?/?>", "\n") | |
| 58 | + } | |
| 59 | + | |
| 60 | + private def remove_link_tag(text: String): String = { | |
| 61 | + text.replaceAll("<a.+href=.*?>", "").replaceAll("</a>", "") | |
| 47 | 62 | } |
| 48 | 63 | |
| 64 | + private def remove_img_tag(text: String): String = { | |
| 65 | + text.replaceAll("<img.+src=.*?>", "") | |
| 66 | + } | |
| 67 | + | |
| 68 | + private def remove_form_tag(text: String): String = { | |
| 69 | + text.replaceAll("<form.*?>", "").replaceAll("</form>", "") | |
| 70 | + } | |
| 71 | + | |
| 72 | + private def replace_webclap_comment(text: String): String = { | |
| 73 | + text.replaceAll("<input type=submit value=(.*?)>", "$1") | |
| 74 | + } | |
| 49 | 75 | } |
| \ No newline at end of file |