作業部屋の使い方を試しています。
(empty log message)
| @@ -22,7 +22,7 @@ | ||
| 22 | 22 | |
| 23 | 23 | package Form; |
| 24 | 24 | |
| 25 | -import Lib.SearchData; | |
| 25 | +import WebScraping.SearchData; | |
| 26 | 26 | import java.io.BufferedReader; |
| 27 | 27 | import java.io.BufferedWriter; |
| 28 | 28 | import java.io.File; |
| @@ -21,8 +21,8 @@ | ||
| 21 | 21 | */ |
| 22 | 22 | package Form; |
| 23 | 23 | |
| 24 | -import Lib.HtmlParser; | |
| 25 | -import Lib.SearchData; | |
| 24 | +import WebScraping.HtmlParser; | |
| 25 | +import WebScraping.SearchData; | |
| 26 | 26 | import java.awt.Desktop; |
| 27 | 27 | import java.io.File; |
| 28 | 28 | import java.io.IOException; |
| @@ -32,6 +32,7 @@ | ||
| 32 | 32 | import java.util.logging.Level; |
| 33 | 33 | import java.util.logging.Logger; |
| 34 | 34 | import javax.swing.JFileChooser; |
| 35 | +import javax.swing.filechooser.FileFilter; | |
| 35 | 36 | import javax.swing.filechooser.FileNameExtensionFilter; |
| 36 | 37 | import org.jdesktop.observablecollections.ObservableCollections; |
| 37 | 38 |
| @@ -50,6 +51,12 @@ | ||
| 50 | 51 | */ |
| 51 | 52 | public HtmlSearch() { |
| 52 | 53 | initComponents(); |
| 54 | + | |
| 55 | + FileFilter filter1 = new FileNameExtensionFilter("XMLファイル", "xml"); | |
| 56 | + FileFilter filter2 = new FileNameExtensionFilter("TEXTファイル", "txt"); | |
| 57 | + jFileChooser1.addChoosableFileFilter(filter1); | |
| 58 | + jFileChooser1.addChoosableFileFilter(filter2); | |
| 59 | + jFileChooser1.setFileFilter(filter1); | |
| 53 | 60 | } |
| 54 | 61 | |
| 55 | 62 | public List getSerachDataList() { |
| @@ -93,7 +100,6 @@ | ||
| 93 | 100 | |
| 94 | 101 | jFileChooser1.setCurrentDirectory(new java.io.File("C:\\zz_work\\java")); |
| 95 | 102 | jFileChooser1.setDialogTitle(""); |
| 96 | - jFileChooser1.setFileFilter(new FileNameExtensionFilter("TEXTファイル", "txt")); | |
| 97 | 103 | |
| 98 | 104 | setDefaultCloseOperation(javax.swing.WindowConstants.EXIT_ON_CLOSE); |
| 99 | 105 | setTitle("タグ検索"); |
| @@ -0,0 +1,163 @@ | ||
| 1 | +/* | |
| 2 | + * Copyright (C) 2014 kgto. | |
| 3 | + * | |
| 4 | + * This library is free software; you can redistribute it and/or | |
| 5 | + * modify it under the terms of the GNU Lesser General Public | |
| 6 | + * License as published by the Free Software Foundation; either | |
| 7 | + * version 2.1 of the License, or (at your option) any later version. | |
| 8 | + * | |
| 9 | + * This library is distributed in the hope that it will be useful, | |
| 10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 12 | + * Lesser General Public License for more details. | |
| 13 | + * | |
| 14 | + * You should have received a copy of the GNU Lesser General Public | |
| 15 | + * License along with this library; if not, write to the Free Software | |
| 16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 17 | + * MA 02110-1301 USA | |
| 18 | + */ | |
| 19 | +/* | |
| 20 | + * $Id$ | |
| 21 | + */ | |
| 22 | + | |
| 23 | +package WebScraping; | |
| 24 | + | |
| 25 | +import java.util.ArrayList; | |
| 26 | +import java.util.Enumeration; | |
| 27 | +import javax.swing.text.MutableAttributeSet; | |
| 28 | +import javax.swing.text.html.HTML; | |
| 29 | + | |
| 30 | +/** | |
| 31 | + * HTMLタグの属性情報を保持する. | |
| 32 | + * @author kgto | |
| 33 | + */ | |
| 34 | +public class AttributeData { | |
| 35 | + | |
| 36 | + public AttributeData() { | |
| 37 | + AttrList = new ArrayList(); | |
| 38 | + size = 0; | |
| 39 | + } | |
| 40 | + | |
| 41 | + /** | |
| 42 | + * 属性情報追加. | |
| 43 | + * @param tag | |
| 44 | + * @param attr | |
| 45 | + */ | |
| 46 | + public void add(HTML.Tag tag, MutableAttributeSet attr) { | |
| 47 | + | |
| 48 | + int tagcount = tagcnt(tag); | |
| 49 | + ++tagcount; | |
| 50 | + | |
| 51 | + Enumeration e = attr.getAttributeNames(); | |
| 52 | + while(e.hasMoreElements()) { | |
| 53 | + Object obj = e.nextElement(); | |
| 54 | + | |
| 55 | + AttrData a = new AttrData(); | |
| 56 | + a.tag = tag; | |
| 57 | + a.count = tagcount; | |
| 58 | + a.attrname = obj.toString(); | |
| 59 | + a.attrvalue = attr.getAttribute(obj).toString(); | |
| 60 | + | |
| 61 | + AttrList.add(a); | |
| 62 | + size = AttrList.size(); | |
| 63 | + } | |
| 64 | + | |
| 65 | + } | |
| 66 | + | |
| 67 | + /** | |
| 68 | + * 属性情報検索. | |
| 69 | + * @param tag | |
| 70 | + * @param attrname | |
| 71 | + * @param attrvalue | |
| 72 | + * @return | |
| 73 | + */ | |
| 74 | + public boolean search(HTML.Tag tag, String attrname, String attrvalue) { | |
| 75 | + boolean ret = false; | |
| 76 | + for (Object AttrList1 : AttrList) { | |
| 77 | + AttrData a = (AttrData)AttrList1; | |
| 78 | + if(a.tag == tag) { | |
| 79 | + if(a.attrname.equals(attrname) && a.attrvalue.equals(attrvalue)) { | |
| 80 | + ret = true; | |
| 81 | + } | |
| 82 | + } | |
| 83 | + } | |
| 84 | + return ret; | |
| 85 | + } | |
| 86 | + | |
| 87 | + public boolean searchId(HTML.Tag tag, String attrvalue) { | |
| 88 | + return search(tag, "id", attrvalue); | |
| 89 | + } | |
| 90 | + | |
| 91 | + public boolean searchClass(HTML.Tag tag, String attrvalue) { | |
| 92 | + return search(tag, "class", attrvalue); | |
| 93 | + } | |
| 94 | + | |
| 95 | + /** | |
| 96 | + * 属性の値を取得する. | |
| 97 | + * @param tag | |
| 98 | + * @param attrname | |
| 99 | + * @return | |
| 100 | + */ | |
| 101 | + public ArrayList getvale(HTML.Tag tag, String attrname) { | |
| 102 | + ArrayList ret = new ArrayList(); | |
| 103 | + for (Object AttrList1 : AttrList) { | |
| 104 | + AttrData a = (AttrData)AttrList1; | |
| 105 | + if(a.tag == tag) { | |
| 106 | + if(a.attrname.equals(attrname)) { | |
| 107 | + ret.add(a.attrvalue); | |
| 108 | + } | |
| 109 | + } | |
| 110 | + } | |
| 111 | + return ret; | |
| 112 | + } | |
| 113 | + | |
| 114 | + /** | |
| 115 | + * 引数で渡されたTAGの最新カウント数を返す. | |
| 116 | + * @param tag | |
| 117 | + * @return | |
| 118 | + */ | |
| 119 | + private int tagcnt(HTML.Tag tag) { | |
| 120 | + int wkcnt = 0; | |
| 121 | + for (Object AttrList1 : AttrList) { | |
| 122 | + AttrData a = (AttrData)AttrList1; | |
| 123 | + if(a.tag == tag) { | |
| 124 | + if(wkcnt < a.count) { | |
| 125 | + wkcnt = a.count; | |
| 126 | + } | |
| 127 | + } | |
| 128 | + } | |
| 129 | + return wkcnt; | |
| 130 | + } | |
| 131 | + | |
| 132 | + // AttrList の内容を返すメソッド | |
| 133 | + public HTML.Tag gettag(int i) { | |
| 134 | + AttrData a = (AttrData)AttrList.get(i); | |
| 135 | + return a.tag; | |
| 136 | + } | |
| 137 | + | |
| 138 | + public int getcount(int i) { | |
| 139 | + AttrData a = (AttrData)AttrList.get(i); | |
| 140 | + return a.count; | |
| 141 | + } | |
| 142 | + | |
| 143 | + public String getattrname(int i) { | |
| 144 | + AttrData a = (AttrData)AttrList.get(i); | |
| 145 | + return a.attrname; | |
| 146 | + } | |
| 147 | + | |
| 148 | + public String getattrvalue(int i) { | |
| 149 | + AttrData a = (AttrData)AttrList.get(i); | |
| 150 | + return a.attrvalue; | |
| 151 | + } | |
| 152 | + | |
| 153 | + // フィールド変数 | |
| 154 | + public class AttrData { | |
| 155 | + public HTML.Tag tag; | |
| 156 | + public int count; | |
| 157 | + public String attrname; | |
| 158 | + public String attrvalue; | |
| 159 | + } | |
| 160 | + public ArrayList AttrList; | |
| 161 | + public int size; // AttrListのサイズ | |
| 162 | + | |
| 163 | +} |
| @@ -0,0 +1,268 @@ | ||
| 1 | +/* | |
| 2 | + * Copyright (C) 2014 kgto. | |
| 3 | + * | |
| 4 | + * This library is free software; you can redistribute it and/or | |
| 5 | + * modify it under the terms of the GNU Lesser General Public | |
| 6 | + * License as published by the Free Software Foundation; either | |
| 7 | + * version 2.1 of the License, or (at your option) any later version. | |
| 8 | + * | |
| 9 | + * This library is distributed in the hope that it will be useful, | |
| 10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 12 | + * Lesser General Public License for more details. | |
| 13 | + * | |
| 14 | + * You should have received a copy of the GNU Lesser General Public | |
| 15 | + * License along with this library; if not, write to the Free Software | |
| 16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 17 | + * MA 02110-1301 USA | |
| 18 | + */ | |
| 19 | +/* | |
| 20 | + * $Id$ | |
| 21 | + */ | |
| 22 | + | |
| 23 | +package WebScraping; | |
| 24 | + | |
| 25 | +import java.util.ArrayList; | |
| 26 | +import java.util.HashMap; | |
| 27 | +import javax.swing.text.MutableAttributeSet; | |
| 28 | +import javax.swing.text.html.HTML; | |
| 29 | +import javax.swing.text.html.HTMLEditorKit; | |
| 30 | + | |
| 31 | +/** | |
| 32 | + * HTMLパーサ部品. | |
| 33 | + * @author kgto | |
| 34 | + */ | |
| 35 | +public class HtmlParserCallback extends HTMLEditorKit.ParserCallback { | |
| 36 | + | |
| 37 | + // デバック情報表示フラグ | |
| 38 | + final boolean DEBUG = false; | |
| 39 | + //final boolean DEBUG = true; | |
| 40 | + | |
| 41 | + // Tag毎の階層 | |
| 42 | + HashMap<HTML.Tag,Integer> tagMap = new HashMap<>(); | |
| 43 | + | |
| 44 | + // serach key 情報 | |
| 45 | + String keytag; | |
| 46 | + String keyid; | |
| 47 | + String keyclass; | |
| 48 | + | |
| 49 | + // serach key と一致時の情報退避 | |
| 50 | + int bufCount = 0; | |
| 51 | + HTML.Tag bufTag = null; | |
| 52 | + // serach key と一致時の情報格納ワーク | |
| 53 | + StringBuilder bufText; | |
| 54 | + | |
| 55 | + // serach key と一致時のデータ一覧 | |
| 56 | + ArrayList sData; | |
| 57 | + | |
| 58 | + // 属性データ | |
| 59 | + AttributeData attrdata; | |
| 60 | + | |
| 61 | + public HtmlParserCallback(SearchData skey) { | |
| 62 | + | |
| 63 | + // キー情報展開 | |
| 64 | + keytag = skey.getHtmltag(); | |
| 65 | + keyid = skey.getHtmlid(); | |
| 66 | + keyclass = skey.getHtmlclass(); | |
| 67 | + | |
| 68 | + sData = new ArrayList(); | |
| 69 | + } | |
| 70 | + | |
| 71 | + public ArrayList getrtnData() { | |
| 72 | + return this.sData; | |
| 73 | + } | |
| 74 | + | |
| 75 | + @Override | |
| 76 | + public void handleStartTag(HTML.Tag tag, MutableAttributeSet attr, int pos){ | |
| 77 | + // Tag毎の階層を保持 | |
| 78 | + int count = 1; | |
| 79 | + if(tagMap.containsKey(tag)) { | |
| 80 | + count = tagMap.get(tag); | |
| 81 | + count++; | |
| 82 | + } | |
| 83 | + tagMap.put(tag, count); | |
| 84 | + | |
| 85 | + // 属性解析 | |
| 86 | + AttributeData handleStartattrdata = new AttributeData(); | |
| 87 | + handleStartattrdata.add(tag, attr); | |
| 88 | + | |
| 89 | + //--- DEBUG OUT ---- start --- | |
| 90 | + if(DEBUG) { | |
| 91 | + StringBuffer strBuf = new StringBuffer(); | |
| 92 | + // tag情報 | |
| 93 | + strBuf.append(count).append(" : F : ").append(tag.toString()); | |
| 94 | + // 属性情報 | |
| 95 | + for(int i = 0; i < handleStartattrdata.size; i++) { | |
| 96 | + strBuf.append(" [").append(handleStartattrdata.getattrname(i)).append("] ") | |
| 97 | + .append(handleStartattrdata.getattrvalue(i)); | |
| 98 | + } | |
| 99 | + // 表示 | |
| 100 | + System.out.println(strBuf); | |
| 101 | + } | |
| 102 | + //--- DEBUG OUT ---- end --- | |
| 103 | + | |
| 104 | + if(bufCount == 0) { | |
| 105 | + if(tag.toString().equals(keytag)) { | |
| 106 | + //if(serachAttribute(attr)) { | |
| 107 | + if(serachAttribute(tag, handleStartattrdata)) { | |
| 108 | + bufCount = count; | |
| 109 | + bufTag = tag; | |
| 110 | + attrdata = new AttributeData(); | |
| 111 | + bufText = new StringBuilder(); | |
| 112 | + } | |
| 113 | + } | |
| 114 | + } | |
| 115 | + if(bufCount > 0) { | |
| 116 | + attrdata.add(tag, attr); | |
| 117 | + } | |
| 118 | + } | |
| 119 | + | |
| 120 | + @Override | |
| 121 | + public void handleEndTag(HTML.Tag tag, int pos){ | |
| 122 | + // Tag毎の階層を取得 | |
| 123 | + int count = 0; | |
| 124 | + if(tagMap.containsKey(tag)) { | |
| 125 | + count = tagMap.get(tag); | |
| 126 | + } | |
| 127 | + | |
| 128 | + //--- DEBUG OUT ---- start --- | |
| 129 | + if(DEBUG) { | |
| 130 | + if(tag.equals(bufTag) && count <= bufCount) { | |
| 131 | + for(int i = 0; i < attrdata.size; i++) { | |
| 132 | + StringBuffer strBuf = new StringBuffer(); | |
| 133 | + strBuf.append(" Tag-attr : "); | |
| 134 | + strBuf.append(attrdata.gettag(i)).append(" [ "); | |
| 135 | + strBuf.append(attrdata.getcount(i)).append(" ] "); | |
| 136 | + strBuf.append(attrdata.getattrname(i)).append(" = "); | |
| 137 | + strBuf.append(attrdata.getattrvalue(i)); | |
| 138 | + System.out.println(strBuf); | |
| 139 | + } | |
| 140 | + } | |
| 141 | + StringBuffer strBuf = new StringBuffer(); | |
| 142 | + // tag情報 | |
| 143 | + strBuf.append(count).append(" : E : ").append(tag.toString()); | |
| 144 | + System.out.println(strBuf); | |
| 145 | + } | |
| 146 | + //--- DEBUG OUT ---- end --- | |
| 147 | + | |
| 148 | + if(tag.equals(bufTag) && count <= bufCount) { | |
| 149 | + | |
| 150 | + // 溜め込んだ一致情報をリストへ格納 | |
| 151 | + sData.add(bufText.toString()); | |
| 152 | + | |
| 153 | + // 退避したserach keyとの一致情報クリア | |
| 154 | + bufCount = 0; | |
| 155 | + bufTag = null; | |
| 156 | + bufText = null; | |
| 157 | + } | |
| 158 | + | |
| 159 | + // Tag毎の階層減算 | |
| 160 | + tagMap.put(tag, --count); | |
| 161 | + } | |
| 162 | + | |
| 163 | + @Override | |
| 164 | + public void handleText(char[] data, int pos){ | |
| 165 | + //--- DEBUG OUT ---- start --- | |
| 166 | + if(DEBUG) { | |
| 167 | + String dat = new String(data); | |
| 168 | + System.out.println(dat); | |
| 169 | + } | |
| 170 | + //--- DEBUG OUT ---- end --- | |
| 171 | + String splitchar = "\t"; | |
| 172 | + | |
| 173 | + //制御文字の削除 | |
| 174 | + // 0xa0 | |
| 175 | + StringBuilder buf = new StringBuilder(); | |
| 176 | + for(int i = 0; i < data.length; i++) { | |
| 177 | + if(data[i] > 0x1f && data[i] != 0x7f && data[i] != 0xa0) { | |
| 178 | + buf.append(data[i]); | |
| 179 | + } | |
| 180 | + } | |
| 181 | + | |
| 182 | + if(bufCount > 0) { | |
| 183 | + if(bufText.length() > 0) { | |
| 184 | + bufText.append(splitchar); | |
| 185 | + } | |
| 186 | + bufText.append(buf.toString()); | |
| 187 | + } | |
| 188 | + | |
| 189 | + } | |
| 190 | + | |
| 191 | + @Override | |
| 192 | + public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attr, int pos){ | |
| 193 | + | |
| 194 | + if(bufCount > 0) { | |
| 195 | + attrdata.add(tag, attr); | |
| 196 | + } | |
| 197 | + | |
| 198 | + //--- DEBUG OUT ---- start --- | |
| 199 | + if(DEBUG) { | |
| 200 | + AttributeData simpleattrdata = new AttributeData(); | |
| 201 | + simpleattrdata.add(tag, attr); | |
| 202 | + StringBuffer strBuf = new StringBuffer(); | |
| 203 | + // tag情報 | |
| 204 | + strBuf.append("x : S : ").append(tag.toString()); | |
| 205 | + // 属性情報 | |
| 206 | + for(int i = 0; i < simpleattrdata.size; i++) { | |
| 207 | + strBuf.append(" [").append(simpleattrdata.getattrname(i)).append("] ").append(simpleattrdata.getcount(i)) | |
| 208 | + .append(" = ").append(simpleattrdata.getattrvalue(i)); | |
| 209 | + } | |
| 210 | + System.out.println(strBuf); | |
| 211 | + } | |
| 212 | + //--- DEBUG OUT ---- end --- | |
| 213 | + } | |
| 214 | + | |
| 215 | + /** | |
| 216 | + * ページ内のID/CLASS値と検索キーを比較する. | |
| 217 | + * @param attr ページのMutableAttributeSet | |
| 218 | + * @return boolean 検索キーと一致の時、true | |
| 219 | + */ | |
| 220 | + public boolean serachAttribute(MutableAttributeSet attr) { | |
| 221 | + String currentID = (String)attr.getAttribute(HTML.Attribute.ID); | |
| 222 | + String currentClass = (String)attr.getAttribute(HTML.Attribute.CLASS); | |
| 223 | + | |
| 224 | + if(keyid.isEmpty() == false && keyclass.isEmpty() == false) { | |
| 225 | + if(keyid.equals(currentID) && keyclass.equals(currentClass)) { | |
| 226 | + return true; | |
| 227 | + } | |
| 228 | + } | |
| 229 | + | |
| 230 | + if(keyid.isEmpty() == false) { | |
| 231 | + if(keyid.equals(currentID)) { | |
| 232 | + return true; | |
| 233 | + } | |
| 234 | + } | |
| 235 | + | |
| 236 | + if(keyclass.isEmpty() == false) { | |
| 237 | + if(keyclass.equals(currentClass)) { | |
| 238 | + return true; | |
| 239 | + } | |
| 240 | + } | |
| 241 | + | |
| 242 | + return false; | |
| 243 | + } | |
| 244 | + | |
| 245 | + /** | |
| 246 | + * ページ内のID/CLASS値と検索キーを比較する. | |
| 247 | + * @param tag | |
| 248 | + * @param attrdata | |
| 249 | + * @return boolean 検索キーと一致の時、true | |
| 250 | + */ | |
| 251 | + public boolean serachAttribute(HTML.Tag tag, AttributeData attrdata) { | |
| 252 | + // ID と CLASS の両方にキー入力有りの場合 | |
| 253 | + if(keyid.isEmpty() == false && keyclass.isEmpty() == false) { | |
| 254 | + if(attrdata.searchId(tag, keyid) && attrdata.searchClass(tag, keyclass)) { | |
| 255 | + return true; | |
| 256 | + } | |
| 257 | + } | |
| 258 | + // ID のキーチェック | |
| 259 | + if(keyid.isEmpty() == false) { | |
| 260 | + return attrdata.searchId(tag, keyid); | |
| 261 | + } | |
| 262 | + // CLASS のキーチェック | |
| 263 | + if(keyclass.isEmpty() == false) { | |
| 264 | + return attrdata.searchClass(tag, keyclass); | |
| 265 | + } | |
| 266 | + return false; | |
| 267 | + } | |
| 268 | +} |
| @@ -0,0 +1,98 @@ | ||
| 1 | +/* | |
| 2 | + * Copyright (C) 2014 kgto. | |
| 3 | + * | |
| 4 | + * This library is free software; you can redistribute it and/or | |
| 5 | + * modify it under the terms of the GNU Lesser General Public | |
| 6 | + * License as published by the Free Software Foundation; either | |
| 7 | + * version 2.1 of the License, or (at your option) any later version. | |
| 8 | + * | |
| 9 | + * This library is distributed in the hope that it will be useful, | |
| 10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 12 | + * Lesser General Public License for more details. | |
| 13 | + * | |
| 14 | + * You should have received a copy of the GNU Lesser General Public | |
| 15 | + * License along with this library; if not, write to the Free Software | |
| 16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 17 | + * MA 02110-1301 USA | |
| 18 | + */ | |
| 19 | +/* | |
| 20 | + * $Id$ | |
| 21 | + */ | |
| 22 | + | |
| 23 | +package WebScraping; | |
| 24 | + | |
| 25 | +/** | |
| 26 | + * | |
| 27 | + * @author kgto | |
| 28 | + */ | |
| 29 | +public class SearchData { | |
| 30 | + | |
| 31 | + private String item; | |
| 32 | + private String htmltag; | |
| 33 | + private String htmlid; | |
| 34 | + private String htmlclass; | |
| 35 | + private String around; | |
| 36 | + private String regexp; | |
| 37 | + | |
| 38 | + public SearchData() { | |
| 39 | + } | |
| 40 | + | |
| 41 | + public SearchData(SearchData dat) { | |
| 42 | + this.item = dat.getitem(); | |
| 43 | + this.htmltag = dat.getHtmltag(); | |
| 44 | + this.htmlid = dat.getHtmlid(); | |
| 45 | + this.htmlclass = dat.getHtmlclass(); | |
| 46 | + this.around = dat.getaround(); | |
| 47 | + this.regexp = dat.getregexp(); | |
| 48 | + } | |
| 49 | + | |
| 50 | + public void setitem(String item) { | |
| 51 | + this.item = item; | |
| 52 | + } | |
| 53 | + | |
| 54 | + public void setHtmltag(String htmltag) { | |
| 55 | + this.htmltag = htmltag; | |
| 56 | + } | |
| 57 | + | |
| 58 | + public void setHtmlid(String htmlid) { | |
| 59 | + this.htmlid = htmlid; | |
| 60 | + } | |
| 61 | + | |
| 62 | + public void setHtmlclass(String htmlclass) { | |
| 63 | + this.htmlclass = htmlclass; | |
| 64 | + } | |
| 65 | + | |
| 66 | + public void setaround(String around) { | |
| 67 | + this.around = around; | |
| 68 | + } | |
| 69 | + | |
| 70 | + public void setregexp(String regexp) { | |
| 71 | + this.regexp = regexp; | |
| 72 | + } | |
| 73 | + | |
| 74 | + public String getitem() { | |
| 75 | + return item; | |
| 76 | + } | |
| 77 | + | |
| 78 | + public String getHtmltag() { | |
| 79 | + return htmltag; | |
| 80 | + } | |
| 81 | + | |
| 82 | + public String getHtmlid() { | |
| 83 | + return htmlid; | |
| 84 | + } | |
| 85 | + | |
| 86 | + public String getHtmlclass() { | |
| 87 | + return htmlclass; | |
| 88 | + } | |
| 89 | + | |
| 90 | + public String getaround() { | |
| 91 | + return around; | |
| 92 | + } | |
| 93 | + | |
| 94 | + public String getregexp() { | |
| 95 | + return regexp; | |
| 96 | + } | |
| 97 | + | |
| 98 | +} |
| @@ -0,0 +1,229 @@ | ||
| 1 | +/* | |
| 2 | + * Copyright (C) 2014 kgto. | |
| 3 | + * | |
| 4 | + * This library is free software; you can redistribute it and/or | |
| 5 | + * modify it under the terms of the GNU Lesser General Public | |
| 6 | + * License as published by the Free Software Foundation; either | |
| 7 | + * version 2.1 of the License, or (at your option) any later version. | |
| 8 | + * | |
| 9 | + * This library is distributed in the hope that it will be useful, | |
| 10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 12 | + * Lesser General Public License for more details. | |
| 13 | + * | |
| 14 | + * You should have received a copy of the GNU Lesser General Public | |
| 15 | + * License along with this library; if not, write to the Free Software | |
| 16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 17 | + * MA 02110-1301 USA | |
| 18 | + */ | |
| 19 | +/* | |
| 20 | + * $Id$ | |
| 21 | + */ | |
| 22 | + | |
| 23 | +package WebScraping; | |
| 24 | + | |
| 25 | +import java.io.*; | |
| 26 | +import java.net.*; | |
| 27 | +import java.util.ArrayList; | |
| 28 | +import java.util.regex.Matcher; | |
| 29 | +import java.util.regex.Pattern; | |
| 30 | +import javax.swing.text.html.parser.ParserDelegator; | |
| 31 | + | |
| 32 | +/** | |
| 33 | + * | |
| 34 | + * @author kgto | |
| 35 | + */ | |
| 36 | +public class HtmlParser { | |
| 37 | + | |
| 38 | + String UrlAdress; | |
| 39 | + String pageData; | |
| 40 | + | |
| 41 | + ArrayList sData; | |
| 42 | + | |
| 43 | + // 作業ワーク | |
| 44 | + String htmltag; | |
| 45 | + String htmlid; | |
| 46 | + String htmlclass; | |
| 47 | + | |
| 48 | + public HtmlParser() { | |
| 49 | + UrlAdress = null; | |
| 50 | + } | |
| 51 | + | |
| 52 | + public HtmlParser(String UrlAdress) { | |
| 53 | + this.UrlAdress = UrlAdress; | |
| 54 | + getpageData(); | |
| 55 | + } | |
| 56 | + | |
| 57 | + public void seturl(String UrlAdress) { | |
| 58 | + this.UrlAdress = UrlAdress; | |
| 59 | + getpageData(); | |
| 60 | + } | |
| 61 | + | |
| 62 | + /** | |
| 63 | + * HTMLページ内検索. | |
| 64 | + * 検索キーとして渡されたタグ,ID,クラスから、対象となるタグを探し出し、 | |
| 65 | + * around(タグ位置)として指定された箇所の文字列をregexp(正規表現)で指定された整形を | |
| 66 | + * 行った結果を返す。<br> | |
| 67 | + * aroundの初期値:0 検索キーとして未指定(未入力)の場合、最初(0)の文字列。<br> | |
| 68 | + * regexpが指定(入力)ありの場合、正規表現にて整形を行う。<br> | |
| 69 | + * 渡された検索キーに一致するタグが存在しなかった場合、NULLを返す。 | |
| 70 | + * @param skey 検索キーデータ(SearchData) | |
| 71 | + * @return String 検索キーに一致するデータの文字列 | |
| 72 | + */ | |
| 73 | + public String search(SearchData skey) { | |
| 74 | + | |
| 75 | + String item = skey.getitem(); | |
| 76 | + String regexp = skey.getregexp(); | |
| 77 | + | |
| 78 | + // htmlページ内を検索 | |
| 79 | + if(isHtmlkeyEq(skey) == false) { | |
| 80 | + serchpageData(skey); | |
| 81 | + } | |
| 82 | + /* | |
| 83 | + around 出現位置指定 入力有り:指定された位置の情報のみ返す。 | |
| 84 | + 入力無し:取得した全ての情報を返す。 | |
| 85 | + */ | |
| 86 | + String wkaround = skey.getaround(); | |
| 87 | + if(wkaround.length() > 0) { | |
| 88 | + byte wbAround = 0; | |
| 89 | + wbAround = Byte.parseByte(wkaround); // 検索位置を数値変換 | |
| 90 | + if(wbAround < sData.size()) { | |
| 91 | + String str = (String)sData.get(wbAround); | |
| 92 | + String rtn = RegularExpression(str, regexp); | |
| 93 | + return item + "\t" + rtn; | |
| 94 | + } | |
| 95 | + } else { | |
| 96 | + StringBuilder strbuf = new StringBuilder(); | |
| 97 | + for (Object sData1 : sData) { | |
| 98 | + String str = (String)sData1; | |
| 99 | + String rtn = RegularExpression(str, regexp); | |
| 100 | + if(strbuf.length() > 0) { | |
| 101 | + strbuf.append("\t"); | |
| 102 | + } | |
| 103 | + strbuf.append(rtn); | |
| 104 | + } | |
| 105 | + return item + "\t" + strbuf; | |
| 106 | + } | |
| 107 | + return null; | |
| 108 | + } | |
| 109 | + | |
| 110 | + /** | |
| 111 | + * 直近のHTMLタグ/ID/CLASS値と引数の値を比較する. | |
| 112 | + * @param skey HTMLタグ/ID/CLASSが格納された検索キー | |
| 113 | + * @return boolean HTMLタグ/ID/CLASS値が一致する時、true | |
| 114 | + */ | |
| 115 | + public boolean isHtmlkeyEq(SearchData skey) { | |
| 116 | + | |
| 117 | + String stag = skey.getHtmltag(); | |
| 118 | + String sid = skey.getHtmlid(); | |
| 119 | + String sclass = skey.getHtmlclass(); | |
| 120 | + | |
| 121 | + boolean rtn = true; | |
| 122 | + | |
| 123 | + // htmltag | |
| 124 | + if(htmltag == null) { | |
| 125 | + rtn = false; | |
| 126 | + } else { | |
| 127 | + if(htmltag.equals(stag) == false) { | |
| 128 | + rtn = false; | |
| 129 | + } | |
| 130 | + } | |
| 131 | + | |
| 132 | + // htmlid | |
| 133 | + if(htmlid == null) { | |
| 134 | + rtn = false; | |
| 135 | + } else { | |
| 136 | + if(htmlid.equals(sid) == false) { | |
| 137 | + rtn = false; | |
| 138 | + } | |
| 139 | + } | |
| 140 | + | |
| 141 | + // htmlclass | |
| 142 | + if(htmlclass == null) { | |
| 143 | + rtn = false; | |
| 144 | + } else { | |
| 145 | + if(htmlclass.equals(sclass) == false) { | |
| 146 | + rtn = false; | |
| 147 | + } | |
| 148 | + } | |
| 149 | + | |
| 150 | + if(!rtn) { | |
| 151 | + htmltag = stag; | |
| 152 | + htmlid = sid; | |
| 153 | + htmlclass = sclass; | |
| 154 | + } | |
| 155 | + | |
| 156 | + return rtn; | |
| 157 | + } | |
| 158 | + | |
| 159 | + /** | |
| 160 | + * 正規表現検索. | |
| 161 | + * @param strdata | |
| 162 | + * @param regexp | |
| 163 | + * @return | |
| 164 | + */ | |
| 165 | + public String RegularExpression(String strdata, String regexp) { | |
| 166 | + String expdata = null; | |
| 167 | + | |
| 168 | + //regexpのチェック | |
| 169 | + if(regexp.isEmpty()) { | |
| 170 | + expdata = strdata; | |
| 171 | + return expdata; | |
| 172 | + } | |
| 173 | + | |
| 174 | + //正規表現検索 | |
| 175 | + Pattern ptn = Pattern.compile(regexp); | |
| 176 | + Matcher matchdata = ptn.matcher(strdata); | |
| 177 | + if (matchdata.find()) { | |
| 178 | + if(matchdata.groupCount() >= 1) { | |
| 179 | + expdata = matchdata.group(1); | |
| 180 | + } | |
| 181 | + } | |
| 182 | + return expdata; | |
| 183 | + } | |
| 184 | + | |
| 185 | + /** | |
| 186 | + * インターネット接続. | |
| 187 | + */ | |
| 188 | + private void getpageData() { | |
| 189 | + try { | |
| 190 | + URL url = new URL(UrlAdress); | |
| 191 | + HttpURLConnection con = (HttpURLConnection)url.openConnection(); | |
| 192 | + con.setRequestMethod("GET"); | |
| 193 | + BufferedReader reader = new BufferedReader( | |
| 194 | + new InputStreamReader(con.getInputStream(), "utf-8")); | |
| 195 | + String wkline; | |
| 196 | + StringBuilder sb = new StringBuilder(); | |
| 197 | + while((wkline = reader.readLine()) != null) { | |
| 198 | + sb.append(wkline).append("\n"); | |
| 199 | + } | |
| 200 | + pageData = sb.toString(); | |
| 201 | + | |
| 202 | + con.disconnect(); | |
| 203 | + } | |
| 204 | + catch(IOException e) { | |
| 205 | + System.err.println(e); | |
| 206 | + } | |
| 207 | + } | |
| 208 | + | |
| 209 | + /** | |
| 210 | + * HTMLパーサ. | |
| 211 | + * @param skey | |
| 212 | + */ | |
| 213 | + public void serchpageData(SearchData skey){ | |
| 214 | + Reader reader; | |
| 215 | + try { | |
| 216 | + reader = new BufferedReader(new StringReader(pageData)); | |
| 217 | + HtmlParserCallback cb = new HtmlParserCallback(skey); | |
| 218 | + ParserDelegator pd = new ParserDelegator(); | |
| 219 | + pd.parse(reader, cb, true); | |
| 220 | + reader.close(); | |
| 221 | + | |
| 222 | + sData = cb.getrtnData(); | |
| 223 | + | |
| 224 | + } catch (IOException e) { | |
| 225 | + System.err.println(e); | |
| 226 | + } | |
| 227 | + } | |
| 228 | + | |
| 229 | +} |