作業部屋の使い方を試しています。
branches/b3/WebScraping をマージ
| @@ -1,108 +1,71 @@ | ||
| 1 | 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?> |
| 2 | -<searchdata> | |
| 3 | - <url>http://weather.yahoo.co.jp/weather/</url> | |
| 4 | - <searchlist> | |
| 5 | - <item>天気01</item> | |
| 6 | - <htmltag>li</htmltag> | |
| 7 | - <htmlid/> | |
| 8 | - <htmlclass>point pt1400</htmlclass> | |
| 9 | - <around/> | |
| 10 | - <regexp/> | |
| 11 | - </searchlist> | |
| 12 | - <searchlist> | |
| 13 | - <item>天気02</item> | |
| 14 | - <htmltag>li</htmltag> | |
| 15 | - <htmlid/> | |
| 16 | - <htmlclass>point pt1900</htmlclass> | |
| 17 | - <around/> | |
| 18 | - <regexp/> | |
| 19 | - </searchlist> | |
| 20 | - <searchlist> | |
| 21 | - <item>天気03</item> | |
| 22 | - <htmltag>li</htmltag> | |
| 23 | - <htmlid/> | |
| 24 | - <htmlclass>point pt3410</htmlclass> | |
| 25 | - <around/> | |
| 26 | - <regexp/> | |
| 27 | - </searchlist> | |
| 28 | - <searchlist> | |
| 29 | - <item>天気04</item> | |
| 30 | - <htmltag>li</htmltag> | |
| 31 | - <htmlid/> | |
| 32 | - <htmlclass>point pt4410</htmlclass> | |
| 33 | - <around/> | |
| 34 | - <regexp/> | |
| 35 | - </searchlist> | |
| 36 | - <searchlist> | |
| 37 | - <item>天気05</item> | |
| 38 | - <htmltag>li</htmltag> | |
| 39 | - <htmlid/> | |
| 40 | - <htmlclass>point pt5110</htmlclass> | |
| 41 | - <around/> | |
| 42 | - <regexp/> | |
| 43 | - </searchlist> | |
| 44 | - <searchlist> | |
| 45 | - <item>天気06</item> | |
| 46 | - <htmltag>li</htmltag> | |
| 47 | - <htmlid/> | |
| 48 | - <htmlclass>point pt5410</htmlclass> | |
| 49 | - <around/> | |
| 50 | - <regexp/> | |
| 51 | - </searchlist> | |
| 52 | - <searchlist> | |
| 53 | - <item>天気07</item> | |
| 54 | - <htmltag>li</htmltag> | |
| 55 | - <htmlid/> | |
| 56 | - <htmlclass>point pt5610</htmlclass> | |
| 57 | - <around/> | |
| 58 | - <regexp/> | |
| 59 | - </searchlist> | |
| 60 | - <searchlist> | |
| 61 | - <item>天気08</item> | |
| 62 | - <htmltag>li</htmltag> | |
| 63 | - <htmlid/> | |
| 64 | - <htmlclass>point pt6200</htmlclass> | |
| 65 | - <around/> | |
| 66 | - <regexp/> | |
| 67 | - </searchlist> | |
| 68 | - <searchlist> | |
| 69 | - <item>天気09</item> | |
| 70 | - <htmltag>li</htmltag> | |
| 71 | - <htmlid/> | |
| 72 | - <htmlclass>point pt6710</htmlclass> | |
| 73 | - <around/> | |
| 74 | - <regexp/> | |
| 75 | - </searchlist> | |
| 76 | - <searchlist> | |
| 77 | - <item>天気10</item> | |
| 78 | - <htmltag>li</htmltag> | |
| 79 | - <htmlid/> | |
| 80 | - <htmlclass>point pt7410</htmlclass> | |
| 81 | - <around/> | |
| 82 | - <regexp/> | |
| 83 | - </searchlist> | |
| 84 | - <searchlist> | |
| 85 | - <item>天気11</item> | |
| 86 | - <htmltag>li</htmltag> | |
| 87 | - <htmlid/> | |
| 88 | - <htmlclass>point pt8210</htmlclass> | |
| 89 | - <around/> | |
| 90 | - <regexp/> | |
| 91 | - </searchlist> | |
| 92 | - <searchlist> | |
| 93 | - <item>天気12</item> | |
| 94 | - <htmltag>li</htmltag> | |
| 95 | - <htmlid/> | |
| 96 | - <htmlclass>point pt8810</htmlclass> | |
| 97 | - <around/> | |
| 98 | - <regexp/> | |
| 99 | - </searchlist> | |
| 100 | - <searchlist> | |
| 101 | - <item>天気13</item> | |
| 102 | - <htmltag>li</htmltag> | |
| 103 | - <htmlid/> | |
| 104 | - <htmlclass>point pt9110</htmlclass> | |
| 105 | - <around/> | |
| 106 | - <regexp/> | |
| 107 | - </searchlist> | |
| 108 | -</searchdata> | |
| \ No newline at end of file | ||
| 2 | +<xmlcontainer> | |
| 3 | +<webscraping> | |
| 4 | +<url>http://weather.yahoo.co.jp/weather/</url> | |
| 5 | +<searchlist listNo="1"> | |
| 6 | +<item>天気01</item> | |
| 7 | +<htmltag>li</htmltag> | |
| 8 | +<htmlclass>point pt1400</htmlclass> | |
| 9 | +</searchlist> | |
| 10 | +<searchlist listNo="2"> | |
| 11 | +<item>天気02</item> | |
| 12 | +<htmltag>li</htmltag> | |
| 13 | +<htmlclass>point pt1900</htmlclass> | |
| 14 | +</searchlist> | |
| 15 | +<searchlist listNo="3"> | |
| 16 | +<item>天気03</item> | |
| 17 | +<htmltag>li</htmltag> | |
| 18 | +<htmlclass>point pt3410</htmlclass> | |
| 19 | +</searchlist> | |
| 20 | +<searchlist listNo="4"> | |
| 21 | +<item>天気04</item> | |
| 22 | +<htmltag>li</htmltag> | |
| 23 | +<htmlclass>point pt4410</htmlclass> | |
| 24 | +</searchlist> | |
| 25 | +<searchlist listNo="5"> | |
| 26 | +<item>天気05</item> | |
| 27 | +<htmltag>li</htmltag> | |
| 28 | +<htmlclass>point pt5110</htmlclass> | |
| 29 | +</searchlist> | |
| 30 | +<searchlist listNo="6"> | |
| 31 | +<item>天気06</item> | |
| 32 | +<htmltag>li</htmltag> | |
| 33 | +<htmlclass>point pt5410</htmlclass> | |
| 34 | +</searchlist> | |
| 35 | +<searchlist listNo="7"> | |
| 36 | +<item>天気07</item> | |
| 37 | +<htmltag>li</htmltag> | |
| 38 | +<htmlclass>point pt5610</htmlclass> | |
| 39 | +</searchlist> | |
| 40 | +<searchlist listNo="8"> | |
| 41 | +<item>天気08</item> | |
| 42 | +<htmltag>li</htmltag> | |
| 43 | +<htmlclass>point pt6200</htmlclass> | |
| 44 | +</searchlist> | |
| 45 | +<searchlist listNo="9"> | |
| 46 | +<item>天気09</item> | |
| 47 | +<htmltag>li</htmltag> | |
| 48 | +<htmlclass>point pt6710</htmlclass> | |
| 49 | +</searchlist> | |
| 50 | +<searchlist listNo="10"> | |
| 51 | +<item>天気10</item> | |
| 52 | +<htmltag>li</htmltag> | |
| 53 | +<htmlclass>point pt7410</htmlclass> | |
| 54 | +</searchlist> | |
| 55 | +<searchlist listNo="11"> | |
| 56 | +<item>天気11</item> | |
| 57 | +<htmltag>li</htmltag> | |
| 58 | +<htmlclass>point pt8210</htmlclass> | |
| 59 | +</searchlist> | |
| 60 | +<searchlist listNo="12"> | |
| 61 | +<item>天気12</item> | |
| 62 | +<htmltag>li</htmltag> | |
| 63 | +<htmlclass>point pt8810</htmlclass> | |
| 64 | +</searchlist> | |
| 65 | +<searchlist listNo="13"> | |
| 66 | +<item>天気13</item> | |
| 67 | +<htmltag>li</htmltag> | |
| 68 | +<htmlclass>point pt9110</htmlclass> | |
| 69 | +</searchlist> | |
| 70 | +</webscraping> | |
| 71 | +</xmlcontainer> |
| @@ -1 +1,217 @@ | ||
| 1 | -<?xml version="1.0" encoding="UTF-8" standalone="no"?><searchdata><url>http://stocks.finance.yahoo.co.jp/stocks/detail/?code=9984.T</url><searchlist><item>銘柄コード</item><htmltag>dl</htmltag><htmlid/><htmlclass>stocksInfo clearFix</htmlclass><around/><regexp>(^\d{4})</regexp></searchlist><searchlist><item>カテゴリ</item><htmltag>div</htmltag><htmlid/><htmlclass>stockMainTabParts stockMainTabPartsCurrent</htmlclass><around/><regexp/></searchlist><searchlist><item>業種</item><htmltag>dd</htmltag><htmlid/><htmlclass>category yjSb</htmlclass><around/><regexp/></searchlist><searchlist><item>取得時間</item><htmltag>dd</htmltag><htmlid/><htmlclass>yjSb real</htmlclass><around/><regexp>^(.*)\t</regexp></searchlist><searchlist><item>銘柄名</item><htmltag>th</htmltag><htmlid/><htmlclass>symbol</htmlclass><around/><regexp/></searchlist><searchlist><item>株価</item><htmltag>td</htmltag><htmlid/><htmlclass>stoksPrice</htmlclass><around/><regexp/></searchlist><searchlist><item>前日比</item><htmltag>td</htmltag><htmlid/><htmlclass>change</htmlclass><around/><regexp>\t(.*)(.*%)</regexp></searchlist><searchlist><item>前日比%</item><htmltag>td</htmltag><htmlid/><htmlclass>change</htmlclass><around/><regexp>\t.*((.*)%)</regexp></searchlist><searchlist><item>前日終値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>0</around><regexp>^([,0-9]+)\t</regexp></searchlist><searchlist><item>始値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>1</around><regexp>^([,0-9]+|-{3})\t</regexp></searchlist><searchlist><item>高値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>2</around><regexp>^([,0-9]+|-{3})\t</regexp></searchlist><searchlist><item>安値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>3</around><regexp>^([,0-9]+|-{3})\t</regexp></searchlist><searchlist><item>出来高</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>4</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>売買代金</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>5</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>値幅制限</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>6</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>時価総額</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>0</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>発行済株式数</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>1</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>配当利回り</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>2</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>1株配当</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>3</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>PER</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>4</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>PBR</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>5</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>EPS</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>6</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>BPS</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>7</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>最低購入代金</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>8</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>単元株数</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>9</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>年初来高値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>10</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>年初来安値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>11</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>信用買残</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>12</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>信用買残前週比</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>13</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>信用売残</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>14</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>信用売残前週比</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>15</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>貸借倍率</item><htmltag>div</htmltag><htmlid/><htmlclass>yjMS clearfix</htmlclass><around/><regexp>^(.*?)\t</regexp></searchlist></searchdata> | |
| \ No newline at end of file | ||
| 1 | +<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
| 2 | +<xmlcontainer> | |
| 3 | +<webscraping> | |
| 4 | +<url>http://stocks.finance.yahoo.co.jp/stocks/detail/?code=5020.T</url> | |
| 5 | +<searchlist listNo="1"> | |
| 6 | +<item>銘柄コード</item> | |
| 7 | +<htmltag>dl</htmltag> | |
| 8 | +<htmlclass>stocksInfo clearFix</htmlclass> | |
| 9 | +<regexp>(^\d{4})</regexp> | |
| 10 | +</searchlist> | |
| 11 | +<searchlist listNo="2"> | |
| 12 | +<item>カテゴリ</item> | |
| 13 | +<htmltag>div</htmltag> | |
| 14 | +<htmlclass>stockMainTabParts stockMainTabPartsCurrent</htmlclass> | |
| 15 | +</searchlist> | |
| 16 | +<searchlist listNo="3"> | |
| 17 | +<item>業種</item> | |
| 18 | +<htmltag>dd</htmltag> | |
| 19 | +<htmlclass>category yjSb</htmlclass> | |
| 20 | +</searchlist> | |
| 21 | +<searchlist listNo="4"> | |
| 22 | +<item>取得時間</item> | |
| 23 | +<htmltag>dd</htmltag> | |
| 24 | +<htmlclass>yjSb real</htmlclass> | |
| 25 | +<regexp>^(.*)\t</regexp> | |
| 26 | +</searchlist> | |
| 27 | +<searchlist listNo="5"> | |
| 28 | +<item>銘柄名</item> | |
| 29 | +<htmltag>th</htmltag> | |
| 30 | +<htmlclass>symbol</htmlclass> | |
| 31 | +</searchlist> | |
| 32 | +<searchlist listNo="6"> | |
| 33 | +<item>株価</item> | |
| 34 | +<htmltag>td</htmltag> | |
| 35 | +<htmlclass>stoksPrice</htmlclass> | |
| 36 | +</searchlist> | |
| 37 | +<searchlist listNo="7"> | |
| 38 | +<item>前日比</item> | |
| 39 | +<htmltag>td</htmltag> | |
| 40 | +<htmlclass>change</htmlclass> | |
| 41 | +<regexp>\t(.*)(.*%)</regexp> | |
| 42 | +</searchlist> | |
| 43 | +<searchlist listNo="8"> | |
| 44 | +<item>前日比%</item> | |
| 45 | +<htmltag>td</htmltag> | |
| 46 | +<htmlclass>change</htmlclass> | |
| 47 | +<regexp>\t.*((.*)%)</regexp> | |
| 48 | +</searchlist> | |
| 49 | +<searchlist listNo="9"> | |
| 50 | +<item>前日終値</item> | |
| 51 | +<htmltag>div</htmltag> | |
| 52 | +<htmlclass>lineFi clearfix</htmlclass> | |
| 53 | +<around>0</around> | |
| 54 | +<regexp>^([,.0-9]+)\t</regexp> | |
| 55 | +</searchlist> | |
| 56 | +<searchlist listNo="10"> | |
| 57 | +<item>始値</item> | |
| 58 | +<htmltag>div</htmltag> | |
| 59 | +<htmlclass>lineFi clearfix</htmlclass> | |
| 60 | +<around>1</around> | |
| 61 | +<regexp>^([,.0-9]+|-{3})\t</regexp> | |
| 62 | +</searchlist> | |
| 63 | +<searchlist listNo="11"> | |
| 64 | +<item>高値</item> | |
| 65 | +<htmltag>div</htmltag> | |
| 66 | +<htmlclass>lineFi clearfix</htmlclass> | |
| 67 | +<around>2</around> | |
| 68 | +<regexp>^((ストップ高\t|ストップ安\t)?[,0-9]+|-{3})</regexp> | |
| 69 | +</searchlist> | |
| 70 | +<searchlist listNo="12"> | |
| 71 | +<item>安値</item> | |
| 72 | +<htmltag>div</htmltag> | |
| 73 | +<htmlclass>lineFi clearfix</htmlclass> | |
| 74 | +<around>3</around> | |
| 75 | +<regexp>^((ストップ高\t|ストップ安\t)?[,0-9]+|-{3})</regexp> | |
| 76 | +</searchlist> | |
| 77 | +<searchlist listNo="13"> | |
| 78 | +<item>出来高</item> | |
| 79 | +<htmltag>div</htmltag> | |
| 80 | +<htmlclass>lineFi clearfix</htmlclass> | |
| 81 | +<around>4</around> | |
| 82 | +<regexp>^(.*?)\t</regexp> | |
| 83 | +</searchlist> | |
| 84 | +<searchlist listNo="14"> | |
| 85 | +<item>売買代金</item> | |
| 86 | +<htmltag>div</htmltag> | |
| 87 | +<htmlclass>lineFi clearfix</htmlclass> | |
| 88 | +<around>5</around> | |
| 89 | +<regexp>^(.*?)\t</regexp> | |
| 90 | +</searchlist> | |
| 91 | +<searchlist listNo="15"> | |
| 92 | +<item>値幅制限</item> | |
| 93 | +<htmltag>div</htmltag> | |
| 94 | +<htmlclass>lineFi clearfix</htmlclass> | |
| 95 | +<around>6</around> | |
| 96 | +<regexp>^(.*?)\t</regexp> | |
| 97 | +</searchlist> | |
| 98 | +<searchlist listNo="16"> | |
| 99 | +<item>時価総額</item> | |
| 100 | +<htmltag>div</htmltag> | |
| 101 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 102 | +<around>0</around> | |
| 103 | +<regexp>^(.*?)\t</regexp> | |
| 104 | +</searchlist> | |
| 105 | +<searchlist listNo="17"> | |
| 106 | +<item>発行済株式数</item> | |
| 107 | +<htmltag>div</htmltag> | |
| 108 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 109 | +<around>1</around> | |
| 110 | +<regexp>^(.*?)\t</regexp> | |
| 111 | +</searchlist> | |
| 112 | +<searchlist listNo="18"> | |
| 113 | +<item>配当利回り</item> | |
| 114 | +<htmltag>div</htmltag> | |
| 115 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 116 | +<around>2</around> | |
| 117 | +<regexp>^(.*?)\t</regexp> | |
| 118 | +</searchlist> | |
| 119 | +<searchlist listNo="19"> | |
| 120 | +<item>1株配当</item> | |
| 121 | +<htmltag>div</htmltag> | |
| 122 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 123 | +<around>3</around> | |
| 124 | +<regexp>^(.*?)\t</regexp> | |
| 125 | +</searchlist> | |
| 126 | +<searchlist listNo="20"> | |
| 127 | +<item>PER</item> | |
| 128 | +<htmltag>div</htmltag> | |
| 129 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 130 | +<around>4</around> | |
| 131 | +<regexp>^(.*?)\t</regexp> | |
| 132 | +</searchlist> | |
| 133 | +<searchlist listNo="21"> | |
| 134 | +<item>PBR</item> | |
| 135 | +<htmltag>div</htmltag> | |
| 136 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 137 | +<around>5</around> | |
| 138 | +<regexp>^(.*?)\t</regexp> | |
| 139 | +</searchlist> | |
| 140 | +<searchlist listNo="22"> | |
| 141 | +<item>EPS</item> | |
| 142 | +<htmltag>div</htmltag> | |
| 143 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 144 | +<around>6</around> | |
| 145 | +<regexp>^(.*?)\t</regexp> | |
| 146 | +</searchlist> | |
| 147 | +<searchlist listNo="23"> | |
| 148 | +<item>BPS</item> | |
| 149 | +<htmltag>div</htmltag> | |
| 150 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 151 | +<around>7</around> | |
| 152 | +<regexp>^(.*?)\t</regexp> | |
| 153 | +</searchlist> | |
| 154 | +<searchlist listNo="24"> | |
| 155 | +<item>最低購入代金</item> | |
| 156 | +<htmltag>div</htmltag> | |
| 157 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 158 | +<around>8</around> | |
| 159 | +<regexp>^(.*?)\t</regexp> | |
| 160 | +</searchlist> | |
| 161 | +<searchlist listNo="25"> | |
| 162 | +<item>単元株数</item> | |
| 163 | +<htmltag>div</htmltag> | |
| 164 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 165 | +<around>9</around> | |
| 166 | +<regexp>^(.*?)\t</regexp> | |
| 167 | +</searchlist> | |
| 168 | +<searchlist listNo="26"> | |
| 169 | +<item>年初来高値</item> | |
| 170 | +<htmltag>div</htmltag> | |
| 171 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 172 | +<around>10</around> | |
| 173 | +<regexp>^(.*?)\t</regexp> | |
| 174 | +</searchlist> | |
| 175 | +<searchlist listNo="27"> | |
| 176 | +<item>年初来安値</item> | |
| 177 | +<htmltag>div</htmltag> | |
| 178 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 179 | +<around>11</around> | |
| 180 | +<regexp>^(.*?)\t</regexp> | |
| 181 | +</searchlist> | |
| 182 | +<searchlist listNo="28"> | |
| 183 | +<item>信用買残</item> | |
| 184 | +<htmltag>div</htmltag> | |
| 185 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 186 | +<around>12</around> | |
| 187 | +<regexp>^(.*?)\t</regexp> | |
| 188 | +</searchlist> | |
| 189 | +<searchlist listNo="29"> | |
| 190 | +<item>信用買残前週比</item> | |
| 191 | +<htmltag>div</htmltag> | |
| 192 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 193 | +<around>13</around> | |
| 194 | +<regexp>^(.*?)\t</regexp> | |
| 195 | +</searchlist> | |
| 196 | +<searchlist listNo="30"> | |
| 197 | +<item>信用売残</item> | |
| 198 | +<htmltag>div</htmltag> | |
| 199 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 200 | +<around>14</around> | |
| 201 | +<regexp>^(.*?)\t</regexp> | |
| 202 | +</searchlist> | |
| 203 | +<searchlist listNo="31"> | |
| 204 | +<item>信用売残前週比</item> | |
| 205 | +<htmltag>div</htmltag> | |
| 206 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
| 207 | +<around>15</around> | |
| 208 | +<regexp>^(.*?)\t</regexp> | |
| 209 | +</searchlist> | |
| 210 | +<searchlist listNo="32"> | |
| 211 | +<item>貸借倍率</item> | |
| 212 | +<htmltag>div</htmltag> | |
| 213 | +<htmlclass>yjMS clearfix</htmlclass> | |
| 214 | +<regexp>^(.*?)\t</regexp> | |
| 215 | +</searchlist> | |
| 216 | +</webscraping> | |
| 217 | +</xmlcontainer> |
| @@ -0,0 +1,314 @@ | ||
| 1 | +/* | |
| 2 | + * Copyright (C) 2014 kgto. | |
| 3 | + * | |
| 4 | + * This library is free software; you can redistribute it and/or | |
| 5 | + * modify it under the terms of the GNU Lesser General Public | |
| 6 | + * License as published by the Free Software Foundation; either | |
| 7 | + * version 2.1 of the License, or (at your option) any later version. | |
| 8 | + * | |
| 9 | + * This library is distributed in the hope that it will be useful, | |
| 10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 12 | + * Lesser General Public License for more details. | |
| 13 | + * | |
| 14 | + * You should have received a copy of the GNU Lesser General Public | |
| 15 | + * License along with this library; if not, write to the Free Software | |
| 16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 17 | + * MA 02110-1301 USA | |
| 18 | + */ | |
| 19 | +/* | |
| 20 | + * $Id$ | |
| 21 | + */ | |
| 22 | + | |
| 23 | +package utility.test1; | |
| 24 | + | |
| 25 | +import webScraping.core.SearchData; | |
| 26 | +import java.io.File; | |
| 27 | +import java.io.FileNotFoundException; | |
| 28 | +import java.io.FileOutputStream; | |
| 29 | +import java.io.IOException; | |
| 30 | +import java.util.logging.Level; | |
| 31 | +import java.util.logging.Logger; | |
| 32 | +import javax.xml.parsers.DocumentBuilder; | |
| 33 | +import javax.xml.parsers.DocumentBuilderFactory; | |
| 34 | +import javax.xml.parsers.ParserConfigurationException; | |
| 35 | +import javax.xml.transform.Transformer; | |
| 36 | +import javax.xml.transform.TransformerConfigurationException; | |
| 37 | +import javax.xml.transform.TransformerException; | |
| 38 | +import javax.xml.transform.TransformerFactory; | |
| 39 | +import javax.xml.transform.dom.DOMSource; | |
| 40 | +import javax.xml.transform.stream.StreamResult; | |
| 41 | +import org.w3c.dom.DOMImplementation; | |
| 42 | +import org.w3c.dom.Document; | |
| 43 | +import org.w3c.dom.Element; | |
| 44 | +import org.w3c.dom.Node; | |
| 45 | +import org.w3c.dom.NodeList; | |
| 46 | +import org.xml.sax.SAXException; | |
| 47 | + | |
| 48 | +/** | |
| 49 | + * 検索データ読込・保存. | |
| 50 | + * @author kgto | |
| 51 | + */ | |
| 52 | +public class SearchDataRW { | |
| 53 | + /* ---------------------------------------------------------------------- * | |
| 54 | + * フィールド | |
| 55 | + * ---------------------------------------------------------------------- */ | |
| 56 | + private String UrlAdress; | |
| 57 | + | |
| 58 | + DocumentBuilder builder; | |
| 59 | + public Document document; | |
| 60 | + Element root; | |
| 61 | + | |
| 62 | + /* ---------------------------------------------------------------------- * | |
| 63 | + * コンストラクタ | |
| 64 | + * ---------------------------------------------------------------------- */ | |
| 65 | + public SearchDataRW() { | |
| 66 | + try { | |
| 67 | + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
| 68 | + builder = factory.newDocumentBuilder(); | |
| 69 | + | |
| 70 | + } catch (ParserConfigurationException ex) { | |
| 71 | + Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 72 | + } | |
| 73 | + } | |
| 74 | + | |
| 75 | + /* ---------------------------------------------------------------------- * | |
| 76 | + * Setter | |
| 77 | + * ---------------------------------------------------------------------- */ | |
| 78 | + public void seturl(String UrlAdress) { | |
| 79 | + this.UrlAdress = UrlAdress; | |
| 80 | + } | |
| 81 | + | |
| 82 | + /* ---------------------------------------------------------------------- * | |
| 83 | + * Getter | |
| 84 | + * ---------------------------------------------------------------------- */ | |
| 85 | + public String geturl() { | |
| 86 | + return UrlAdress; | |
| 87 | + } | |
| 88 | + | |
| 89 | + /* ---------------------------------------------------------------------- * | |
| 90 | + * メソッド | |
| 91 | + * ---------------------------------------------------------------------- */ | |
| 92 | + /** | |
| 93 | + * 保存. | |
| 94 | + * @param file | |
| 95 | + */ | |
| 96 | + public void save(File file) { | |
| 97 | + saveUrl(UrlAdress); | |
| 98 | + saveSearchList(); | |
| 99 | + write(file); | |
| 100 | + } | |
| 101 | + | |
| 102 | + /** | |
| 103 | + * 読込. | |
| 104 | + * @param file | |
| 105 | + */ | |
| 106 | + public void load(File file) { | |
| 107 | + read(file); | |
| 108 | + loadUrl(); | |
| 109 | + loadSearchList(); | |
| 110 | + } | |
| 111 | + | |
| 112 | + /* ---------------------------------------------------------------------- */ | |
| 113 | + | |
| 114 | + void loadUrl() { | |
| 115 | + NodeList nodelist = root.getElementsByTagName("url"); | |
| 116 | + Node node = nodelist.item(0); | |
| 117 | + UrlAdress = node.getFirstChild().getNodeValue(); | |
| 118 | + } | |
| 119 | + | |
| 120 | + public void loadSearchList() { | |
| 121 | + SearchData.clear(); | |
| 122 | + | |
| 123 | + NodeList nodelist = root.getElementsByTagName("searchlist"); | |
| 124 | + for(int i = 0; i < nodelist.getLength(); i++) { | |
| 125 | + Node childnode = nodelist.item(i); | |
| 126 | + | |
| 127 | + boolean sdatflg = false; | |
| 128 | + SearchData sdat = new SearchData(); | |
| 129 | + for (Node child = childnode.getFirstChild(); child != null; child = child.getNextSibling()) { | |
| 130 | + if(child.getNodeType() == Node.ELEMENT_NODE) { | |
| 131 | + String tag = child.getNodeName(); | |
| 132 | + String rtn = ""; | |
| 133 | + if(child.getFirstChild() != null) { | |
| 134 | + rtn = child.getFirstChild().getNodeValue(); | |
| 135 | + } | |
| 136 | + switch (tag) { | |
| 137 | + case "item" : | |
| 138 | + sdat.setitem(rtn); | |
| 139 | + sdatflg = true; | |
| 140 | + break; | |
| 141 | + case "htmltag" : | |
| 142 | + sdat.setHtmltag(rtn); | |
| 143 | + sdatflg = true; | |
| 144 | + break; | |
| 145 | + case "htmlid" : | |
| 146 | + sdat.setHtmlid(rtn); | |
| 147 | + sdatflg = true; | |
| 148 | + break; | |
| 149 | + case "htmlclass" : | |
| 150 | + sdat.setHtmlclass(rtn); | |
| 151 | + sdatflg = true; | |
| 152 | + break; | |
| 153 | + case "around" : | |
| 154 | + sdat.setaround(rtn); | |
| 155 | + sdatflg = true; | |
| 156 | + break; | |
| 157 | + case "regexp" : | |
| 158 | + sdat.setregexp(rtn); | |
| 159 | + sdatflg = true; | |
| 160 | + break; | |
| 161 | + } | |
| 162 | + } | |
| 163 | + } | |
| 164 | + if(sdatflg) SearchData.add(sdat); | |
| 165 | + } | |
| 166 | + } | |
| 167 | + | |
| 168 | + public String loadMsg404() { | |
| 169 | + StringBuilder strbuf = new StringBuilder(); | |
| 170 | + NodeList nodelist = root.getElementsByTagName("msg404"); | |
| 171 | + for(int i = 0; i < nodelist.getLength(); i++) { | |
| 172 | + Node childnode = nodelist.item(i); | |
| 173 | + String str = childnode.getFirstChild().getNodeValue(); | |
| 174 | + if(strbuf.length() > 0) { | |
| 175 | + strbuf.append("\n"); | |
| 176 | + } | |
| 177 | + strbuf.append(str); | |
| 178 | + } | |
| 179 | + return strbuf.toString(); | |
| 180 | + } | |
| 181 | + | |
| 182 | + public Element loadElement(String elementTagName) { | |
| 183 | + NodeList nodelist = root.getElementsByTagName(elementTagName); | |
| 184 | + Element element = (Element)nodelist.item(0); | |
| 185 | + | |
| 186 | + return element; | |
| 187 | + } | |
| 188 | + | |
| 189 | + /* ---------------------------------------------------------------------- */ | |
| 190 | + | |
| 191 | + void saveUrl(String urladdress) { | |
| 192 | + checkdoc(); | |
| 193 | + removeElement("url"); // 既にElementが存在してた場合、一度削除 | |
| 194 | + | |
| 195 | + Element url = document.createElement("url"); | |
| 196 | + url.appendChild(document.createTextNode(urladdress)); | |
| 197 | + root.appendChild(url); | |
| 198 | + } | |
| 199 | + | |
| 200 | + void saveSearchList() { | |
| 201 | + checkdoc(); | |
| 202 | + removeElement("searchlist"); // 既にElementが存在してた場合、一度削除 | |
| 203 | + | |
| 204 | + int count = 0; | |
| 205 | + for(int i = 0; i < SearchData.size(); i++) { | |
| 206 | + SearchData sdat = SearchData.get(i); | |
| 207 | + | |
| 208 | + Element cslist = document.createElement("searchlist"); | |
| 209 | + cslist.setAttribute("listNo", String.valueOf(++count)); | |
| 210 | + | |
| 211 | + addChild(cslist, "item", sdat.getitem()); | |
| 212 | + addChild(cslist, "htmltag", sdat.getHtmltag()); | |
| 213 | + addChild(cslist, "htmlid", sdat.getHtmlid()); | |
| 214 | + addChild(cslist, "htmlclass", sdat.getHtmlclass()); | |
| 215 | + addChild(cslist, "around", sdat.getaround()); | |
| 216 | + addChild(cslist, "regexp", sdat.getregexp()); | |
| 217 | + | |
| 218 | + root.appendChild(cslist); | |
| 219 | + } | |
| 220 | + } | |
| 221 | + | |
| 222 | + void saveMsg404(String msg) { | |
| 223 | + checkdoc(); | |
| 224 | + removeElement("msg404"); // 既にElementが存在してた場合、一度削除 | |
| 225 | + | |
| 226 | + String[] msgs = msg.split("\n"); | |
| 227 | + int count = 0; | |
| 228 | + for(String msgOne : msgs) { | |
| 229 | + Element msgElement = document.createElement("msg404"); | |
| 230 | + msgElement.setAttribute("No", String.valueOf(++count)); | |
| 231 | + msgElement.appendChild(document.createTextNode(msgOne)); | |
| 232 | + | |
| 233 | + root.appendChild(msgElement); | |
| 234 | + } | |
| 235 | + } | |
| 236 | + | |
| 237 | + public void saveElement(Element element) { | |
| 238 | + checkdoc(); | |
| 239 | + removeElement(element.getTagName()); // 既にElementが存在してた場合、一度削除 | |
| 240 | + | |
| 241 | + root.appendChild(element); | |
| 242 | + } | |
| 243 | + | |
| 244 | + /* ---------------------------------------------------------------------- */ | |
| 245 | + | |
| 246 | + private void addChild(Element cslist, String keyword, String data) { | |
| 247 | + if(!data.isEmpty()) { | |
| 248 | + Element element = document.createElement(keyword); | |
| 249 | + element.appendChild(document.createTextNode(data)); | |
| 250 | + cslist.appendChild(element); | |
| 251 | + } | |
| 252 | + } | |
| 253 | + | |
| 254 | + private void removeElement(String elementTagName) { | |
| 255 | + int nodeSize; | |
| 256 | + do { | |
| 257 | + NodeList nodelist = document.getElementsByTagName(elementTagName); | |
| 258 | + nodeSize = nodelist.getLength(); | |
| 259 | + for(int i = 0; i < nodelist.getLength(); i++) { | |
| 260 | + Node node = nodelist.item(i); | |
| 261 | + root.removeChild(node); | |
| 262 | + } | |
| 263 | + } while(nodeSize > 0); | |
| 264 | + } | |
| 265 | + | |
| 266 | + /** | |
| 267 | + * ドキュメントチェック. | |
| 268 | + * 新規の場合やXMLファイルの読込みが行われていない状態時、新たにルートエレメントを作成する。 | |
| 269 | + * 既読の場合、ルートエレメントの取得を行う。 | |
| 270 | + */ | |
| 271 | + public void checkdoc() { | |
| 272 | + if(document == null) { | |
| 273 | + DOMImplementation domImpl = builder.getDOMImplementation(); | |
| 274 | + document = domImpl.createDocument("","searchdata",null); | |
| 275 | + } | |
| 276 | + root = document.getDocumentElement(); | |
| 277 | + } | |
| 278 | + | |
| 279 | + /** | |
| 280 | + * XML読込み. | |
| 281 | + * @param file | |
| 282 | + */ | |
| 283 | + public void read(File file) { | |
| 284 | + try { | |
| 285 | + document = builder.parse(file); | |
| 286 | + root = document.getDocumentElement(); | |
| 287 | + | |
| 288 | + } catch (SAXException | IOException ex) { | |
| 289 | + Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 290 | + } | |
| 291 | + } | |
| 292 | + | |
| 293 | + /** | |
| 294 | + * XML書込み. | |
| 295 | + * @param file | |
| 296 | + */ | |
| 297 | + public void write(File file) { | |
| 298 | + try { | |
| 299 | + TransformerFactory transFactory = TransformerFactory.newInstance(); | |
| 300 | + Transformer transformer = transFactory.newTransformer(); | |
| 301 | + | |
| 302 | + DOMSource source = new DOMSource(document); | |
| 303 | + FileOutputStream os = new FileOutputStream(file); | |
| 304 | + StreamResult result = new StreamResult(os); | |
| 305 | + transformer.transform(source, result); | |
| 306 | + | |
| 307 | + } catch (TransformerConfigurationException ex) { | |
| 308 | + Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 309 | + } catch (FileNotFoundException | TransformerException ex) { | |
| 310 | + Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 311 | + } | |
| 312 | + } | |
| 313 | + | |
| 314 | +} |
| @@ -1,7 +1,6 @@ | ||
| 1 | 1 | |
| 2 | 2 | package utility.test1; |
| 3 | 3 | |
| 4 | -import webScraping.utility.SearchDataRW; | |
| 5 | 4 | import java.io.File; |
| 6 | 5 | import java.lang.reflect.InvocationTargetException; |
| 7 | 6 | import java.lang.reflect.Method; |
| @@ -1,7 +1,6 @@ | ||
| 1 | 1 | |
| 2 | 2 | package utility.test1; |
| 3 | 3 | |
| 4 | -import webScraping.utility.SearchDataRW; | |
| 5 | 4 | import java.io.File; |
| 6 | 5 | import webScraping.core.SearchData; |
| 7 | 6 |
| @@ -0,0 +1,42 @@ | ||
| 1 | + | |
| 2 | +package utility.test1; | |
| 3 | + | |
| 4 | +import java.io.File; | |
| 5 | +import webScraping.utility.ScrapingXml; | |
| 6 | + | |
| 7 | +/** | |
| 8 | + * XMLコンバータ | |
| 9 | + * 旧:SearchDataRW.java → 新:ScrapingXml.java | |
| 10 | + * @author kgto | |
| 11 | + */ | |
| 12 | +public class ConvertXml01 { | |
| 13 | + | |
| 14 | + private String UrlAdress; | |
| 15 | + File file = new File("test1.xml"); | |
| 16 | + | |
| 17 | + /** | |
| 18 | + * @param args the command line arguments | |
| 19 | + */ | |
| 20 | + public static void main(String[] args) { | |
| 21 | + ConvertXml01 conv = new ConvertXml01(); | |
| 22 | + | |
| 23 | + conv.readold(); | |
| 24 | + conv.writenew(); | |
| 25 | + | |
| 26 | + System.exit(0); | |
| 27 | + } | |
| 28 | + | |
| 29 | + void readold() { | |
| 30 | + SearchDataRW sdatrw = new SearchDataRW(); | |
| 31 | + sdatrw.load(file); | |
| 32 | + UrlAdress = sdatrw.geturl(); | |
| 33 | + } | |
| 34 | + | |
| 35 | + void writenew() { | |
| 36 | + ScrapingXml xmlwriter = new ScrapingXml(); | |
| 37 | + xmlwriter.setTestUrl(UrlAdress); | |
| 38 | + xmlwriter.setSdata(); | |
| 39 | + xmlwriter.save(file); | |
| 40 | + } | |
| 41 | + | |
| 42 | +} |
| @@ -1,16 +1,71 @@ | ||
| 1 | -<?xml version="1.0" encoding="UTF-8" standalone="no"?><searchdata> | |
| 2 | - | |
| 3 | - | |
| 4 | - | |
| 5 | - | |
| 6 | - | |
| 7 | - | |
| 8 | - | |
| 9 | - | |
| 10 | - | |
| 11 | - | |
| 12 | - | |
| 13 | - | |
| 14 | - | |
| 15 | - | |
| 16 | -<url>http://weather.yahoo.co.jp/weather/</url><searchlist listNo="1"><item>天気01</item><htmltag>li</htmltag><htmlclass>point pt1400</htmlclass></searchlist><searchlist listNo="2"><item>天気02</item><htmltag>li</htmltag><htmlclass>point pt1900</htmlclass></searchlist><searchlist listNo="3"><item>天気03</item><htmltag>li</htmltag><htmlclass>point pt3410</htmlclass></searchlist><searchlist listNo="4"><item>天気04</item><htmltag>li</htmltag><htmlclass>point pt4410</htmlclass></searchlist><searchlist listNo="5"><item>天気05</item><htmltag>li</htmltag><htmlclass>point pt5110</htmlclass></searchlist><searchlist listNo="6"><item>天気06</item><htmltag>li</htmltag><htmlclass>point pt5410</htmlclass></searchlist><searchlist listNo="7"><item>天気07</item><htmltag>li</htmltag><htmlclass>point pt5610</htmlclass></searchlist><searchlist listNo="8"><item>天気08</item><htmltag>li</htmltag><htmlclass>point pt6200</htmlclass></searchlist><searchlist listNo="9"><item>天気09</item><htmltag>li</htmltag><htmlclass>point pt6710</htmlclass></searchlist><searchlist listNo="10"><item>天気10</item><htmltag>li</htmltag><htmlclass>point pt7410</htmlclass></searchlist><searchlist listNo="11"><item>天気11</item><htmltag>li</htmltag><htmlclass>point pt8210</htmlclass></searchlist><searchlist listNo="12"><item>天気12</item><htmltag>li</htmltag><htmlclass>point pt8810</htmlclass></searchlist><searchlist listNo="13"><item>天気13</item><htmltag>li</htmltag><htmlclass>point pt9110</htmlclass></searchlist></searchdata> | |
| \ No newline at end of file | ||
| 1 | +<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
| 2 | +<xmlcontainer> | |
| 3 | +<webscraping> | |
| 4 | +<url>http://weather.yahoo.co.jp/weather/</url> | |
| 5 | +<searchlist listNo="1"> | |
| 6 | +<item>天気01</item> | |
| 7 | +<htmltag>li</htmltag> | |
| 8 | +<htmlclass>point pt1400</htmlclass> | |
| 9 | +</searchlist> | |
| 10 | +<searchlist listNo="2"> | |
| 11 | +<item>天気02</item> | |
| 12 | +<htmltag>li</htmltag> | |
| 13 | +<htmlclass>point pt1900</htmlclass> | |
| 14 | +</searchlist> | |
| 15 | +<searchlist listNo="3"> | |
| 16 | +<item>天気03</item> | |
| 17 | +<htmltag>li</htmltag> | |
| 18 | +<htmlclass>point pt3410</htmlclass> | |
| 19 | +</searchlist> | |
| 20 | +<searchlist listNo="4"> | |
| 21 | +<item>天気04</item> | |
| 22 | +<htmltag>li</htmltag> | |
| 23 | +<htmlclass>point pt4410</htmlclass> | |
| 24 | +</searchlist> | |
| 25 | +<searchlist listNo="5"> | |
| 26 | +<item>天気05</item> | |
| 27 | +<htmltag>li</htmltag> | |
| 28 | +<htmlclass>point pt5110</htmlclass> | |
| 29 | +</searchlist> | |
| 30 | +<searchlist listNo="6"> | |
| 31 | +<item>天気06</item> | |
| 32 | +<htmltag>li</htmltag> | |
| 33 | +<htmlclass>point pt5410</htmlclass> | |
| 34 | +</searchlist> | |
| 35 | +<searchlist listNo="7"> | |
| 36 | +<item>天気07</item> | |
| 37 | +<htmltag>li</htmltag> | |
| 38 | +<htmlclass>point pt5610</htmlclass> | |
| 39 | +</searchlist> | |
| 40 | +<searchlist listNo="8"> | |
| 41 | +<item>天気08</item> | |
| 42 | +<htmltag>li</htmltag> | |
| 43 | +<htmlclass>point pt6200</htmlclass> | |
| 44 | +</searchlist> | |
| 45 | +<searchlist listNo="9"> | |
| 46 | +<item>天気09</item> | |
| 47 | +<htmltag>li</htmltag> | |
| 48 | +<htmlclass>point pt6710</htmlclass> | |
| 49 | +</searchlist> | |
| 50 | +<searchlist listNo="10"> | |
| 51 | +<item>天気10</item> | |
| 52 | +<htmltag>li</htmltag> | |
| 53 | +<htmlclass>point pt7410</htmlclass> | |
| 54 | +</searchlist> | |
| 55 | +<searchlist listNo="11"> | |
| 56 | +<item>天気11</item> | |
| 57 | +<htmltag>li</htmltag> | |
| 58 | +<htmlclass>point pt8210</htmlclass> | |
| 59 | +</searchlist> | |
| 60 | +<searchlist listNo="12"> | |
| 61 | +<item>天気12</item> | |
| 62 | +<htmltag>li</htmltag> | |
| 63 | +<htmlclass>point pt8810</htmlclass> | |
| 64 | +</searchlist> | |
| 65 | +<searchlist listNo="13"> | |
| 66 | +<item>天気13</item> | |
| 67 | +<htmltag>li</htmltag> | |
| 68 | +<htmlclass>point pt9110</htmlclass> | |
| 69 | +</searchlist> | |
| 70 | +</webscraping> | |
| 71 | +</xmlcontainer> |
| @@ -1,547 +0,0 @@ | ||
| 1 | -/* | |
| 2 | - * Copyright (C) 2014 kgto. | |
| 3 | - * | |
| 4 | - * This library is free software; you can redistribute it and/or | |
| 5 | - * modify it under the terms of the GNU Lesser General Public | |
| 6 | - * License as published by the Free Software Foundation; either | |
| 7 | - * version 2.1 of the License, or (at your option) any later version. | |
| 8 | - * | |
| 9 | - * This library is distributed in the hope that it will be useful, | |
| 10 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 12 | - * Lesser General Public License for more details. | |
| 13 | - * | |
| 14 | - * You should have received a copy of the GNU Lesser General Public | |
| 15 | - * License along with this library; if not, write to the Free Software | |
| 16 | - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 17 | - * MA 02110-1301 USA | |
| 18 | - */ | |
| 19 | -/* | |
| 20 | - * $Id$ | |
| 21 | - */ | |
| 22 | - | |
| 23 | -package webScraping.utility; | |
| 24 | - | |
| 25 | -import webScraping.core.SearchData; | |
| 26 | -import java.io.BufferedReader; | |
| 27 | -import java.io.BufferedWriter; | |
| 28 | -import java.io.File; | |
| 29 | -import java.io.FileInputStream; | |
| 30 | -import java.io.FileNotFoundException; | |
| 31 | -import java.io.FileOutputStream; | |
| 32 | -import java.io.IOException; | |
| 33 | -import java.io.InputStreamReader; | |
| 34 | -import java.io.OutputStreamWriter; | |
| 35 | -import java.util.ArrayList; | |
| 36 | -import java.util.logging.Level; | |
| 37 | -import java.util.logging.Logger; | |
| 38 | -import javax.xml.parsers.DocumentBuilder; | |
| 39 | -import javax.xml.parsers.DocumentBuilderFactory; | |
| 40 | -import javax.xml.parsers.ParserConfigurationException; | |
| 41 | -import javax.xml.transform.Transformer; | |
| 42 | -import javax.xml.transform.TransformerConfigurationException; | |
| 43 | -import javax.xml.transform.TransformerException; | |
| 44 | -import javax.xml.transform.TransformerFactory; | |
| 45 | -import javax.xml.transform.dom.DOMSource; | |
| 46 | -import javax.xml.transform.stream.StreamResult; | |
| 47 | -import org.w3c.dom.DOMImplementation; | |
| 48 | -import org.w3c.dom.Document; | |
| 49 | -import org.w3c.dom.Element; | |
| 50 | -import org.w3c.dom.Node; | |
| 51 | -import org.w3c.dom.NodeList; | |
| 52 | -import org.xml.sax.SAXException; | |
| 53 | - | |
| 54 | -/** | |
| 55 | - * | |
| 56 | - * @author kgto | |
| 57 | - */ | |
| 58 | -public class SearchDataRW { | |
| 59 | - | |
| 60 | - DocumentBuilder builder; | |
| 61 | - public Document document; | |
| 62 | - Element root; | |
| 63 | - | |
| 64 | - private final String splitchar = "\t"; | |
| 65 | - | |
| 66 | - private String UrlAdress; | |
| 67 | - private ArrayList<SearchData> slist = new ArrayList<>(); | |
| 68 | - | |
| 69 | - public SearchDataRW() { | |
| 70 | - try { | |
| 71 | - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
| 72 | - builder = factory.newDocumentBuilder(); | |
| 73 | - | |
| 74 | - } catch (ParserConfigurationException ex) { | |
| 75 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 76 | - } | |
| 77 | - } | |
| 78 | - | |
| 79 | - public void seturl(String UrlAdress) { | |
| 80 | - this.UrlAdress = UrlAdress; | |
| 81 | - } | |
| 82 | - | |
| 83 | - public void setslist(ArrayList slist) { | |
| 84 | - this.slist = slist; | |
| 85 | - } | |
| 86 | - | |
| 87 | - public String geturl() { | |
| 88 | - return UrlAdress; | |
| 89 | - } | |
| 90 | - | |
| 91 | - public ArrayList getslist() { | |
| 92 | - return slist; | |
| 93 | - } | |
| 94 | - | |
| 95 | - /** | |
| 96 | - * 保存. | |
| 97 | - * @param file | |
| 98 | - */ | |
| 99 | - public void save(File file) { | |
| 100 | - //saveCsv(file); | |
| 101 | - //saveXml(file); | |
| 102 | - | |
| 103 | - saveUrl(UrlAdress); | |
| 104 | - saveSearchList(slist); | |
| 105 | - write(file); | |
| 106 | - } | |
| 107 | - | |
| 108 | - /** | |
| 109 | - * 読込. | |
| 110 | - * @param file | |
| 111 | - */ | |
| 112 | - public void load(File file) { | |
| 113 | - //loadCsv(file); | |
| 114 | - //loadXml(file); | |
| 115 | - | |
| 116 | - read(file); | |
| 117 | - loadUrl(); | |
| 118 | - loadSearchList(); | |
| 119 | - } | |
| 120 | - | |
| 121 | - /* ---------------------------------------------------------------------- */ | |
| 122 | - /** | |
| 123 | - * 保存(CSV形式). | |
| 124 | - * @param file | |
| 125 | - */ | |
| 126 | - public void saveCsv(File file) { | |
| 127 | - BufferedWriter bufferedwriter = null; | |
| 128 | - try { | |
| 129 | - //空のファイルを作成 | |
| 130 | - file.createNewFile(); | |
| 131 | - FileOutputStream fileoutputstream = new FileOutputStream(file); | |
| 132 | - OutputStreamWriter outputstreamwriter = new OutputStreamWriter(fileoutputstream, "UTF-8"); | |
| 133 | - bufferedwriter = new BufferedWriter(outputstreamwriter); | |
| 134 | - | |
| 135 | - // URL | |
| 136 | - bufferedwriter.write(UrlAdress); | |
| 137 | - bufferedwriter.write("\n"); | |
| 138 | - // 検索情報 | |
| 139 | - for(Object slist1 : slist) { | |
| 140 | - SearchData sdat = (SearchData)slist1; | |
| 141 | - // | |
| 142 | - StringBuilder str = new StringBuilder(); | |
| 143 | - str.append(sdat.getitem()).append(splitchar); | |
| 144 | - str.append(sdat.getHtmltag()).append(splitchar); | |
| 145 | - str.append(sdat.getHtmlid()).append(splitchar); | |
| 146 | - str.append(sdat.getHtmlclass()).append(splitchar); | |
| 147 | - str.append(sdat.getaround()).append(splitchar); | |
| 148 | - str.append(sdat.getregexp()).append("\n"); | |
| 149 | - // 書込み | |
| 150 | - bufferedwriter.write(str.toString()); | |
| 151 | - } | |
| 152 | - | |
| 153 | - } catch (IOException ex) { | |
| 154 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 155 | - } finally { | |
| 156 | - try { | |
| 157 | - if(bufferedwriter != null) { | |
| 158 | - bufferedwriter.close(); | |
| 159 | - } | |
| 160 | - | |
| 161 | - } catch (IOException ex) { | |
| 162 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 163 | - } | |
| 164 | - } | |
| 165 | - } | |
| 166 | - | |
| 167 | - /** | |
| 168 | - * 読込(CSV形式). | |
| 169 | - * @param file | |
| 170 | - */ | |
| 171 | - public void loadCsv(File file) { | |
| 172 | - slist = new ArrayList(); | |
| 173 | - | |
| 174 | - BufferedReader bufferedreader = null; | |
| 175 | - try { | |
| 176 | - FileInputStream fileinputstream = new FileInputStream(file); | |
| 177 | - InputStreamReader inputstreamreader = new InputStreamReader(fileinputstream, "UTF-8"); | |
| 178 | - bufferedreader = new BufferedReader(inputstreamreader); | |
| 179 | - | |
| 180 | - // URL | |
| 181 | - UrlAdress = bufferedreader.readLine(); | |
| 182 | - // 検索情報 | |
| 183 | - String rec; | |
| 184 | - while((rec = bufferedreader.readLine()) != null) { | |
| 185 | - String[] recary = rec.split(splitchar, -1); | |
| 186 | - SearchData sdat = new SearchData(); | |
| 187 | - sdat.setitem(recary[0]); | |
| 188 | - sdat.setHtmltag(recary[1]); | |
| 189 | - sdat.setHtmlid(recary[2]); | |
| 190 | - sdat.setHtmlclass(recary[3]); | |
| 191 | - sdat.setaround(recary[4]); | |
| 192 | - sdat.setregexp(recary[5]); | |
| 193 | - | |
| 194 | - slist.add(sdat); | |
| 195 | - } | |
| 196 | - | |
| 197 | - } catch(IOException ex) { | |
| 198 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 199 | - | |
| 200 | - } finally { | |
| 201 | - try { | |
| 202 | - if(bufferedreader != null) { | |
| 203 | - bufferedreader.close(); | |
| 204 | - } | |
| 205 | - | |
| 206 | - } catch (IOException ex) { | |
| 207 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 208 | - } | |
| 209 | - } | |
| 210 | - } | |
| 211 | - | |
| 212 | - /* ---------------------------------------------------------------------- */ | |
| 213 | - /** | |
| 214 | - * 保存(XML形式). | |
| 215 | - * @param file | |
| 216 | - */ | |
| 217 | - public void saveXml(File file) { | |
| 218 | - try { | |
| 219 | - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
| 220 | - DocumentBuilder wkBuilder = factory.newDocumentBuilder(); | |
| 221 | - DOMImplementation domImpl = wkBuilder.getDOMImplementation(); | |
| 222 | - | |
| 223 | - Document doc = domImpl.createDocument("","searchdata",null); | |
| 224 | - Element wkRoot = doc.getDocumentElement(); | |
| 225 | - | |
| 226 | - // URL | |
| 227 | - Element url = doc.createElement("url"); | |
| 228 | - url.appendChild(doc.createTextNode(UrlAdress)); | |
| 229 | - wkRoot.appendChild(url); | |
| 230 | - | |
| 231 | - // 検索情報 | |
| 232 | - for (Object slist1 : slist) { | |
| 233 | - SearchData sdat = (SearchData) slist1; | |
| 234 | - | |
| 235 | - Element cslist = doc.createElement("searchlist"); | |
| 236 | - Element item = doc.createElement("item"); | |
| 237 | - Element htmltag = doc.createElement("htmltag"); | |
| 238 | - Element htmlid = doc.createElement("htmlid"); | |
| 239 | - Element htmlclass = doc.createElement("htmlclass"); | |
| 240 | - Element around = doc.createElement("around"); | |
| 241 | - Element regexp = doc.createElement("regexp"); | |
| 242 | - | |
| 243 | - item.appendChild(doc.createTextNode(sdat.getitem())); | |
| 244 | - htmltag.appendChild(doc.createTextNode(sdat.getHtmltag())); | |
| 245 | - htmlid.appendChild(doc.createTextNode(sdat.getHtmlid())); | |
| 246 | - htmlclass.appendChild(doc.createTextNode(sdat.getHtmlclass())); | |
| 247 | - around.appendChild(doc.createTextNode(sdat.getaround())); | |
| 248 | - regexp.appendChild(doc.createTextNode(sdat.getregexp())); | |
| 249 | - | |
| 250 | - cslist.appendChild(item); | |
| 251 | - cslist.appendChild(htmltag); | |
| 252 | - cslist.appendChild(htmlid); | |
| 253 | - cslist.appendChild(htmlclass); | |
| 254 | - cslist.appendChild(around); | |
| 255 | - cslist.appendChild(regexp); | |
| 256 | - | |
| 257 | - wkRoot.appendChild(cslist); | |
| 258 | - } | |
| 259 | - // 出力 | |
| 260 | - TransformerFactory transFactory = TransformerFactory.newInstance(); | |
| 261 | - Transformer transformer = transFactory.newTransformer(); | |
| 262 | - | |
| 263 | - DOMSource source = new DOMSource(doc); | |
| 264 | - FileOutputStream os = new FileOutputStream(file); | |
| 265 | - StreamResult result = new StreamResult(os); | |
| 266 | - transformer.transform(source, result); | |
| 267 | - | |
| 268 | - } catch (ParserConfigurationException | FileNotFoundException ex) { | |
| 269 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 270 | - } catch (TransformerConfigurationException ex) { | |
| 271 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 272 | - } catch (TransformerException ex) { | |
| 273 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 274 | - } | |
| 275 | - } | |
| 276 | - | |
| 277 | - /** | |
| 278 | - * 読込(XML形式). | |
| 279 | - * @param file | |
| 280 | - */ | |
| 281 | - public void loadXml(File file) { | |
| 282 | - slist = new ArrayList(); | |
| 283 | - | |
| 284 | - try { | |
| 285 | - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
| 286 | - DocumentBuilder wkBuilder = factory.newDocumentBuilder(); | |
| 287 | - Document doc = wkBuilder.parse(file); | |
| 288 | - | |
| 289 | - // ルート要素の取得 | |
| 290 | - Element wkRoot = doc.getDocumentElement(); | |
| 291 | - | |
| 292 | - // URL | |
| 293 | - NodeList url = wkRoot.getElementsByTagName("url"); | |
| 294 | - Node urlnode = url.item(0); | |
| 295 | - UrlAdress = urlnode.getFirstChild().getNodeValue(); | |
| 296 | - | |
| 297 | - // 検索情報 | |
| 298 | - NodeList cslist = wkRoot.getElementsByTagName("searchlist"); | |
| 299 | - for(int i = 0; i < cslist.getLength(); i++) { | |
| 300 | - SearchData sdat = new SearchData(); | |
| 301 | - | |
| 302 | - Node slistnode = cslist.item(i); | |
| 303 | - Node child; | |
| 304 | - for (child = slistnode.getFirstChild(); child != null; child = child.getNextSibling()) { | |
| 305 | - if(child.getNodeType() == Node.ELEMENT_NODE) { | |
| 306 | - | |
| 307 | - String tag = child.getNodeName(); | |
| 308 | - String rtn = ""; | |
| 309 | - if(child.getFirstChild() != null) { | |
| 310 | - rtn = child.getFirstChild().getNodeValue(); | |
| 311 | - } | |
| 312 | - | |
| 313 | - switch (tag) { | |
| 314 | - case "item" : | |
| 315 | - sdat.setitem(rtn); | |
| 316 | - break; | |
| 317 | - case "htmltag" : | |
| 318 | - sdat.setHtmltag(rtn); | |
| 319 | - break; | |
| 320 | - case "htmlid" : | |
| 321 | - sdat.setHtmlid(rtn); | |
| 322 | - break; | |
| 323 | - case "htmlclass" : | |
| 324 | - sdat.setHtmlclass(rtn); | |
| 325 | - break; | |
| 326 | - case "around" : | |
| 327 | - sdat.setaround(rtn); | |
| 328 | - break; | |
| 329 | - case "regexp" : | |
| 330 | - sdat.setregexp(rtn); | |
| 331 | - break; | |
| 332 | - } | |
| 333 | - } | |
| 334 | - } | |
| 335 | - slist.add(sdat); | |
| 336 | - } | |
| 337 | - | |
| 338 | - } catch (ParserConfigurationException | SAXException | IOException ex) { | |
| 339 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 340 | - } | |
| 341 | - } | |
| 342 | - | |
| 343 | - /* ---------------------------------------------------------------------- */ | |
| 344 | - | |
| 345 | - void loadUrl() { | |
| 346 | - NodeList nodelist = root.getElementsByTagName("url"); | |
| 347 | - Node node = nodelist.item(0); | |
| 348 | - UrlAdress = node.getFirstChild().getNodeValue(); | |
| 349 | - } | |
| 350 | - | |
| 351 | - public void loadSearchList() { | |
| 352 | - slist.clear(); | |
| 353 | - SearchData.clear(); | |
| 354 | - | |
| 355 | - NodeList nodelist = root.getElementsByTagName("searchlist"); | |
| 356 | - for(int i = 0; i < nodelist.getLength(); i++) { | |
| 357 | - Node childnode = nodelist.item(i); | |
| 358 | - | |
| 359 | - boolean sdatflg = false; | |
| 360 | - SearchData sdat = new SearchData(); | |
| 361 | - for (Node child = childnode.getFirstChild(); child != null; child = child.getNextSibling()) { | |
| 362 | - if(child.getNodeType() == Node.ELEMENT_NODE) { | |
| 363 | - String tag = child.getNodeName(); | |
| 364 | - String rtn = ""; | |
| 365 | - if(child.getFirstChild() != null) { | |
| 366 | - rtn = child.getFirstChild().getNodeValue(); | |
| 367 | - } | |
| 368 | - switch (tag) { | |
| 369 | - case "item" : | |
| 370 | - sdat.setitem(rtn); | |
| 371 | - sdatflg = true; | |
| 372 | - break; | |
| 373 | - case "htmltag" : | |
| 374 | - sdat.setHtmltag(rtn); | |
| 375 | - sdatflg = true; | |
| 376 | - break; | |
| 377 | - case "htmlid" : | |
| 378 | - sdat.setHtmlid(rtn); | |
| 379 | - sdatflg = true; | |
| 380 | - break; | |
| 381 | - case "htmlclass" : | |
| 382 | - sdat.setHtmlclass(rtn); | |
| 383 | - sdatflg = true; | |
| 384 | - break; | |
| 385 | - case "around" : | |
| 386 | - sdat.setaround(rtn); | |
| 387 | - sdatflg = true; | |
| 388 | - break; | |
| 389 | - case "regexp" : | |
| 390 | - sdat.setregexp(rtn); | |
| 391 | - sdatflg = true; | |
| 392 | - break; | |
| 393 | - } | |
| 394 | - } | |
| 395 | - } | |
| 396 | - if(sdatflg) slist.add(sdat); | |
| 397 | - if(sdatflg) SearchData.add(sdat); | |
| 398 | - } | |
| 399 | - } | |
| 400 | - | |
| 401 | - public String loadMsg404() { | |
| 402 | - StringBuilder strbuf = new StringBuilder(); | |
| 403 | - NodeList nodelist = root.getElementsByTagName("msg404"); | |
| 404 | - for(int i = 0; i < nodelist.getLength(); i++) { | |
| 405 | - Node childnode = nodelist.item(i); | |
| 406 | - String str = childnode.getFirstChild().getNodeValue(); | |
| 407 | - if(strbuf.length() > 0) { | |
| 408 | - strbuf.append("\n"); | |
| 409 | - } | |
| 410 | - strbuf.append(str); | |
| 411 | - } | |
| 412 | - return strbuf.toString(); | |
| 413 | - } | |
| 414 | - | |
| 415 | - public Element loadElement(String elementTagName) { | |
| 416 | - NodeList nodelist = root.getElementsByTagName(elementTagName); | |
| 417 | - Element element = (Element)nodelist.item(0); | |
| 418 | - | |
| 419 | - return element; | |
| 420 | - } | |
| 421 | - | |
| 422 | - /* ---------------------------------------------------------------------- */ | |
| 423 | - | |
| 424 | - void saveUrl(String urladdress) { | |
| 425 | - checkdoc(); | |
| 426 | - removeElement("url"); // 既にElementが存在してた場合、一度削除 | |
| 427 | - | |
| 428 | - Element url = document.createElement("url"); | |
| 429 | - url.appendChild(document.createTextNode(urladdress)); | |
| 430 | - root.appendChild(url); | |
| 431 | - } | |
| 432 | - | |
| 433 | - void saveSearchList(ArrayList slist) { | |
| 434 | - checkdoc(); | |
| 435 | - removeElement("searchlist"); // 既にElementが存在してた場合、一度削除 | |
| 436 | - | |
| 437 | - int count = 0; | |
| 438 | - for (Object slist1 : slist) { | |
| 439 | - SearchData sdat = (SearchData) slist1; | |
| 440 | - | |
| 441 | - Element cslist = document.createElement("searchlist"); | |
| 442 | - cslist.setAttribute("listNo", String.valueOf(++count)); | |
| 443 | - | |
| 444 | - addChild(cslist, "item", sdat.getitem()); | |
| 445 | - addChild(cslist, "htmltag", sdat.getHtmltag()); | |
| 446 | - addChild(cslist, "htmlid", sdat.getHtmlid()); | |
| 447 | - addChild(cslist, "htmlclass", sdat.getHtmlclass()); | |
| 448 | - addChild(cslist, "around", sdat.getaround()); | |
| 449 | - addChild(cslist, "regexp", sdat.getregexp()); | |
| 450 | - | |
| 451 | - root.appendChild(cslist); | |
| 452 | - } | |
| 453 | - } | |
| 454 | - | |
| 455 | - void saveMsg404(String msg) { | |
| 456 | - checkdoc(); | |
| 457 | - removeElement("msg404"); // 既にElementが存在してた場合、一度削除 | |
| 458 | - | |
| 459 | - String[] msgs = msg.split("\n"); | |
| 460 | - int count = 0; | |
| 461 | - for(String msgOne : msgs) { | |
| 462 | - Element msgElement = document.createElement("msg404"); | |
| 463 | - msgElement.setAttribute("No", String.valueOf(++count)); | |
| 464 | - msgElement.appendChild(document.createTextNode(msgOne)); | |
| 465 | - | |
| 466 | - root.appendChild(msgElement); | |
| 467 | - } | |
| 468 | - } | |
| 469 | - | |
| 470 | - public void saveElement(Element element) { | |
| 471 | - checkdoc(); | |
| 472 | - removeElement(element.getTagName()); // 既にElementが存在してた場合、一度削除 | |
| 473 | - | |
| 474 | - root.appendChild(element); | |
| 475 | - } | |
| 476 | - | |
| 477 | - /* ---------------------------------------------------------------------- */ | |
| 478 | - | |
| 479 | - private void addChild(Element cslist, String keyword, String data) { | |
| 480 | - if(!data.isEmpty()) { | |
| 481 | - Element element = document.createElement(keyword); | |
| 482 | - element.appendChild(document.createTextNode(data)); | |
| 483 | - cslist.appendChild(element); | |
| 484 | - } | |
| 485 | - } | |
| 486 | - | |
| 487 | - private void removeElement(String elementTagName) { | |
| 488 | - int nodeSize; | |
| 489 | - do { | |
| 490 | - NodeList nodelist = document.getElementsByTagName(elementTagName); | |
| 491 | - nodeSize = nodelist.getLength(); | |
| 492 | - for(int i = 0; i < nodelist.getLength(); i++) { | |
| 493 | - Node node = nodelist.item(i); | |
| 494 | - root.removeChild(node); | |
| 495 | - } | |
| 496 | - } while(nodeSize > 0); | |
| 497 | - } | |
| 498 | - | |
| 499 | - /** | |
| 500 | - * ドキュメントチェック. | |
| 501 | - * 新規の場合やXMLファイルの読込みが行われていない状態時、新たにルートエレメントを作成する。 | |
| 502 | - * 既読の場合、ルートエレメントの取得を行う。 | |
| 503 | - */ | |
| 504 | - public void checkdoc() { | |
| 505 | - if(document == null) { | |
| 506 | - DOMImplementation domImpl = builder.getDOMImplementation(); | |
| 507 | - document = domImpl.createDocument("","searchdata",null); | |
| 508 | - } | |
| 509 | - root = document.getDocumentElement(); | |
| 510 | - } | |
| 511 | - | |
| 512 | - /** | |
| 513 | - * XML読込み. | |
| 514 | - * @param file | |
| 515 | - */ | |
| 516 | - public void read(File file) { | |
| 517 | - try { | |
| 518 | - document = builder.parse(file); | |
| 519 | - root = document.getDocumentElement(); | |
| 520 | - | |
| 521 | - } catch (SAXException | IOException ex) { | |
| 522 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 523 | - } | |
| 524 | - } | |
| 525 | - | |
| 526 | - /** | |
| 527 | - * XML書込み. | |
| 528 | - * @param file | |
| 529 | - */ | |
| 530 | - public void write(File file) { | |
| 531 | - try { | |
| 532 | - TransformerFactory transFactory = TransformerFactory.newInstance(); | |
| 533 | - Transformer transformer = transFactory.newTransformer(); | |
| 534 | - | |
| 535 | - DOMSource source = new DOMSource(document); | |
| 536 | - FileOutputStream os = new FileOutputStream(file); | |
| 537 | - StreamResult result = new StreamResult(os); | |
| 538 | - transformer.transform(source, result); | |
| 539 | - | |
| 540 | - } catch (TransformerConfigurationException ex) { | |
| 541 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 542 | - } catch (FileNotFoundException | TransformerException ex) { | |
| 543 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
| 544 | - } | |
| 545 | - } | |
| 546 | - | |
| 547 | -} |
| @@ -0,0 +1,142 @@ | ||
| 1 | +/* | |
| 2 | + * Copyright (C) 2014-2015 kgto. | |
| 3 | + * | |
| 4 | + * This library is free software; you can redistribute it and/or | |
| 5 | + * modify it under the terms of the GNU Lesser General Public | |
| 6 | + * License as published by the Free Software Foundation; either | |
| 7 | + * version 2.1 of the License, or (at your option) any later version. | |
| 8 | + * | |
| 9 | + * This library is distributed in the hope that it will be useful, | |
| 10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 12 | + * Lesser General Public License for more details. | |
| 13 | + * | |
| 14 | + * You should have received a copy of the GNU Lesser General Public | |
| 15 | + * License along with this library; if not, write to the Free Software | |
| 16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 17 | + * MA 02110-1301 USA | |
| 18 | + */ | |
| 19 | +/* | |
| 20 | + * $Id$ | |
| 21 | + */ | |
| 22 | + | |
| 23 | +package webScraping.utility; | |
| 24 | + | |
| 25 | +import java.io.File; | |
| 26 | +import java.io.FileNotFoundException; | |
| 27 | +import java.io.FileOutputStream; | |
| 28 | +import java.io.IOException; | |
| 29 | +import java.util.logging.Level; | |
| 30 | +import java.util.logging.Logger; | |
| 31 | + | |
| 32 | +import javax.xml.parsers.DocumentBuilder; | |
| 33 | +import javax.xml.parsers.DocumentBuilderFactory; | |
| 34 | +import javax.xml.parsers.ParserConfigurationException; | |
| 35 | +import javax.xml.transform.Transformer; | |
| 36 | +import javax.xml.transform.TransformerConfigurationException; | |
| 37 | +import javax.xml.transform.TransformerException; | |
| 38 | +import javax.xml.transform.TransformerFactory; | |
| 39 | +import javax.xml.transform.dom.DOMSource; | |
| 40 | +import javax.xml.transform.stream.StreamResult; | |
| 41 | + | |
| 42 | +import org.w3c.dom.DOMImplementation; | |
| 43 | +import org.w3c.dom.Document; | |
| 44 | +import org.w3c.dom.Element; | |
| 45 | +import org.w3c.dom.Node; | |
| 46 | +import org.w3c.dom.NodeList; | |
| 47 | +import org.xml.sax.SAXException; | |
| 48 | + | |
| 49 | +public class LibraryXml { | |
| 50 | + | |
| 51 | + String xmlrootname = "xmlcontainer"; | |
| 52 | + | |
| 53 | + DocumentBuilder builder; | |
| 54 | + public Document readdoc, writedoc; | |
| 55 | + Element xmlroot; | |
| 56 | + | |
| 57 | + /* ---------------------------------------------------------------------- * | |
| 58 | + * コンストラクタ | |
| 59 | + * ---------------------------------------------------------------------- */ | |
| 60 | + public LibraryXml() { | |
| 61 | + try { | |
| 62 | + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
| 63 | + builder = factory.newDocumentBuilder(); | |
| 64 | + | |
| 65 | + } catch (ParserConfigurationException ex) { | |
| 66 | + Logger.getLogger(LibraryXml.class.getName()).log(Level.SEVERE, null, ex); | |
| 67 | + } | |
| 68 | + } | |
| 69 | + | |
| 70 | + /* ---------------------------------------------------------------------- * | |
| 71 | + * メソッド | |
| 72 | + * ---------------------------------------------------------------------- */ | |
| 73 | + /* 読込み処理 */ | |
| 74 | + public Element getwriteRoot(String elementName) { | |
| 75 | + mainElement(); | |
| 76 | + Element element = writedoc.createElement(elementName); | |
| 77 | + xmlroot.appendChild(element); | |
| 78 | + return element; | |
| 79 | + } | |
| 80 | + | |
| 81 | + private void mainElement() { | |
| 82 | + if(writedoc == null) { | |
| 83 | + DOMImplementation domImpl = builder.getDOMImplementation(); | |
| 84 | + writedoc = domImpl.createDocument("", xmlrootname, null); | |
| 85 | + xmlroot = writedoc.getDocumentElement(); | |
| 86 | + } | |
| 87 | + } | |
| 88 | + | |
| 89 | + /** | |
| 90 | + * XML書込み. | |
| 91 | + * @param file | |
| 92 | + */ | |
| 93 | + public void write(File file) { | |
| 94 | + try (FileOutputStream os = new FileOutputStream(file)) { | |
| 95 | + TransformerFactory transFactory = TransformerFactory.newInstance(); | |
| 96 | + Transformer transformer = transFactory.newTransformer(); | |
| 97 | + | |
| 98 | + transformer.setOutputProperty("indent", "yes"); // 改行指定 | |
| 99 | + transformer.setOutputProperty("method", "xml"); | |
| 100 | + | |
| 101 | + DOMSource source = new DOMSource(writedoc); | |
| 102 | + StreamResult result = new StreamResult(os); | |
| 103 | + transformer.transform(source, result); | |
| 104 | + | |
| 105 | + // 作成したXMLをクリア | |
| 106 | + writedoc = null; | |
| 107 | + | |
| 108 | + } catch (TransformerConfigurationException ex) { | |
| 109 | + Logger.getLogger(LibraryXml.class.getName()).log(Level.SEVERE, null, ex); | |
| 110 | + } catch (FileNotFoundException | TransformerException ex) { | |
| 111 | + Logger.getLogger(LibraryXml.class.getName()).log(Level.SEVERE, null, ex); | |
| 112 | + } catch (IOException ex) { | |
| 113 | + Logger.getLogger(LibraryXml.class.getName()).log(Level.SEVERE, null, ex); | |
| 114 | + } | |
| 115 | + } | |
| 116 | + | |
| 117 | + /* ---------------------------------------------------------------------- */ | |
| 118 | + /* 書込み処理 */ | |
| 119 | + | |
| 120 | + public Element getreadRoot(String elementName) { | |
| 121 | + NodeList nodelist = xmlroot.getElementsByTagName(elementName); | |
| 122 | + Node node = nodelist.item(0); | |
| 123 | + return (node.getNodeType() == Node.ELEMENT_NODE ? (Element)node : null); | |
| 124 | + } | |
| 125 | + | |
| 126 | + /** | |
| 127 | + * XML読込み. | |
| 128 | + * @param file | |
| 129 | + */ | |
| 130 | + public void read(File file) { | |
| 131 | + try { | |
| 132 | + readdoc = builder.parse(file); | |
| 133 | + xmlroot = readdoc.getDocumentElement(); | |
| 134 | + | |
| 135 | + } catch (SAXException | IOException ex) { | |
| 136 | + Logger.getLogger(LibraryXml.class.getName()).log(Level.SEVERE, null, ex); | |
| 137 | + } | |
| 138 | + } | |
| 139 | + | |
| 140 | + /* ---------------------------------------------------------------------- */ | |
| 141 | + | |
| 142 | +} |
| @@ -0,0 +1,198 @@ | ||
| 1 | +/* | |
| 2 | + * Copyright (C) 2014-2015 kgto. | |
| 3 | + * | |
| 4 | + * This library is free software; you can redistribute it and/or | |
| 5 | + * modify it under the terms of the GNU Lesser General Public | |
| 6 | + * License as published by the Free Software Foundation; either | |
| 7 | + * version 2.1 of the License, or (at your option) any later version. | |
| 8 | + * | |
| 9 | + * This library is distributed in the hope that it will be useful, | |
| 10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 12 | + * Lesser General Public License for more details. | |
| 13 | + * | |
| 14 | + * You should have received a copy of the GNU Lesser General Public | |
| 15 | + * License along with this library; if not, write to the Free Software | |
| 16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 17 | + * MA 02110-1301 USA | |
| 18 | + */ | |
| 19 | +/* | |
| 20 | + * $Id$ | |
| 21 | + */ | |
| 22 | + | |
| 23 | +package webScraping.utility; | |
| 24 | + | |
| 25 | +import webScraping.core.SearchData; | |
| 26 | +import java.io.File; | |
| 27 | +import java.util.ArrayList; | |
| 28 | +import org.w3c.dom.Element; | |
| 29 | +import org.w3c.dom.Node; | |
| 30 | +import org.w3c.dom.NodeList; | |
| 31 | + | |
| 32 | +public class ScrapingXml { | |
| 33 | + /* ---------------------------------------------------------------------- * | |
| 34 | + * フィールド | |
| 35 | + * ---------------------------------------------------------------------- */ | |
| 36 | + String rootnameScraping = "webscraping"; | |
| 37 | + | |
| 38 | + private String testUrl; | |
| 39 | + private SearchData[] sdata; | |
| 40 | + | |
| 41 | + public LibraryXml xlib = new LibraryXml(); | |
| 42 | + public Element root; | |
| 43 | + | |
| 44 | + /* ---------------------------------------------------------------------- * | |
| 45 | + * コンストラクタ | |
| 46 | + * ---------------------------------------------------------------------- */ | |
| 47 | + public ScrapingXml() { | |
| 48 | + } | |
| 49 | + | |
| 50 | + /* ---------------------------------------------------------------------- * | |
| 51 | + * Setter | |
| 52 | + * ---------------------------------------------------------------------- */ | |
| 53 | + public void setTestUrl(String testUrl) { | |
| 54 | + this.testUrl = testUrl; | |
| 55 | + } | |
| 56 | + | |
| 57 | + public void setSdata() { | |
| 58 | + this.sdata = new SearchData[SearchData.size()]; | |
| 59 | + for(int i = 0; i < SearchData.size(); i++) { | |
| 60 | + this.sdata[i] = SearchData.get(i); | |
| 61 | + } | |
| 62 | + } | |
| 63 | + | |
| 64 | + /* ---------------------------------------------------------------------- * | |
| 65 | + * Getter | |
| 66 | + * ---------------------------------------------------------------------- */ | |
| 67 | + public String getTestUrl() { | |
| 68 | + return testUrl; | |
| 69 | + } | |
| 70 | + | |
| 71 | + public void getSdata() { | |
| 72 | + SearchData.clear(); | |
| 73 | + for(SearchData sdata1 : sdata) { | |
| 74 | + SearchData.add(sdata1); | |
| 75 | + } | |
| 76 | + } | |
| 77 | + | |
| 78 | + /* ---------------------------------------------------------------------- * | |
| 79 | + * メソッド | |
| 80 | + * ---------------------------------------------------------------------- */ | |
| 81 | + public void save(File file) { | |
| 82 | + | |
| 83 | + elementset(); | |
| 84 | + | |
| 85 | + xlib.write(file); | |
| 86 | + } | |
| 87 | + | |
| 88 | + public void elementset() { | |
| 89 | + root = xlib.getwriteRoot(rootnameScraping); | |
| 90 | + elementsetUrl(); | |
| 91 | + elementsetSearchdata(); | |
| 92 | + System.out.println("elementset XmlScraping"); | |
| 93 | + } | |
| 94 | + | |
| 95 | + private void elementsetUrl() { | |
| 96 | + Element url = xlib.writedoc.createElement("url"); | |
| 97 | + url.appendChild(xlib.writedoc.createTextNode(testUrl)); | |
| 98 | + root.appendChild(url); | |
| 99 | + } | |
| 100 | + | |
| 101 | + private void elementsetSearchdata() { | |
| 102 | + int count = 0; | |
| 103 | + for(SearchData sdat : sdata) { | |
| 104 | + Element cslist = xlib.writedoc.createElement("searchlist"); | |
| 105 | + cslist.setAttribute("listNo", String.valueOf(++count)); | |
| 106 | + | |
| 107 | + addChild(cslist, "item" , sdat.getitem()); | |
| 108 | + addChild(cslist, "htmltag" , sdat.getHtmltag()); | |
| 109 | + addChild(cslist, "htmlid" , sdat.getHtmlid()); | |
| 110 | + addChild(cslist, "htmlclass", sdat.getHtmlclass()); | |
| 111 | + addChild(cslist, "around" , sdat.getaround()); | |
| 112 | + addChild(cslist, "regexp" , sdat.getregexp()); | |
| 113 | + | |
| 114 | + root.appendChild(cslist); | |
| 115 | + } | |
| 116 | + } | |
| 117 | + | |
| 118 | + private void addChild(Element cslist, String keyword, String data) { | |
| 119 | + if(!data.isEmpty()) { | |
| 120 | + Element element = xlib.writedoc.createElement(keyword); | |
| 121 | + element.appendChild(xlib.writedoc.createTextNode(data)); | |
| 122 | + cslist.appendChild(element); | |
| 123 | + } | |
| 124 | + } | |
| 125 | + | |
| 126 | + /* ---------------------------------------------------------------------- */ | |
| 127 | + | |
| 128 | + void load(File file) { | |
| 129 | + xlib.read(file); | |
| 130 | + elementget(); | |
| 131 | + } | |
| 132 | + | |
| 133 | + public void elementget() { | |
| 134 | + root = xlib.getreadRoot(rootnameScraping); | |
| 135 | + elementgetUrl(); | |
| 136 | + elementgetSearchdata(); | |
| 137 | + } | |
| 138 | + | |
| 139 | + private void elementgetUrl() { | |
| 140 | + NodeList nodelist = root.getElementsByTagName("url"); | |
| 141 | + Node node = nodelist.item(0); | |
| 142 | + testUrl = node.getFirstChild().getNodeValue(); | |
| 143 | + } | |
| 144 | + | |
| 145 | + private void elementgetSearchdata() { | |
| 146 | + ArrayList<SearchData> slist = new ArrayList<>(); | |
| 147 | + | |
| 148 | + NodeList nodelist = root.getElementsByTagName("searchlist"); | |
| 149 | + for(int i = 0; i < nodelist.getLength(); i++) { | |
| 150 | + Node childnode = nodelist.item(i); | |
| 151 | + | |
| 152 | + boolean sdatflg = false; | |
| 153 | + SearchData sdat = new SearchData(); | |
| 154 | + for (Node child = childnode.getFirstChild(); child != null; child = child.getNextSibling()) { | |
| 155 | + if(child.getNodeType() == Node.ELEMENT_NODE) { | |
| 156 | + String tag = child.getNodeName(); | |
| 157 | + String rtn = ""; | |
| 158 | + if(child.getFirstChild() != null) { | |
| 159 | + rtn = child.getFirstChild().getNodeValue(); | |
| 160 | + } | |
| 161 | + switch (tag) { | |
| 162 | + case "item" : | |
| 163 | + sdat.setitem(rtn); | |
| 164 | + sdatflg = true; | |
| 165 | + break; | |
| 166 | + case "htmltag" : | |
| 167 | + sdat.setHtmltag(rtn); | |
| 168 | + sdatflg = true; | |
| 169 | + break; | |
| 170 | + case "htmlid" : | |
| 171 | + sdat.setHtmlid(rtn); | |
| 172 | + sdatflg = true; | |
| 173 | + break; | |
| 174 | + case "htmlclass" : | |
| 175 | + sdat.setHtmlclass(rtn); | |
| 176 | + sdatflg = true; | |
| 177 | + break; | |
| 178 | + case "around" : | |
| 179 | + sdat.setaround(rtn); | |
| 180 | + sdatflg = true; | |
| 181 | + break; | |
| 182 | + case "regexp" : | |
| 183 | + sdat.setregexp(rtn); | |
| 184 | + sdatflg = true; | |
| 185 | + break; | |
| 186 | + } | |
| 187 | + } | |
| 188 | + } | |
| 189 | + if(sdatflg) slist.add(sdat); | |
| 190 | + } | |
| 191 | + // 配列化 | |
| 192 | + sdata = new SearchData[slist.size()]; | |
| 193 | + for(int i = 0; i < slist.size(); i++) { | |
| 194 | + sdata[i] = slist.get(i); | |
| 195 | + } | |
| 196 | + } | |
| 197 | + | |
| 198 | +} |
| @@ -40,7 +40,7 @@ | ||
| 40 | 40 | * @author kgto |
| 41 | 41 | */ |
| 42 | 42 | public class HtmlSearch extends javax.swing.JFrame { |
| 43 | - private final SearchDataRW sio = new SearchDataRW(); | |
| 43 | + private final ScrapingXml xmlwriter = new ScrapingXml(); | |
| 44 | 44 | |
| 45 | 45 | SearchDataTableModel sdatatblmodel; |
| 46 | 46 |
| @@ -332,8 +332,9 @@ | ||
| 332 | 332 | int selected = jFileChooser1.showOpenDialog(this); |
| 333 | 333 | if (selected == JFileChooser.APPROVE_OPTION) { |
| 334 | 334 | File file = jFileChooser1.getSelectedFile(); |
| 335 | - sio.load(file); | |
| 336 | - jTxtUrl.setText(sio.geturl()); | |
| 335 | + xmlwriter.load(file); | |
| 336 | + jTxtUrl.setText(xmlwriter.getTestUrl()); | |
| 337 | + xmlwriter.getSdata(); | |
| 337 | 338 | sdatatblmodel.setRowCount(0); |
| 338 | 339 | for(int i = 0; i < SearchData.size(); i++) { |
| 339 | 340 | SearchData sdata = SearchData.get(i); |
| @@ -347,7 +348,7 @@ | ||
| 347 | 348 | int selected = jFileChooser1.showSaveDialog(this); |
| 348 | 349 | if (selected == JFileChooser.APPROVE_OPTION) { |
| 349 | 350 | File file = jFileChooser1.getSelectedFile(); |
| 350 | - sio.seturl(jTxtUrl.getText()); | |
| 351 | + xmlwriter.setTestUrl(jTxtUrl.getText()); | |
| 351 | 352 | |
| 352 | 353 | SearchData.clear(); |
| 353 | 354 | for(int row = 0; row < sdatatblmodel.getRowCount(); row++) { |
| @@ -354,7 +355,8 @@ | ||
| 354 | 355 | SearchData sdata = sdatatblmodel.getSearchData(row); |
| 355 | 356 | SearchData.add(sdata); |
| 356 | 357 | } |
| 357 | - sio.save(file); | |
| 358 | + xmlwriter.setSdata(); | |
| 359 | + xmlwriter.save(file); | |
| 358 | 360 | } |
| 359 | 361 | }//GEN-LAST:event_jMenuSaveActionPerformed |
| 360 | 362 |
| @@ -76,7 +76,8 @@ | ||
| 76 | 76 | for (Object AttrList1 : AttrList) { |
| 77 | 77 | AttrData a = (AttrData)AttrList1; |
| 78 | 78 | if(a.tag == tag) { |
| 79 | - if(a.attrname.equals(attrname) && a.attrvalue.equals(attrvalue)) { | |
| 79 | + //if(a.attrname.equals(attrname) && a.attrvalue.equals(attrvalue)) { | |
| 80 | + if(a.attrname.equals(attrname) && a.attrvalue.startsWith(attrvalue)) { | |
| 80 | 81 | ret = true; |
| 81 | 82 | } |
| 82 | 83 | } |
| @@ -33,7 +33,9 @@ | ||
| 33 | 33 | * @author kgto |
| 34 | 34 | */ |
| 35 | 35 | class HtmlParserCallback extends HTMLEditorKit.ParserCallback { |
| 36 | - | |
| 36 | + /* ---------------------------------------------------------------------- * | |
| 37 | + * フィールド | |
| 38 | + * ---------------------------------------------------------------------- */ | |
| 37 | 39 | // Tag毎の階層 |
| 38 | 40 | HashMap<HTML.Tag,Integer> tagMap = new HashMap<>(); |
| 39 | 41 |
| @@ -54,6 +56,9 @@ | ||
| 54 | 56 | // 属性データ |
| 55 | 57 | AttributeData attrdata; |
| 56 | 58 | |
| 59 | + /* ---------------------------------------------------------------------- * | |
| 60 | + * コンストラクタ | |
| 61 | + * ---------------------------------------------------------------------- */ | |
| 57 | 62 | protected HtmlParserCallback(SearchData skey) { |
| 58 | 63 | |
| 59 | 64 | // キー情報展開 |
| @@ -64,10 +69,16 @@ | ||
| 64 | 69 | sData = new ArrayList(); |
| 65 | 70 | } |
| 66 | 71 | |
| 72 | + /* ---------------------------------------------------------------------- * | |
| 73 | + * Getter | |
| 74 | + * ---------------------------------------------------------------------- */ | |
| 67 | 75 | ArrayList getrtnData() { |
| 68 | 76 | return this.sData; |
| 69 | 77 | } |
| 70 | 78 | |
| 79 | + /* ---------------------------------------------------------------------- * | |
| 80 | + * メソッド | |
| 81 | + * ---------------------------------------------------------------------- */ | |
| 71 | 82 | @Override |
| 72 | 83 | public void handleStartTag(HTML.Tag tag, MutableAttributeSet attr, int pos){ |
| 73 | 84 | // Tag毎の階層を保持 |
| @@ -42,6 +42,28 @@ | ||
| 42 | 42 | /* ---------------------------------------------------------------------- * |
| 43 | 43 | * static 処理 |
| 44 | 44 | * ---------------------------------------------------------------------- */ |
| 45 | + public static class Context { | |
| 46 | + public Class columnClass; | |
| 47 | + public String columnName; | |
| 48 | + public String columnNameJp; | |
| 49 | + | |
| 50 | + public Context(Class columnClass, String columnName, String columnNameJp) { | |
| 51 | + this.columnClass = columnClass; | |
| 52 | + this.columnName = columnName; | |
| 53 | + this.columnNameJp = columnNameJp; | |
| 54 | + } | |
| 55 | + } | |
| 56 | + | |
| 57 | + public static final Context[] context = { | |
| 58 | + /* 0 */ new Context(String.class , "item" , "項目名"), | |
| 59 | + /* 1 */ new Context(String.class , "htmltag" , "タグ"), | |
| 60 | + /* 2 */ new Context(String.class , "htmlid" , "ID"), | |
| 61 | + /* 3 */ new Context(String.class , "htmlclass" , "クラス"), | |
| 62 | + /* 4 */ new Context(String.class , "around" , "位置"), | |
| 63 | + /* 5 */ new Context(String.class , "regexp" , "抽出条件") | |
| 64 | + }; | |
| 65 | + | |
| 66 | + /* ---------------------------------------------------------------------- */ | |
| 45 | 67 | private static ArrayList<SearchData> slist = new ArrayList<>(); |
| 46 | 68 | |
| 47 | 69 | public static void addSearchData( |
| @@ -162,5 +184,17 @@ | ||
| 162 | 184 | this.around = ""; |
| 163 | 185 | this.regexp = ""; |
| 164 | 186 | } |
| 165 | - | |
| 187 | + | |
| 188 | + public Object[] getObjData() { | |
| 189 | + Object[] obj = { | |
| 190 | + /* 0 */ getitem(), // 項目名 | |
| 191 | + /* 1 */ getHtmltag(), // タグ | |
| 192 | + /* 2 */ getHtmlid(), // ID | |
| 193 | + /* 3 */ getHtmlclass(), // クラス | |
| 194 | + /* 4 */ getaround(), // 位置 | |
| 195 | + /* 5 */ getregexp() // 抽出条件 | |
| 196 | + }; | |
| 197 | + return obj; | |
| 198 | + } | |
| 199 | + | |
| 166 | 200 | } |
| @@ -32,20 +32,25 @@ | ||
| 32 | 32 | import javax.swing.text.html.parser.ParserDelegator; |
| 33 | 33 | |
| 34 | 34 | /** |
| 35 | - * | |
| 35 | + * HTMLパーサ. | |
| 36 | 36 | * @author kgto |
| 37 | 37 | */ |
| 38 | 38 | public class HtmlParser { |
| 39 | - | |
| 39 | + /* ---------------------------------------------------------------------- * | |
| 40 | + * フィールド | |
| 41 | + * ---------------------------------------------------------------------- */ | |
| 40 | 42 | URL url; |
| 41 | 43 | String pageData; |
| 42 | 44 | ArrayList sData; |
| 43 | 45 | |
| 44 | 46 | // 作業ワーク |
| 45 | - String htmltag; | |
| 46 | - String htmlid; | |
| 47 | - String htmlclass; | |
| 47 | + private String htmltag; | |
| 48 | + private String htmlid; | |
| 49 | + private String htmlclass; | |
| 48 | 50 | |
| 51 | + /* ---------------------------------------------------------------------- * | |
| 52 | + * コンストラクタ | |
| 53 | + * ---------------------------------------------------------------------- */ | |
| 49 | 54 | public HtmlParser(URL UrlAdress) { |
| 50 | 55 | DebugProcess.debuglog_set(); |
| 51 | 56 | this.url = UrlAdress; |
| @@ -68,15 +73,24 @@ | ||
| 68 | 73 | url = null; |
| 69 | 74 | } |
| 70 | 75 | |
| 76 | + /* ---------------------------------------------------------------------- * | |
| 77 | + * Getter | |
| 78 | + * ---------------------------------------------------------------------- */ | |
| 71 | 79 | public String getStringPageData() { |
| 72 | 80 | return pageData; |
| 73 | 81 | } |
| 74 | 82 | |
| 83 | + /* ---------------------------------------------------------------------- * | |
| 84 | + * Setter | |
| 85 | + * ---------------------------------------------------------------------- */ | |
| 75 | 86 | public void seturl(URL UrlAdress) { |
| 76 | 87 | this.url = UrlAdress; |
| 77 | 88 | getPageData(); |
| 78 | 89 | } |
| 79 | 90 | |
| 91 | + /* ---------------------------------------------------------------------- * | |
| 92 | + * メソッド | |
| 93 | + * ---------------------------------------------------------------------- */ | |
| 80 | 94 | public void seturl(String UrlAdress) { |
| 81 | 95 | try { |
| 82 | 96 | url = new URL(UrlAdress); |