java代理实现爬取代理IP的示例

时间:2021-05-02

仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和jsoup(版本1.10.2)

如果用了pom,那么就是以下两个:

? 1 2 3 4 5 6 7 8 9 10 <dependency> <groupid>com.alibaba</groupid> <artifactid>fastjson</artifactid> <version>1.2.28</version> </dependency> <dependency> <groupid>org.jsoup</groupid> <artifactid>jsoup</artifactid> <version>1.10.2</version> </dependency>

完整的代码如下:

? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 package com.tuniu.fcm.facade.ipproxy; import com.alibaba.fastjson.jsonobject; import org.jsoup.jsoup; import org.jsoup.nodes.document; import java.util.arraylist; import java.util.hashmap; import java.util.list; import java.util.map; import java.util.regex.matcher; import java.util.regex.pattern; /** * 获取代理ip,需要 * com.alibaba.fastjson.jsonobject以及jsoup */ public class proxycralwerunusedvpn { threadlocal<integer> localwantednumber = new threadlocal<integer>(); threadlocal<list<proxyinfo>> localproxyinfos = new threadlocal<list<proxyinfo>>(); public static void main(string[] args) { proxycralwerunusedvpn proxycrawler = new proxycralwerunusedvpn(); /** * 想要获取的代理ip个数,由需求方自行指定。(如果个数太多,将导致返回变慢) */ proxycrawler.startcrawler(1); } /** * 暴露给外部模块调用的入口 * @param wantednumber 调用方期望获取到的代理ip个数 */ public string startcrawler(int wantednumber) { localwantednumber.set(wantednumber); kuaidailicom("http://www.xicidaili.com/nn/", 15); kuaidailicom("http://www.xicidaili.com/nt/", 15); kuaidailicom("http://www.xicidaili.com/wt/", 15); kuaidailicom("http://www.kuaidaili.com/free/inha/", 15); kuaidailicom("http://www.kuaidaili.com/free/intr/", 15); kuaidailicom("http://www.kuaidaili.com/free/outtr/", 15); /** * 构造返回数据 */ proxyresponse response = new proxyresponse(); response.setsuccess("true"); map<string, object> datainfomap = new hashmap<string, object>(); datainfomap.put("numfound", localproxyinfos.get().size()); datainfomap.put("pagenum", 1); datainfomap.put("proxy", localproxyinfos.get()); response.setdata(datainfomap); string responsestring = jsonobject.tojson(response).tostring(); system.out.println(responsestring); return responsestring; } private void kuaidailicom(string baseurl, int totalpage) { string ipreg = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}"; pattern ipptn = pattern.compile(ipreg); for (int i = 1; i < totalpage; i++) { if (getcurrentproxynumber() >= localwantednumber.get()) { return; } try { document doc = jsoup.connect(baseurl + i + "/") .header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") .header("accept-encoding", "gzip, deflate, sdch") .header("accept-language", "zh-cn,zh;q=0.8,en;q=0.6") .header("cache-control", "max-age=0") .header("user-agent", "mozilla/5.0 (macintosh; intel mac os x 10_11_4) applewebkit/537.36 (khtml, like gecko) chrome/51.0.2704.103 safari/537.36") .header("cookie", "hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=ga1.2.1061361785.1462812244") .header("host", "www.kuaidaili.com") .header("referer", "http://www.kuaidaili.com/free/outha/") .timeout(30 * 1000) .get(); matcher m = ipptn.matcher(doc.text()); while (m.find()) { if (getcurrentproxynumber() >= localwantednumber.get()) { break; } string[] strs = m.group().split(" "); if (checkproxy(strs[0], integer.parseint(strs[1]))) { system.out.println("获取到可用代理ip\t" + strs[0] + "\t" + strs[1]); addproxy(strs[0], strs[1], "http"); } } } catch (exception e) { e.printstacktrace(); } } } private static boolean checkproxy(string ip, integer port) { try { //http://1212.ip138.com/ic.asp 可以换成任何比较快的网页 jsoup.connect("http://1212.ip138.com/ic.asp") .timeout(2 * 1000) .proxy(ip, port) .get(); return true; } catch (exception e) { return false; } } private int getcurrentproxynumber() { list<proxyinfo> proxyinfos = localproxyinfos.get(); if (proxyinfos == null) { proxyinfos = new arraylist<proxyinfo>(); localproxyinfos.set(proxyinfos); return 0; } else { return proxyinfos.size(); } } private void addproxy(string ip, string port, string protocol){ list<proxyinfo> proxyinfos = localproxyinfos.get(); if (proxyinfos == null) { proxyinfos = new arraylist<proxyinfo>(); proxyinfos.add(new proxyinfo(ip, port, protocol)); } else { proxyinfos.add(new proxyinfo(ip, port, protocol)); } } } class proxyinfo { private string username = ""; private string ip; private string password = ""; private string type; private string port; private int is_internet = 1; public proxyinfo(string ip, string port, string type) { this.ip = ip; this.type = type; this.port = port; } public string getusername() { return username; } public void setusername(string username) { this.username = username; } public string getip() { return ip; } public void setip(string ip) { this.ip = ip; } public string getpassword() { return password; } public void setpassword(string password) { this.password = password; } public string gettype() { return type; } public void settype(string type) { this.type = type; } public string getport() { return port; } public void setport(string port) { this.port = port; } public int getis_internet() { return is_internet; } public void setis_internet(int is_internet) { this.is_internet = is_internet; } } class proxyresponse { private string success; private map<string, object> data; public string getsuccess() { return success; } public void setsuccess(string success) { this.success = success; } public map<string, object> getdata() { return data; } public void setdata(map<string, object> data) { this.data = data; } }

以上这篇java代理实现爬取代理ip的示例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。

原文链接:https://blog.csdn.net/sdfiiiiii/article/details/70432060

声明:本页内容来源网络,仅供用户参考;我单位不保证亦不表示资料全面及准确无误,也不保证亦不表示这些资料为最新信息,如因任何原因,本网内容或者用户因倚赖本网内容造成任何损失或损害,我单位将不会负任何法律责任。如涉及版权问题,请提交至online#300.cn邮箱联系删除。

相关文章