时间:2021-05-02
仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和jsoup(版本1.10.2)
如果用了pom,那么就是以下两个:
? 1 2 3 4 5 6 7 8 9 10 <dependency> <groupid>com.alibaba</groupid> <artifactid>fastjson</artifactid> <version>1.2.28</version> </dependency> <dependency> <groupid>org.jsoup</groupid> <artifactid>jsoup</artifactid> <version>1.10.2</version> </dependency>完整的代码如下:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 package com.tuniu.fcm.facade.ipproxy; import com.alibaba.fastjson.jsonobject; import org.jsoup.jsoup; import org.jsoup.nodes.document; import java.util.arraylist; import java.util.hashmap; import java.util.list; import java.util.map; import java.util.regex.matcher; import java.util.regex.pattern; /** * 获取代理ip,需要 * com.alibaba.fastjson.jsonobject以及jsoup */ public class proxycralwerunusedvpn { threadlocal<integer> localwantednumber = new threadlocal<integer>(); threadlocal<list<proxyinfo>> localproxyinfos = new threadlocal<list<proxyinfo>>(); public static void main(string[] args) { proxycralwerunusedvpn proxycrawler = new proxycralwerunusedvpn(); /** * 想要获取的代理ip个数,由需求方自行指定。(如果个数太多,将导致返回变慢) */ proxycrawler.startcrawler(1); } /** * 暴露给外部模块调用的入口 * @param wantednumber 调用方期望获取到的代理ip个数 */ public string startcrawler(int wantednumber) { localwantednumber.set(wantednumber); kuaidailicom("http://www.xicidaili.com/nn/", 15); kuaidailicom("http://www.xicidaili.com/nt/", 15); kuaidailicom("http://www.xicidaili.com/wt/", 15); kuaidailicom("http://www.kuaidaili.com/free/inha/", 15); kuaidailicom("http://www.kuaidaili.com/free/intr/", 15); kuaidailicom("http://www.kuaidaili.com/free/outtr/", 15); /** * 构造返回数据 */ proxyresponse response = new proxyresponse(); response.setsuccess("true"); map<string, object> datainfomap = new hashmap<string, object>(); datainfomap.put("numfound", localproxyinfos.get().size()); datainfomap.put("pagenum", 1); datainfomap.put("proxy", localproxyinfos.get()); response.setdata(datainfomap); string responsestring = jsonobject.tojson(response).tostring(); system.out.println(responsestring); return responsestring; } private void kuaidailicom(string baseurl, int totalpage) { string ipreg = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}"; pattern ipptn = pattern.compile(ipreg); for (int i = 1; i < totalpage; i++) { if (getcurrentproxynumber() >= localwantednumber.get()) { return; } try { document doc = jsoup.connect(baseurl + i + "/") .header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") .header("accept-encoding", "gzip, deflate, sdch") .header("accept-language", "zh-cn,zh;q=0.8,en;q=0.6") .header("cache-control", "max-age=0") .header("user-agent", "mozilla/5.0 (macintosh; intel mac os x 10_11_4) applewebkit/537.36 (khtml, like gecko) chrome/51.0.2704.103 safari/537.36") .header("cookie", "hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=ga1.2.1061361785.1462812244") .header("host", "www.kuaidaili.com") .header("referer", "http://www.kuaidaili.com/free/outha/") .timeout(30 * 1000) .get(); matcher m = ipptn.matcher(doc.text()); while (m.find()) { if (getcurrentproxynumber() >= localwantednumber.get()) { break; } string[] strs = m.group().split(" "); if (checkproxy(strs[0], integer.parseint(strs[1]))) { system.out.println("获取到可用代理ip\t" + strs[0] + "\t" + strs[1]); addproxy(strs[0], strs[1], "http"); } } } catch (exception e) { e.printstacktrace(); } } } private static boolean checkproxy(string ip, integer port) { try { //http://1212.ip138.com/ic.asp 可以换成任何比较快的网页 jsoup.connect("http://1212.ip138.com/ic.asp") .timeout(2 * 1000) .proxy(ip, port) .get(); return true; } catch (exception e) { return false; } } private int getcurrentproxynumber() { list<proxyinfo> proxyinfos = localproxyinfos.get(); if (proxyinfos == null) { proxyinfos = new arraylist<proxyinfo>(); localproxyinfos.set(proxyinfos); return 0; } else { return proxyinfos.size(); } } private void addproxy(string ip, string port, string protocol){ list<proxyinfo> proxyinfos = localproxyinfos.get(); if (proxyinfos == null) { proxyinfos = new arraylist<proxyinfo>(); proxyinfos.add(new proxyinfo(ip, port, protocol)); } else { proxyinfos.add(new proxyinfo(ip, port, protocol)); } } } class proxyinfo { private string username = ""; private string ip; private string password = ""; private string type; private string port; private int is_internet = 1; public proxyinfo(string ip, string port, string type) { this.ip = ip; this.type = type; this.port = port; } public string getusername() { return username; } public void setusername(string username) { this.username = username; } public string getip() { return ip; } public void setip(string ip) { this.ip = ip; } public string getpassword() { return password; } public void setpassword(string password) { this.password = password; } public string gettype() { return type; } public void settype(string type) { this.type = type; } public string getport() { return port; } public void setport(string port) { this.port = port; } public int getis_internet() { return is_internet; } public void setis_internet(int is_internet) { this.is_internet = is_internet; } } class proxyresponse { private string success; private map<string, object> data; public string getsuccess() { return success; } public void setsuccess(string success) { this.success = success; } public map<string, object> getdata() { return data; } public void setdata(map<string, object> data) { this.data = data; } }以上这篇java代理实现爬取代理ip的示例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/sdfiiiiii/article/details/70432060
声明:本页内容来源网络,仅供用户参考;我单位不保证亦不表示资料全面及准确无误,也不保证亦不表示这些资料为最新信息,如因任何原因,本网内容或者用户因倚赖本网内容造成任何损失或损害,我单位将不会负任何法律责任。如涉及版权问题,请提交至online#300.cn邮箱联系删除。
爬虫的小伙伴,肯定经常遇到ip被封的情况,而现在网络上的代理ip免费的已经很难找了,那么现在就用python的requests库从爬取代理ip,创建一个ip代理
使用pythonasyncio实现了一个异步代理池,根据规则爬取代理网站上的免费代理,在验证其有效后存入redis中,定期扩展代理的数量并检验池中代理的有效性,
爬取代理IP及测试是否可用很多人在爬虫时为了防止被封IP,所以就会去各大网站上查找免费的代理IP,由于不是每个IP地址都是有效的,如果要进去一个一个比对的话效率
本文实例讲述了python实现ip代理池功能。分享给大家供大家参考,具体如下:爬取的代理源为西刺代理。用xpath解析页面用telnet来验证ip是否可用把有效
Java反射机制与动态代理,使得Java更加强大,Spring核心概念IoC、AOP就是通过反射机制与动态代理实现的。1Java反射示例:Useruser=ne