时间:2021-05-23
在众多的网站防爬措施中,有一种是根据ip的访问频率进行限制,即在某一时间段内,当某个ip的访问次数达到一定的阀值时,该ip就会被拉黑、在一段时间内禁止访问。
应对的方法有两种:
1. 降低爬虫的爬取频率,避免IP被限制访问,缺点显而易见:会大大降低爬取的效率。
2. 搭建一个IP代理池,使用不同的IP轮流进行爬取。
1、从代理网站(如:西刺代理、快代理、云代理、无忧代理)爬取代理IP;
2、验证代理IP的可用性(使用代理IP去请求指定URL,根据响应验证代理IP是否生效);
3、将可用的代理IP保存到数据库;
常用代理网站:西刺代理、云代理、IP海、无忧代理、飞蚁代理、快代理
工程结构如下:
ipproxy.py
IPProxy代理类定义了要爬取的IP代理的字段信息和一些基础方法。
# -*- coding: utf-8 -*-import reimport timefrom settings import PROXY_URL_FORMATTERschema_pattern = re.compile(r'http|https$', re.I)ip_pattern = re.compile(r'^([0-9]{1,3}.){3}[0-9]{1,3}$', re.I)port_pattern = re.compile(r'^[0-9]{2,5}$', re.I)class IPProxy: ''' { "schema": "http", # 代理的类型 "ip": "127.0.0.1", # 代理的IP地址 "port": "8050", # 代理的端口号 "used_total": 11, # 代理的使用次数 "success_times": 5, # 代理请求成功的次数 "continuous_failed": 3, # 使用代理发送请求,连续失败的次数 "created_time": "2018-05-02" # 代理的爬取时间 } ''' def __init__(self, schema, ip, port, used_total=0, success_times=0, continuous_failed=0, created_time=None): """Initialize the proxy instance""" if schema == "" or schema is None: schema = "http" self.schema = schema.lower() self.ip = ip self.port = port self.used_total = used_total self.success_times = success_times self.continuous_failed = continuous_failed if created_time is None: created_time = time.strftime('%Y-%m-%d', time.localtime(time.time())) self.created_time = created_time def _get_url(self): ''' Return the proxy url''' return PROXY_URL_FORMATTER % {'schema': self.schema, 'ip': self.ip, 'port': self.port} def _check_format(self): ''' Return True if the proxy fields are well-formed,otherwise return False''' if self.schema is not None and self.ip is not None and self.port is not None: if schema_pattern.match(self.schema) and ip_pattern.match(self.ip) and port_pattern.match(self.port): return True return False def _is_https(self): ''' Return True if the proxy is https,otherwise return False''' return self.schema == 'https' def _update(self, successed=False): ''' Update proxy based on the result of the request's response''' self.used_total = self.used_total + 1 if successed: self.continuous_failed = 0 self.success_times = self.success_times + 1 else: print(self.continuous_failed) self.continuous_failed = self.continuous_failed + 1if __name__ == '__main__': proxy = IPProxy('HTTPS', '192.168.2.25', "8080") print(proxy._get_url()) print(proxy._check_format()) print(proxy._is_https())settings.py
settings.py中汇聚了工程所需要的配置信息。
# 指定Redis的主机名和端口REDIS_HOST = 'localhost'REDIS_PORT = 6379# 代理保存到Redis key 格式化字符串PROXIES_REDIS_FORMATTER = 'proxies::{}'# 已经存在的HTTP代理和HTTPS代理集合PROXIES_REDIS_EXISTED = 'proxies::existed'# 最多连续失败几次MAX_CONTINUOUS_TIMES = 3# 代理地址的格式化字符串PROXY_URL_FORMATTER = '%(schema)s://%(ip)s:%(port)s'USER_AGENT_LIST = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]# 爬取到的代理保存前先检验是否可用,默认TruePROXY_CHECK_BEFOREADD = True# 检验代理可用性的请求地址,支持多个PROXY_CHECK_URLS = {'https':['https://icanhazip.com'],'http':['http://icanhazip.com']}proxy_util.py
proxy_util.py 中主要定义了一些实用方法,例如:proxy_to_dict(proxy)用来将IPProxy代理实例转换成字典;proxy_from_dict(d)用来将字典转换为IPProxy实例;request_page()用来发送请求;_is_proxy_available()用来校验代理IP是否可用。
# -*- coding: utf-8 -*-import randomimport loggingimport requestsfrom ipproxy import IPProxyfrom settings import USER_AGENT_LIST, PROXY_CHECK_URLS# Setting logger output formatlogging.basicConfig(level=logging.INFO, format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)', datefmt='%Y-%m-%d %T' )logger = logging.getLogger(__name__)def proxy_to_dict(proxy): d = { "schema": proxy.schema, "ip": proxy.ip, "port": proxy.port, "used_total": proxy.used_total, "success_times": proxy.success_times, "continuous_failed": proxy.continuous_failed, "created_time": proxy.created_time } return ddef proxy_from_dict(d): return IPProxy(schema=d['schema'], ip=d['ip'], port=d['port'], used_total=d['used_total'], success_times=d['success_times'], continuous_failed=d['continuous_failed'], created_time=d['created_time'])# Truncate header and tailer blanksdef strip(data): if data is not None: return data.strip() return database_headers = { 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'}def request_page(url, options={}, encoding='utf-8'): """send a request,get response""" headers = dict(base_headers, **options) if 'User-Agent' not in headers.keys(): headers['User-Agent'] = random.choice(USER_AGENT_LIST) logger.info('正在抓取: ' + url) try: response = requests.get(url, headers=headers) if response.status_code == 200: logger.info('抓取成功: ' + url) return response.content.decode(encoding=encoding) except ConnectionError: logger.error('抓取失败' + url) return Nonedef _is_proxy_available(proxy, options={}): """Check whether the Proxy is available or not""" headers = dict(base_headers, **options) if 'User-Agent' not in headers.keys(): headers['User-Agent'] = random.choice(USER_AGENT_LIST) proxies = {proxy.schema: proxy._get_url()} check_urls = PROXY_CHECK_URLS[proxy.schema] for url in check_urls: try: response = requests.get(url=url, proxies=proxies, headers=headers, timeout=5) except BaseException: logger.info("< " + url + " > 验证代理 < " + proxy._get_url() + " > 结果: 不可用 ") else: if response.status_code == 200: logger.info("< " + url + " > 验证代理 < " + proxy._get_url() + " > 结果: 可用 ") return True else: logger.info("< " + url + " > 验证代理 < " + proxy._get_url() + " > 结果: 不可用 ") return Falseif __name__ == '__main__': headers = dict(base_headers) if 'User-Agent' not in headers.keys(): headers['User-Agent'] = random.choice(USER_AGENT_LIST) proxies = {"https": "https://163.125.255.154:9797"} response = requests.get("https:///pengjunlee/ipproxy_pool.git到此这篇关于Python爬虫代理池搭建的方法步骤的文章就介绍到这了,更多相关Python爬虫代理池搭建内容请搜索以前的文章或继续浏览下面的相关文章希望大家以后多多支持!
声明:本页内容来源网络,仅供用户参考;我单位不保证亦不表示资料全面及准确无误,也不保证亦不表示这些资料为最新信息,如因任何原因,本网内容或者用户因倚赖本网内容造成任何损失或损害,我单位将不会负任何法律责任。如涉及版权问题,请提交至online#300.cn邮箱联系删除。
在公司做分布式深网爬虫,搭建了一套稳定的代理池服务,为上千个爬虫提供有效的代理,保证各个爬虫拿到的都是对应网站有效的代理IP,从而保证爬虫快速稳
在公司做分布式深网爬虫,搭建了一套稳定的代理池服务,为上千个爬虫提供有效的代理,保证各个爬虫拿到的都是对应网站有效的代理IP,从而保证爬虫快速稳定的运行,当然在
本文实例讲述了python爬虫之线程池和进程池功能与用法。分享给大家供大家参考,具体如下:一、需求最近准备爬取某电商网站的数据,先不考虑代理、分布式,先说效率问
python爬虫-梨视频短视频爬取(线程池)示例代码importrequestsfromlxmlimportetreeimportrandomfrommulti
这篇文章主要介绍了爬虫代理池Python3WebSpider源代码测试过程解析,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要