1. 出现问题:
- 爬虫经常遇到当前访问次数过多,导致出现验证码的情况。
- 被识别出为爬虫。
2. 解决方案:
- 可以使用不同的浏览器信息(user_agent)
- 不同的转跳链接信息(referer)
- 更换不同的代理 IP (proxies)
3.代码:
import requests
import random
url=r"https://www.baidu.com"
ip_list=[
'118.70.144.77:3128',
'113.200.105.45:8080',
'116.207.131.19:8080',
]
def change_ip_info():
user_agent_list=[
'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
]
referer_list=[
'https://www.sogou.com/',
'http://blog.csdn.net/',
'https://www.baidu.com/',
]
header={
'User-Agent':random.choice(user_agent_list),
'Referer':random.choice(referer_list)
}
ip=random.choice(ip_list)
proxy_ip = 'http://' + ip
proxy_ips = 'https://' + ip
proxy = {'https': proxy_ips, 'http': proxy_ip}
try:
html=requests.get(url,headers=header, proxies=proxy,timeout=(3,7))
except:
print("访问失败!")
else:
print("成功访问!")
if __name__=='__main__':
change_ip_info()