1. 出现问题:
  • 爬虫经常遇到当前访问次数过多,导致出现验证码的情况。
  • 被识别出为爬虫。
2. 解决方案:
  • 可以使用不同的浏览器信息(user_agent)
  • 不同的转跳链接信息(referer)
  • 更换不同的代理 IP (proxies)
3.代码:
import requests
import random

url=r"https://www.baidu.com"    #访问页面
ip_list=[                       #ip存放地址
        '118.70.144.77:3128',
        '113.200.105.45:8080',
        '116.207.131.19:8080',
    ]

def change_ip_info():
    #不同类型的操作系统以及浏览器的标识
    user_agent_list=[
            'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
            'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
            'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
            'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
            'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
            'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
            'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',  
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
        ]
    #请求是从哪个链接而来的
    referer_list=[
            'https://www.sogou.com/',
            'http://blog.csdn.net/',
            'https://www.baidu.com/',
        ]
    #随机抽取user_agent和referer设置Headers识别
    header={
            'User-Agent':random.choice(user_agent_list), 
            'Referer':random.choice(referer_list)
        }
    ip=random.choice(ip_list)
    proxy_ip = 'http://' + ip
    proxy_ips = 'https://' + ip
    proxy = {'https': proxy_ips, 'http': proxy_ip}
    try:
        html=requests.get(url,headers=header, proxies=proxy,timeout=(3,7))  #尝试访问是否成功
    except:    #异常抛出
        print("访问失败!")
    else:
        print("成功访问!")
if __name__=='__main__':
    change_ip_info()