功能:
  • 爬取西刺代理IP
  • 添加了自动检测IP是否可用功能
  • 输出到Data.txt文件中
注意:
  • 爬取西刺的时候,有可能真实IP被封,可以先尝试爬取少量的代理IP,放入ip_use中。
测试:

1.测试输出

2.文件输出

代码:
import requests
import traceback
import re
import random
import time

ip_list=[]      #获取的ip列表
def main():     #主函数,设置爬取的页码范围
    for i in range(10):
        print("-----"+str(i+1)+"-----")
        url ='https://www.xicidaili.com/nn/'+str(i+1)
        get_ip(url)

#不同类型的操作系统以及浏览器的标识
user_agent_list=[
            'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
            'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
            'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
            'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
            'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
            'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
            'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',  
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
        ]
#请求是从哪个链接而来的
referer_list=[
            'https://www.sogou.com/',
            'http://blog.csdn.net/',
            'https://www.baidu.com/',
        ]
#ip存放地址
ip_use=[                       
        '47.102.216.176:3128',
        '222.95.240.79:3000',
        '117.88.176.176:3000',
        '117.88.5.65:3000',
    ]
#获取html文本内容
def get_html(url,ip):
    try:
        header={
            'User-Agent':random.choice(user_agent_list), 
            'Referer':random.choice(referer_list)
        }
        ip=random.choice(ip)
        proxy_ip = 'http://' + ip
        proxy_ips = 'https://' + ip
        proxy = {'https': proxy_ips, 'http': proxy_ip}
        html=requests.get(url, headers=header, timeout=(3,7))
        html=html.text
        return html
    except:
        print('获取ip错误!')
        #traceback.print_exc() #打印异常

#通过正则表达式,获取爬取的ip
def get_ip(url):
    try:
        html=get_html(url,ip_use)
        pattrens = r'alt="Cn" /></td>[\s]*?<td>([\d\D]*?)</td>[\s]*?<td>([\d\D]*?)</td>'
        root = re.findall(pattrens ,html)
        #print(len(root)) 当返回值为503的时候,root 的长度为0,可能是代理ip出现了问题,更换ip即可.
        for i in range(len(root)):
            #print(i)
            if(root[i][1]!='9999'and text_ip(root[i][0]+':'+root[i][1])):
                print(root[i][0]+':'+root[i][1])
                ip_list.append(root[i][0]+':'+root[i][1])
                write_text(root[i][0]+':'+root[i][1]+'\n')
    except:
        print('正则匹配错误!')
        #traceback.print_exc() #打印异常

#测试可用IP
def text_ip(ip):
    try:
        url="https://www.baidu.com/"
        header={
                'User-Agent':random.choice(user_agent_list), 
                'Referer':random.choice(referer_list)
            }
        proxy_ip = 'http://' + ip
        proxy_ips = 'https://' + ip
        proxy = {'https': proxy_ips, 'http': proxy_ip}
        html=requests.get(url, headers=header, proxies=proxy, timeout=(3,7))
        #print(html.status_code)
        if(html.status_code==200):
            return 1
        else:
            return 0
    except:
        #print('测试IP错误!')
        return 0
def write_text(ip):
    file = open("Data.txt",'a') #打开文件,并在文件尾添加内容
    file.write(ip)              #写入文件
    file.flush()                #刷新缓冲区 
    file.close()                #关闭文件


if __name__ == '__main__':
    main()