功能:
- 爬取西刺代理IP
- 添加了自动检测IP是否可用功能
- 输出到
Data.txt
文件中
注意:
- 爬取西刺的时候,有可能真实IP被封,可以先尝试爬取少量的代理IP,放入
ip_use
中。
测试:
1.测试输出
2.文件输出
代码:
import requests
import traceback
import re
import random
import time
ip_list=[] #获取的ip列表
def main(): #主函数,设置爬取的页码范围
for i in range(10):
print("-----"+str(i+1)+"-----")
url ='https://www.xicidaili.com/nn/'+str(i+1)
get_ip(url)
#不同类型的操作系统以及浏览器的标识
user_agent_list=[
'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
]
#请求是从哪个链接而来的
referer_list=[
'https://www.sogou.com/',
'http://blog.csdn.net/',
'https://www.baidu.com/',
]
#ip存放地址
ip_use=[
'47.102.216.176:3128',
'222.95.240.79:3000',
'117.88.176.176:3000',
'117.88.5.65:3000',
]
#获取html文本内容
def get_html(url,ip):
try:
header={
'User-Agent':random.choice(user_agent_list),
'Referer':random.choice(referer_list)
}
ip=random.choice(ip)
proxy_ip = 'http://' + ip
proxy_ips = 'https://' + ip
proxy = {'https': proxy_ips, 'http': proxy_ip}
html=requests.get(url, headers=header, timeout=(3,7))
html=html.text
return html
except:
print('获取ip错误!')
#traceback.print_exc() #打印异常
#通过正则表达式,获取爬取的ip
def get_ip(url):
try:
html=get_html(url,ip_use)
pattrens = r'alt="Cn" /></td>[\s]*?<td>([\d\D]*?)</td>[\s]*?<td>([\d\D]*?)</td>'
root = re.findall(pattrens ,html)
#print(len(root)) 当返回值为503的时候,root 的长度为0,可能是代理ip出现了问题,更换ip即可.
for i in range(len(root)):
#print(i)
if(root[i][1]!='9999'and text_ip(root[i][0]+':'+root[i][1])):
print(root[i][0]+':'+root[i][1])
ip_list.append(root[i][0]+':'+root[i][1])
write_text(root[i][0]+':'+root[i][1]+'\n')
except:
print('正则匹配错误!')
#traceback.print_exc() #打印异常
#测试可用IP
def text_ip(ip):
try:
url="https://www.baidu.com/"
header={
'User-Agent':random.choice(user_agent_list),
'Referer':random.choice(referer_list)
}
proxy_ip = 'http://' + ip
proxy_ips = 'https://' + ip
proxy = {'https': proxy_ips, 'http': proxy_ip}
html=requests.get(url, headers=header, proxies=proxy, timeout=(3,7))
#print(html.status_code)
if(html.status_code==200):
return 1
else:
return 0
except:
#print('测试IP错误!')
return 0
def write_text(ip):
file = open("Data.txt",'a') #打开文件,并在文件尾添加内容
file.write(ip) #写入文件
file.flush() #刷新缓冲区
file.close() #关闭文件
if __name__ == '__main__':
main()