网络爬取协议
得到一个网页爬取的协议
import requests url = "https://www.baidu.com/robots.txt" r = requests.get(url) print(r.text)
爬虫实例1
爬取一个固定网页的信息
#商品网页爬取
import requests
url = "https://item.jd.com"
#具体商品 https://item.jd.com/100014323002.html
#<script>window.location.href='https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fitem.jd.com%2F100014323002.html'</script>
try:
kv = {'user-agent' :'Mozilla/5.0'}
r = requests.get('https://item.jd.com/100014323002.html',headers = kv)
print(r.status_code) #爬取状态
r.raise_for_status()
r.encoding = r.apparent_encoding
#print(r.text)
print(r.text[:1000]) #[:1000] 截取前1000个字符
except:
print("爬取失败")爬虫实例2
网站阻止以爬虫形式获取信息
#网页爬取,网站阻止爬虫行为访问
import requests
url = "https://www.amazon.cn/dp/B088BJ8HVL/ref=sr_1_1?brr=1&dchild=1&qid=1613541616&rd=1&s=digital-text&sr=1-1"
try:
kv = {'user-agent' :'Mozilla/5.0'}
r = requests.get(url,headers = kv)
print(r.status_code) #爬取状态 若是503 则该网站拦截爬虫行为的访问,需要加上用户信息,改为浏览器访问
r.raise_for_status()
r.encoding = r.apparent_encoding
#print(r.text)
print(r.text[:1000]) #[:1000] 截取前1000个字符
except:
print("爬取失败")爬虫实例3
根据一个关键词获取与之相关的信息
#关键词爬取
import requests
try:
kv = {'wd':'Python'}
r=requests.get("http://www.baidu.com/s",params=kv)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失败")
爬虫实例4
爬取网络的某个信息并进行存储
#网络图片爬取并保存本地
import requests
import os
url = "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2032926573,1024357327&fm=26&gp=0.jpg"
root = "D://pics//"
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")爬虫实例5
获取一个IP地址的归属地
# IP归属地查询
# https://m.ip138.com
import requests
#url = "https://m.ip138.com/iplookup.asp?ip="
url = "https://m.ip138.com/iplookup.asp?ip=202.204.80.112"
ip ="202.204.80.112"
kv = {'user-agent': 'Mozilla/5.0'}#不进行身份更改会造成爬取失败
try:
try:
r = requests.get(url,headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text)
except:
print("爬取失败")

京公网安备 11010502036488号