网络爬取协议
得到一个网页爬取的协议
import requests url = "https://www.baidu.com/robots.txt" r = requests.get(url) print(r.text)
爬虫实例1
爬取一个固定网页的信息
#商品网页爬取 import requests url = "https://item.jd.com" #具体商品 https://item.jd.com/100014323002.html #<script>window.location.href='https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fitem.jd.com%2F100014323002.html'</script> try: kv = {'user-agent' :'Mozilla/5.0'} r = requests.get('https://item.jd.com/100014323002.html',headers = kv) print(r.status_code) #爬取状态 r.raise_for_status() r.encoding = r.apparent_encoding #print(r.text) print(r.text[:1000]) #[:1000] 截取前1000个字符 except: print("爬取失败")
爬虫实例2
网站阻止以爬虫形式获取信息
#网页爬取,网站阻止爬虫行为访问 import requests url = "https://www.amazon.cn/dp/B088BJ8HVL/ref=sr_1_1?brr=1&dchild=1&qid=1613541616&rd=1&s=digital-text&sr=1-1" try: kv = {'user-agent' :'Mozilla/5.0'} r = requests.get(url,headers = kv) print(r.status_code) #爬取状态 若是503 则该网站拦截爬虫行为的访问,需要加上用户信息,改为浏览器访问 r.raise_for_status() r.encoding = r.apparent_encoding #print(r.text) print(r.text[:1000]) #[:1000] 截取前1000个字符 except: print("爬取失败")
爬虫实例3
根据一个关键词获取与之相关的信息
#关键词爬取 import requests try: kv = {'wd':'Python'} r=requests.get("http://www.baidu.com/s",params=kv) print(r.request.url) r.raise_for_status() print(len(r.text)) except: print("爬取失败")
爬虫实例4
爬取网络的某个信息并进行存储
#网络图片爬取并保存本地 import requests import os url = "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2032926573,1024357327&fm=26&gp=0.jpg" root = "D://pics//" path = root + url.split('/')[-1] try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(url) with open(path,'wb') as f: f.write(r.content) f.close() print("文件保存成功") else: print("文件已存在") except: print("爬取失败")
爬虫实例5
获取一个IP地址的归属地
# IP归属地查询 # https://m.ip138.com import requests #url = "https://m.ip138.com/iplookup.asp?ip=" url = "https://m.ip138.com/iplookup.asp?ip=202.204.80.112" ip ="202.204.80.112" kv = {'user-agent': 'Mozilla/5.0'}#不进行身份更改会造成爬取失败 try: try: r = requests.get(url,headers=kv) r.raise_for_status() r.encoding = r.apparent_encoding print(r.text) except: print("爬取失败")