网络爬取协议

得到一个网页爬取的协议

import requests
url = "https://www.baidu.com/robots.txt"
r = requests.get(url)
print(r.text)

爬虫实例1

爬取一个固定网页的信息

#商品网页爬取

import requests
url = "https://item.jd.com"
#具体商品 https://item.jd.com/100014323002.html
#<script>window.location.href='https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fitem.jd.com%2F100014323002.html'</script>

try:
    kv = {'user-agent' :'Mozilla/5.0'}
    r = requests.get('https://item.jd.com/100014323002.html',headers = kv)
    print(r.status_code) #爬取状态
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    #print(r.text)
    print(r.text[:1000]) #[:1000] 截取前1000个字符
except:
    print("爬取失败")

爬虫实例2

网站阻止以爬虫形式获取信息

#网页爬取,网站阻止爬虫行为访问

import requests
url = "https://www.amazon.cn/dp/B088BJ8HVL/ref=sr_1_1?brr=1&dchild=1&qid=1613541616&rd=1&s=digital-text&sr=1-1"

try:
    kv = {'user-agent' :'Mozilla/5.0'}
    r = requests.get(url,headers = kv)
    print(r.status_code) #爬取状态 若是503 则该网站拦截爬虫行为的访问,需要加上用户信息,改为浏览器访问
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    #print(r.text)
    print(r.text[:1000]) #[:1000] 截取前1000个字符
except:
    print("爬取失败")

爬虫实例3

根据一个关键词获取与之相关的信息

#关键词爬取

import requests

try:    
    kv = {'wd':'Python'}
    r=requests.get("http://www.baidu.com/s",params=kv)
    print(r.request.url)
    r.raise_for_status()
    print(len(r.text))
except:
    print("爬取失败")

爬虫实例4

爬取网络的某个信息并进行存储

#网络图片爬取并保存本地

import requests
import os

url = "https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2032926573,1024357327&fm=26&gp=0.jpg"
root = "D://pics//"
path = root + url.split('/')[-1]

try:
    if not os.path.exists(root):
        os.mkdir(root)
    if not os.path.exists(path):
        r = requests.get(url)
        with open(path,'wb') as f:
            f.write(r.content)
            f.close()
            print("文件保存成功")
    else:
        print("文件已存在")
except:
    print("爬取失败")

爬虫实例5

获取一个IP地址的归属地

# IP归属地查询

# https://m.ip138.com

import requests

#url = "https://m.ip138.com/iplookup.asp?ip="

url = "https://m.ip138.com/iplookup.asp?ip=202.204.80.112"

ip ="202.204.80.112"

kv = {'user-agent': 'Mozilla/5.0'}#不进行身份更改会造成爬取失败
try:
try:
    r = requests.get(url,headers=kv)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    print(r.text)
except:
    print("爬取失败")