京东商品页面的爬取
选取一个商品页面
import requests
url = "https://item.jd.com/100011333796.html#crumb-wrap"
try:
#更改头部信息,模拟浏览器访问
kv = {
'user-agent': 'Mozilla/5.0'}
r = requests.get(url, headers = kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text)
except:
print("爬取失败")
百度(360)关键词提交
baidu
import requests
keyword = "Python"
try:
kv = {
'wd': keyword}
r = requests.get("http://www.baidu.com/s", params = kv)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失败")
360
import requests
keyword = "Python"
try:
kv = {
'q': keyword}
r = requests.get("http://www.so.com/s", params = kv)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失败")
网络图片的爬取与存储
爬取csdn上的某张图片
import requests
import os
kv = {
'user-agent' : 'Mozilla/5.0'}
url = "https://imgconvert.csdnimg.cn/aHR0cHM6Ly9tbWJpei5xcGljLmNuL21tYml6X2pwZy8xaFJlSGFxYWZhZTN5bVlZanN2TWRmRnB4YUZiY3VNRFBOVXNPNzg0NWZST0V1cmVTNGdWWmliYXYyWTIzYlI1WXZmUWRJNjJVTWhLNWJyTWRpYXRDSnJnLzY0MA?x-oss-process=image/format,png"
root = "E://python爬虫学习//"
path = root + '不想奋斗.' + url.split(',')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url, headers = kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
#print(r.status_code)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
IP地址归属地的自动查询
手动查询网址:https://www.ip138.com/
查询中北大学网址
import requests
kv = {
'user-agent' : 'Mozilla/5.0'}
url1 = "https://www.ip138.com/iplookup.asp?ip="
url2 = "&action=2"
try:
r = requests.get(url1 + '202.207.177.39' + url2, headers = kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[1000:1800])
except:
print("爬取失败")