since_id那里还是不会,书里面是page,很好进行多个页面的爬取,但是since_id找不到规律,我暂时还不会,以下的代码只爬取了一页的数据
# 因为爬虫对象是https链接,导入一个ssl模块就可以解决问题
# 全局取消证书验证
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlencode
import re
headers ={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
'Referer': 'https://m.weibo.cn/u/2830678474',
'X-Requested-With': 'XMLHttpRequest',
'Host' : 'm.weibo.cn'
}
base_url = 'https://m.weibo.cn/api/container/getIndex?'
def get_since_id(since_id):
params = {
'type': 'uid',
'value': '2830678474',
'containerid': '1076032830678474',
'since_id': since_id
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
from pyquery import PyQuery as pq #解析
def parse_since_id(json):
if json:
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
from pymongo import MongoClient
client = MongoClient()
db=client['weibo']
collection=db['weibo']
def save_to_mongo(result):
if collection.insert_one(result):
print('Saved to mongo')
if __name__ == '__main__':
for since_id in range(4527282934256454, 4527282934256455):
json = get_since_id(since_id)
results = parse_since_id(json)
for result in results:
save_to_mongo(result)
# print(result)



京公网安备 11010502036488号