since_id那里还是不会,书里面是page,很好进行多个页面的爬取,但是since_id找不到规律,我暂时还不会,以下的代码只爬取了一页的数据
# 因为爬虫对象是https链接,导入一个ssl模块就可以解决问题 # 全局取消证书验证 import ssl ssl._create_default_https_context = ssl._create_unverified_context from bs4 import BeautifulSoup import requests from urllib.parse import urlencode import re headers ={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36', 'Referer': 'https://m.weibo.cn/u/2830678474', 'X-Requested-With': 'XMLHttpRequest', 'Host' : 'm.weibo.cn' } base_url = 'https://m.weibo.cn/api/container/getIndex?' def get_since_id(since_id): params = { 'type': 'uid', 'value': '2830678474', 'containerid': '1076032830678474', 'since_id': since_id } url = base_url + urlencode(params) try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.json() except requests.ConnectionError as e: print('Error', e.args) from pyquery import PyQuery as pq #解析 def parse_since_id(json): if json: items = json.get('data').get('cards') for item in items: item = item.get('mblog') weibo = {} weibo['id'] = item.get('id') weibo['text'] = pq(item.get('text')).text() weibo['attitudes'] = item.get('attitudes_count') weibo['comments'] = item.get('comments_count') weibo['reposts'] = item.get('reposts_count') yield weibo from pymongo import MongoClient client = MongoClient() db=client['weibo'] collection=db['weibo'] def save_to_mongo(result): if collection.insert_one(result): print('Saved to mongo') if __name__ == '__main__': for since_id in range(4527282934256454, 4527282934256455): json = get_since_id(since_id) results = parse_since_id(json) for result in results: save_to_mongo(result) # print(result)