文章导语
目前全国防疫战还正在紧张阶段,为了避免感染人数再次反弹,大街小巷都不让出门,学校也迟迟没法开学,哎…,下周就要开始在家上网课了。平时也不怎么看微博,但是由于这次疫情,对微博的消息关注的稍微多一点,我浏览微博都只看评论,从来不发表什么意见和个人看法,刚好也好久没写过爬虫了,就当练练手了。这个过程有点小曲折,真是撸码半小时,分析十年功。
功能实现
此脚本主要的功能是获取微博50条热搜并可交互式获取相应每条热搜的评论,然后通过词云分词进行可视化,以便了解大众对热搜词条的总体态度。
实现过程
# -*- coding:utf-8 -*-
# /usr/bin/env python
""" Author: Zhu Huaren Date: 2020/2/25 15:29 """
import requests
import urllib
import time
import json
import re
import jieba
import wordcloud
headers = {
"authority": "m.weibo.cn",
"method": "GET",
"path": "/p/index?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30%26display_time%3D1582790041&luicode=10000011&lfid=231583",
"scheme": "https",
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": "cokie",
"dnt": "1",
"referer": "https://m.weibo.cn/sw.js",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "same-origin",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36",
}
proxy = {
'HTTP': 'HTTP://180.125.70.78:9999',
'HTTP': 'HTTP://117.90.4.230:9999',
'HTTP': 'HTTP://111.77.196.229:9999',
'HTTP': 'HTTP://111.177.183.57:9999',
'HTTP': 'HTTP://123.55.98.146:9999',
# 构造微博热搜首页api
hot_search_api = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30%26display_time%3D{}&luicode=10000011&lfid=231583".format(int(time.time()))
res = requests.get(
"https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26t%3D10%26q%3D%E9%92%9F%E5%8D%97%E5%B1%B1+%E8%BA%AB%E4%BD%93%E9%87%8C%E5%87%BA%E7%8E%B0%E8%B6%B3%E5%A4%9F%E6%8A%97%E4%BD%93%E4%B8%8D%E4%BC%9A%E5%86%8D%E6%84%9F%E6%9F%93&isnewpage=1&extparam=cate%3D0%26pos%3D0%26realpos%3D1%26flag%3D2%26filter_type%3Drealtimehot%26c_type%3D31%26display_time%3D1582790042&luicode=10000011&lfid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&page_type=searchall", headers=headers, proxies=proxy)
# 检测连接响应状态
print(res.status_code)
# 存储热搜文本和链接
hot_search_list = []
hot_search_content = requests.get(hot_search_api, headers=headers, proxies=proxy).json()
for i in range(50):
"""获取热搜文本和链接"""
hot_search_dict = {
}
hot_search_text = hot_search_content["data"]["cards"][0]["card_group"][i]["desc"]
hot_search_link = hot_search_content["data"]["cards"][0]["card_group"][i]["scheme"]
hot_search_dict["TEXT"] = hot_search_text
hot_search_dict["LINK"] = hot_search_link
# 存储获取的热搜文本和链接
hot_search_list.append(hot_search_dict)
print("第【{}】条热搜:".format(i) + hot_search_text, end="\n")
# 根据需要输入对应的热搜序列
search_num = int(input("请输入序列号(范围:0-50):"))
search = hot_search_list[search_num]["TEXT"]
print("您要搜索的热搜是:【" + search + "】")
api_url_list = []
# 解析每个热搜若干页的url
parse_text = urllib.parse.quote("type=60&q=#" + search + "#&t=10&title=热门-#" + search + "#")
parse_text_ = urllib.parse.quote("title=热门&mid=&q=#" + search + "#")
parse_text__ = urllib.parse.quote("type=1&t=10&q=#" + search + "#")
# "20" 可以根据评论数量进行调整
for n in range(20):
api_url_dict = {
}
parse_link = "https://m.weibo.cn/api/container/getIndex?containerid=100103" + parse_text + "&cardid=weibo_page&extparam=" + parse_text_ +"&luicode=10000011&lfid=100103" + parse_text__ + "&page={}".format(n)
api_url_dict["【" + search + "】" + "热搜第{}页API".format(n)] = parse_link
api_url_list.append(api_url_dict)
# print(parse_link)
# 存储每个微博的id
blog_id_nums = []
for v in range(len(api_url_list)):
mblog_res = requests.get(list(api_url_list[v].values())[0], headers=headers, proxies=proxy)
# print(a)
mblog_res_json = mblog_res.json()
for blog_id in range(len(mblog_res_json["data"]["cards"])):
try:
blog_id_num = mblog_res_json["data"]["cards"][blog_id]["mblog"]["id"]
blog_id_nums.append(blog_id_num)
except KeyError:
pass
# print(blog_id_nums)
get_data_list = []
def get_data(url):
res_json = requests.get(url, headers=headers, proxies=proxy).json()
# print(len(res_json["data"]["data"]))
for p in range(len(res_json["data"]["data"])):
try:
comment_text = res_json["data"]["data"][p]["text"]
# print(comment_text)
cleaned_text = re.sub('<[^<]+?>', '', comment_text).replace('\n', '').strip()
get_data_list.append(cleaned_text)
print(get_data_list)
except KeyError:
pass
for j in range(len(blog_id_nums)):
blog_id = blog_id_nums[j]
base_comment_url = "https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id_type=0".format(blog_id, blog_id)
try:
base_comment_url_res = get_data(base_comment_url)
base_max_id_json = requests.get(base_comment_url, headers=headers, proxies=proxy).json()
# print(base_max_id_json)
base_max_id = base_max_id_json["data"]["max_id"]
# print(base_max_id)
comment_url = "https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type=0".format(blog_id,
blog_id,
base_max_id)
comment_urlres = get_data(comment_url)
except Exception:
pass
def split_words(text):
text_ = ''.join(text)
cut_text = jieba.cut(text_)
string = ','.join(cut_text)
# stop_words = ['str']
word_cloud = wordcloud.WordCloud(
font_path=r'.\simhei.ttf',
background_color='white',
width=500,
height=350,
max_font_size=100,
min_font_size=10,
# stopwords=stop_words,
scale=15,
)
word_cloud.generate(string)
word_cloud.to_file(r'{}.png'.format(search))
split_words(text=get_data_list)
文章结语
中华民族自古以来就是多灾多难的民族,我们的祖辈们经历过各种各样的困难,这次疫情也只不过是我们中华民族几千年历史中的一抹灰色,在这场中国人民的战争,乃至世界人民的战争中,我们一定会取得最后的胜利,我们终将胜利!
<mark>武汉加油!中国加油!世界加油!</mark>