# __author: han-zhang
# date: 2019/8/19 16:13
# page_queue、data_queue、多线程
from queue import Queue
import threading
import requests
from bs4 import BeautifulSoup
import time,re,json
class CrawlThread(threading.Thread):
def __init__(self, crawl_name, page_queue, data_queue):
super().__init__()
self.crawl_name = crawl_name
self.page_queue = page_queue
self.data_queue = data_queue
self.url = 'http://www.fanjian.net/jianwen-{}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
def run(self):
'''
1.从页码队列中取一个page
2、和url进行拼接为一个完整的url
3、发送请求,得到响应数据
4、将响应数据存入数据队列
'''
while True:
page = self.page_queue.get()
url = self.url.format(page)
r = requests.get(url=url, headers=self.headers)
# 将文件写入数据队列
self.data_queue.put(r.text)
print('%s线程结束' % self.crawl_name)
class ParseThread(threading.Thread):
def __init__(self, parse_name, data_queue, fp, lock):
super().__init__()
self.data_queue = data_queue
self.parse_name = parse_name
self.fp = fp
self.lock = lock
def parse_data_queue(self, content):
soup = BeautifulSoup(content, 'lxml')
# 解析网页
user_list=soup.select('.box .cont-list > .cont-item')
# print(user_list)
type_list=[]
for user in user_list:
# print(user)
#用户名字
user_name=user.select('.cont-list-head .cont-list-editor')[0].string
# print(user_name)
#主题
user_title=user.select('.cont-list-title')[0].string
print(print('开始爬取%s' % user_title))
# print(user_title)
#用户内容
user_content=user.select('.cont-list-main p')[1].text
# print(user_content)
#内容信息
info=user.select('.cont-list-info')[0].text
#浏览量
read=re.findall('\d+',info)[0]
# print(read)
#类型
type=user.select('.cont-list-info .cont-list-tags .fc-gray')
for t in type:
type_list.append(t.text)
type_list=type_list[1:-1]
# print(type_list)
#点赞量
six=soup.select('.cont-list-sub .cont-list-tools .like')[-1].text
zan=re.findall('\d+', six)[0]
# print(zan)
item={
'用户名字':user_name,
'主题':user_title,
'浏览量':read,
'点赞量':zan,
'用户内容':user_content,
'类型':type_list
}
string=json.dumps(item,ensure_ascii=False)
time.sleep(1)
# exit()
# 先加锁
self.lock.acquire()
# 写入文件
self.fp.write(string+'\n')
print('结束爬取%s'%user_title)
# 解锁
self.lock.release()
def run(self):
'''
1、从数据对列中取一页数据
2、解析数据并保存
'''
while True:
content = self.data_queue.get()
# 解析放到函数中做
self.parse_data_queue(content)
# 创建队列
def creat_queue():
page_queue = Queue()
for page in range(1, 10):
page_queue.put(page)
data_queue = Queue()
return page_queue, data_queue
def main():
'''
主线程的功能有那些?
创建队列:页码队列、数据队列
创建采集线程:
创建解析线程:
启动线程
主线程等待
?打开文件,线程锁
'''
fp = open('**.txt', 'w', encoding='utf8')
lock = threading.Lock()
page_queue, data_queue = creat_queue()
# 创建列表,保存所有线程
t_crwal_list = []
t_parse_list = []
# 创建所有采集线程,并且启动
crawl_name_list = ['采集线程1', '采集线程2', '采集线程3']
for crawl_name in crawl_name_list:
# 实例化线程对象
t_crwal = CrawlThread(crawl_name, page_queue, data_queue)
t_crwal_list.append(t_crwal)
t_crwal.start()
# 创建解析线程,并且启动
parse_name_list = ['解析线程1', '解析线程2', '解析线程3']
for parse_name in parse_name_list:
# 实例化线程对象
t_parse = ParseThread(parse_name, data_queue, fp, lock)
t_parse_list.append(t_parse)
t_parse.start()
# 主线程等待子线程
for t_crwal in t_crwal_list:
t_crwal.join()
for t_parse in t_parse_list:
t_parse.join()
fp.close()
if __name__ == '__main__':
main()