此文仅作为记录使用.
环境:win10+Python3.6+Anaconda3
基本配置:
项目结构:
代码:
import os
import requests
import threading
from urllib import request
from bs4 import BeautifulSoup
BASE_PAGE_URL = "http://www.doutula.com/photo/list/?page="
PAGE_URL_List = []
# 爬取869页( *随意)
for i in range(1,870):
PAGE_URL_List.append(BASE_PAGE_URL+str(i))
IMG_URL = []
# 设置headers 隐藏身份
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
}
# 加锁
lock = threading.Lock()
# 生产者
def producer():
while True:
lock.acquire()
if len(PAGE_URL_List) == 0:
lock.release() #释放锁
break
else:
pageurl=PAGE_URL_List.pop()
lock.release()
req = requests.get(pageurl, headers=headers)
soup = BeautifulSoup(req.content, 'lxml')
imglist = soup.find_all("img", attrs={"class": "img-responsive lazy image_dta"})
# lock.acquire() #影响消费者下载照片的效率
for img in imglist:
IMG_URL.append(img["data-original"])
# lock.release() #影响消费者下载照片的效率
def customer():
while True:
lock.acquire()
if len(IMG_URL) == 0:
lock.release()
continue
else:
img_url = IMG_URL.pop()
lock.release()
split_list =img_url.split("/")
filename = split_list.pop()
# 为下载的图片指定目录
path = os.path.join("templates", filename)
# print(path) #templates\9150e4e5ly1fv8qhvstpug20500661kz.gif
request.urlretrieve(img_url, filename=path)
def async_main():
# 定义3个生产者线程去爬取url
for i in range(3):
th = threading.Thread(target=producer)
th.start()
#定义5的消费者线程去存取img
for i in range(5):
th = threading.Thread(target=customer)
th.start()
if __name__ == '__main__':
async_main()
# def downloader_img(url):
#
# split_list = url.split("/")
# filename = split_list.pop()
# path = os.path.join("templates",filename)
# # print(path) #templates\9150e4e5ly1fv8qhvstpug20500661kz.gif
# request.urlretrieve(url,filename=path)
# def getPageUrl(pageurl):
#
# req = requests.get(pageurl,headers=headers)
#
# soup = BeautifulSoup(req.content,'lxml')
#
# imglist = soup.find_all("img",attrs={"class":"img-responsive lazy image_dta"})
#
# for img in imglist:
# downloader_img(img["data-original"])
# print(img["data-original"],"====",img["alt"])
# print("*"*100)