Python3 网络爬虫(一) 斗图网

此文仅作为记录使用.

环境:win10+Python3.6+Anaconda3
基本配置:
项目结构:
代码:
import  os
import  requests
import  threading
from  urllib import request
from  bs4 import  BeautifulSoup



BASE_PAGE_URL = "http://www.doutula.com/photo/list/?page="
PAGE_URL_List = []

# 爬取869页( *随意)
for i in range(1,870):
    PAGE_URL_List.append(BASE_PAGE_URL+str(i))

IMG_URL = []
# 设置headers 隐藏身份
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
}

# 加锁
lock = threading.Lock()

# 生产者
def producer():
    while True:
        lock.acquire()
        if len(PAGE_URL_List) == 0:
            lock.release() #释放锁
            break
        else:
            pageurl=PAGE_URL_List.pop()
            lock.release()
            req = requests.get(pageurl, headers=headers)

            soup = BeautifulSoup(req.content, 'lxml')

            imglist = soup.find_all("img", attrs={"class": "img-responsive lazy image_dta"})

            # lock.acquire()  #影响消费者下载照片的效率
            for img in imglist:
                IMG_URL.append(img["data-original"])
            # lock.release()   #影响消费者下载照片的效率

def customer():
    while True:
        lock.acquire()
        if len(IMG_URL) == 0:
            lock.release()
            continue
        else:

            img_url = IMG_URL.pop()
            lock.release()
            split_list =img_url.split("/")
            filename = split_list.pop()
            # 为下载的图片指定目录
            path = os.path.join("templates", filename)
            # print(path) #templates\9150e4e5ly1fv8qhvstpug20500661kz.gif
            request.urlretrieve(img_url, filename=path)



def async_main():
    # 定义3个生产者线程去爬取url
    for i in range(3):
        th = threading.Thread(target=producer)
        th.start()

    #定义5的消费者线程去存取img
    for i in range(5):
        th = threading.Thread(target=customer)
        th.start()


if __name__ == '__main__':
    async_main()




# def downloader_img(url):
#
#     split_list = url.split("/")
#     filename = split_list.pop()
#     path = os.path.join("templates",filename)
#     # print(path) #templates\9150e4e5ly1fv8qhvstpug20500661kz.gif
#     request.urlretrieve(url,filename=path)



# def getPageUrl(pageurl):
#
#     req = requests.get(pageurl,headers=headers)
#
#     soup = BeautifulSoup(req.content,'lxml')
#
#     imglist = soup.find_all("img",attrs={"class":"img-responsive lazy image_dta"})
#
#     for img in imglist:
#          downloader_img(img["data-original"])
#     print(img["data-original"],"====",img["alt"])
#     print("*"*100)