from lxml import etree
import chardet
import requests
import pymysql
from requests.exceptions import RequestException
#1.请求一个单页的内容
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
response = requests.get(url,headers=headers)
response.encoding = chardet.detect(response.content)['encoding']
data = response.text
return data
except RequestException as e:
print(e)
#2.解析
def parse_one_page(html):
data = etree.HTML(html)
name = data.xpath('//div[@class="name"]/a/text()')#书名
# print(len(name))
author = data.xpath('//li/div[last()-2]')
author_new = []
#切割作者字段,保留著作人,去掉部分出现图和译的作者
for i in author:
author_new.append((i.xpath('string(.)')).split('著')[0])#作者
# print(len(author_new))
time = data.xpath('//li/div[last()-1]/span/text()')#发布时间
# print(len(time))
publish = data.xpath('//li/div[last()-1]/a')
publish_new = []
for i in publish:
publish_new.append(i.xpath('string(.)'))#出版社
# print(len(publish_new))
price = data.xpath('//li/div[last()]/p[1]/span[1]/text()')#价格
price = list(map(lambda x:x[1:],price))#去掉¥
# print(len(price))
for i in range(len(name)):
yield name[i],author_new[i].strip(),time[i],publish_new[i],price[i]

3.存储

def write_to_mysql(data):
#建立数据库连接
#主机,用户名,密码,数据库名
conn = pymysql.connect('localhost','root','jxk8410638','myschool.dangdangbook')
#创建游标
cursor = conn.cursor()
sql = 'insert into dangdangbook values (null,%s,%s,%s,%s,%s)'
parm =tuple(data)
cursor.executemany(sql,parm)
#提交
conn.commit()
cursor.close()
conn.close()
#4.分页处理
def main(index):
#url重构
url = 'http://book.dangdang.com/?_utm_brand_id=11106&_ddclickunion=460-5-biaoti|ad_type=0|sys_id=1'+ str(index)
html = get_one_page(url)
data = parse_one_page(html)
write_to_mysql(data)
for i in range(1,26):
main(i)
print('当当网图书畅销榜图书商品信息已抓取%d页'%i)


mysql中存储数据如下:图片说明