在家闲来无事,初入python世界,尝试做一个爬虫项目,爬取囧事百科网站相关内容
下面直接干货

import requests
from lxml import etree
import json

class QiubaiSpider:
    def __init__(self):
        self.url_trmp = "http://www.cnxox.com/baike/p{}/"

    def get_url_list(self):# 根据url地址的规律,构造url_list
        url_list = [self.url_trmp.format(i) for i in range(1, 14)]
        return url_list

    def parse_url(self, url):# 发送请求,获取响应
        response = requests.get(url)
        return response.content.decode()

    def get_content_list(self, html_str): # 提取数据
        html = etree.HTML(html_str)
        # 分组
        div_list = html.xpath('//div[@class="content"]/article')
        content_list =[]
        for div in div_list:
            item = {}
            item["title"] = div.xpath('.//h2/a/@title')[0] if len(div.xpath('//h2/a/@title')) > 0 else None
            item["note"] = div.xpath('.//p[@class="note"]/text()')
            item["read"] = div.xpath('.//p[@class="text-muted views"]/span/text()')[0] if len(
                div.xpath('.//p[@class="text-muted views"]/span/text()')) > 0 else None
            item["zan"] = div.xpath('.//p[@class="text-muted views"]/a/span/text()')[0] if len(
                div.xpath('.//p[@class="text-muted views"]/a/span/text()')) > 0 else None
            item["img"] = div.xpath('.//p[@class="focus"]/a/span/span/img/@src')[0] if len(
                div.xpath('.//p[@class="focus"]/a/span/span/img/@src')) > 0 else None
            content_list.append(item)
        return content_list

    def save_content_list(self, content_list):
        with open("qiubai.txt", 'a', encoding="utf-8") as f:
            for content in content_list:
                f.write(json.dumps(content, ensure_ascii=False))
                f.write('\n')
        print('保存成功')

    # 实现主要逻辑
    def run(self):
        # 根据url地址的规律,构造url_list
        url_list = self.get_url_list()
        # 发送请求,获取响应
        for url in url_list:
            html_str = self.parse_url(url)
            
            # 提取数据
            content_list = self.get_content_list(html_str)

            print(content_list)
            for content in content_list:
                print(content_list)
            # 保存数据
            self.save_content_list(content_list)
if __name__ == '__main__':
     quibai = QiubaiSpider()
     quibai.run()

运行结果:


总结:

书写爬虫脚本时,要按四步顺序书写代码:

  1. url
    • 知道url地址的规律和总的页码数:构造url地址的列表
    • start_url
  2. 发送请求获取响应
    • requests
  3. 提取数据
    • 返回json字符串:json模块
    • 返回的是html字符串:lxml模块配合xpath提取数据
  4. 保存数据