数新的同学提问,刚好回忆一下大一小学期学习的爬虫(水一篇博客)。
  与网上众多豆瓣250不同的是,此次需要爬取更多的内容(更麻烦一些)。
  首先豆瓣是有反爬虫措施的(形同没有),之后用requests库和xpath轻轻松松爬到所有的信息,之后就是数据清洗。注意由于内容涉及到中文字符,全文需以utf-8格式处理,同时空白分隔符会被识别为\xa0(事儿多,直接去掉),其余就是无聊的字符串处理了。
  很久没有写python了,基本语法记得还可以,水水这些简单的任务还是可行的。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lxml import etree
import requests
import json
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}

movie_list = []


def getRequests():
    urls = [
        "https://movie.douban.com/top250?start={}".format(str(i)) for i in range(0, 250, 25)]
    for url in urls:
        data = requests.get(url, headers=headers)
        html = etree.HTML(data.text)
        count = html.xpath("//div[@class='item']")  # 这里是共有的xpath
        for info in count:
            titles = info.xpath("div[2]/div[1]/a/span/text()")  # 电影名称
            directors = info.xpath("div[2]/div[2]/p[1]/text()[1]")  # 导演
            year_country_classify = info.xpath(
                "div[2]/div[2]/p[1]/text()[2]")  # 信息
            stars = info.xpath("div[2]/div[2]/div/span[2]/text()")  # 电影星评
            starpeople = info.xpath("div[2]/div[2]/div/span[4]/text()")  # 电影人数
            details = info.xpath("div[2]/div[2]/p[2]/span/text()")  # 电影的简介
            # print(titles)
            # 标题
            other_title = ""
            for i in range(len(titles)):
                if i == 0:
                    continue
                else:
                    other_title += titles[i].replace("\xa0",
                                                     "").replace("/", "").strip()+" "

            # 导演
            director = temp = ""
            flag = 0
            temp = directors[0].replace("/", " ").replace("\n", "").strip()
            for i in temp:
                if i == ':':
                    flag = (flag+1) % 2
                elif i == "主":
                    break
                elif flag == 1:
                    director += i
            director = director.strip()

            year_country_classify = year_country_classify[0].replace(
                "\xa0", " ").replace("\n", "").strip()
            temp = year_country_classify.split("/")
            people = starpeople[0].replace("人评价", "")
            if not details:
                details.append("")

            movie_list.append({
                "title": {
                    "chinese": titles[0],
                    "others": other_title
                },
                "director": director,
                "year": temp[0].strip(),
                "country": temp[1].strip(),
                "classify": temp[2].strip(),
                "rating": {
                    "num": stars[0],
                    "people": people
                },
                "quote": details[0]
            })

    # for i in movie_list:
    #     print(i)


def output():
    with open("豆瓣TOP250电影.json", "w+", encoding="UTF-8") as file:
        file.write(json.dumps({"data": movie_list}, ensure_ascii=False))
    print("end")


if __name__ == '__main__':
    getRequests()
    output()