还没有爬具体题目信息,待更新。

# -*- coding:utf-8 -*-

# 爬取HDU题目保存到本地Excel表格
import requests
from bs4 import BeautifulSoup
import xlsxwriter

# 保存题目信息二维数组
AllProblem = []
# 创建Excel文件"HDUproblem.xlsx"
workxlsx = xlsxwriter.Workbook('HDUproblem.xlsx')
# 创建一个表单
worksheet = workxlsx.add_worksheet()
# 设置表格格式
worksheet.set_column(1, 1, 74)
ratio_format = workxlsx.add_format({'num_format':'## %'})
title_format = workxlsx.add_format({'bold':True, 'align': 'center', 'font_size': 20, 'font_color': 'red'})
head_format = workxlsx.add_format({'bold':True})
worksheet.merge_range(0, 0, 0, 2, 'HDU Online Judge Problem', title_format)

# 获取网页文本源代码
def GetHtmlText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        # r.encoding = 'utf-8'
        return r.text
    except:
        return ''

# 使用BeautifulSoup根据树状标签寻找需要爬取的信息
def FillProblemList(soup):
    OneProblemList = []
    # 题目信息在这里保存
    table = soup.find('table', {'class': {'table_text'}})
    ProblemList = table.get_text()
    ProblemList = ProblemList[47:]
    TempOnePageProblemList = []
    # 将每道题的信息分开保存
    TempOnePageProblemList = ProblemList.split(';')
    # HDU每页并不是100道题,中间会少几道
    PageProblemNum = len(TempOnePageProblemList)
    for Problem in range(PageProblemNum - 1):
        TempOneProblemList = TempOnePageProblemList[Problem][4:-1]
        # 将题目每个信息分开
        OneProblemList = TempOneProblemList.split(',')
        OneProblemList[2] = OneProblemList[2][1:-1]
        # 循环整理数据(计算通过率),题目名称有可能含有未知个数','而被分开
        flag = 1
        while flag:
            try:
                OneProblemList[3] = float(OneProblemList[3]) / float(OneProblemList[4])
                flag = 0
            except:
                OneProblemList[2] = OneProblemList[2] + OneProblemList[3]
                del OneProblemList[3]
                try:
                    OneProblemList[3] = float(OneProblemList[3]) / float(OneProblemList[4])
                    flag = 0
                except:
                    pass
        OneProblemList.pop()
        AllProblem.append(OneProblemList)

def main(pages):
    # 表头
    SheetHead = ['题号', '', '题目', '正确率']
    headcol = 0
    for head in range(len(SheetHead)):
        if head == 1:
            continue
        worksheet.write(1, headcol, SheetHead[head], head_format)
        headcol += 1
    # 截止到2018年6月14日共53页题目
    for page in range(1, pages + 1):
        url = 'http://acm.hdu.edu.cn/listproblem.php?vol=' + str(page)
        html = GetHtmlText(url)
        soup = BeautifulSoup(html, 'html.parser')
        FillProblemList(soup)
    # 将信息写入表格
    row = 2
    for Problem in AllProblem:
        col = 0;
        for i in range(0, len(Problem)):
            if i == 1:
                continue
            elif i == 3:
                worksheet.write(row, col, Problem[i])
            worksheet.write(row, col, Problem[i], ratio_format)
            col += 1
        row += 1
    workxlsx.close()

if __name__ == '__main__':
    main(53)

运行结果: