cf的题目有很多Latex公式,而且是用’$$$'三个符号标记,所以复制题目写博客的时候很不方便,写一个爬虫保存一场比赛中的所有题目信息。

# -*- coding:utf-8 -*-

import os
import requests
from bs4 import BeautifulSoup

f = open('blog.md', 'w')

Latextag = 0

def GetHtmlText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ""

def Clear(text):
    flag = True
    while flag:
        flag = False
        try:
            index = text.index('$$$')
            if Latextag == 0:
                pass
            elif Latextag == 1:
                text = text[:index] + text[index + 1:]
            elif Latextag == 2:
                text = text[:index] + text[index + 2:]
            flag = True
        except:
            break
    return text

def FindInfo(soup, url):
    AllInfo = soup.find('div', {'class', 'problemindexholder'})
    divs = AllInfo.find_all('div')
    title = '# ' + divs[3].get_text()
    f.write('%s\n' % title)
    problem = '## Description:\n' + divs[12].get_text()
    problem = Clear(problem)
    f.write('%s\n' % problem)
    Input = '## Input:\n' + divs[13].get_text()[5:]
    Input = Clear(Input)
    f.write('%s\n' % Input)
    Output = '## Output\n' + divs[15].get_text()[6:]
    Output = Clear(Output)
    f.write('%s\n' % Output)
    Sample = soup.find('div', {'class', 'sample-test'})
    SampleInputs = Sample.find_all('div', {'class', 'input'})
    SampleOutputs = Sample.find_all('div', {'class', 'output'})
    for i in range(len(SampleInputs)):
        SampleInput = SampleInputs[i].get_text()
        SampleOutput = SampleOutputs[i].get_text()
        f.write('## Sample Input:\n%s\n' % SampleInput[5:])
        f.write('## Sample Output:\n%s\n' % SampleOutput[6:])
    f.write('### [题目链接](%s)\n\n' % url)
    f.write('## AC代码:\n```\n```\n')

def main():
    global Latextag
    print('Welcome to use codeforces contest crawler\n')
    Latextag = int(input("Please enter the Latex tag you need(0:'$$$',1:'$$',2:'$'):\n"))
    Url = input("请输入比赛链接(eg:'http://codeforces.com/contest/1003'):\n")
    Problem = input('请输入比赛题目编号(eg:A B C D E F):\n').split(' ')
    Url += '/problem/'
    for i in Problem:
        url = Url + i;
        print(url)
        html = GetHtmlText(url).replace('<br />', '\n').replace('</p>', '\n')
        soup = BeautifulSoup(html, "html.parser")
        FindInfo(soup, url)
    f.close()

if __name__ == '__main__':
    main()

运行结果: