cf的题目有很多Latex公式,而且是用’$$$'三个符号标记,所以复制题目写博客的时候很不方便,写一个爬虫保存一场比赛中的所有题目信息。
# -*- coding:utf-8 -*-
import os
import requests
from bs4 import BeautifulSoup
f = open('blog.md', 'w')
Latextag = 0
def GetHtmlText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def Clear(text):
flag = True
while flag:
flag = False
try:
index = text.index('$$$')
if Latextag == 0:
pass
elif Latextag == 1:
text = text[:index] + text[index + 1:]
elif Latextag == 2:
text = text[:index] + text[index + 2:]
flag = True
except:
break
return text
def FindInfo(soup, url):
AllInfo = soup.find('div', {'class', 'problemindexholder'})
divs = AllInfo.find_all('div')
title = '# ' + divs[3].get_text()
f.write('%s\n' % title)
problem = '## Description:\n' + divs[12].get_text()
problem = Clear(problem)
f.write('%s\n' % problem)
Input = '## Input:\n' + divs[13].get_text()[5:]
Input = Clear(Input)
f.write('%s\n' % Input)
Output = '## Output\n' + divs[15].get_text()[6:]
Output = Clear(Output)
f.write('%s\n' % Output)
Sample = soup.find('div', {'class', 'sample-test'})
SampleInputs = Sample.find_all('div', {'class', 'input'})
SampleOutputs = Sample.find_all('div', {'class', 'output'})
for i in range(len(SampleInputs)):
SampleInput = SampleInputs[i].get_text()
SampleOutput = SampleOutputs[i].get_text()
f.write('## Sample Input:\n%s\n' % SampleInput[5:])
f.write('## Sample Output:\n%s\n' % SampleOutput[6:])
f.write('### [题目链接](%s)\n\n' % url)
f.write('## AC代码:\n```\n```\n')
def main():
global Latextag
print('Welcome to use codeforces contest crawler\n')
Latextag = int(input("Please enter the Latex tag you need(0:'$$$',1:'$$',2:'$'):\n"))
Url = input("请输入比赛链接(eg:'http://codeforces.com/contest/1003'):\n")
Problem = input('请输入比赛题目编号(eg:A B C D E F):\n').split(' ')
Url += '/problem/'
for i in Problem:
url = Url + i;
print(url)
html = GetHtmlText(url).replace('<br />', '\n').replace('</p>', '\n')
soup = BeautifulSoup(html, "html.parser")
FindInfo(soup, url)
f.close()
if __name__ == '__main__':
main()