爬取中国大学排名实例
import requests
from bs4 import BeautifulSoup
import bs4
import re
def getHTMLText(url):
try:
kv = {"user-agent":"Mozilla/5.0"}
r = requests.get(url,headers=kv,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist,html):
soup = BeautifulSoup(html,'html.parser')
tbody = soup.find('tbody')
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr.find_all('td')
#print(tds)
# st0 = getTagText(str(tds[0]))
st0 = re.sub(r"<!--.*-->", "", str(tds[0]))
st0 = re.sub(r"</td>","",st0)
st0 = re.sub(r"<.*>","",st0)
st0 = re.sub(r" ","",st0)
st0 = re.sub("\n","",st0)
#st1 = getTagText(str(tds[1]))
aas =tds[1].find_all('a')
st1 = aas[0].string
st1 = re.sub(r" ","",st1)
st1 = re.sub("\n","",st1)
#st4 = getTagText(str(tds[4]))
st4 = re.sub(r"<!--.*-->", "", str(tds[4]))
st4 = re.sub(r"</td>","",st4)
st4 = re.sub(r"<.*>","",st4)
st4 = re.sub(r" ","",st4)
st4 = re.sub("\n","",st4)
#print(st0,end = ",")
#print(st1,end = ",")
#print(st4,end = ",")
#print(tds[1])
#st0 = re.sub(r"<.*>","",str(tds[0]))
#st1 = re.sub(r"<.*>","",str(tds[1]))
#st4 = re.sub(r"<.*>","",str(tds[4]))
#print(st0)
#print(st1)
#print(st4)
ulist.append([st0 , st1 , st4 ] ) ###!!!!!!!!出错
def printUnivList(ulist,num):
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" #详情去看.format的用法
print(tplt.format("排名","学校名称","总分",chr(12288)))
for i in range(num):
u=ulist[i]
print(tplt.format(u[0],u[1],u[2],chr(12288)))
def main():
uinfo = []
url = "https://www.shanghairanking.cn/rankings/bcur/2020"
html = getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo, 20)
main()
其中与慕课上稍有变动,由于在td标签中加入了,因此不能直接用.string获得标签中的信息,所以将标签转化为字符串类型,然后运用re.sub()将字符串进行处理,最后得到所需要的结果。

京公网安备 11010502036488号