'''爬取天天基金网站,所有基金经理人以及所管的基金的信息。
网址:http://fund.eastmoney.com/Data/FundDataPortfolio_Interface.aspx?dt=14&mc=returnjson&ft=all&pn=50&pi=1&sc=abbname&st=asc
关键字:get方法。
流程:先获取第一层的namenumber,然后对于每一个namenumber爬取第二层的fund的信息。
结果文件:g:\\fund.csv
'''
import requests
import json
import re
from bs4 import BeautifulSoup
def get_singlepage_namenum(page):
namenumber=[]
url_1 = f'http://fund.eastmoney.com/Data/FundDataPortfolio_Interface.aspx?dt=14&mc=returnjson&ft=all&pn=50&pi={page}&sc=abbname&st=asc' #第一次爬虫的目标网站。
rawhtml = requests.get(url_1)
listhtmltext = json.loads(rawhtml.text.replace("var returnjson=","").replace("data",'"data"').replace('record','"record"').replace('pages','"pages"').replace("curpage",'"curpage"')) # 把字符串变成列表
for i in range(len(listhtmltext['data'])):
namenumber.extend((listhtmltext)['data'][i][0]) #链接到二次爬虫的枢纽。
return(namenumber)
def get_totalinfo():
#开始二次爬虫。
for unitnamenum in get_singlepage_namenum(page):
res = requests.get(f'http://fund.eastmoney.com/manager/{unitnamenum}.html')
res.encoding = ('utf8') #解决乱码
#text=res.text.encode('ISO-8859-1').decode('gb18030') #另一种解决乱码的方式
soupunit = BeautifulSoup(res.text,"lxml")
name=[]
fund=[]
for unitfund in soupunit.find_all("tbody")[1].find_all("tr"):
name.extend((listhtmltext)['data'][i][1]) #managername
for unitfund_info in unitfund.find_all("td")[0]:
fund.extend(unitfund_info.text.replace("\n","").replace("\t","")) #fundcode
info=[]
for i in range(0,len(fund)):
info.append(list(name[i],fund[i])) #managername
return info
for page in range(1,43):
get_singlepage_namenum(page)
get_totalinfo()
with open('g:\\classfund.csv','w',encoding='utf8') as f:
for unitlistinfo in get_totalinfo():
f.write(unitlistinfo+'\n')