'''爬取天天基金网站,所有基金经理人以及所管的基金的信息。
网址:http://fund.eastmoney.com/Data/FundDataPortfolio_Interface.aspx?dt=14&mc=returnjson&ft=all&pn=50&pi=1&sc=abbname&st=asc
关键字:get方法。
流程:先获取第一层的namenumber,然后对于每一个namenumber爬取第二层的fund的信息。
结果文件:g:\\fund.csv
'''
import requests
import json
import re
from bs4 import BeautifulSoup

def get_singlepage_namenum(page):
	namenumber=[]
	url_1 = f'http://fund.eastmoney.com/Data/FundDataPortfolio_Interface.aspx?dt=14&mc=returnjson&ft=all&pn=50&pi={page}&sc=abbname&st=asc' #第一次爬虫的目标网站。
	rawhtml = requests.get(url_1)
	listhtmltext = json.loads(rawhtml.text.replace("var returnjson=","").replace("data",'"data"').replace('record','"record"').replace('pages','"pages"').replace("curpage",'"curpage"'))  # 把字符串变成列表
	for i in range(len(listhtmltext['data'])):	
		namenumber.extend((listhtmltext)['data'][i][0]) #链接到二次爬虫的枢纽。
	return(namenumber)

def get_totalinfo():
	#开始二次爬虫。
	for unitnamenum in get_singlepage_namenum(page):
		res = requests.get(f'http://fund.eastmoney.com/manager/{unitnamenum}.html')			
		res.encoding = ('utf8') #解决乱码
		#text=res.text.encode('ISO-8859-1').decode('gb18030') #另一种解决乱码的方式
		soupunit = BeautifulSoup(res.text,"lxml")
		name=[]	
		fund=[]
		for unitfund in soupunit.find_all("tbody")[1].find_all("tr"):
			name.extend((listhtmltext)['data'][i][1]) #managername 
			for unitfund_info in unitfund.find_all("td")[0]:
				fund.extend(unitfund_info.text.replace("\n","").replace("\t","")) #fundcode
		info=[]
		for i in range(0,len(fund)):
			info.append(list(name[i],fund[i])) #managername 
		return info

for page in range(1,43): 
	get_singlepage_namenum(page)
	get_totalinfo()
with open('g:\\classfund.csv','w',encoding='utf8') as f:
	for unitlistinfo in get_totalinfo():
		f.write(unitlistinfo+'\n')