文章目录
内容概览
数据下载
链接:https://pan.baidu.com/s/16kWSQ4OfNE3w3nSW4M2lyA
提取码:tjc9
源代码及分析
import pandas as pd
#州名的简写
abb = pd.read_csv('state-abbrevs.csv')
#人口的数据
pop = pd.read_csv('state-population.csv')
#州的面积
are = pd.read_csv('state-areas.csv')
abb.head()
pop.head()
cols = pop.columns.tolist()
cols[0] = 'state_region'
pop.columns = cols
pop.shape
are.head()
将abbrevs和population合并
abb_pop = pd.merge(abb,pop,left_on='abbreviation',right_on='state_region',how='outer')
abb_pop.drop(labels='abbreviation',axis=1,inplace=True)
abb_pop.head()
要检查空数据
abb_pop.isnull().sum()
查找州名为空的数据
#查找州名为空的行号
inds = abb_pop.state.isnull()
#获取州名为空的数据集
filter_data = abb_pop.loc[inds]
#提取州名简写
#unique 获取唯一值
filter_data.state_region.unique()
填补简写为PR的
#先抽取简写为PR的数据行号
pr_inds = abb_pop.query("state_region=='PR'").index
reple = abb_pop.loc[pr_inds]
reple['state'] = 'Puerto Rico'
abb_pop.loc[pr_inds] = reple
填补简写为USA的
usa_inds = abb_pop.query("state_region=='USA'").index
reple_usa = abb_pop.loc[usa_inds]
reple_usa['state'] = 'USA'
abb_pop.loc[usa_inds] = reple_usa
人口数据为空的行号
indexs = abb_pop.population.isnull()
filter_data = abb_pop.loc[indexs]
#PR2000年的数据
under18 = abb_pop.query("state_region=='PR' & ages=='under18' & year==2000").population.iloc[0]
total = abb_pop.query("state_region=='PR' & ages=='total' & year==2000").population.iloc[0]
un18_data = filter_data.query("ages=='under18'")
un18_data.population = under18
abb_pop.loc[un18_data.index] = un18_data
total_data = filter_data.query("ages=='total'")
total_data.population = total
abb_pop.loc[total_data.index] = total_data
合并面积的df
abb_pop_are = pd.merge(abb_pop,are,how='outer')
cols = abb_pop_are.columns.tolist()
cols[-1] = 'area'
abb_pop_are.columns = cols
填充面积
are_inds = abb_pop_are.area.isnull()
state = [x for x in abb_pop_are.state.unique() if x not in ['USA']]
#format python3.7
area = 0
for s in state:
area += abb_pop_are.query(f"state=='{s}'").iloc[0].area
fill_are = abb_pop_are.loc[are_inds]
fill_are.area = area
abb_pop_are.loc[are_inds] = fill_are
数据分析
2012 年各州成年人口和未成年人口比例
#2012 年各州成年人口和未成年人口比例
under18 = abb_pop_are.query("ages=='under18' & year==2012").population
total = abb_pop_are.query("ages=='total' & year==2012").population
state = abb_pop_are.query("ages=='under18' & year==2012").state
#求成年人口数量
gt18 = np.array(total) - np.array(under18)
#求为成年人的人口比例
u = np.divide(np.array(under18),np.array(total))
#求成年人的人口比例
g = np.divide(gt18,np.array(total))
#合并成透视表
#把多个一维数组合并成一个二维数组
result = pd.DataFrame(np.c_[u,g],columns=['lt18','gt18'],index=state)
result
import matplotlib.pyplot as plt
#堆叠图
result.plot(kind='barh',stacked=True,figsize=(10,18))