文章目录

内容概览
数据下载
源代码及分析

内容概览

数据下载

链接：https://pan.baidu.com/s/16kWSQ4OfNE3w3nSW4M2lyA
提取码：tjc9

源代码及分析

import pandas as pd

#州名的简写
abb = pd.read_csv('state-abbrevs.csv')
#人口的数据
pop = pd.read_csv('state-population.csv')
#州的面积
are = pd.read_csv('state-areas.csv')

abb.head()

pop.head()

 cols = pop.columns.tolist()
cols[0] = 'state_region'
pop.columns = cols

pop.shape

are.head()

将abbrevs和population合并

abb_pop = pd.merge(abb,pop,left_on='abbreviation',right_on='state_region',how='outer')

abb_pop.drop(labels='abbreviation',axis=1,inplace=True)

abb_pop.head()

要检查空数据

abb_pop.isnull().sum()

查找州名为空的数据

#查找州名为空的行号
inds = abb_pop.state.isnull()

#获取州名为空的数据集
filter_data = abb_pop.loc[inds]

#提取州名简写
#unique 获取唯一值
filter_data.state_region.unique()

填补简写为PR的

#先抽取简写为PR的数据行号
pr_inds = abb_pop.query("state_region=='PR'").index

reple = abb_pop.loc[pr_inds]
reple['state'] = 'Puerto Rico'

abb_pop.loc[pr_inds] = reple

填补简写为USA的

usa_inds = abb_pop.query("state_region=='USA'").index

reple_usa = abb_pop.loc[usa_inds]
reple_usa['state'] = 'USA'

abb_pop.loc[usa_inds] = reple_usa

人口数据为空的行号

indexs = abb_pop.population.isnull()

filter_data = abb_pop.loc[indexs]

#PR2000年的数据
under18 = abb_pop.query("state_region=='PR' & ages=='under18' & year==2000").population.iloc[0]
total = abb_pop.query("state_region=='PR' & ages=='total' & year==2000").population.iloc[0]

un18_data = filter_data.query("ages=='under18'")
un18_data.population = under18
abb_pop.loc[un18_data.index] = un18_data

total_data = filter_data.query("ages=='total'")
total_data.population = total
abb_pop.loc[total_data.index] = total_data

合并面积的df

abb_pop_are = pd.merge(abb_pop,are,how='outer')

cols = abb_pop_are.columns.tolist()
cols[-1] = 'area'
abb_pop_are.columns = cols

填充面积

are_inds = abb_pop_are.area.isnull()

state = [x for x in abb_pop_are.state.unique() if x not in ['USA']]

#format python3.7
area = 0
for s in state:
    area += abb_pop_are.query(f"state=='{s}'").iloc[0].area

fill_are = abb_pop_are.loc[are_inds]
fill_are.area = area
abb_pop_are.loc[are_inds] = fill_are

数据分析

2012 年各州成年人口和未成年人口比例

#2012 年各州成年人口和未成年人口比例
under18 = abb_pop_are.query("ages=='under18' & year==2012").population
total = abb_pop_are.query("ages=='total' & year==2012").population
state = abb_pop_are.query("ages=='under18' & year==2012").state

#求成年人口数量
gt18 = np.array(total) - np.array(under18)

#求为成年人的人口比例
u = np.divide(np.array(under18),np.array(total))

#求成年人的人口比例
g = np.divide(gt18,np.array(total))

#合并成透视表
#把多个一维数组合并成一个二维数组
result = pd.DataFrame(np.c_[u,g],columns=['lt18','gt18'],index=state)
result

import matplotlib.pyplot as plt

#堆叠图
result.plot(kind='barh',stacked=True,figsize=(10,18))

【数据挖掘重要笔记day16】美国各州人口的数据以及州的面积相关的数据预处理及分析+缺失值处理

文章目录

内容概览

数据下载

源代码及分析

将abbrevs和population合并

要检查空数据

查找州名为空的数据

填补简写为PR的

填补简写为USA的

人口数据为空的行号

合并面积的df

填充面积

数据分析

2012 年各州成年人口和未成年人口比例