技术交流QQ群:1027579432,欢迎你的加入!
1.使用Pandas进行数据清洗与处理
- 下面以IMDB电影数据集为例进行分析,看一下数据集的信息:
import pandas as pd
movies_df = pd.read_csv("E:\\Code\\local_code\\DataSets\\movie_data\\movie_metadata.csv")
print("data-frame shape:", movies_df.shape)
print("columns names:", movies_df.columns.values)
- 处理缺失数据DataFrame.isna()和DataFrame.fillna()
print("null values:\n",movies_df.isna()) # 查看哪些数据是缺失值
print(movies_df.isna().sum())
print("total null values:", movies_df.isna().sum().sum())
clean_movies_df = movies_df.dropna(how="all") # 只丢弃全为缺失值的那些行
print("new dataframe shape:", clean_movies_df.shape)
print("old dataframe shape:", movies_df.shape)
print(movies_df.fillna(value=0, inplace=True))
movies_df[['gross', 'budget']] = movies_df[['gross', 'budget']].fillna(value=0)
movies_df['language'].fillna("no info", inplace=True)
print(movies_df.head())
movies_df['language'].fillna(method="ffill", inplace=True)
movies_df["budget"].fillna(movies_df["budget"].mean(), inplace=True)
duplicate_rows_df = movies_df[movies_df.duplicated()]
duplicate_rows_df = movies_df[movies_df.duplicated()]
print("number of duplicate rows:", duplicate_rows_df.shape)
duplicate_rows_imdb_link = movies_df[movies_df.duplicated(["movie_imdb_link"])]
print(duplicate_rows_imdb_link.shape)
print(len(movies_df.movie_imdb_link.unique()))
print('shape of dataframe after dropping duplicates', movies_df.drop_duplicates().shape)
op_lables = ['shyttte', 'moderate', 'good']
category = [0., 4., 7., 10.]
movies_df['imdb_labels'] = pd.cut(
movies_df['imdb_score'], labels=op_lables, bins=category, include_lowest=False)
print(movies_df[['movie_title', 'imdb_score', 'imdb_labels']][209:220])