技术交流QQ群:1027579432,欢迎你的加入!

1.使用Pandas进行数据清洗与处理

  • 下面以IMDB电影数据集为例进行分析,看一下数据集的信息:
    import pandas as pd


    movies_df = pd.read_csv("E:\\Code\\local_code\\DataSets\\movie_data\\movie_metadata.csv")

    print("data-frame shape:", movies_df.shape)
  • 查看列的名字
    print("columns names:", movies_df.columns.values)
  • 处理缺失数据DataFrame.isna()和DataFrame.fillna()
    print("null values:\n",movies_df.isna())  # 查看哪些数据是缺失值
    print(movies_df.isna().sum())
    print("total null values:", movies_df.isna().sum().sum())
    clean_movies_df = movies_df.dropna(how="all")  # 只丢弃全为缺失值的那些行
    print("new dataframe shape:", clean_movies_df.shape)
    print("old dataframe shape:", movies_df.shape)
    print(movies_df.fillna(value=0, inplace=True))
    movies_df[['gross', 'budget']] = movies_df[['gross', 'budget']].fillna(value=0)
    movies_df['language'].fillna("no info", inplace=True)
    print(movies_df.head())
    movies_df['language'].fillna(method="ffill", inplace=True)
    movies_df["budget"].fillna(movies_df["budget"].mean(), inplace=True)
    duplicate_rows_df = movies_df[movies_df.duplicated()]
  • 复制一份DataFrame
    duplicate_rows_df = movies_df[movies_df.duplicated()]
    print("number of duplicate rows:", duplicate_rows_df.shape)
    duplicate_rows_imdb_link = movies_df[movies_df.duplicated(["movie_imdb_link"])]
    print(duplicate_rows_imdb_link.shape)
    print(len(movies_df.movie_imdb_link.unique()))
    print('shape of dataframe after dropping duplicates', movies_df.drop_duplicates().shape)
  • Bining data:pd.cut()
    op_lables = ['shyttte', 'moderate', 'good']
    category = [0., 4., 7., 10.]
    movies_df['imdb_labels'] = pd.cut(
        movies_df['imdb_score'], labels=op_lables, bins=category, include_lowest=False)
    print(movies_df[['movie_title', 'imdb_score', 'imdb_labels']][209:220])