from datetime import timedelta import pandas as pd #1.rolling函数 # df = pd.read_csv('nowcoder.csv',parse_dates=['date']) # df['date'] = df.date.dt.date.astype('datetime64[ns]') # df = df.sort_values(['user_id','date'],ascending=True).drop_duplicates(['user_id','date']).reset_index(drop=True) # #df.rename({'user_id':'user'},axis=1,inplace=True) # df.set_index('date',inplace=True) # #df['question_id'].rolling('3D').count() # s1 = df.groupby('user_id')['question_id'].rolling("3D").count() # s1 = s1.reset_index(-1,drop=True).astype(int) # print(s1[s1>=3]) # 2.类SQL使用rank # import pandas as pd # df = pd.read_csv('nowcoder.csv',parse_dates=['date']) # df['date'] = df['date'].dt.date.astype('datetime64[D]') # df = df.query(' "2021-12-01"<= date <= "2021-12-30" ') # df['rk'] = df.groupby('user_id')["date"].rank("dense") # df["diff"] = df['date'] - pd.to_timedelta(df['rk'],'D') # res = df.groupby(['user_id','diff'],as_index=False).nunique().groupby('user_id').rk.max() # print(res[res>=3]) #3.循环 df = pd.read_csv('nowcoder.csv',parse_dates=['date']) df['date'] = df['date'].dt.date.astype('datetime64[D]') df = df.query(' "2021-12-01"<= date <= "2021-12-30" ') user_index = [] times = [] groups = df.groupby('user_id') for name,group in groups: group.drop_duplicates(['user_id','date'],inplace = True) res = group['date'] - pd.to_timedelta(group['date'].rank(method = 'dense'),unit='d') res = res.value_counts() if res.max()>=3: times.append(res.max()) user_index.append(name) series = pd.Series(index=user_index,data=times,name='question_id') series.index.name = 'user_id' print(series)