import pandas as pd from datetime import timedelta # 方法1 # nowcoder = pd.read_csv('nowcoder.csv') # nowcoder['date'] = pd.to_datetime(nowcoder['date']).dt.date # nowcoder['date'] = nowcoder['date'].astype('datetime64') # nowcoder = nowcoder[pd.to_datetime(nowcoder['date']).dt.strftime('%Y-%m') == '2021-12'] # nowcoder.drop_duplicates(subset= ['user_id', 'date'], inplace = True) # nowcoder['rank'] = nowcoder.groupby(['user_id']).date.rank() # nowcoder['diff'] = nowcoder['date'] - pd.to_timedelta(nowcoder['rank'], unit = 'd') # result = nowcoder.groupby(['user_id'])['diff'].count() # result = result[result>=3] # print(result) #方法二 import pandas as pd from datetime import timedelta nowcoder = pd.read_csv('nowcoder.csv') from datetime import timedelta nowcoder['date1']=pd.to_datetime(nowcoder['date']).dt.strftime('%Y-%m') nowcoder['date2']=pd.to_datetime(nowcoder['date']).dt.strftime('%d') nowcoder['date2']=nowcoder['date2'].astype('int') dup=nowcoder[['user_id','date2']].drop_duplicates(['user_id','date2'],inplace=False) #分组,将数据按照‘user_id’分组 group_df=dup.groupby(['user_id']) #空列表 l=[] dic = {} for group, df in group_df: # if df.date2.count()<3: # continue # else: for i in range(df.date2.count()): for j in range(i+1, df.date2.count()): if df['date2'].iloc[j] == df['date2'].iloc[i] +(j-i): l.append((j-i+1)) if len(l) == 0: dic[group] = 0 else: dic[group] = max(l) l = [] result = pd.Series(dic, index= dic.keys()) result.name = 'question_id' result.index.name = 'user_id' print(result)