1.对字典的值进行排序
import operator
importance = model.get_fscore() # 得到都是字典的list
importance = sorted(importance.items(),key = operator.itemgetter(1),reverse = True)
# 结果类似这样
# [('iBlade1TempBattBox_1sec', 164), ('iCableTwistTotal', 163), ('iKWhOverall_h', 159), ('iPitchAngle3', 15), ('iUL2_690V', 14), ('iIL1_690V', 14), ('iTempCntr_1sec', 12), ('iIL2_690V', 7), ('iVibrationY', 3), ('iReactivePower', 1), ('iActivePoweiSetPointValue', 1), ('iIL3_690V', 1)]
2.pickle存储和读取
# 存储文件
with open('abc.pickle','wb') as file:
pickle.dump(abc,file)
# 读取文件
with open('abc.pickle','rb') as file:
abc = pickle.load(file)
- 查看属性的值分别是多少以及出现了多少次
# 查看都有哪些值
df[‘Label’].unique()
# 查看值都出现了多少次
df['Label'].value_counts()
4.pandas读取大文件
reader = pd.read_csv('ANN/20003-2017-03/20003001#2017-03.csv',iterator=True)
chunks = []
while 1:
try:
chunk = reader.get_chunk(100000)
chunks.append(chunk)
except StopIteration:
print('Iteration is stoped')
break
df = pd.concat(chunks,ignore_index=True) # 合并全部的chunk
5.去除值全部相同的属性
df = df.ix[:,(df != df.ix[0]).any()]
6.Pandas使用Dataframe的时候减少内存的方法
参考:
https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
def reduce_mem_usage(props):
start_mem_usg = props.memory_usage().sum() / 1024**2
print("Memory usage of properties dataframe is :",start_mem_usg," MB")
NAlist = [] # Keeps track of columns that have missing values filled in.
for col in props.columns:
if props[col].dtype != object: # Exclude strings
# Print current column type
print("******************************")
print("Column: ",col)
print("dtype before: ",props[col].dtype)
# make variables for Int, max and min
IsInt = False
mx = props[col].max()
mn = props[col].min()
# Integer does not support NA, therefore, NA needs to be filled
if not np.isfinite(props[col]).all():
NAlist.append(col)
props[col].fillna(mn-1,inplace=True)
# test if column can be converted to an integer
asint = props[col].fillna(0).astype(np.int64)
result = (props[col] - asint)
result = result.sum()
if result > -0.01 and result < 0.01:
IsInt = True
# Make Integer/unsigned Integer datatypes
if IsInt:
if mn >= 0:
if mx < 255:
props[col] = props[col].astype(np.uint8)
elif mx < 65535:
props[col] = props[col].astype(np.uint16)
elif mx < 4294967295:
props[col] = props[col].astype(np.uint32)
else:
props[col] = props[col].astype(np.uint64)
else:
if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
props[col] = props[col].astype(np.int8)
elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
props[col] = props[col].astype(np.int16)
elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
props[col] = props[col].astype(np.int32)
elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
props[col] = props[col].astype(np.int64)
# Make float datatypes 32 bit
else:
props[col] = props[col].astype(np.float32)
# Print new column type
print("dtype after: ",props[col].dtype)
print("******************************")
# Print final result
print("___MEMORY USAGE AFTER COMPLETION:___")
mem_usg = props.memory_usage().sum() / 1024**2
print("Memory usage is: ",mem_usg," MB")
print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
return props, NAlist
#-------------------------------------------------------------------------
train = pd.read_csv('../input/train_V2.csv')
train = reduce_mem_usage(train)
test = pd.read_csv('../input/test_V2.csv')
test = reduce_mem_usage(test)
7.Pandas的nunique()
# 表示A,B属性中不相同的元素的数量
>>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
>>> df.nunique()
A 3
B 1
# 表示每行中不相同的元素的数量
>>> df.nunique(axis=1)
0 1
1 2
2 2
8.可以直接调用Series的画图函数plot
fig,ax = plt.subplots(1,2,figsize = (12,4))
# value_counts后得到的是Series,直接使用plot即可画图
train.groupby('matchId')['matchType'].first().value_counts().plot(kind = 'bar',ax = ax[0])
9.Pandas中的query的用法
df[df['total_bill'] > 20]
# 等价于
df.query('total_bill > 20')
10.Pandas的层次化索引
https://blog.csdn.net/Darkman_EX/article/details/80723802
https://cloud.tencent.com/developer/ask/48201
https://pandas.pydata.org/pandas-docs/stable/advanced.html
cols = ['numGroups','maxPlace']
desc1 = train.groupby('matchType')[cols].describe()
print(desc1)
print(desc1.loc[:,(slice(None),['min','mean','max'])])
numGroups \
count mean std min 25% 50% 75% max
matchType
duo 1322628.0 45.812482 3.164604 1.0 45.0 46.0 48.0 52.0
solo 720713.0 91.115157 11.437120 1.0 91.0 93.0 95.0 100.0
squad 2403625.0 27.039389 2.348066 2.0 26.0 27.0 28.0 37.0
maxPlace
count mean std min 25% 50% 75% max
matchType
duo 1322628.0 47.608919 2.911739 3.0 47.0 48.0 49.0 52.0
solo 720713.0 93.908771 10.135402 1.0 94.0 96.0 97.0 100.0
squad 2403625.0 27.982982 2.205999 2.0 27.0 28.0 29.0 37.0
numGroups maxPlace
mean min max mean min max
matchType
duo 45.812482 1.0 52.0 47.608919 3.0 52.0
solo 91.115157 1.0 100.0 93.908771 1.0 100.0
squad 27.039389 2.0 37.0 27.982982 2.0 37.0
11.Pandas的isin函数
>>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
>>> df.isin([1, 3, 12, 'a'])
A B
0 True True
1 False False
2 True False
12.Pandas中的Groupby
https://www.cnblogs.com/lemonbit/p/6810972.html
13.Pandas中的count()和size()的不同
# 都是用来计数的,但是返回的类型不一样,count返回Dataframe,size返回Series
train.groupby(['matchType','matchId']).size()
matchType matchId
duo 0003b92987589e 100
0006eb8c17708d 93
00086c74bb4efc 98
001125344b660c 96
001360264d4b5f 91
0014d9d1b0aff6 94
0016fe3ee17ce7 97
00177a6ce4dfb5 92
0019bc34b3c58e 97
0019d729577e9c 94
0027a504cd3b0c 99
002aeef57764f8 99
002b3b8e0e3870 95
003228f5dc4934 97
0043d584520de4 97
00489048e21690 93
004cddec9da619 93
005166f6ee4ab6 97
0052116de68e46 99
005712df62d721 96
005908fb69efc4 93
005e236012c55f 97
005e3f527c4a6a 97
0063217a97a803 93
00656cf8a4dbe6 98
006755987f2217 93
006d25449eb444 96
007124bcf16c97 92
007510af0aaa9e 92
007edf38197cec 84
...
squad ffc364ae0cdae2 92
ffc66d8b3e21f7 84
ffc6fd9da2b020 98
ffc89e1f6347df 84
ffc9f30599cc27 97
ffca00bb37147a 95
ffca3124b378ad 93
ffcb58c4fe9192 93
ffcbc1b5255d3a 98
ffcdc0aa9e4b09 97
ffd07bb11c36c3 97
ffd19b3585e2ba 93
ffd334bce1b8ca 98
ffd8796ff5f1d6 94
ffd8c5746d9be8 97
ffdf8e9dc18596 98
ffead5be0b50e8 98
ffeaf5f7073d41 89
ffeb2131284e45 94
ffed6b75ebe4ce 91
ffed79cf43d7d7 97
ffef306edf6b36 98
fff0a2994a6f3a 87
fff2c218352941 98
fff497dff0186d 90
fff4d3cbb6c317 100
fffa170cf8ed83 93
fffd74b5150cb4 97
fffe562611d981 94
fffe92232706aa 93
Length: 47965, dtype: int64
train.groupby(['matchType','matchId']).count()
Id groupId assists boosts damageDealt DBNOs \
matchType matchId
duo 0003b92987589e 100 100 100 100 100 100
0006eb8c17708d 93 93 93 93 93 93
00086c74bb4efc 98 98 98 98 98 98
001125344b660c 96 96 96 96 96 96
001360264d4b5f 91 91 91 91 91 91
0014d9d1b0aff6 94 94 94 94 94 94
0016fe3ee17ce7 97 97 97 97 97 97
00177a6ce4dfb5 92 92 92 92 92 92
0019bc34b3c58e 97 97 97 97 97 97
0019d729577e9c 94 94 94 94 94 94
0027a504cd3b0c 99 99 99 99 99 99
002aeef57764f8 99 99 99 99 99 99
002b3b8e0e3870 95 95 95 95 95 95
003228f5dc4934 97 97 97 97 97 97
0043d584520de4 97 97 97 97 97 97
00489048e21690 93 93 93 93 93 93
004cddec9da619 93 93 93 93 93 93
005166f6ee4ab6 97 97 97 97 97 97
0052116de68e46 99 99 99 99 99 99
005712df62d721 96 96 96 96 96 96
005908fb69efc4 93 93 93 93 93 93
005e236012c55f 97 97 97 97 97 97
005e3f527c4a6a 97 97 97 97 97 97
0063217a97a803 93 93 93 93 93 93
00656cf8a4dbe6 98 98 98 98 98 98
006755987f2217 93 93 93 93 93 93
006d25449eb444 96 96 96 96 96 96
007124bcf16c97 92 92 92 92 92 92
007510af0aaa9e 92 92 92 92 92 92
007edf38197cec 84 84 84 84 84 84
... ... ... ... ... ... ...
squad ffc364ae0cdae2 92 92 92 92 92 92
ffc66d8b3e21f7 84 84 84 84 84 84
ffc6fd9da2b020 98 98 98 98 98 98
ffc89e1f6347df 84 84 84 84 84 84
ffc9f30599cc27 97 97 97 97 97 97
ffca00bb37147a 95 95 95 95 95 95
ffca3124b378ad 93 93 93 93 93 93
ffcb58c4fe9192 93 93 93 93 93 93
ffcbc1b5255d3a 98 98 98 98 98 98
ffcdc0aa9e4b09 97 97 97 97 97 97
ffd07bb11c36c3 97 97 97 97 97 97
ffd19b3585e2ba 93 93 93 93 93 93
ffd334bce1b8ca 98 98 98 98 98 98
ffd8796ff5f1d6 94 94 94 94 94 94
ffd8c5746d9be8 97 97 97 97 97 97
ffdf8e9dc18596 98 98 98 98 98 98
ffead5be0b50e8 98 98 98 98 98 98
ffeaf5f7073d41 89 89 89 89 89 89
ffeb2131284e45 94 94 94 94 94 94
ffed6b75ebe4ce 91 91 91 91 91 91
ffed79cf43d7d7 97 97 97 97 97 97
ffef306edf6b36 98 98 98 98 98 98
fff0a2994a6f3a 87 87 87 87 87 87
fff2c218352941 98 98 98 98 98 98
fff497dff0186d 90 90 90 90 90 90
fff4d3cbb6c317 100 100 100 100 100 100
fffa170cf8ed83 93 93 93 93 93 93
fffd74b5150cb4 97 97 97 97 97 97
fffe562611d981 94 94 94 94 94 94
fffe92232706aa 93 93 93 93 93 93
headshotKills heals killPlace killPoints \
matchType matchId
duo 0003b92987589e 100 100 100 100
0006eb8c17708d 93 93 93 93
00086c74bb4efc 98 98 98 98
001125344b660c 96 96 96 96
001360264d4b5f 91 91 91 91
0014d9d1b0aff6 94 94 94 94
0016fe3ee17ce7 97 97 97 97
00177a6ce4dfb5 92 92 92 92
0019bc34b3c58e 97 97 97 97
0019d729577e9c 94 94 94 94
0027a504cd3b0c 99 99 99 99
002aeef57764f8 99 99 99 99
002b3b8e0e3870 95 95 95 95
003228f5dc4934 97 97 97 97
0043d584520de4 97 97 97 97
00489048e21690 93 93 93 93
004cddec9da619 93 93 93 93
005166f6ee4ab6 97 97 97 97
0052116de68e46 99 99 99 99
005712df62d721 96 96 96 96
005908fb69efc4 93 93 93 93
005e236012c55f 97 97 97 97
005e3f527c4a6a 97 97 97 97
0063217a97a803 93 93 93 93
00656cf8a4dbe6 98 98 98 98
006755987f2217 93 93 93 93
006d25449eb444 96 96 96 96
007124bcf16c97 92 92 92 92
007510af0aaa9e 92 92 92 92
007edf38197cec 84 84 84 84
... ... ... ... ...
squad ffc364ae0cdae2 92 92 92 92
ffc66d8b3e21f7 84 84 84 84
ffc6fd9da2b020 98 98 98 98
ffc89e1f6347df 84 84 84 84
ffc9f30599cc27 97 97 97 97
ffca00bb37147a 95 95 95 95
ffca3124b378ad 93 93 93 93
ffcb58c4fe9192 93 93 93 93
ffcbc1b5255d3a 98 98 98 98
ffcdc0aa9e4b09 97 97 97 97
ffd07bb11c36c3 97 97 97 97
ffd19b3585e2ba 93 93 93 93
ffd334bce1b8ca 98 98 98 98
ffd8796ff5f1d6 94 94 94 94
ffd8c5746d9be8 97 97 97 97
ffdf8e9dc18596 98 98 98 98
ffead5be0b50e8 98 98 98 98
ffeaf5f7073d41 89 89 89 89
ffeb2131284e45 94 94 94 94
ffed6b75ebe4ce 91 91 91 91
ffed79cf43d7d7 97 97 97 97
ffef306edf6b36 98 98 98 98
fff0a2994a6f3a 87 87 87 87
fff2c218352941 98 98 98 98
fff497dff0186d 90 90 90 90
fff4d3cbb6c317 100 100 100 100
fffa170cf8ed83 93 93 93 93
fffd74b5150cb4 97 97 97 97
fffe562611d981 94 94 94 94
fffe92232706aa 93 93 93 93
... revives rideDistance roadKills \
matchType matchId ...
duo 0003b92987589e ... 100 100 100
0006eb8c17708d ... 93 93 93
00086c74bb4efc ... 98 98 98
001125344b660c ... 96 96 96
001360264d4b5f ... 91 91 91
0014d9d1b0aff6 ... 94 94 94
0016fe3ee17ce7 ... 97 97 97
00177a6ce4dfb5 ... 92 92 92
0019bc34b3c58e ... 97 97 97
0019d729577e9c ... 94 94 94
0027a504cd3b0c ... 99 99 99
002aeef57764f8 ... 99 99 99
002b3b8e0e3870 ... 95 95 95
003228f5dc4934 ... 97 97 97
0043d584520de4 ... 97 97 97
00489048e21690 ... 93 93 93
004cddec9da619 ... 93 93 93
005166f6ee4ab6 ... 97 97 97
0052116de68e46 ... 99 99 99
005712df62d721 ... 96 96 96
005908fb69efc4 ... 93 93 93
005e236012c55f ... 97 97 97
005e3f527c4a6a ... 97 97 97
0063217a97a803 ... 93 93 93
00656cf8a4dbe6 ... 98 98 98
006755987f2217 ... 93 93 93
006d25449eb444 ... 96 96 96
007124bcf16c97 ... 92 92 92
007510af0aaa9e ... 92 92 92
007edf38197cec ... 84 84 84
... ... ... ... ...
squad ffc364ae0cdae2 ... 92 92 92
ffc66d8b3e21f7 ... 84 84 84
ffc6fd9da2b020 ... 98 98 98
ffc89e1f6347df ... 84 84 84
ffc9f30599cc27 ... 97 97 97
ffca00bb37147a ... 95 95 95
ffca3124b378ad ... 93 93 93
ffcb58c4fe9192 ... 93 93 93
ffcbc1b5255d3a ... 98 98 98
ffcdc0aa9e4b09 ... 97 97 97
ffd07bb11c36c3 ... 97 97 97
ffd19b3585e2ba ... 93 93 93
ffd334bce1b8ca ... 98 98 98
ffd8796ff5f1d6 ... 94 94 94
ffd8c5746d9be8 ... 97 97 97
ffdf8e9dc18596 ... 98 98 98
ffead5be0b50e8 ... 98 98 98
ffeaf5f7073d41 ... 89 89 89
ffeb2131284e45 ... 94 94 94
ffed6b75ebe4ce ... 91 91 91
ffed79cf43d7d7 ... 97 97 97
ffef306edf6b36 ... 98 98 98
fff0a2994a6f3a ... 87 87 87
fff2c218352941 ... 98 98 98
fff497dff0186d ... 90 90 90
fff4d3cbb6c317 ... 100 100 100
fffa170cf8ed83 ... 93 93 93
fffd74b5150cb4 ... 97 97 97
fffe562611d981 ... 94 94 94
fffe92232706aa ... 93 93 93
swimDistance teamKills vehicleDestroys \
matchType matchId
duo 0003b92987589e 100 100 100
0006eb8c17708d 93 93 93
00086c74bb4efc 98 98 98
001125344b660c 96 96 96
001360264d4b5f 91 91 91
0014d9d1b0aff6 94 94 94
0016fe3ee17ce7 97 97 97
00177a6ce4dfb5 92 92 92
0019bc34b3c58e 97 97 97
0019d729577e9c 94 94 94
0027a504cd3b0c 99 99 99
002aeef57764f8 99 99 99
002b3b8e0e3870 95 95 95
003228f5dc4934 97 97 97
0043d584520de4 97 97 97
00489048e21690 93 93 93
004cddec9da619 93 93 93
005166f6ee4ab6 97 97 97
0052116de68e46 99 99 99
005712df62d721 96 96 96
005908fb69efc4 93 93 93
005e236012c55f 97 97 97
005e3f527c4a6a 97 97 97
0063217a97a803 93 93 93
00656cf8a4dbe6 98 98 98
006755987f2217 93 93 93
006d25449eb444 96 96 96
007124bcf16c97 92 92 92
007510af0aaa9e 92 92 92
007edf38197cec 84 84 84
... ... ... ...
squad ffc364ae0cdae2 92 92 92
ffc66d8b3e21f7 84 84 84
ffc6fd9da2b020 98 98 98
ffc89e1f6347df 84 84 84
ffc9f30599cc27 97 97 97
ffca00bb37147a 95 95 95
ffca3124b378ad 93 93 93
ffcb58c4fe9192 93 93 93
ffcbc1b5255d3a 98 98 98
ffcdc0aa9e4b09 97 97 97
ffd07bb11c36c3 97 97 97
ffd19b3585e2ba 93 93 93
ffd334bce1b8ca 98 98 98
ffd8796ff5f1d6 94 94 94
ffd8c5746d9be8 97 97 97
ffdf8e9dc18596 98 98 98
ffead5be0b50e8 98 98 98
ffeaf5f7073d41 89 89 89
ffeb2131284e45 94 94 94
ffed6b75ebe4ce 91 91 91
ffed79cf43d7d7 97 97 97
ffef306edf6b36 98 98 98
fff0a2994a6f3a 87 87 87
fff2c218352941 98 98 98
fff497dff0186d 90 90 90
fff4d3cbb6c317 100 100 100
fffa170cf8ed83 93 93 93
fffd74b5150cb4 97 97 97
fffe562611d981 94 94 94
fffe92232706aa 93 93 93
walkDistance weaponsAcquired winPoints \
matchType matchId
duo 0003b92987589e 100 100 100
0006eb8c17708d 93 93 93
00086c74bb4efc 98 98 98
001125344b660c 96 96 96
001360264d4b5f 91 91 91
0014d9d1b0aff6 94 94 94
0016fe3ee17ce7 97 97 97
00177a6ce4dfb5 92 92 92
0019bc34b3c58e 97 97 97
0019d729577e9c 94 94 94
0027a504cd3b0c 99 99 99
002aeef57764f8 99 99 99
002b3b8e0e3870 95 95 95
003228f5dc4934 97 97 97
0043d584520de4 97 97 97
00489048e21690 93 93 93
004cddec9da619 93 93 93
005166f6ee4ab6 97 97 97
0052116de68e46 99 99 99
005712df62d721 96 96 96
005908fb69efc4 93 93 93
005e236012c55f 97 97 97
005e3f527c4a6a 97 97 97
0063217a97a803 93 93 93
00656cf8a4dbe6 98 98 98
006755987f2217 93 93 93
006d25449eb444 96 96 96
007124bcf16c97 92 92 92
007510af0aaa9e 92 92 92
007edf38197cec 84 84 84
... ... ... ...
squad ffc364ae0cdae2 92 92 92
ffc66d8b3e21f7 84 84 84
ffc6fd9da2b020 98 98 98
ffc89e1f6347df 84 84 84
ffc9f30599cc27 97 97 97
ffca00bb37147a 95 95 95
ffca3124b378ad 93 93 93
ffcb58c4fe9192 93 93 93
ffcbc1b5255d3a 98 98 98
ffcdc0aa9e4b09 97 97 97
ffd07bb11c36c3 97 97 97
ffd19b3585e2ba 93 93 93
ffd334bce1b8ca 98 98 98
ffd8796ff5f1d6 94 94 94
ffd8c5746d9be8 97 97 97
ffdf8e9dc18596 98 98 98
ffead5be0b50e8 98 98 98
ffeaf5f7073d41 89 89 89
ffeb2131284e45 94 94 94
ffed6b75ebe4ce 91 91 91
ffed79cf43d7d7 97 97 97
ffef306edf6b36 98 98 98
fff0a2994a6f3a 87 87 87
fff2c218352941 98 98 98
fff497dff0186d 90 90 90
fff4d3cbb6c317 100 100 100
fffa170cf8ed83 93 93 93
fffd74b5150cb4 97 97 97
fffe562611d981 94 94 94
fffe92232706aa 93 93 93
winPlacePerc
matchType matchId
duo 0003b92987589e 100
0006eb8c17708d 93
00086c74bb4efc 98
001125344b660c 96
001360264d4b5f 91
0014d9d1b0aff6 94
0016fe3ee17ce7 97
00177a6ce4dfb5 92
0019bc34b3c58e 97
0019d729577e9c 94
0027a504cd3b0c 99
002aeef57764f8 99
002b3b8e0e3870 95
003228f5dc4934 97
0043d584520de4 97
00489048e21690 93
004cddec9da619 93
005166f6ee4ab6 97
0052116de68e46 99
005712df62d721 96
005908fb69efc4 93
005e236012c55f 97
005e3f527c4a6a 97
0063217a97a803 93
00656cf8a4dbe6 98
006755987f2217 93
006d25449eb444 96
007124bcf16c97 92
007510af0aaa9e 92
007edf38197cec 84
... ...
squad ffc364ae0cdae2 92
ffc66d8b3e21f7 84
ffc6fd9da2b020 98
ffc89e1f6347df 84
ffc9f30599cc27 97
ffca00bb37147a 95
ffca3124b378ad 93
ffcb58c4fe9192 93
ffcbc1b5255d3a 98
ffcdc0aa9e4b09 97
ffd07bb11c36c3 97
ffd19b3585e2ba 93
ffd334bce1b8ca 98
ffd8796ff5f1d6 94
ffd8c5746d9be8 97
ffdf8e9dc18596 98
ffead5be0b50e8 98
ffeaf5f7073d41 89
ffeb2131284e45 94
ffed6b75ebe4ce 91
ffed79cf43d7d7 97
ffef306edf6b36 98
fff0a2994a6f3a 87
fff2c218352941 98
fff497dff0186d 90
fff4d3cbb6c317 100
fffa170cf8ed83 93
fffd74b5150cb4 97
fffe562611d981 94
fffe92232706aa 93
[47965 rows x 27 columns]
14.Pandas中的first
https://codeday.me/bug/20180109/115070.html
# 含义是获取每个group的第一条记录
15.groupby后的计数统计
# 下面这两句话是一个意思,都是统计每个比赛下group的多少
train.groupby(['matchType','matchId','groupId']).count().groupby(['matchType','matchId']).size()
train.groupby(['matchType','matchId'])['groupId'].nunique()
16.Pandas中的to_frame()
# 由于size()会返回一个series,导致没有column,使用to_frame将series转化成为Dataframe,添加column
train.groupby(['matchType','matchId'])['groupId'].nunique()
train.groupby(['matchType','matchId'])['groupId'].nunique().to_frame('group in match')
matchType matchId
duo 0003b92987589e 47
0006eb8c17708d 44
00086c74bb4efc 48
001125344b660c 47
001360264d4b5f 44
0014d9d1b0aff6 47
0016fe3ee17ce7 47
00177a6ce4dfb5 47
0019bc34b3c58e 47
0019d729577e9c 47
0027a504cd3b0c 50
002aeef57764f8 49
002b3b8e0e3870 46
003228f5dc4934 47
0043d584520de4 48
00489048e21690 45
004cddec9da619 46
005166f6ee4ab6 47
0052116de68e46 50
005712df62d721 47
005908fb69efc4 47
005e236012c55f 45
005e3f527c4a6a 48
0063217a97a803 47
00656cf8a4dbe6 46
006755987f2217 45
006d25449eb444 45
007124bcf16c97 45
007510af0aaa9e 47
007edf38197cec 41
..
squad ffc364ae0cdae2 26
ffc66d8b3e21f7 25
ffc6fd9da2b020 29
ffc89e1f6347df 27
ffc9f30599cc27 24
ffca00bb37147a 26
ffca3124b378ad 25
ffcb58c4fe9192 26
ffcbc1b5255d3a 29
ffcdc0aa9e4b09 26
ffd07bb11c36c3 29
ffd19b3585e2ba 27
ffd334bce1b8ca 29
ffd8796ff5f1d6 27
ffd8c5746d9be8 29
ffdf8e9dc18596 30
ffead5be0b50e8 28
ffeaf5f7073d41 25
ffeb2131284e45 30
ffed6b75ebe4ce 26
ffed79cf43d7d7 25
ffef306edf6b36 26
fff0a2994a6f3a 26
fff2c218352941 28
fff497dff0186d 29
fff4d3cbb6c317 29
fffa170cf8ed83 28
fffd74b5150cb4 27
fffe562611d981 23
fffe92232706aa 29
Name: groupId, Length: 47965, dtype: int64
group in match
matchType matchId
duo 0003b92987589e 47
0006eb8c17708d 44
00086c74bb4efc 48
001125344b660c 47
001360264d4b5f 44
0014d9d1b0aff6 47
0016fe3ee17ce7 47
00177a6ce4dfb5 47
0019bc34b3c58e 47
0019d729577e9c 47
0027a504cd3b0c 50
002aeef57764f8 49
002b3b8e0e3870 46
003228f5dc4934 47
0043d584520de4 48
00489048e21690 45
004cddec9da619 46
005166f6ee4ab6 47
0052116de68e46 50
005712df62d721 47
005908fb69efc4 47
005e236012c55f 45
005e3f527c4a6a 48
0063217a97a803 47
00656cf8a4dbe6 46
006755987f2217 45
006d25449eb444 45
007124bcf16c97 45
007510af0aaa9e 47
007edf38197cec 41
... ...
squad ffc364ae0cdae2 26
ffc66d8b3e21f7 25
ffc6fd9da2b020 29
ffc89e1f6347df 27
ffc9f30599cc27 24
ffca00bb37147a 26
ffca3124b378ad 25
ffcb58c4fe9192 26
ffcbc1b5255d3a 29
ffcdc0aa9e4b09 26
ffd07bb11c36c3 29
ffd19b3585e2ba 27
ffd334bce1b8ca 29
ffd8796ff5f1d6 27
ffd8c5746d9be8 29
ffdf8e9dc18596 30
ffead5be0b50e8 28
ffeaf5f7073d41 25
ffeb2131284e45 30
ffed6b75ebe4ce 26
ffed79cf43d7d7 25
ffef306edf6b36 26
fff0a2994a6f3a 26
fff2c218352941 28
fff497dff0186d 29
fff4d3cbb6c317 29
fffa170cf8ed83 28
fffd74b5150cb4 27
fffe562611d981 23
fffe92232706aa 29
[47965 rows x 1 columns]
16.Pandas中查看属性最大的几个元素
print(group['players in group'].nlargest(5))
matchType matchId groupId
squad b30f3d87189aa6 14d6b54cdec6bc 74
duo 3e029737889ce9 b8275198faa03b 72
solo 41a634f62f86b7 128b07271aa012 64
duo 7e93ce71ac6f61 7385e5fe214021 49
squad 3c2531adf5b942 e52a2e6ca30474 36
Name: players in group, dtype: int64
17.画图subplots
fix,ax = plt.subplots(1,2,figsize = (12,4))
train['matchDuration'].plot(kind = 'hist',ax = ax[0])
train.query('matchDuration >= 1400 & matchDuration <= 1800')['matchDuration'].plot(kind = 'hist',ax = ax[1])
18.画图subplot
# 调整图片尺寸
plt.figure(figsize = (12,4))
# 创建axes
plt.subplot(1,2,1)
plt.hist(x = train['matchDuration'],bins = 50)
plt.subplot(1,2,2)
plt.hist(x =train.query('matchDuration >= 1400 & matchDuration <= 1800')['matchDuration'],bins = 50 )
19.利用any判断
any() 函数用于判断给定的可迭代参数 iterable 是否全部为 False,则返回 False,如果有一个为 True,则返回 True。
元素除了是 0、空、FALSE 外都算 TRUE。
在这里插入代码片
20.Pandas中的cut
https://medium.com/@morris_tai/pandas的cut-qcut函數-93c244e34cfc
https://blog.csdn.net/cc_jjj/article/details/78878878
21.判断属性是否包含以及取反
train['matchType'].str.contains('solo')
0 False
1 False
2 False
3 False
4 True
5 False
6 False
7 True
8 False
9 False
10 False
11 False
12 False
13 True
14 False
15 False
16 False
17 True
18 False
19 False
20 False
21 False
22 False
23 False
24 False
25 False
26 False
27 False
28 False
29 False
...
4446936 False
4446937 False
4446938 False
4446939 False
4446940 False
4446941 False
4446942 False
4446943 False
4446944 False
4446945 False
4446946 True
4446947 False
4446948 False
4446949 False
4446950 True
4446951 False
4446952 False
4446953 False
4446954 True
4446955 False
4446956 False
4446957 False
4446958 False
4446959 True
4446960 False
4446961 False
4446962 True
4446963 False
4446964 False
4446965 True
Name: matchType, Length: 4446966, dtype: bool
~train['matchType'].str.contains('solo')
0 True
1 True
2 True
3 True
4 False
5 True
6 True
7 False
8 True
9 True
10 True
11 True
12 True
13 False
14 True
15 True
16 True
17 False
18 True
19 True
20 True
21 True
22 True
23 True
24 True
25 True
26 True
27 True
28 True
29 True
...
4446936 True
4446937 True
4446938 True
4446939 True
4446940 True
4446941 True
4446942 True
4446943 True
4446944 True
4446945 True
4446946 False
4446947 True
4446948 True
4446949 True
4446950 False
4446951 True
4446952 True
4446953 True
4446954 False
4446955 True
4446956 True
4446957 True
4446958 True
4446959 False
4446960 True
4446961 True
4446962 False
4446963 True
4446964 True
4446965 False
Name: matchType, Length: 4446966, dtype: bool
# 还可以进行选择,选择train中所有包含‘solo’的行
train.loc[train['matchType'].str.contains('solo')]
22.对属性值进行排序
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
>>> df = pd.DataFrame({
... 'col1' : ['A', 'A', 'B', np.nan, 'D', 'C'],
... 'col2' : [2, 1, 9, 8, 7, 4],
... 'col3': [0, 1, 9, 4, 2, 3],
... })
>>> df
col1 col2 col3
0 A 2 0
1 A 1 1
2 B 9 9
3 NaN 8 4
4 D 7 2
5 C 4 3
>>> df.sort_values(by=['col1'])
col1 col2 col3
0 A 2 0
1 A 1 1
2 B 9 9
5 C 4 3
4 D 7 2
3 NaN 8 4
23.更改具体值
# 把返回为True的行中的值进行更改
sub.loc[sub[col] >= 5, col] = '5+'
24.Pandas中的交叉表和透视表
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.pivot_table.html
https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.crosstab.html
https://zhuanlan.zhihu.com/p/31952948
https://blog.csdn.net/hustqb/article/details/78086394
# 简单的说,data用来传入dataframe,index用来选择行,values用来选择查看哪些列,columns用来增加列的层次,aggfunc用来选择聚合方法,默认是np.mean
df = pd.DataFrame({"A": [1, 1, 1, 1, 1,
2, 2, 2, 2],
"B": [1, 1, 1, 1, 2,
1, 1, 2, 2],
"C": [1, 1, 1, 2,
2, 1, 2, 2,
1],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})
A B C D
0 1 1 1 1
1 1 1 1 2
2 1 1 1 2
3 1 1 2 3
4 1 2 2 3
5 2 1 1 4
6 2 1 2 5
7 2 2 2 6
8 2 2 1 7
pd.pivot_table(data = df, index = 'A')
B C D
A
1 1.2 1.4 2.2
2 1.5 1.5 5.5
pd.pivot_table(data = df, index = 'A',values = ['B','C'])
B C
A
1 1.2 1.4
2 1.5 1.5
print(pd.pivot_table(data = df, index = 'A',values = ['B','C'],columns = 'D'))
B C
D 1 2 3 4 5 6 7 1 2 3 4 5 6 7
A
1 1.0 1.0 1.5 NaN NaN NaN NaN 1.0 1.0 2.0 NaN NaN NaN NaN
2 NaN NaN NaN 1.0 1.0 2.0 2.0 NaN NaN NaN 1.0 2.0 2.0 1.0