
import operator 
importance = model.get_fscore() # 得到都是字典的list
importance = sorted(importance.items(),key = operator.itemgetter(1),reverse = True)

# 结果类似这样
# [('iBlade1TempBattBox_1sec', 164), ('iCableTwistTotal', 163), ('iKWhOverall_h', 159), ('iPitchAngle3', 15), ('iUL2_690V', 14), ('iIL1_690V', 14), ('iTempCntr_1sec', 12), ('iIL2_690V', 7), ('iVibrationY', 3), ('iReactivePower', 1), ('iActivePoweiSetPointValue', 1), ('iIL3_690V', 1)]


# 存储文件
with open('abc.pickle','wb') as file:

# 读取文件
with open('abc.pickle','rb') as file:
	abc = pickle.load(file)

  1. 查看属性的值分别是多少以及出现了多少次
# 查看都有哪些值
# 查看值都出现了多少次


reader = pd.read_csv('ANN/20003-2017-03/20003001#2017-03.csv',iterator=True)
chunks = []
while 1:
        chunk = reader.get_chunk(100000)
    except StopIteration:
        print('Iteration is stoped')
df = pd.concat(chunks,ignore_index=True) # 合并全部的chunk


df = df.ix[:,(df != df.ix[0]).any()]




def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            # Print current column type
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                        props[col] = props[col].astype(np.uint64)
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            # Make float datatypes 32 bit
                props[col] = props[col].astype(np.float32)
            # Print new column type
            print("dtype after: ",props[col].dtype)
    # Print final result
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

train = pd.read_csv('../input/train_V2.csv')
train = reduce_mem_usage(train)
test = pd.read_csv('../input/test_V2.csv')
test = reduce_mem_usage(test)


# 表示A,B属性中不相同的元素的数量
>>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
>>> df.nunique()
A    3
B    1

# 表示每行中不相同的元素的数量
>>> df.nunique(axis=1)
0    1
1    2
2    2


fig,ax = plt.subplots(1,2,figsize = (12,4))

# value_counts后得到的是Series,直接使用plot即可画图
train.groupby('matchId')['matchType'].first().value_counts().plot(kind = 'bar',ax = ax[0])


df[df['total_bill'] > 20]

# 等价于
df.query('total_bill > 20')


cols = ['numGroups','maxPlace']
desc1 = train.groupby('matchType')[cols].describe()

# 都是用来计数的,但是返回的类型不一样,count返回Dataframe,size返回Series

# 含义是获取每个group的第一条记录


# 下面这两句话是一个意思,都是统计每个比赛下group的多少


# 由于size()会返回一个series,导致没有column,使用to_frame将series转化成为Dataframe,添加column
train.groupby(['matchType','matchId'])['groupId'].nunique().to_frame('group in match')

print(group['players in group'].nlargest(5))

fix,ax = plt.subplots(1,2,figsize = (12,4))
train['matchDuration'].plot(kind = 'hist',ax = ax[0])
train.query('matchDuration >= 1400 & matchDuration <= 1800')['matchDuration'].plot(kind = 'hist',ax = ax[1])


# 调整图片尺寸
plt.figure(figsize = (12,4))

# 创建axes
plt.hist(x = train['matchDuration'],bins = 50)
plt.hist(x =train.query('matchDuration >= 1400 & matchDuration <= 1800')['matchDuration'],bins = 50 )

any() 函数用于判断给定的可迭代参数 iterable 是否全部为 False,则返回 False,如果有一个为 True,则返回 True。
元素除了是 0、空、FALSE 外都算 TRUE。





# 还可以进行选择,选择train中所有包含‘solo’的行


>>> df = pd.DataFrame({
...     'col1' : ['A', 'A', 'B', np.nan, 'D', 'C'],
...     'col2' : [2, 1, 9, 8, 7, 4],
...     'col3': [0, 1, 9, 4, 2, 3],
... })
>>> df
    col1 col2 col3
0   A    2    0
1   A    1    1
2   B    9    9
3   NaN  8    4
4   D    7    2
5   C    4    3

>>> df.sort_values(by=['col1'])
    col1 col2 col3
0   A    2    0
1   A    1    1
2   B    9    9
5   C    4    3
4   D    7    2
3   NaN  8    4


# 把返回为True的行中的值进行更改
sub.loc[sub[col] >= 5, col] = '5+'


# 简单的说,data用来传入dataframe,index用来选择行,values用来选择查看哪些列,columns用来增加列的层次,aggfunc用来选择聚合方法,默认是np.mean

df = pd.DataFrame({"A": [1, 1, 1, 1, 1,
                         2, 2, 2, 2],
                    "B": [1, 1, 1, 1, 2,
                          1, 1, 2, 2],
                    "C": [1, 1, 1, 2,
                          2, 1, 2, 2,
                    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})

	A	B	C	D
0	1	1	1	1
1	1	1	1	2
2	1	1	1	2
3	1	1	2	3
4	1	2	2	3
5	2	1	1	4
6	2	1	2	5
7	2	2	2	6
8	2	2	1	7
pd.pivot_table(data = df, index = 'A')

     B    C    D
1  1.2  1.4  2.2
2  1.5  1.5  5.5
pd.pivot_table(data = df, index = 'A',values = ['B','C'])

     B    C
1  1.2  1.4
2  1.5  1.5
print(pd.pivot_table(data = df, index = 'A',values = ['B','C'],columns = 'D'))

     B                                  C                              
D    1    2    3    4    5    6    7    1    2    3    4    5    6    7
1  1.0  1.0  1.5  NaN  NaN  NaN  NaN  1.0  1.0  2.0  NaN  NaN  NaN  NaN
2  NaN  NaN  NaN  1.0  1.0  2.0  2.0  NaN  NaN  NaN  1.0  2.0  2.0  1.0