XGBoost算法原理解释(转载)

技术交流QQ群:1027579432，欢迎你的加入！

1.XGBoost算法原理

2.代码实战

train = pd.read_csv(r"./dataset/train.csv")
train.head()
"""
    user_id:没有实际意义，只是一个代号
    shop_id:商店的位置，是预测目标
    longitude:商店的经度
    latitude:商店的纬度
    wifi_id:路由器的编号
    wifi_strong:wifi的信号强度
    con_sta:wifi的连接状态
"""
test = pd.read_csv(r"./dataset/test.csv")
test.head()

train['time_stamp'] = pd.to_datetime(pd.Series(train['time_stamp']))
test['time_stamp'] = pd.to_datetime(pd.Series(train['time_stamp']))
train.head()

train['Year'] = train['time_stamp'].apply(lambda x: x.year)
train['Month'] = train['time_stamp'].apply(lambda x: x.month)
train['weekday'] = train['time_stamp'].dt.dayofweek
train['time'] = train['time_stamp'].dt.time

test['Year'] = test['time_stamp'].apply(lambda x: x.year)
test['Month'] = test['time_stamp'].apply(lambda x: x.month)
test['weekday'] = test['time_stamp'].dt.dayofweek
test['time'] = test['time_stamp'].dt.time

train = train.drop("time_stamp", axis=1)
test = test.drop("time_stamp", axis=1)   # axis=1列方向，水平方向
train = train.dropna(axis=0)
test = test.fillna(method='pad')  # axis=0行方向，垂直方向
train.head()
test.head()


for f in train.columns:
    if train[f].dtype == 'object':
        if f != 'shop_id':
            print("train col:\n", f)
            labels = preprocessing.LabelEncoder()
            train[f] = labels.fit_transform(list(train[f].values))
for f in test.columns:
    if test[f].dtype == 'object':
        print("test col:\n", f)
        labels = preprocessing.LabelEncoder()
        labels.fit(list(test[f].values))
        test[f] = labels.transform(list(test[f].values))

feature_columns_to_use = ['Year', 'Month', 'weekday',
'time', 'longitude', 'latitude',
'wifi_id1', 'wifi_strong1', 'con_sta1',
 'wifi_id2', 'wifi_strong2', 'con_sta2',
'wifi_id3', 'wifi_strong3', 'con_sta3',
'wifi_id4', 'wifi_strong4', 'con_sta4',
'wifi_id5', 'wifi_strong5', 'con_sta5',
'wifi_id6', 'wifi_strong6', 'con_sta6',
'wifi_id7', 'wifi_strong7', 'con_sta7',
'wifi_id8', 'wifi_strong8', 'con_sta8',
'wifi_id9', 'wifi_strong9', 'con_sta9',
'wifi_id10', 'wifi_strong10', 'con_sta10',]



big_train = train[feature_columns_to_use]
big_test = test[feature_columns_to_use]

train_X = big_train.as_matrix()
test_X = big_test.as_matrix()

train_y = train['shop_id']

# xgboost生成的决策树，处理的特征都是连续型的！！
gbm = xgb.XGBClassifier(silent=1,max_depth=10, n_estimators=1000, learning_rate=0.05)

gbm.fit(train_X, train_y)
y_hat = gbm.predict(test_X)

# 下面是提交到kaggle上的格式
submission = pd.DataFrame({'row_id': test['row_id'],
                            'shop_id': y_hat})

print(submission)
submission.to_csv(r"./dataset/submission.csv", index=False)