技术交流QQ群:1027579432,欢迎你的加入!
1.XGBoost算法原理
2.代码实战
train = pd.read_csv(r"./dataset/train.csv")
train.head()
"""
user_id:没有实际意义,只是一个代号
shop_id:商店的位置,是预测目标
longitude:商店的经度
latitude:商店的纬度
wifi_id:路由器的编号
wifi_strong:wifi的信号强度
con_sta:wifi的连接状态
"""
test = pd.read_csv(r"./dataset/test.csv")
test.head()
train['time_stamp'] = pd.to_datetime(pd.Series(train['time_stamp']))
test['time_stamp'] = pd.to_datetime(pd.Series(train['time_stamp']))
train.head()
train['Year'] = train['time_stamp'].apply(lambda x: x.year)
train['Month'] = train['time_stamp'].apply(lambda x: x.month)
train['weekday'] = train['time_stamp'].dt.dayofweek
train['time'] = train['time_stamp'].dt.time
test['Year'] = test['time_stamp'].apply(lambda x: x.year)
test['Month'] = test['time_stamp'].apply(lambda x: x.month)
test['weekday'] = test['time_stamp'].dt.dayofweek
test['time'] = test['time_stamp'].dt.time
train = train.drop("time_stamp", axis=1)
test = test.drop("time_stamp", axis=1) # axis=1列方向,水平方向
train = train.dropna(axis=0)
test = test.fillna(method='pad') # axis=0行方向,垂直方向
train.head()
test.head()
for f in train.columns:
if train[f].dtype == 'object':
if f != 'shop_id':
print("train col:\n", f)
labels = preprocessing.LabelEncoder()
train[f] = labels.fit_transform(list(train[f].values))
for f in test.columns:
if test[f].dtype == 'object':
print("test col:\n", f)
labels = preprocessing.LabelEncoder()
labels.fit(list(test[f].values))
test[f] = labels.transform(list(test[f].values))
feature_columns_to_use = ['Year', 'Month', 'weekday',
'time', 'longitude', 'latitude',
'wifi_id1', 'wifi_strong1', 'con_sta1',
'wifi_id2', 'wifi_strong2', 'con_sta2',
'wifi_id3', 'wifi_strong3', 'con_sta3',
'wifi_id4', 'wifi_strong4', 'con_sta4',
'wifi_id5', 'wifi_strong5', 'con_sta5',
'wifi_id6', 'wifi_strong6', 'con_sta6',
'wifi_id7', 'wifi_strong7', 'con_sta7',
'wifi_id8', 'wifi_strong8', 'con_sta8',
'wifi_id9', 'wifi_strong9', 'con_sta9',
'wifi_id10', 'wifi_strong10', 'con_sta10',]
big_train = train[feature_columns_to_use]
big_test = test[feature_columns_to_use]
train_X = big_train.as_matrix()
test_X = big_test.as_matrix()
train_y = train['shop_id']
# xgboost生成的决策树,处理的特征都是连续型的!!
gbm = xgb.XGBClassifier(silent=1,max_depth=10, n_estimators=1000, learning_rate=0.05)
gbm.fit(train_X, train_y)
y_hat = gbm.predict(test_X)
# 下面是提交到kaggle上的格式
submission = pd.DataFrame({'row_id': test['row_id'],
'shop_id': y_hat})
print(submission)
submission.to_csv(r"./dataset/submission.csv", index=False)