Kaggle上的竞赛链接:https://www.kaggle.com/c/titanic/kernels
#导入pandas用于数据分析
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#利用pandas的read_csv模块直接从互联网加载泰坦尼克号乘客数据
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#titanic.info()
#特征选择,sex,age,pclass这些特征很有可能是决定幸免与否的关键因素
X = titanic[['pclass','age','sex']]
y = titanic['survived']
X['age'].fillna(X['age'].mean(), inplace=True)
#X.info()
#数据分割,拆分训练集和测试集
from sklearn.model_selection import train_test_split #随机划分测试集合训练集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
#特征转换
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
print(vec.feature_names_)
#对预测数据进行同样的特征转换
X_test = vec.transform(X_test.to_dict(orient='record'))
#导入决策树模型并对测试特征数据进行预测
from sklearn.tree import DecisionTreeClassifier
#使用默认配置初始化决策树分类器
dtc = DecisionTreeClassifier()
#训练数据进行模型学习
dtc.fit(X_train,y_train)
#决策树模型对特征数据进行预测
y_predict=dtc.predict(X_test)
#模型评估
from sklearn.metrics import classification_report
#输出预测的准确性
print(dtc.score(X_test,y_test))
#输出详细的分类性能
print(classification_report(y_predict,y_test,target_names=['died','survived']))
#使用随机深林分类器进行生成模型的训练和预测分析
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)
#输出随机深林分类器在测试集上的分类准确性
print(rfc.score(X_test,y_test))
print(classification_report(rfc_y_pred,y_test,target_names=['died','survived']))
#使用梯度提升决策树进行集成O型的训练和模型预测
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
gbc_y_pred = gbc.predict(X_test)
#输出梯度提升树在测试集上的分类准确性
print(gbc.score(X_test,y_test))
print(classification_report(gbc_y_pred, y_test,target_names=['died','survived']))
从上到下为3种方式得到的训练和测试结果
0.781155015198
precision recall f1-score support
died 0.91 0.78 0.84 236
survived 0.58 0.80 0.67 93
avg / total 0.81 0.78 0.79 329
0.784194528875
precision recall f1-score support
0 0.91 0.78 0.84 235
1 0.59 0.80 0.68 94
avg / total 0.82 0.78 0.79 329
0.790273556231
precision recall f1-score support
0 0.92 0.78 0.84 239
1 0.58 0.82 0.68 90
avg / total 0.83 0.79 0.80 329
#coding=utf-8
import numpy as np
import pandas as pd
train = pd.read_csv("/home/just_sort/train.csv")
test = pd.read_csv("/home/just_sort/test.csv")
Emap = {'S': 1, 'C': 2, 'Q': 3}
train.Embarked.loc[pd.isnull(train.Embarked)] = 'S'
datasets = [train, test]
for dataset in datasets:
dataset["EmbarkedMap"] = dataset.Embarked.map(Emap)
CabinMap = {'NaN': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
for dataset in datasets:
dataset['HasCabin'] = 0
dataset.HasCabin.loc[pd.notnull(dataset.Cabin)] = 1
dataset["CMap"] = dataset.Cabin.str[0].map(CabinMap)
dataset.CMap.fillna(0, inplace=True)
trainages = np.random.randint(train.Age.mean() - train.Age.std(), train.Age.mean() + train.Age.std(),
size=sum(pd.isnull(train.Age)))
testages = np.random.randint(test.Age.mean() - test.Age.std(), test.Age.mean() + test.Age.std(),
size=sum(pd.isnull(test.Age)))
train.Age.loc[pd.isnull(train.Age)] = trainages
test.Age.loc[pd.isnull(test.Age)] = testages
for dataset in datasets:
dataset["Male"] = 0
dataset.Male.loc[dataset.Sex == 'male'] = 1
dataset["Title"] = dataset.Name.str.extract('( [A-Za-z]+)\.', expand=False)
d = dict(zip(np.unique(train.Title), np.arange(1, 18)))
for dataset in datasets:
dataset['TMap'] = dataset.Title.map(d)
dataset.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], 1, inplace=True)
dataset.TMap.loc[dataset.TMap == 1] = 0
dataset.TMap.loc[dataset.TMap == 2] = 0
dataset.TMap.loc[dataset.TMap == 3] = 0
dataset.TMap.loc[dataset.TMap == 4] = 0
dataset.TMap.loc[dataset.TMap == 5] = 0
dataset.TMap.loc[dataset.TMap == 6] = 0
dataset.TMap.loc[dataset.TMap == 7] = 0
dataset.TMap.loc[dataset.TMap == 8] = 0
dataset.TMap.loc[dataset.TMap == 13] = 0
dataset.TMap.loc[dataset.TMap == 14] = 0
dataset.TMap.loc[dataset.TMap == 15] = 0
test.Fare.loc[pd.isnull(test.Fare)] = test.Fare.mean()
for dataset in datasets:
dataset['Pclass1'] = 0
dataset['Pclass2'] = 0
dataset['Pclass3'] = 0
dataset.Pclass1.loc[dataset.Pclass == 1] = 1
dataset.Pclass2.loc[dataset.Pclass == 2] = 1
dataset.Pclass3.loc[dataset.Pclass == 3] = 1
dataset['EMap1'] = 0
dataset['EMap2'] = 0
dataset['EMap3'] = 0
dataset.EMap1.loc[dataset.EmbarkedMap == 1] = 1
dataset.EMap2.loc[dataset.EmbarkedMap == 2] = 1
dataset.EMap3.loc[dataset.EmbarkedMap == 3] = 1
dataset['TMap1'] = 0
dataset['TMap2'] = 0
dataset['TMap3'] = 0
dataset['TMap4'] = 0
dataset['TMap5'] = 0
dataset.TMap1.loc[dataset.TMap == 0] = 1
dataset.TMap2.loc[dataset.TMap == 9] = 1
dataset.TMap3.loc[dataset.TMap == 10] = 1
dataset.TMap4.loc[dataset.TMap == 11] = 1
dataset.TMap5.loc[dataset.TMap == 12] = 1
train.info()
from sklearn.model_selection import train_test_split
train1, train2 = train_test_split(train, test_size=0.20, random_state=33)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, criterion='entropy')
clf.fit(train.drop(['Survived', 'PassengerId', 'TMap', 'Title'], 1, inplace=False),train.Survived)
clf.score(train2.drop(['Survived', 'PassengerId', 'TMap', 'Title'], 1, inplace=False),train2.Survived)
train2_pred = clf.predict(
train2.drop(['Survived', 'PassengerId', 'TMap', 'Title'], 1, inplace=False))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(train2.Survived, train2_pred)
arr = pd.Series(clf.predict(test.drop(['PassengerId', 'TMap', 'Title'], 1, inplace=False)),name='Survived')
df = pd.concat([test.PassengerId, arr], axis=1)
df.to_csv('pred1.csv', sep=',', index=False)