Kaggle上的竞赛链接:https://www.kaggle.com/c/titanic/kernels

#导入pandas用于数据分析
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#利用pandas的read_csv模块直接从互联网加载泰坦尼克号乘客数据
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

#titanic.info()
#特征选择,sex,age,pclass这些特征很有可能是决定幸免与否的关键因素
X = titanic[['pclass','age','sex']]
y = titanic['survived']

X['age'].fillna(X['age'].mean(), inplace=True)
#X.info()

#数据分割,拆分训练集和测试集
from sklearn.model_selection import  train_test_split #随机划分测试集合训练集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

#特征转换
from sklearn.feature_extraction import  DictVectorizer
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
print(vec.feature_names_)

#对预测数据进行同样的特征转换
X_test = vec.transform(X_test.to_dict(orient='record'))

#导入决策树模型并对测试特征数据进行预测
from sklearn.tree import  DecisionTreeClassifier
#使用默认配置初始化决策树分类器
dtc = DecisionTreeClassifier()
#训练数据进行模型学习
dtc.fit(X_train,y_train)
#决策树模型对特征数据进行预测
y_predict=dtc.predict(X_test)

#模型评估
from sklearn.metrics import  classification_report
#输出预测的准确性
print(dtc.score(X_test,y_test))
#输出详细的分类性能
print(classification_report(y_predict,y_test,target_names=['died','survived']))

#使用随机深林分类器进行生成模型的训练和预测分析
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)
#输出随机深林分类器在测试集上的分类准确性
print(rfc.score(X_test,y_test))
print(classification_report(rfc_y_pred,y_test,target_names=['died','survived']))

#使用梯度提升决策树进行集成O型的训练和模型预测
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
gbc_y_pred = gbc.predict(X_test)
#输出梯度提升树在测试集上的分类准确性
print(gbc.score(X_test,y_test))
print(classification_report(gbc_y_pred, y_test,target_names=['died','survived']))


从上到下为3种方式得到的训练和测试结果

0.781155015198
             precision    recall  f1-score   support

       died       0.91      0.78      0.84       236
   survived       0.58      0.80      0.67        93

avg / total       0.81      0.78      0.79       329

0.784194528875
             precision    recall  f1-score   support

          0       0.91      0.78      0.84       235
          1       0.59      0.80      0.68        94

avg / total       0.82      0.78      0.79       329

0.790273556231
             precision    recall  f1-score   support

          0       0.92      0.78      0.84       239
          1       0.58      0.82      0.68        90

avg / total       0.83      0.79      0.80       329
#coding=utf-8
import numpy as np
import pandas as pd
train = pd.read_csv("/home/just_sort/train.csv")

test = pd.read_csv("/home/just_sort/test.csv")

Emap = {'S': 1, 'C': 2, 'Q': 3}
train.Embarked.loc[pd.isnull(train.Embarked)] = 'S'
datasets = [train, test]
for dataset in datasets:
    dataset["EmbarkedMap"] = dataset.Embarked.map(Emap)
CabinMap = {'NaN': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
for dataset in datasets:
    dataset['HasCabin'] = 0
    dataset.HasCabin.loc[pd.notnull(dataset.Cabin)] = 1
    dataset["CMap"] = dataset.Cabin.str[0].map(CabinMap)
    dataset.CMap.fillna(0, inplace=True)
trainages = np.random.randint(train.Age.mean() - train.Age.std(), train.Age.mean() + train.Age.std(),
                              size=sum(pd.isnull(train.Age)))
testages = np.random.randint(test.Age.mean() - test.Age.std(), test.Age.mean() + test.Age.std(),
                             size=sum(pd.isnull(test.Age)))
train.Age.loc[pd.isnull(train.Age)] = trainages
test.Age.loc[pd.isnull(test.Age)] = testages
for dataset in datasets:
    dataset["Male"] = 0
    dataset.Male.loc[dataset.Sex == 'male'] = 1
    dataset["Title"] = dataset.Name.str.extract('( [A-Za-z]+)\.', expand=False)
d = dict(zip(np.unique(train.Title), np.arange(1, 18)))
for dataset in datasets:
    dataset['TMap'] = dataset.Title.map(d)
    dataset.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], 1, inplace=True)
    dataset.TMap.loc[dataset.TMap == 1] = 0
    dataset.TMap.loc[dataset.TMap == 2] = 0
    dataset.TMap.loc[dataset.TMap == 3] = 0
    dataset.TMap.loc[dataset.TMap == 4] = 0
    dataset.TMap.loc[dataset.TMap == 5] = 0
    dataset.TMap.loc[dataset.TMap == 6] = 0
    dataset.TMap.loc[dataset.TMap == 7] = 0
    dataset.TMap.loc[dataset.TMap == 8] = 0
    dataset.TMap.loc[dataset.TMap == 13] = 0
    dataset.TMap.loc[dataset.TMap == 14] = 0
    dataset.TMap.loc[dataset.TMap == 15] = 0
test.Fare.loc[pd.isnull(test.Fare)] = test.Fare.mean()
for dataset in datasets:
    dataset['Pclass1'] = 0
    dataset['Pclass2'] = 0
    dataset['Pclass3'] = 0
    dataset.Pclass1.loc[dataset.Pclass == 1] = 1
    dataset.Pclass2.loc[dataset.Pclass == 2] = 1
    dataset.Pclass3.loc[dataset.Pclass == 3] = 1
    dataset['EMap1'] = 0
    dataset['EMap2'] = 0
    dataset['EMap3'] = 0
    dataset.EMap1.loc[dataset.EmbarkedMap == 1] = 1
    dataset.EMap2.loc[dataset.EmbarkedMap == 2] = 1
    dataset.EMap3.loc[dataset.EmbarkedMap == 3] = 1
    dataset['TMap1'] = 0
    dataset['TMap2'] = 0
    dataset['TMap3'] = 0
    dataset['TMap4'] = 0
    dataset['TMap5'] = 0
    dataset.TMap1.loc[dataset.TMap == 0] = 1
    dataset.TMap2.loc[dataset.TMap == 9] = 1
    dataset.TMap3.loc[dataset.TMap == 10] = 1
    dataset.TMap4.loc[dataset.TMap == 11] = 1
    dataset.TMap5.loc[dataset.TMap == 12] = 1

train.info()
from sklearn.model_selection import train_test_split
train1, train2 = train_test_split(train, test_size=0.20, random_state=33)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, criterion='entropy')
clf.fit(train.drop(['Survived', 'PassengerId', 'TMap', 'Title'], 1, inplace=False),train.Survived)
clf.score(train2.drop(['Survived', 'PassengerId', 'TMap', 'Title'], 1, inplace=False),train2.Survived)
train2_pred = clf.predict(
    train2.drop(['Survived', 'PassengerId', 'TMap', 'Title'], 1, inplace=False))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(train2.Survived, train2_pred)
arr = pd.Series(clf.predict(test.drop(['PassengerId', 'TMap', 'Title'], 1, inplace=False)),name='Survived')
df = pd.concat([test.PassengerId, arr], axis=1)
df.to_csv('pred1.csv', sep=',', index=False)