3.5. 逻辑回归算法案例分析
逻辑回归算法案例分析
良/恶性乳腺癌肿瘤预测
原始数据的下载地址为:https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/
数据预处理
import pandas as pd
import numpy as np
# 根据官方数据构建类别
column_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class'],
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/',names = column_names)
# 将?替换成标准缺失值表示
data = data.replace(to_replace='?',value = np.nan)
# 丢弃带有缺失值的数据(只要一个维度有缺失)
data = data.dropna(how='any')
data.shape
处理的缺失值后的样本共有683条,特征包括细胞厚度、细胞大小、形状等九个维度
准备训练测试数据
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size=0.25,random_state=42)
# 查看训练和测试样本的数量和类别分布
y_train.value_counts()
y_test.value_counts()
使用逻辑回归进行良/恶性肿瘤预测任务
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# 标准化数据,保证每个维度的特征数据方差为1,均值为0。使得预测结果不会被某些维度过大的特征值而主导
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
# 初始化 LogisticRegression
lr = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
# 跳用LogisticRegression中的fit函数/模块来训练模型参数
lr.fit(X_train,y_train)
lr_y_predict = lr.predict(X_test)
性能分析
from sklearn.metrics import classification_report
# 利用逻辑斯蒂回归自带的评分函数score获得模型在测试集上的准确定结果
print '精确率为:',lr.score(X_test,y_test)
print classification_report(y_test,lr_y_predict,target_names = ['Benign','Maligant'])