下面就是相关数据,这是一个脱敏数据,经过了一定的处理,我们不需要分析情况处理相关特征。

 如果需要做实验,评论留邮箱,发数据。下面是代码部分,具体每一步的作用,代码中已经进行了详细的阐述。

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

credit = pd.read_csv('./data/creditcard.csv')
# print(credit.shape)  # 284807* 31
print(credit.head())

# 判断其是否有空值
# print(pd.isnull(credit).values.any())  # 表示没有空值

# 看一下类别
count_classes = pd.value_counts(credit['Class'], sort=True)
# print(count_classes)  # 0表示为欺诈 数据284315条   1 表示欺诈  数据有492条

# 我们此时应该看数据需不需要进行处理
# 显然Time是个无用的属性,Amount账户需要标准化
credit['normAmount'] = StandardScaler().fit_transform(np.array(credit['Amount']).reshape(-1, 1))
# 接着删除刚才原始的Amount以及Time特征
credit = credit.drop(['Time', 'Amount'], axis=1)
# print(credit)

# 将数据打乱, 这里就随意了,打不打乱都不是很重要
columns_index = credit.columns  # 把特征名先保存起来,等会还原数据需要用
credit = np.array(credit)
np.random.shuffle(credit)
credit = pd.DataFrame(credit, columns=columns_index)
# print(credit)

# 切分特征和标签
X = credit.loc[:, credit.columns != 'Class']
y = credit.loc[:, credit.columns == 'Class']
# print(X)
# print(y)

# 因为标签为0的数据太多 而标签为1的数据却很少 悬殊太大,我们进行重采样, 将未欺诈和被欺诈的数量搞成一样的
bad_num = len(credit[credit['Class'] == 1])  # 被欺骗的数量
bad_indices = credit[credit['Class'] == 1].index
good_indices = credit[credit['Class'] == 0].index  # 把为欺骗的横索引找出来
# 然后进行选择
random_good_indices = np.random.choice(good_indices, bad_num)
random_good_indices = np.array(random_good_indices)
resample_indices = np.concatenate([bad_indices, random_good_indices])
# print(resample_indices)
resample_data = credit.iloc[resample_indices, :]
# print(len(resample_data))
# print(resample_data)
resample_x = resample_data.loc[:, resample_data.columns != 'Class']
resample_y = resample_data.loc[:, resample_data.columns == 'Class']


# 数据集的分割
# 未重新采样的数据集分割
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# 采样后的数据集分割
re_x_train, re_x_test, re_y_train, re_y_test = train_test_split(resample_x, resample_y, test_size=0.25, random_state=42)
# print(len(x_train))   # 213605
# print(len(re_x_train))   # 738

# 走到这一步数据基本处理好了
# 我们有两组数据,一组是原始数据,一组是重采样后的数据
# 接下来我们进行模型训练与评估

def print_kfold(x_train_data, y_train_data):

    fold = KFold(n_splits=5, shuffle=False)  # 进行5折交叉验证
    # 给一些不同的参数
    c_param_range = [0.01, 0.1, 1, 10, 100]
    result = pd.DataFrame(index=[0, 1, 2, 3, 4], columns=['C_parameter', 'Mean recall score'])
    result['C_parameter'] = c_param_range
    j = 0
    for c_param in c_param_range:
        print("*"*100)
        print('C parameter:', c_param)

        recall_accs = []
        for train_index, test_index in fold.split(x_train_data, y_train_data):
            lr = LogisticRegression(C=c_param, penalty='l1')
            lr.fit(x_train_data.iloc[train_index, :], y_train_data.iloc[train_index, :])
            y_pred = lr.predict(x_train_data.iloc[test_index, :])
            recall = recall_score(y_train_data.iloc[test_index, :], y_pred)
            recall_accs.append(recall)
            print("此次召回率:", recall)
        print('平均召回率:', np.mean(recall_accs))

        result.loc[j, 'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('平均召回率为:', np.mean(recall_accs))

    print(result)
    result['Mean recall score'] = result['Mean recall score'].astype('float64')
    best_c = result.loc[result['Mean recall score'].idxmax()]['C_parameter']
    print("最好的参数C:", best_c)
    return best_c

best_c = print_kfold(re_x_train, re_y_train)

# 此处我们看一下混淆矩阵
lr = LogisticRegression(C=best_c, penalty='l1')
lr.fit(re_x_train, re_y_train)
re_y_pred = lr.predict(re_x_train)
matrix = confusion_matrix(re_y_train, re_y_pred)
print("混淆矩阵:\n", matrix)
print("精度:", precision_score(re_y_train, re_y_pred))
print("召回率:", recall_score(re_y_train, re_y_pred))
print("f1分数:", f1_score(re_y_train, re_y_pred))

最后的实验效果: