质量声明:原创文章,内容质量问题请评论吐槽。如对您产生干扰,可私信删除。
主要参考:李沐等:动手学深度学习-伯克利教材
文章目录
摘要: 以MLP+FC/Softmax实现回归“波士顿房价预测”和分类“Fashion-MNIST识别”
回归预测
数据预处理
- 导入依赖
import d2lzh as d2l
import numpy as np
import pandas as pd
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
数据集规模
train_data = pd.read_csv('data/kaggle_house_price_prediction/train.csv')
test_data = pd.read_csv('data/kaggle_house_price_prediction/test.csv')
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
- 训练数据集包括1460个样本、 80个特征和1个标签;
- 测试数据集包括1459个样本和80个特征;
- 特征值有连续的数字(数值型特征)、离散的标签(类别型特征),甚至是缺失值“na”
- 合并所有特征,统一进行预处理
数值型特征处理
- 归一化
index = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[index] = all_features[index].apply(lambda x: (x - x.mean()) / (x.std()))
- 缺失值处理:标准化后,特征均值为0,所以可以直接用0来替换缺失值
all_features[index] = all_features[index].fillna(0)
类别型特征处理
pd.get_dummies
进行独热编码dummy_na=True
表示将缺失类别也看做一类,进行one-hot编码。举例说明,假设特征MSZoning里面有两个类别RL和RM,以及缺失类别NaN,则将MSZoning特征扩展为MSZoning_RL、MSZoning_RM、MSZoning_NaN,再进行one-hot编码- 特征数由79增加到了331
all_features = pd.get_dummies(all_features, dummy_na=True)
重新划分数据集
- 通过
DataFrame.values
属性得到NumPy格式的数据,并转成nd.NDArray
以便于后续训练
n_train = train_data.shape[0]
train_features = nd.array(all_features[:n_train].values)
test_features = nd.array(all_features[n_train:].values)
train_labels = nd.array(train_data.SalePrice.values).reshape((-1, 1))
模型构建
- 输入层可自动确定为331
- 靠近输入层丢弃概率建议偏小
def get_net():
net = nn.Sequential()
net.add(nn.Dense(360, activation='relu'),
nn.Dropout(0.2),
nn.Dense(64, activation='relu'),
nn.Dropout(0.5),
nn.Dense(1))
net.initialize(init.Normal(sigma=0.01))
return net
交叉验证模型参数
- 训练模型
loss = gloss.L2Loss()
def log_rmse(net, features, labels):
# 将小于1的值设成1,使得取对数时数值更稳定
clipped_preds = nd.clip(net(features), 1, float('inf'))
rmse = nd.sqrt(2 * loss(clipped_preds.log(), labels.log()).mean())
return rmse.asscalar()
def train(net, train_features, train_labels, test_features, test_labels,
num_epochs, learning_rate, weight_decay, batch_size):
train_ls, test_ls = [], []
train_iter = gdata.DataLoader(gdata.ArrayDataset(
train_features, train_labels), batch_size, shuffle=True)
# 这里使用了Adam优化算法,对学习率相对不那么敏感
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': learning_rate, 'wd': weight_decay})
for epoch in range(num_epochs):
for X, y in train_iter:
with autograd.record():
l = loss(net(X), y)
l.backward()
trainer.step(batch_size)
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
- 交叉验证:
def get_k_fold_data(k, i, X, y):
assert k > 1
fold_size = X.shape[0] // k
X_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)
X_part, y_part = X[idx, :], y[idx]
if j == i:
X_valid, y_valid = X_part, y_part
elif X_train is None:
X_train, y_train = X_part, y_part
else:
X_train = nd.concat(X_train, X_part, dim=0)
y_train = nd.concat(y_train, y_part, dim=0)
return X_train, y_train, X_valid, y_valid
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
train_l_sum, valid_l_sum = 0, 0
for i in range(k):
data = get_k_fold_data(k, i, X_train, y_train)
net = get_net()
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]
if i == 0:
d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
range(1, num_epochs + 1), valid_ls,
['train', 'valid'])
print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
return train_l_sum / k, valid_l_sum / k
k=5; num_epochs=100; lr=0.01; weight_decay=20; batch_size=64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))
训练模型
调用以上函数:交叉验证模型参数中的训练模型
net = get_net()
train_ls, _ = train(net, train_features, train_labels, None, None,num_epochs, lr, weight_decay, batch_size)
d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')
print('train rmse %f' % train_ls[-1])
预测
- net输出
shape = (1459, 1)
- net输出
type = numpy.ndarray
preds = net(test_features).asnumpy()
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1) # 新增一列
submission.to_csv('submission.csv', index=False)
图像分类
- 导入依赖
import sys,os
import d2lzh as d2l
import mxnet as mx
from matplotlib import pyplot as plt
from mxnet import nd, autograd, gluon, init
from mxnet.gluon import data as gdata
from mxnet.gluon import loss as gloss
from mxnet.gluon import nn
数据预处理
导入数据集
- 图像分类数据集中最常⽤的是⼿写数字识别数据集MNIST ,但⼤部分模型在MNIST上的分类精度都超过了95%,为了更直观地观察算法之间的差异,选用图像内容更加复杂的数据集Fashion-MNIST. ⼀共包括了10个类别,分别为t-shirt(T恤)、 trouser(裤⼦)、 pullover(套衫)、dress(连⾐裙)、 coat(外套)、 sandal(凉鞋)、 shirt(衬衫)、 sneaker(运动鞋)、 bag(包)和ankle boot(短靴)
- 在gluon接口中,通过data模块对数据集进行加载和循环遍历,并返回batch大小的数据. 其中mxnet.gluon.data是数据加载API,用于生成Dataset对象,而
mxnet.gluon.data.vision
专门用于加载计算机视觉的数据集;mxnet.gluon.data.DataLoader
用于返回指定batch大小的数据. (详见博文gluon模块进行数据加载-Dataset和DataLoader) - 在指定目录下加载数据集,若数据集文件已存在,则不重复下载.
- 通过gluon.data加载的数据集都是原始图片完成变换后,得到的
28 x 28
灰度图
root=os.path.join('~', '.mxnet', 'datasets', 'fashion-mnist')
root = os.path.expanduser(root)
mnist_train = gdata.vision.FashionMNIST(root=root, train=True)
mnist_test = gdata.vision.FashionMNIST(root=root, train=False)
Dataset对象都有四个方法:
getitem(idx)
: 数据加载,用于返回第idx个样本len()
: 用于返回数据集的样本的数量transform(fn, lazy = True)
: 数据变换,用于返回对每个样本利用fn函数进行数据变换(增广)后的Datasettransform_first(fn, lazy = True)
: 数据变换,用于返回对每个样本的特征利用fn函数进行数据变换(增广)后的Dataset,而不对label进行数据增广
样本数据展示
img_0.shape = (28, 28, 1)
img_0.dtype = numpy.uint8
, 范围[0, 255]label_0 = 2
text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
img_0, label_0 = mnist_train[0]
img_0 = img_0.reshape(28,28).asnumpy()
plt.imshow(img_0, cmap="gray"), plt.axis("off"), plt.title(text_labels[label_0])
plt.show()
图像增广
-
在gloun的data接口中,有可以使用的数据增广的模块(
mxnet.gluon.data.vision.tranforms
)。在transforms模块中定义了很多数据变换的layer(为Block的子类),变换layer的输入为样本,输出为变换后的样本。 -
重要的变换有:
- Cast: 变换数据类型
- ToTensor: 将图像数组维度由“M×N×C”改为 “C×M×N”
- Normalize: 对C×M×N图片每个像素按均值和方差标准化
- RandomResizedCrop: 随机裁剪
- Resize: 变换尺寸
- RandomFlipLeftRight: 随机左右翻转
- RandomFlipTopBottom:随机上下翻转,不如左右翻转通用
- RandomColorJitter:随机变换图像亮度brightness、对比度contrast、饱和度saturation和色调hue
-
通过
transforms.Compose( [ ] )
联结多个变换:
transformer = gdata.vision.transforms.Compose([
gdata.vision.transforms.RandomFlipLeftRight(),
gdata.vision.transforms.ToTensor(),
gdata.vision.transforms.Normalize(),
])
mnist_train = mnist_train.transform_first(transformer)
mnist_test = mnist_test.transform_first(transformer)
- 变换后的图像:
shape = (1, 28, 28)
,dtype = 'float32'
,(min, max) = (0, 1)
模型构建
def get_net():
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'),
nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
return net
net = get_net()
模型训练
- 构建小批量数据数据生成器
batch_size = 256
num_workers = 0 if sys.platform.startswith('win32') else 4
train_iter = gdata.DataLoader(mnist_train, batch_size, shuffle=True,num_workers=num_workers)
test_iter = gdata.DataLoader(mnist_test, batch_size, shuffle=True,num_workers=num_workers)
- 构建模型训练器
lr = 0.01
loss = gloss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
def evaluate_accuracy(data_iter, net):
acc_sum, n = 0.0, 0
for X, y in data_iter:
y = y.astype('float32')
acc_sum += (net(X).argmax(axis=1) == y).sum().asscalar()
n += y.size
return acc_sum / n
- 训练n个epoch,调用评价指标输出test accuracy
num_epochs = 5
for epoch in range(num_epochs):
train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
for X, y in train_iter:
with autograd.record():
y_hat = net(X)
l = loss(y_hat, y).sum()
l.backward()
trainer.step(batch_size)
y = y.astype('float32')
train_l_sum += l.asscalar()
train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
n += y.size
test_acc = evaluate_accuracy(test_iter, net)
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
% (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
输出:
epoch 1, loss 0.5600, train acc 0.793, test acc 0.847
epoch 2, loss 0.4028, train acc 0.854, test acc 0.852
epoch 3, loss 0.3734, train acc 0.863, test acc 0.866
epoch 4, loss 0.3512, train acc 0.871, test acc 0.868
epoch 5, loss 0.3410, train acc 0.875, test acc 0.872
模型预测
预测实现同上,这里单独列出:
out = net(X)
y_pred = out.argmax(axis=1)
输出:
dtype = numpy.float32
out.shape = (256, 10)
,y_pred.shape = (256, )
out[0] = [ -8.695162 -16.209768 ... -5.969324 ]
,y_pred[0] = [3.]
计算准确度:
acc_sum = 0.0; n = 0
for X, y in test_iter:
y = y.astype('float32')
out = net(X)
y_pred = out.argmax(axis=1)
acc_sum += (y_pred==y).sum().asscalar()
n += y.size
accuracy = acc_sum / n
print(f"accuracy = {accuracy*100:.2f}%")