样本数据如下:
pollution dew temp press wnd_dir wnd_spd snow rain
date
2010-01-02 00:00:00 129.0 -16 -4.0 1020.0 SE 1.79 0 0
2010-01-02 01:00:00 148.0 -15 -4.0 1020.0 SE 2.68 0 0
2010-01-02 02:00:00 159.0 -11 -5.0 1021.0 SE 3.57 0 0
2010-01-02 03:00:00 181.0 -7 -5.0 1022.0 SE 5.36 1 0
2010-01-02 04:00:00 138.0 -7 -5.0 1022.0 SE 6.25 2 0
... ... ... ... ... ... ... ...
2014-12-31 19:00:00 8.0 -23 -2.0 1034.0 NW 231.97 0 0
2014-12-31 20:00:00 10.0 -22 -3.0 1034.0 NW 237.78 0 0
2014-12-31 21:00:00 10.0 -22 -3.0 1034.0 NW 242.70 0 0
2014-12-31 22:00:00 8.0 -22 -4.0 1034.0 NW 246.72 0 0
2014-12-31 23:00:00 12.0 -21 -3.0 1034.0 NW 249.85 0 0
[43800 rows x 8 columns]根据特征dew、temp、press、wnd_dir、wnd_spd、snow、rain预测pollution
如何将有监督的Python学习问题转换成时间序列?
import pandas as pd
def series_to_supervised(data, columns, n_in=1, n_out=1, dropnan=True):
"""
Frame a time series as a supervised learning dataset.
Arguments:
data: Sequence of observations as a list or NumPy array.
n_in: Number of lag observations as input (X).
n_out: Number of observations as output (y).
dropnan: Boolean whether or not to drop rows with NaN values.
Returns:
Pandas DataFrame of series framed for supervised learning.
"""
n_vars = 1 if type(data) is list else data.shape[1]
df = pd.DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('%s%d(t-%d)' % (columns[j], j + 1, i)) for j in range(n_vars)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('%s%d(t)' % (columns[j], j + 1)) for j in range(n_vars)]
else:
names += [('%s%d(t+%d)' % (columns[j], j + 1, i)) for j in range(n_vars)]
# put it all together
agg = pd.concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
clean_agg = agg.dropna()
return clean_agg
import numpy as np
if __name__ == '__main__':
values = [x for x in range(10)]
values = np.array([[25,17,20,18],[13,17,26,11],[22,26,31,19],[18,19,35,46]])
data = series_to_supervised(values, ['temp','lr','rw','dir'], 2)将数据处理成LSTM能输入的格式
import pandas as pd
from util import PROCESS_LEVEL1
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from series_to_supervised_learning import series_to_supervised
pd.options.display.expand_frame_repr = False
def data_helper():
dataset = pd.read_csv(PROCESS_LEVEL1, header=0, index_col=0)
print(dataset)
dataset_columns = dataset.columns
values = dataset.values
# 对第四列(风向)数据进行编码,也可进行 哑编码处理
encoder = LabelEncoder()
values[:, 4] = encoder.fit_transform(values[:, 4])
values = values.astype('float32')
# 对数据进行归一化处理, valeus.shape=(, 8),inversed_transform时也需要8列
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)
# 将序列数据转化为监督学习数据
reframed = series_to_supervised(scaled, dataset_columns, 1, 1)
#print(reframed.columns[[9, 10, 11, 12, 13, 14, 15]])
# 只考虑当前时刻(t)的前一时刻(t-1)的PM2.5值
reframed.drop(reframed.columns[[9, 10, 11, 12, 13, 14, 15]], axis=1, inplace=True)
values = reframed.values
n_train_hours = 365 * 24
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]
# 监督学习结果划分,test_x.shape = (, 8)
train_x, train_y = train[:, :-1], train[:, -1]
test_x, test_y = test[:, :-1], test[:, -1]
print(type(train_x))
# 为了在LSTM中应用该数据,需要将其格式转化为3D format,即[Samples, timesteps, features]
train_X = train_x.reshape((train_x.shape[0], 1, train_x.shape[1]))
print(train_X.shape)
test_X = test_x.reshape((test_x.shape[0], 1, test_x.shape[1]))
return scaler,test_x,train_X,train_y,test_X,test_y模型预测代码

京公网安备 11010502036488号