样本数据如下:
pollution dew temp press wnd_dir wnd_spd snow rain date 2010-01-02 00:00:00 129.0 -16 -4.0 1020.0 SE 1.79 0 0 2010-01-02 01:00:00 148.0 -15 -4.0 1020.0 SE 2.68 0 0 2010-01-02 02:00:00 159.0 -11 -5.0 1021.0 SE 3.57 0 0 2010-01-02 03:00:00 181.0 -7 -5.0 1022.0 SE 5.36 1 0 2010-01-02 04:00:00 138.0 -7 -5.0 1022.0 SE 6.25 2 0 ... ... ... ... ... ... ... ... 2014-12-31 19:00:00 8.0 -23 -2.0 1034.0 NW 231.97 0 0 2014-12-31 20:00:00 10.0 -22 -3.0 1034.0 NW 237.78 0 0 2014-12-31 21:00:00 10.0 -22 -3.0 1034.0 NW 242.70 0 0 2014-12-31 22:00:00 8.0 -22 -4.0 1034.0 NW 246.72 0 0 2014-12-31 23:00:00 12.0 -21 -3.0 1034.0 NW 249.85 0 0 [43800 rows x 8 columns]
根据特征dew、temp、press、wnd_dir、wnd_spd、snow、rain预测pollution
如何将有监督的Python学习问题转换成时间序列?
import pandas as pd def series_to_supervised(data, columns, n_in=1, n_out=1, dropnan=True): """ Frame a time series as a supervised learning dataset. Arguments: data: Sequence of observations as a list or NumPy array. n_in: Number of lag observations as input (X). n_out: Number of observations as output (y). dropnan: Boolean whether or not to drop rows with NaN values. Returns: Pandas DataFrame of series framed for supervised learning. """ n_vars = 1 if type(data) is list else data.shape[1] df = pd.DataFrame(data) cols, names = list(), list() # input sequence (t-n, ... t-1) for i in range(n_in, 0, -1): cols.append(df.shift(i)) names += [('%s%d(t-%d)' % (columns[j], j + 1, i)) for j in range(n_vars)] # forecast sequence (t, t+1, ... t+n) for i in range(0, n_out): cols.append(df.shift(-i)) if i == 0: names += [('%s%d(t)' % (columns[j], j + 1)) for j in range(n_vars)] else: names += [('%s%d(t+%d)' % (columns[j], j + 1, i)) for j in range(n_vars)] # put it all together agg = pd.concat(cols, axis=1) agg.columns = names # drop rows with NaN values if dropnan: clean_agg = agg.dropna() return clean_agg import numpy as np if __name__ == '__main__': values = [x for x in range(10)] values = np.array([[25,17,20,18],[13,17,26,11],[22,26,31,19],[18,19,35,46]]) data = series_to_supervised(values, ['temp','lr','rw','dir'], 2)
将数据处理成LSTM能输入的格式
import pandas as pd from util import PROCESS_LEVEL1 from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import OneHotEncoder from series_to_supervised_learning import series_to_supervised pd.options.display.expand_frame_repr = False def data_helper(): dataset = pd.read_csv(PROCESS_LEVEL1, header=0, index_col=0) print(dataset) dataset_columns = dataset.columns values = dataset.values # 对第四列(风向)数据进行编码,也可进行 哑编码处理 encoder = LabelEncoder() values[:, 4] = encoder.fit_transform(values[:, 4]) values = values.astype('float32') # 对数据进行归一化处理, valeus.shape=(, 8),inversed_transform时也需要8列 scaler = MinMaxScaler(feature_range=(0, 1)) scaled = scaler.fit_transform(values) # 将序列数据转化为监督学习数据 reframed = series_to_supervised(scaled, dataset_columns, 1, 1) #print(reframed.columns[[9, 10, 11, 12, 13, 14, 15]]) # 只考虑当前时刻(t)的前一时刻(t-1)的PM2.5值 reframed.drop(reframed.columns[[9, 10, 11, 12, 13, 14, 15]], axis=1, inplace=True) values = reframed.values n_train_hours = 365 * 24 train = values[:n_train_hours, :] test = values[n_train_hours:, :] # 监督学习结果划分,test_x.shape = (, 8) train_x, train_y = train[:, :-1], train[:, -1] test_x, test_y = test[:, :-1], test[:, -1] print(type(train_x)) # 为了在LSTM中应用该数据,需要将其格式转化为3D format,即[Samples, timesteps, features] train_X = train_x.reshape((train_x.shape[0], 1, train_x.shape[1])) print(train_X.shape) test_X = test_x.reshape((test_x.shape[0], 1, test_x.shape[1])) return scaler,test_x,train_X,train_y,test_X,test_y
模型预测代码