目前我正在处理如下所示的时间序列数据
数据由5家公司、15种产品(每家公司有3-5种产品)和6个不同地区组成
目标 构建单一广义预测模型
问题 1 制作单个模型是否正确。
我选择LSTM进行建模,
问题 2 LSTM 是正确的选择吗?如果不是,哪种方法适合此类数据?
为了训练 LSTM,我使用过去 120 天的数据并对接下来的 30 天进行预测,对于输入,我使用除销售之外的过去 120 天的数据,遵循这篇文章 并且目标是未来 30 天的销售
我的方法
数据预处理(无需查看所有类,只需查看预处理函数即可了解其作用,并查看创建序列函数以了解如何制作输入和输出数据)
class DataPreprocess:
def __init__(self, data_path) -> None:
self.df= pd.read_csv(data_path, index_col='Date', parse_dates=True)
def preprocess(self, check_model):
self.feature_extraction()
self.encode_features()
self.split_data()
self.scale_features()
train_loader, test_loader= self.get_dataloader(check_model)
return train_loader, test_loader
def get_sin(self, X, period):
return np.sin(2*np.pi * (X/period))
def get_cos(self, X, period):
return np.cos(2*np.pi * (X/period))
def feature_extraction(self):
self.df['DayOfYear']= self.df.index.dayofyear
self.df['DayOfWeek']= self.df.index.dayofweek
self.df['Seconds']= self.df.index.map(pd.Timestamp.timestamp)
self.df['Week Sin']= self.df['DayOfYear'].apply(lambda x: self.get_sin(x, 7))
self.df['Week Cos']= self.df['DayOfYear'].apply(lambda x: self.get_cos(x, 7))
self.df['Month Sin']= self.df['DayOfYear'].apply(lambda x: self.get_sin(x, 30))
self.df['Month Cos']= self.df['DayOfYear'].apply(lambda x: self.get_cos(x, 30))
self.df['Year Sin']= self.df['DayOfYear'].apply(lambda x: self.get_sin(x, 365))
self.df['Year Cos']= self.df['DayOfYear'].apply(lambda x: self.get_cos(x, 365))
self.df.drop(['DayOfYear'], axis=1, inplace= True)
def encode_features(self):
categorical_features = ['Company', 'Product', 'Region']
numerical_features = self.df.columns
encoders= {}
for col in categorical_features:
le= LabelEncoder()
self.df[col]= le.fit_transform(self.df[col])
encoders[col]= le
with open('artifacts/LSTM/encoders.pkl', 'wb') as file:
pickle.dump(encoders, file)
def split_data(self):
self.train_df= self.df.groupby(['Product', 'Region']).head(-200)
self.test_df= self.df.groupby(['Product', 'Region']).tail(200)
def scale_features(self):
standard_scaler_cols= ['Company', 'Product', 'Region', 'DayOfWeek', 'Seconds',
'Week Sin', 'Week Cos', 'Month Sin', 'Month Cos', 'Year Sin',
'Year Cos']
standard_scaler= StandardScaler()
min_max_scaler= MinMaxScaler()
self.train_df[standard_scaler_cols]= standard_scaler.fit_transform(self.train_df[standard_scaler_cols])
self.test_df[standard_scaler_cols]= standard_scaler.transform(self.test_df[standard_scaler_cols])
self.train_df[['Sales']]= min_max_scaler.fit_transform(self.train_df[['Sales']])
self.test_df[['Sales']]= min_max_scaler.fit_transform(self.test_df[['Sales']])
with open('artifacts/LSTM/standard_scaler.pkl', 'wb') as file:
pickle.dump(standard_scaler, file)
with open('artifacts/LSTM/min_max_scaler.pkl', 'wb') as file:
pickle.dump(min_max_scaler, file)
def create_sequences(self, df, past_window_size= 120, forecast_window_size= 30):
X= []
y= []
logger.info(f'creating sequences with past window : {past_window_size}, forecast_window : {forecast_window_size}')
for prod, reg in df[['Product', 'Region']].drop_duplicates().values:
df_to_np= df[(df['Product']==prod) & (df['Region']==reg)].to_numpy()
for i in range(len(df_to_np)- past_window_size- forecast_window_size):
# row= [r for r in df_to_np[i:i+past_window_size]]
row= df_to_np[i:i+past_window_size, 1:]
X.append(row)
# label= [r[0] for r in df_to_np[i+past_window_size: i+past_window_size+forecast_window_size]]
label= df_to_np[i+past_window_size:i+past_window_size+forecast_window_size, 0]
y.append(label)
return np.array(X, dtype=float), np.array(y, dtype= float)
def get_dataloader(self, check_model):
X_train, y_train= self.create_sequences(self.train_df)
X_test, y_test= self.create_sequences(self.test_df)
X_train, y_train= torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float()
X_test, y_test= torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float()
if check_model:
train_dataset= TensorDataset(X_train[:5000], y_train[:5000])
test_dataset= TensorDataset(X_test[:1000], y_test[:1000])
else:
train_dataset= TensorDataset(X_train, y_train)
test_dataset= TensorDataset(X_test, y_test)
train_loader= DataLoader(train_dataset, batch_size=32, drop_last=True, num_workers=4, pin_memory=True)
test_loader= DataLoader(test_dataset, batch_size=32, drop_last=True, num_workers=4, pin_memory=True)
return train_loader, test_loader
在 create_sequences 函数中,我生成最后 120 个 X 和接下来的 30 个 y
我使用的型号
from torch import nn
import torch
class LSTM_Model(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_prob=0.3):
super(LSTM_Model, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers=num_layers, dropout=dropout_prob)
self.linear1 = nn.Linear(hidden_size, hidden_size * 2)
self.batch_norm1 = nn.BatchNorm1d(hidden_size * 2)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout_prob)
self.linear2 = nn.Linear(hidden_size * 2, output_size)
# self.batch_norm2 = nn.BatchNorm1d(hidden_size)
# self.linear3 = nn.Linear(hidden_size, output_size)
def forward(self, X):
h0 = torch.zeros(self.num_layers, X.size(0), self.hidden_size).to(X.device)
c0 = torch.zeros(self.num_layers, X.size(0), self.hidden_size).to(X.device)
output, (hidden, cell) = self.lstm(X, (h0, c0))
out = self.linear1(hidden[-1, :, :])
out = self.batch_norm1(out)
out = self.relu(out)
out = self.dropout(out)
out = self.linear2(out)
# out = self.batch_norm2(out)
# out = self.relu(out)
# out = self.dropout(out)
# # Output layer
# out = self.linear3(out)
return out
模型训练
from tqdm import tqdm
from pathlib import Path
import sys
import torch
from torch import nn
from datetime import datetime
import os
parent_dir= Path(__file__).parent.parent.parent
sys.path.append(str(parent_dir))
from data_preprocess import DataPreprocess
from LSTM_model import LSTM_Model
from src.utils.logger import get_logger
logger= get_logger()
class LSTM:
def __init__(self, input_size, hidden_size, output_size, num_layers, drop_out= 0.3) -> None:
self.model = LSTM_Model(input_size, hidden_size, output_size, num_layers, drop_out)
def train(self, epochs, train_loader, test_loader, load_model, save_every=10, model_path= None):
self.loss_fn = nn.MSELoss()
self.model.load_state_dict(torch.load(model_path)) if load_model else None
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001, weight_decay=1e-5)
now = datetime.now()
formatted_time = now.strftime("%d-%m-%y_%H-%M")
formatted_time
os.makedirs(f'artifacts/LSTM/{formatted_time}')
for epoch in range(epochs):
self.model.train()
train_loss = 0.0
for X, y in tqdm(train_loader):
self.optimizer.zero_grad()
outputs = self.model(X)
loss = self.loss_fn(outputs, y)
train_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
self.optimizer.step()
avg_train_loss = train_loss / len(train_loader)
print(f'Epoch {epoch+1} --- train loss {avg_train_loss}')
self.model.eval()
test_loss = 0.0
with torch.no_grad():
for X, y in tqdm(test_loader):
outputs = self.model(X)
loss = self.loss_fn(outputs, y)
test_loss += loss.item()
avg_test_loss = test_loss / len(test_loader)
self.model.train()
print(f'Epoch {epoch+1} --- test loss {avg_test_loss}\n')
with open(f'artifacts/LSTM/{formatted_time}/metrics.txt', 'a') as file:
file.write(f"Epoch {epoch+1} : \n\ttrain loss {avg_train_loss} \n\ttest loss {avg_test_loss}\n\n")
if epoch % save_every ==0 :
torch.save(self.model.state_dict(), f'artifacts/LSTM/{formatted_time}/lstm_epoch_{epoch+1}.pth')
def predict(self, X, model_path):
'''
X: past 120 days records
'''
self.model.load_state_dict(torch.load(model_path))
self.model.eval()
preds= self.model(X).detach().numpy().flatten()
return preds
if __name__=='__main__':
data_preprocess= DataPreprocess(data_path= 'data/inputs/Corrected_and_Balanced_Time_Series_Data.csv')
logger.info('Started Data Preprocessing')
train_loader, test_loader= data_preprocess.preprocess(check_model= False)
logger.info('Done Data Preprocessing')
lstm= LSTM(1, 256, 30, 2, 0.5)
lstm.train(epochs=10, train_loader=train_loader, test_loader=test_loader, load_model=False, save_every=1)
结果
Epoch 1 :
train loss 0.004617819177448817
test loss 0.006365457516637564
Epoch 2 :
train loss 0.0013970815729702321
test loss 0.010996414257221789
Epoch 3 :
train loss 0.0023961294302583384
test loss 0.012776140605897776
请建议我要去哪里,或者建议更好的方法。