我正在尝试使用
pytorch
调整我的 Optuna
模型的超参数,但每次运行 optimizer
时都会出现以下错误。
[W 2024-02-05 17:19:26,007] Trial 2 failed with parameters: {'hidden_state': 64, 'droup_out_prec': 0.18615371906093597, 'num_epochs': 14, 'encoder_lr': 0.021112576066074633, 'decoder_lr': 0.0006833950215216012, 'learning_rate': 1.9257784640609453e-05, 'control_factor_ce': 0.03524950489764759, 'control_factor_kl': 0.13410725114961825, 'batch_size': 256} because of the following error: RuntimeError('one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!').
Traceback (most recent call last):
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
value_or_values = func(trial)
File "/root/.ipykernel/38222/command-4071806974828746-3917727534", line 149, in __call__
output = train_model(model = model_to_train,
File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 386, in train_model
raise e
File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 132, in train_model
loss.backward()
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
[W 2024-02-05 17:19:26,018] Trial 2 failed with value None.
backword success: 0
Traceback (most recent call last):
File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 132, in train_model
loss.backward()
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
模型架构是
class EncoderLSTM(nn.Module):
def __init__(self,feature_num, hidden_size_lstm, num_layers_lstm,bias,has_channel,bidirectional = False):
super(EncoderLSTM, self).__init__()
self.hidden_size_lstm = hidden_size_lstm
self.num_layers_lstm = num_layers_lstm
self.feature_num = feature_num
self.bias = bias
self.has_channel = has_channel
self.bidirectional = bidirectional
self.lstm1 = nn.LSTM(input_size = self.feature_num, hidden_size = self.hidden_size_lstm, num_layers = self.num_layers_lstm, batch_first=True,bidirectional = False,bias=self.bias)
self.fc_encoder = nn.Linear(self.hidden_size_lstm, self.hidden_size_lstm)
self.fc_encoder.time_distributed = True
def forward(self, x):
if self.has_channel:
x=x.view(x.size(0),x.size(2),x.size(3))
h0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Hidden state
c0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Cell state
out, (hn, cn) = self.lstm1(x, (h0, c0))
out = self.fc_encoder(out)
return out, (hn, cn)
class DecoderLSTM(nn.Module):
def __init__(self,feature_num, hidden_size_lstm, num_layers_lstm,bias,output_size,droup_out_prec = 0.2):
super(DecoderLSTM, self).__init__()
self.hidden_size_lstm = hidden_size_lstm
self.num_layers_lstm = num_layers_lstm
self.feature_num = feature_num
self.bias = bias
self.output_size = output_size
self.droup_out_prec = droup_out_prec
self.decoder_net = nn.LSTM(input_size = self.hidden_size_lstm, hidden_size = self.hidden_size_lstm, num_layers = self.num_layers_lstm, batch_first=True,bidirectional = False,bias=self.bias)
self.fc_decoder_1 = nn.Linear(self.hidden_size_lstm, int(self.hidden_size_lstm/2))
self.fc_decoder_1.time_distributed = True
self.fc_decoder_2 = nn.Linear(int(self.hidden_size_lstm/2), self.output_size)
self.fc_decoder_2.time_distributed = True
self.relu_decoder_1= nn.ReLU(inplace=False)
self.dropout_decoder_1 = nn.Dropout(self.droup_out_prec)
self.relu_decoder_2= nn.ReLU(inplace=False)
def forward(self, out, hn,cn,MAX_TIMESTEP = 4,target_tensor = None,return_state = False):
out_decoder_list = []
for time in range(MAX_TIMESTEP):
# print(time)
# print("encoder after fc")
# print(output.size())
out, (hn, cn) = self.decoder_net(out, (hn, cn))
# print("Seq2Seq_decoder")
# print(output.size())
out_reg = torch.squeeze(hn, 0).clone()
out_reg = self.fc_decoder_1(out_reg)
out_reg = self.relu_decoder_1(out_reg)
out_reg = self.dropout_decoder_1(out_reg)
out_reg = self.fc_decoder_2(out_reg)
out_decoder_list.append(out_reg)
if target_tensor is not None:
output = target_tensor[time]
out_decoder_list = torch.cat(out_decoder_list, dim=1)
if return_state:
return out_decoder_list,(hn, cn)
else:
return out_decoder_list
class seq2seqModel_indipendent(nn.Module):
def __init__(self,encoder = None,decoder = None, training = True):
super(seq2seqModel_indipendent, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.training = training
def __call__(self,x,MAX_TIMESTEP = 4,return_state = False):
if self.training:
self.encoder.train()
self.decoder.train()
else:
self.encoder.eval()
self.decoder.eval()
# if self.encoder:
out, (hn, cn) = self.encoder(x)
out = self.decoder(out = out, hn = hn,cn = cn,MAX_TIMESTEP = MAX_TIMESTEP,return_state = return_state)
return out
def train(self):
self.training = True
def eval(self):
self.training = False
def state_dict(self):
return (self.encoder.state_dict(),self.decoder.state_dict())
跑步垂耳兔就像
for epoch in range(num_epochs):
count = 0
# print('epoch:',epoch)
if verbos == 3 or verbos == 2:
print('*'*100)
print(f'Running epoch: {epoch}')
if train:
if not model.training:
model.train()
temp_train = []
for i, (inputs, labels_reg) in enumerate(dataloader_train):
inputs = inputs.to(device)
labels_reg = labels_reg.to(device).reshape(-1,4)
# optimizer.zero_grad()
out_reg = model(inputs)
loss = criterion_train(out_reg, labels_reg)
# loss.backward(retain_graph=True)
# print('iter:',i)
loss.backward()
if (i - count) > 0:
print('iter: ',i)
print('epoch:',epoch)
count += 1
if grad_clip is not None:
torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
# optimizer.step()
for opt in optimizer:
opt.step()
for opt in optimizer:
opt.zero_grad()
# for opt in optimizer:
# opt.zero_grad()
loss_list_train.append(loss.item())
temp_train.append(loss.item())
if verbos == 3:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch, num_epochs, i, len(dataset_train)//batch_size, loss.item()))
loss_list_train_epoch.append(sum(temp_train)/len(temp_train))
if scheduler is not None and verbos == 3:
for schedule,opt in zip(scheduler,optimizer):
before_lr = opt.param_groups[0]["lr"]
schedule.step()
after_lr = opt.param_groups[0]["lr"]
print("Epoch %d: lr %.6f -> %.6f" % (epoch, before_lr, after_lr))
该错误表明您没有在某处正确修改张量。当您调用
train_model()
时,错误情况会出现在 loss.backward()
中。检查您如何计算 loss
并且不要使用就地操作 - 这可能会根据消息中的信息解决问题。
查看更多代码会很有帮助,因为出错的部分似乎没有包含在您的原始帖子中。