Pytorch 模型独立运行良好,但在使用 optuna 运行时会抛出运行时错误

问题描述 投票:0回答:1

我正在尝试使用

pytorch
调整我的
Optuna
模型的超参数,但每次运行
optimizer
时都会出现以下错误。

[W 2024-02-05 17:19:26,007] Trial 2 failed with parameters: {'hidden_state': 64, 'droup_out_prec': 0.18615371906093597, 'num_epochs': 14, 'encoder_lr': 0.021112576066074633, 'decoder_lr': 0.0006833950215216012, 'learning_rate': 1.9257784640609453e-05, 'control_factor_ce': 0.03524950489764759, 'control_factor_kl': 0.13410725114961825, 'batch_size': 256} because of the following error: RuntimeError('one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!').
Traceback (most recent call last):
  File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/root/.ipykernel/38222/command-4071806974828746-3917727534", line 149, in __call__
    output = train_model(model = model_to_train,
  File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 386, in train_model
    raise e
  File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 132, in train_model
    loss.backward()
  File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/_tensor.py", line 522, in backward
    torch.autograd.backward(
  File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
[W 2024-02-05 17:19:26,018] Trial 2 failed with value None.
backword success:  0
Traceback (most recent call last):
  File "/Workspace/IMDNA/PWSA0000375_IM_US_FORECASTING/Challenger Model/HCP Forecasting/Kisqali/utils/train_model.py", line 132, in train_model
    loss.backward()
  File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/_tensor.py", line 522, in backward
    torch.autograd.backward(
  File "/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.LongTensor []] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

    

模型架构是

    class EncoderLSTM(nn.Module):
    def __init__(self,feature_num, hidden_size_lstm, num_layers_lstm,bias,has_channel,bidirectional = False):
        super(EncoderLSTM, self).__init__()
        self.hidden_size_lstm = hidden_size_lstm
        self.num_layers_lstm = num_layers_lstm
        self.feature_num = feature_num
        self.bias = bias
        self.has_channel = has_channel
        self.bidirectional = bidirectional
    

        self.lstm1 = nn.LSTM(input_size = self.feature_num, hidden_size  = self.hidden_size_lstm, num_layers = self.num_layers_lstm, batch_first=True,bidirectional = False,bias=self.bias)

        self.fc_encoder = nn.Linear(self.hidden_size_lstm, self.hidden_size_lstm)
        self.fc_encoder.time_distributed = True

    def forward(self, x):
        if self.has_channel:
            x=x.view(x.size(0),x.size(2),x.size(3))

        h0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Hidden state
        c0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Cell state


        out, (hn, cn) = self.lstm1(x, (h0, c0))
        out = self.fc_encoder(out)
        return out, (hn, cn)

    class DecoderLSTM(nn.Module):
    def __init__(self,feature_num, hidden_size_lstm, num_layers_lstm,bias,output_size,droup_out_prec = 0.2):
        super(DecoderLSTM, self).__init__()

        self.hidden_size_lstm = hidden_size_lstm
        self.num_layers_lstm = num_layers_lstm
        self.feature_num = feature_num
        self.bias = bias
        self.output_size = output_size
        self.droup_out_prec = droup_out_prec


        self.decoder_net = nn.LSTM(input_size = self.hidden_size_lstm, hidden_size  = self.hidden_size_lstm, num_layers = self.num_layers_lstm, batch_first=True,bidirectional = False,bias=self.bias)

        self.fc_decoder_1 = nn.Linear(self.hidden_size_lstm, int(self.hidden_size_lstm/2))
        self.fc_decoder_1.time_distributed = True

        self.fc_decoder_2 = nn.Linear(int(self.hidden_size_lstm/2), self.output_size)
        self.fc_decoder_2.time_distributed = True

        self.relu_decoder_1= nn.ReLU(inplace=False)
        self.dropout_decoder_1 = nn.Dropout(self.droup_out_prec)
        self.relu_decoder_2= nn.ReLU(inplace=False)

    def forward(self, out, hn,cn,MAX_TIMESTEP = 4,target_tensor = None,return_state = False):

        out_decoder_list = []
        for time in range(MAX_TIMESTEP):
            # print(time)
            
            # print("encoder after fc")
            # print(output.size())
            out, (hn, cn) = self.decoder_net(out, (hn, cn))
            # print("Seq2Seq_decoder")
            # print(output.size())


            out_reg = torch.squeeze(hn, 0).clone()

            out_reg = self.fc_decoder_1(out_reg)
            out_reg = self.relu_decoder_1(out_reg)
            out_reg = self.dropout_decoder_1(out_reg)

            out_reg = self.fc_decoder_2(out_reg)

            out_decoder_list.append(out_reg)



            if target_tensor is not None:
                output = target_tensor[time]


        out_decoder_list = torch.cat(out_decoder_list, dim=1)
        if return_state:
            return out_decoder_list,(hn, cn)
        else:
            return out_decoder_list    


    class seq2seqModel_indipendent(nn.Module):
    def __init__(self,encoder = None,decoder = None, training = True):
        super(seq2seqModel_indipendent, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.training = training




    def __call__(self,x,MAX_TIMESTEP = 4,return_state = False):
        if self.training:
            self.encoder.train()
            self.decoder.train()
        else:
            self.encoder.eval()
            self.decoder.eval()  
        # if self.encoder:         
        out, (hn, cn) = self.encoder(x)
        out = self.decoder(out = out, hn = hn,cn = cn,MAX_TIMESTEP = MAX_TIMESTEP,return_state = return_state)
        return out
    
    
    def train(self):
        self.training = True

    def eval(self):
        self.training = False
    def state_dict(self):
        return (self.encoder.state_dict(),self.decoder.state_dict())

跑步垂耳兔就像

     for epoch in range(num_epochs):
      count = 0
      # print('epoch:',epoch)
      if verbos == 3 or verbos == 2:
        print('*'*100)
        print(f'Running epoch: {epoch}')
      if train:
        if not model.training:
          model.train()
        temp_train = []
        for i, (inputs, labels_reg) in enumerate(dataloader_train):
            inputs = inputs.to(device)
            labels_reg = labels_reg.to(device).reshape(-1,4)
            

            # optimizer.zero_grad()
            out_reg = model(inputs)




            loss = criterion_train(out_reg, labels_reg)
            # loss.backward(retain_graph=True)
            # print('iter:',i)

            loss.backward()

            if (i - count) > 0:
              print('iter: ',i)
              print('epoch:',epoch) 
            count += 1
          
            
            if grad_clip is not None:
              torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)

            # optimizer.step()
            for opt in optimizer:
              opt.step()

            for opt in optimizer:
              opt.zero_grad()

            # for opt in optimizer: 
            #   opt.zero_grad()
            loss_list_train.append(loss.item())
            temp_train.append(loss.item())
            
            if verbos == 3:
              print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                    .format(epoch, num_epochs, i, len(dataset_train)//batch_size, loss.item()))

        loss_list_train_epoch.append(sum(temp_train)/len(temp_train))    
        if scheduler is not None and verbos == 3:

          for schedule,opt in zip(scheduler,optimizer):
              before_lr = opt.param_groups[0]["lr"]
              schedule.step()
              after_lr = opt.param_groups[0]["lr"]
              print("Epoch %d: lr %.6f -> %.6f" % (epoch, before_lr, after_lr))
python deep-learning pytorch optuna
1个回答
0
投票

该错误表明您没有在某处正确修改张量。当您调用

train_model()
时,错误情况会出现在
loss.backward()
中。检查您如何计算
loss
并且不要使用就地操作 - 这可能会根据消息中的信息解决问题。

查看更多代码会很有帮助,因为出错的部分似乎没有包含在您的原始帖子中。

© www.soinside.com 2019 - 2024. All rights reserved.