如何每20次保存纪元并再次加载模型?

问题描述 投票:0回答:1

我想在 Colab 上训练这个模型,它有 1000 个 epoch ..但是需要很长时间。

这是我用 python 编写的代码:

它训练了 1000 个 epoch 的模型,

我想每 20 次保存一次纪元(例如)并再次加载模型并从上一个纪元继续。

示例:从 1 训练到 20,然后保存模型...然后加载模型并继续从 20 到 40 等等。

import argparse
import numpy as np
import pandas as pd
import sys, os
from random import shuffle
import torch
import torch.nn as nn
from models.gcn import GCNNet
from utils import *

# training function at each epoch
def train(model, device, train_loader, optimizer, epoch,hidden,cell):
    print('Training on {} samples...'.format(len(train_loader.dataset)))
    model.train()
    for batch_idx, data in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data,hidden,cell)
        loss = loss_fn(output, data.y.view(-1, 1).float().to(device))
        loss.backward()
        optimizer.step()
        if batch_idx % LOG_INTERVAL == 0:
            print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch,
                                                                           batch_idx * len(data.x),
                                                                           len(train_loader.dataset),
                                                                           100. * batch_idx / len(train_loader),
                                                                           loss.item()))

def predicting(model, device, loader,hidden,cell):
    model.eval()
    total_preds = torch.Tensor()
    total_labels = torch.Tensor()
    print('Make prediction for {} samples...'.format(len(loader.dataset)))
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            output = model(data,hidden,cell)
            total_preds = torch.cat((total_preds, output.cpu()), 0)
            total_labels = torch.cat((total_labels, data.y.view(-1, 1).cpu()), 0)
    return total_labels.numpy().flatten(),total_preds.numpy().flatten()


loss_fn = nn.MSELoss()
LOG_INTERVAL = 20

def main(args):
  dataset = args.dataset
  modeling = [GCNNet]
  model_st = modeling[0].__name__

  cuda_name = "cuda:0"
  print('cuda_name:', cuda_name)

  TRAIN_BATCH_SIZE = args.batch_size
  TEST_BATCH_SIZE = args.batch_size
  LR = args.lr
  
  NUM_EPOCHS = args.epoch

  print('Learning rate: ', LR)
  print('Epochs: ', NUM_EPOCHS)

  # Main program: iterate over different datasets
  print('\nrunning on ', model_st + '_' + dataset )
  processed_data_file_train = 'data/processed/' + dataset + '_train.pt'
  processed_data_file_test = 'data/processed/' + dataset + '_test.pt'
  if ((not os.path.isfile(processed_data_file_train)) or (not os.path.isfile(processed_data_file_test))):
     print('please run create_data.py to prepare data in pytorch format!')
  else:
    train_data = TestbedDataset(root='data', dataset=dataset+'_train')
    test_data = TestbedDataset(root='data', dataset=dataset+'_test')
        
    # make data PyTorch mini-batch processing ready
    train_loader = DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True,drop_last=True)
    test_loader = DataLoader(test_data, batch_size=TEST_BATCH_SIZE, shuffle=False,drop_last=True)

    # training the model
    device = torch.device(cuda_name if torch.cuda.is_available() else "cpu")
    model = modeling[0](k1=1,k2=2,k3=3,embed_dim=128,num_layer=1,device=device).to(device)



    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    best_mse = 1000
    best_ci = 0
    best_epoch = -1
    #model_file_name = 'model' + model_st + '_' + dataset +  '.model'
    result_file_name = 'result' + model_st + '_' + dataset +  '.csv'


    ##TRAIN _ NUM OF EPOCHES
    for epoch in range(NUM_EPOCHS):
      hidden,cell = model.init_hidden(batch_size=TRAIN_BATCH_SIZE)
      train(model, device, train_loader, optimizer, epoch+1,hidden,cell)
      G,P = predicting(model, device, test_loader,hidden,cell)
      ret = [rmse(G,P),mse(G,P),pearson(G,P),spearman(G,P),ci(G,P),get_rm2(G.reshape(G.shape[0],-1),P.reshape(P.shape[0],-1))]
      if ret[1]<best_mse:
        if args.save_file:
          model_file_name = args.save_file + '.model'
          torch.save(model.state_dict(), model_file_name)
        
        
        with open(result_file_name,'w') as f:
          f.write('rmse,mse,pearson,spearman,ci,rm2\n')
          f.write(','.join(map(str,ret)))
        best_epoch = epoch+1
        best_mse = ret[1]
        best_ci = ret[-2]
        print('rmse improved at epoch ', best_epoch, '; best_mse,best_ci:', best_mse,best_ci,model_st,dataset)
      else:
        print(ret[1],'No improvement since epoch ', best_epoch, '; best_mse,best_ci:', best_mse,best_ci,model_st,dataset)

if __name__ == "__main__":
  parser = argparse.ArgumentParser(description="Run DeepGLSTM")

  parser.add_argument("--dataset",type=str,default='davis',
                      help="Dataset Name (davis,kiba,DTC,Metz,ToxCast,Stitch)")

  parser.add_argument("--epoch",
                      type = int,
                      default = 1000,
                      help="Number of training epochs. Default is 1000."
                      ) 
  
  parser.add_argument("--lr",
                      type=float,
                      default = 0.0005,
                      help="learning rate",
                      )
  
  parser.add_argument("--batch_size",type=int,
                      default = 128,
                      help = "Number of drug-tareget per batch. Default is 128 for davis.") # batch 128 for Davis
  
  parser.add_argument("--save_file",type=str,
                      default=None,
                      help="Where to save the trained model. For example davis.model")


  args = parser.parse_args()
  print(args)
  main(args)

我的代码应该做什么?

python save load torch
1个回答
0
投票

这是我目前训练模型的方法:

  def train_model(model, train_loader, criterion, optimizer, start_epoch,end_epoch, save_path):
    model.train()
    for epoch in range(start_epoch, end_epoch):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f'Epoch [{epoch + 1}/{end_epoch}], Loss: {running_loss / len(train_loader)}')

        # Save the model every 20 epochs
        if (epoch + 1) % 20 == 0:
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': running_loss,
            }, f'{save_path}_epoch_{epoch + 1}.pth')

您可以加载模型并继续训练

# If a checkpoint exists, load it
if checkpoint_path:
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    print(f'Resuming training from epoch {start_epoch}')

# Train the model in chunks of 20 epochs
for i in range(start_epoch, num_epochs, 20):
    train_model(model, train_loader, criterion, optimizer, i, min(i + 20, num_epochs), save_path)
© www.soinside.com 2019 - 2024. All rights reserved.