PyTorch 多处理在 Ubuntu 上运行不正常

问题描述 投票:0回答:0

下面是在我的机器(Windows 笔记本电脑)上本地正确运行的代码。然而,代码在我的 Ubuntu 终端上停止运行(使用它来测试在 Ubuntu 上运行的另一个更强大的服务器)。它似乎只是在某个时候停止运行。 main 函数初始化 worker 并使用 Pytorch.multiprocessing 包并行运行它们。我还尝试与 1 个演员一起运行下面的代码,但它仍然停止了。我没有附上所有的背景代码,因为有很多并且希望有更多经验的人会了解多处理或遇到类似问题。

import math

import torch
import torch.optim as optim
import os
from pokerenv import poker_env
from actor_critic import *
import torch.multiprocessing as mp
from shared_adam import SharedAdam
from transformer import *

'inspired from https://github.com/MorvanZhou/pytorch-A3C/blob/master/discrete_A3C.py'


def train(global_actor_critic, actor_critic, optimizer, hand_count, N_GAMES):
    while hand_count.value < N_GAMES:
        print('getting loss')
        loss = actor_critic.play_hand()
        print('loss done')
        optimizer.zero_grad() # zero gradient on the master copy
        loss.backward(retain_graph=True)
        for local_param, global_param in zip(
                actor_critic.agent.model.parameters(),
                global_actor_critic.parameters()):
            global_param._grad = local_param.grad
        optimizer.step()
        actor_critic.agent.model.parameters = global_actor_critic.parameters
        actor_critic.clear_memory()
        with hand_count.get_lock():
            hand_count.value += 1
        print(f'hand_count {hand_count} loss {loss}')


if __name__ == '__main__':
    torch.manual_seed(0)
    N_GAMES = 100
    actor_count = 1
    # actor parameters
    max_sequence = 100
    n_players = 6
    gamma = .98
    n_actions = 14
    # model parameters
    model_dim = 64
    mlp_dim = 128
    attn_heads = 4
    sequence_length = 100
    enc_layers = 3
    dec_layers = 4
    action_dim = 14
    learning_rate = .0001
    player_params = [model_dim, mlp_dim, attn_heads, enc_layers, dec_layers, sequence_length, n_players, learning_rate, action_dim]
    model_params = [model_dim, mlp_dim, attn_heads, sequence_length, enc_layers, dec_layers, action_dim]
    # create poker environment
    # initialize global model
    global_model = RLformer(* model_params)
    global_model.share_memory()
    print('Parameter_count: ', sum(p.numel() for p in global_model.parameters() if p.requires_grad))
    optimizer = SharedAdam(global_model.parameters(), lr=learning_rate)
    hand_count = mp.Value('i', 0)
    # initialize players
    actor_critics = []
    for i in range(actor_count):
        actor_critics.append(actor_critic(
            model_dim=player_params[0],
            mlp_dim=player_params[1],
            heads=player_params[2],
            enc_layers=player_params[3],
            dec_layers=player_params[4],
            max_sequence=player_params[5],
            n_players=player_params[6],
            gamma=player_params[7],
            n_actions=player_params[8]
        ))
    players = []
    for i in range(actor_count):
        player = mp.Process(target=train, args=(global_model, actor_critics[i], optimizer, hand_count, N_GAMES))
        player.start()
        players.append(player)
    for player in players:
        player.join()

类似行为的另一个例子如下:

import torch.multiprocessing as mp
import torch
from encoder import encoder_layer
from actor_critic import actor_critic

class Player (mp.Process):
    def __init__(self, name, global_count):
        super(Player, self).__init__()
        self.player_name = name
        self.global_count = global_count
        self.LN = torch.nn.LayerNorm(64)
        self.encoder = encoder_layer(64, 128, 4)

        self.worker = actor_critic(
            model_dim=64,
            mlp_dim=128,
            heads=4,
            enc_layers=2,
            dec_layers=2,
            max_sequence=100,
            n_players=6,
            gamma=0.99,
            n_actions=14
        )

        print('created actor critic')


    def run(self):
        while True:
            print(f'{self.name} prints {self.global_count.value}')
            test = torch.rand([2, 64])
            print('test created')
            y = self.LN(test)
            print('y created')
            x = self.encoder(test)
            print('x created')
            with self.global_count.get_lock():
                self.global_count.value += 1



if __name__ == '__main__':
    print('starting')
    global_ep = mp.Value('i', 0)
    workers = []
    for i in range(10):
        player = Player(i, global_ep)
        workers.append(player)
    [player.start() for player in workers]
    [player.join() for player in workers]

这是上面代码的输出,注意 torch.rand([2, 64]) 甚至没有运行(它从不打印“test created”。)

我们可以看到它打印了“player-1 prints 0”,但从不执行下一行,也从不打印“test created”。经过一些额外的测试,似乎在初始化 ActorCritic 之后,任何 pytorch 功能都停止工作。我在没有多处理的情况下运行了准确的代码,并且只运行了一个演员并且代码有效。重申一下,该代码在我的笔记本电脑上的 Pycharm 中使用和不使用多处理时都可以工作,但在 Ubuntu 上停止使用多处理。任何帮助,将不胜感激!提前致谢。

python ubuntu deep-learning pytorch multiprocessing
最新问题
© www.soinside.com 2019 - 2024. All rights reserved.