import os
import torch
import torch.distributed as dist
def init_distributed():
os.environ['MASTER_ADDR'] = "10.12.27.241"
os.environ['MASTER_PORT'] = '29500'
node_rank = int(os.environ.get('RANK', 0)) # 1 for worker
world_size = 2
dist.init_process_group(
backend='gloo',
rank=node_rank,
world_size=world_size
)
print(f"Initialized process group: rank {node_rank} of {world_size}")
return node_rank, world_size
def send_receive_message(rank, world_size):
if rank == 0:
# Node 0 sends a message
message = torch.tensor([42, 43, 44], dtype=torch.int64)
dist.send(message, dst=1)
print(f"Rank {rank} sent message: {message}")
else:
# Node 1 receives the message
message = torch.zeros(3, dtype=torch.int64)
dist.recv(message, src=0)
print(f"Rank {rank} received message: {message}")
if __name__ == "__main__":
rank, world_size = init_distributed()
send_receive_message(rank, world_size)
# Barrier to ensure all processes have completed
dist.barrier()
# Clean up
dist.destroy_process_group()
使用 docker run 的 --network=host 选项时,我能够成功运行此脚本。但是,由于组织限制,我需要使用 --network=bridge 选项。当我使用--network=bridge时,遇到以下错误:
[E110 05:59:45.095859745 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.cc:144] no error
Traceback (most recent call last):
File "/data/exp/com.py", line 36, in <module>
rank, world_size = init_distributed()
File "/data/exp/com.py", line 12, in init_distributed
dist.init_process_group(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 97, in wrapper
func_return = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group
default_pg, _ = _new_process_group_helper(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper
backend_class = ProcessGroupGloo(
RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.cc:144] no error
在单独的实例上运行容器时,如何配置 torch.distributed 与桥接网络一起使用?在此设置中,需要哪些额外步骤或配置才能使 Gloo 后端通信成功?任何指导或指示将不胜感激!
选项 1:使用 docker compose 并在 compose 文件中定义一个可供所有容器使用的网络,即运行分布式工作负载。
选项 2:创建一个 docker 网络,并在发出运行命令时将相同的网络名称传递给容器。挂载一个共享卷,每个节点在其中写入一个条目文件,即空的,名称为获取的 IP 地址。
无论哪种情况,关键是在同一网络中运行容器。这使他们能够相互对等。如果您想在开始之前声明IP地址可以通过
--ip <Intended IP>
。请记住,提供的 IP 应位于所提供的 docker 网络的地址范围内。要了解子网范围,请使用
docker network inspect <network name>
。网络名称可以通过
docker network ls
获取