我想出了这段代码,但它导致了永无止境的错误:
def get_device_via_env_variables(deterministic: bool = False, verbose: bool = True) -> torch.device:
device: torch.device = torch.device("cpu")
if torch.cuda.is_available():
if 'CUDA_VISIBLE_DEVICES' not in os.environ:
device: torch.device = torch.device("cuda:0")
else:
gpu_idx: list[str] = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
if len(gpu_idx) == 1:
gpu_idx: str = gpu_idx[0]
else:
# generate random int from 0 to len(gpu_idx) with import statement
import random
idx: int = random.randint(0, len(gpu_idx) - 1) if not deterministic else -1
gpu_idx: str = gpu_idx[idx]
device: torch.device = torch.device(f"cuda:{gpu_idx}")
if verbose:
print(f'{device=}')
return device
我怀疑
gpu_idx
和 CUDA_VISIBLE_DEVICES
实际上不匹配......我只是想加载正确的 GPU。我该怎么做?
错误:
Traceback (most recent call last):aded (0.000 MB deduped)
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in <module>
main_data_analyis()
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1163, in main_data_analyis
args: Namespace = load_args()
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1152, in load_args
args.meta_learner = get_maml_meta_learner(args)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 272, in get_maml_meta_learner
base_model = load_model_ckpt(args, path_to_checkpoint=args.path_2_init_maml)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 265, in load_model_ckpt
base_model, _, _ = load_model_optimizer_scheduler_from_ckpt(args, path_to_checkpoint,
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 81, in load_model_optimizer_scheduler_from_ckpt
ckpt: dict = torch.load(path_to_checkpoint, map_location=torch.device('cuda:3'))
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 607, in load
return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 882, in _load
result = unpickler.load()
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 857, in persistent_load
load_tensor(data_type, size, key, _maybe_decode_ascii(location))
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 846, in load_tensor
loaded_storages[key] = restore_location(storage, location)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 827, in restore_location
return default_restore_location(storage, str(map_location))
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 175, in default_restore_location
result = fn(storage, location)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 151, in _cuda_deserialize
device = validate_cuda_device(location)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 142, in validate_cuda_device
raise RuntimeError('Attempting to deserialize object on CUDA device '
RuntimeError: Attempting to deserialize object on CUDA device 3 but torch.cuda.device_count() is 1. Please use torch.load with map_location to map your storages to an existing device.
我试图使用带有 256 和 512 个过滤器的 5CNN 剩余的 40GB 但结果是内存问题
Traceback (most recent call last):
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in <module>
main_data_analyis()
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1213, in main_data_analyis
stats_analysis_with_emphasis_on_effect_size(args, hist=True)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/stats_analysis_with_emphasis_on_effect_size.py", line 74, in stats_analysis_with_emphasis_on_effect_size
results_usl: dict = get_episodic_accs_losses_all_splits_usl(args, args.mdl_sl, loaders)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 616, in get_episodic_accs_losses_all_splits_usl
losses, accs = agent.get_lists_accs_losses(data, training)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 92, in get_lists_accs_losses
spt_embeddings_t = self.get_embedding(spt_x_t, self.base_model).detach()
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 166, in get_embedding
return get_embedding(x=x, base_model=base_model)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 267, in get_embedding
out = base_model.model.features(x)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 439, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: CUDA out of memory. Tried to allocate 174.00 MiB (GPU 0; 79.20 GiB total capacity; 54.31 GiB already allocated; 22.56 MiB free; 54.61 GiB reserved in total by PyTorch)
我想使用 GPU 3,但最后一个错误是 GPU 0。我做错了什么?
你得到的错误是你只有一个 GPU。正如 torch.cuda.device_count() 所说,您只有一个 GPU。但是,您正在尝试从不可用的 cuda:3 加载模型。换句话说,您正在尝试从设备加载模型,而 PyTorch 无法找到该设备并抛出错误。
检查您的 Pytorch 是否有可用的 cuda 资源到您要加载模型的位置,否则您可以更新 CUDA_VISIBLE_DEVICES 环境变量以匹配设备索引。