Pytorch设备选择,多GPU训练
To函数
功能:转换数据类型设备
1.tensor.to() 2.mouble.to()
例子:
torch.cuda
多GPU分发并行机制
把数据等分,给不同的GPU运行
torch.nn.DataParallel
# ============================ 手动选择gpu gpu_list = [2,3] gpu_list_str = ,.join(map(str, gpu_list)) os.environ.setdefault("CUDA_VISIBLE_DEVICES", gpu_list_str) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # ============================ 依内存情况自动选择主gpu def get_gpu_memory(): import platform if Windows != platform.system(): import os os.system(nvidia-smi -q -d Memory | grep -A4 GPU | grep Free > tmp.txt) memory_gpu = [int(x.split()[2]) for x in open(tmp.txt, r).readlines()] os.system(rm tmp.txt) else: memory_gpu = False print("显存计算功能暂不支持windows操作系统") return memory_gpu gpu_memory = get_gpu_memory() if not gpu_memory: print(" gpu free memory: {}".format(gpu_memory)) gpu_list = np.argsort(gpu_memory)[::-1] gpu_list_str = ,.join(map(str, gpu_list)) os.environ.setdefault("CUDA_VISIBLE_DEVICES", gpu_list_str) device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
常见报错
1.原因:gpu训练出来的model,用map_location映射到cpu上
path_state_dict = "./model_in_gpu_0.pkl" state_dict_load = torch.load(path_state_dict, map_location="cpu") print("state_dict_load: {}".format(state_dict_load))
2.多GPU(DataParallel)训练出来的模型,需要更改一下层的名字,因为在前面多了一个moudle字段
from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict_load.items(): namekey = k[7:] if k.startswith(module.) else k new_state_dict[namekey] = v print("new_state_dict: {}".format(new_state_dict)) net.load_state_dict(new_state_dict)
让网络加载new_state_dict即可
上一篇:
IDEA上Java项目控制台中文乱码