polarmix单卡训练后test报错
问题根源
权重文件是用 DistributedDataParallel包装的模型保存的(键名带 module.前缀),但加载时模型没有被 DDP 包装,导致键名不匹配。
解决方案
保持 torchpack的分布式初始化方式:使用 dist.init()而不是 PyTorch 原生的 init_process_group
正确处理权重文件:在加载权重时去掉 module.前缀
关键代码修改
在 model_zoo.py中添加了权重处理逻辑
#去掉 module. 前缀
new_state_dict = {}
for k, v in state_dict.items():
if k.startswith(‘module.’):
new_k = k[7:] # 去掉 ‘module.’
else:
new_k = k
new_state_dict[new_k] = v
非严格模式加载
model.load_state_dict(new_state_dict, strict=False)
解决方法如下:
解决方法 第一步:修改test.py
importargparseimportsys,osimporttorchimporttorch.backends.cudnnimporttorch.cudaimporttorch.nnimporttorch.utils.datafromtorchpackimportdistributedasdistfromtorchpack.callbacksimportCallbacks,SaverRestorefromtorchpack.environimportauto_set_run_dir,set_run_dirfromtorchpack.utils.configimportconfigsfromtorchpack.utils.loggingimportloggerfromtqdmimporttqdmfromcoreimportbuilderfromcore.callbacksimportMeanIoUfromcore.trainersimportSemanticKITTITrainerfrommodel_zooimportminkunet_test,spvcnn_testdefmain()->None:# 正确的分布式初始化dist.init()torch.backends.cudnn.benchmark=Truetorch.cuda.set_device(dist.local_rank())parser=argparse.ArgumentParser()parser.add_argument('--run-dir',metavar='DIR',help='run directory')parser.add_argument('--name',type=str,help='model name')parser.add_argument('--gpu',default='0',help='gpu index')args,opts=parser.parse_known_args()args.config=args.name+'/metainfo/configs.yaml'configs.load(args.config,recursive=True)configs.update(opts)ifargs.run_dirisNone:args.run_dir=auto_set_run_dir()else:set_run_dir(args.run_dir)os.environ["CUDA_VISIBLE_DEVICES"]=args.gpu logger.info(' '.join([sys.executable]+sys.argv))logger.info(f'Experiment started: "{args.run_dir}".'+'\n'+f'{configs}')dataset=builder.make_dataset()dataflow={}forsplitindataset:sampler=torch.utils.data.distributed.DistributedSampler(dataset[split],num_replicas=dist.size(),rank=dist.rank(),shuffle=(split=='train'))dataflow[split]=torch.utils.data.DataLoader(dataset[split],batch_size=1,sampler=sampler,num_workers=configs.workers_per_gpu,pin_memory=True,collate_fn=dataset[split].collate_fn)assertos.path.exists(args.name+'/checkpoints/max-iou-test.pt')# 加载模型(使用修复后的函数)if'spvcnn'inargs.name.lower():model=spvcnn_test(weight_path=args.name+'/checkpoints/max-iou-test.pt',configs=configs)elif'mink'inargs.name.lower():model=minkunet_test(weight_path=args.name+'/checkpoints/max-iou-test.pt',configs=configs)else:raiseNotImplementedError# 使用 DDP 包装模型model=torch.nn.parallel.DistributedDataParallel(model.cuda(),device_ids=[dist.local_rank()],find_unused_parameters=True)model.eval()criterion=builder.make_criterion()optimizer=builder.make_optimizer(model)scheduler=builder.make_scheduler(optimizer)trainer=SemanticKITTITrainer(model=model,criterion=criterion,optimizer=optimizer,scheduler=scheduler,num_workers=configs.workers_per_gpu,seed=configs.train.seed)callbacks=Callbacks([SaverRestore(),MeanIoU(configs.data.num_classes,configs.data.ignore_label)])callbacks._set_trainer(trainer)trainer.callbacks=callbacks trainer.dataflow=dataflow['test']trainer.before_train()trainer.before_epoch()model.eval()forfeed_dictintqdm(dataflow['test'],desc='eval'):_inputs={}forkey,valueinfeed_dict.items():if'name'notinkey:_inputs[key]=value.cuda()inputs=_inputs['lidar']outputs=model(inputs)invs=feed_dict['inverse_map']all_labels=feed_dict['targets_mapped']_outputs=[]_targets=[]foridxinrange(invs.C[:,-1].max()+1):cur_scene_pts=(inputs.C[:,-1]==idx).cpu().numpy()cur_inv=invs.F[invs.C[:,-1]==idx].cpu().numpy()cur_label=(all_labels.C[:,-1]==idx).cpu().numpy()outputs_mapped=outputs[cur_scene_pts][cur_inv].argmax(1)targets_mapped=all_labels.F[cur_label]_outputs.append(outputs_mapped)_targets.append(targets_mapped)outputs=torch.cat(_outputs,0)targets=torch.cat(_targets,0)output_dict={'outputs':outputs,'targets':targets}trainer.after_step(output_dict)trainer.after_epoch()if__name__=='__main__':main()第二步修改model_zoo.py
defspvcnn_test(weight_path,configs):model=SPVCNN(num_classes=configs.data.num_classes,cr=configs.model.cr,pres=configs.dataset.voxel_size,vres=configs.dataset.voxel_size)checkpoint=torch.load(weight_path,map_location='cpu')# 精确提取模型权重if'model'incheckpoint:state_dict=checkpoint['model']else:state_dict=checkpoint# 只加载模型相关的键model_state_dict={}fork,vinstate_dict.items():ifk.startswith('module.'):k=k[7:]# 去掉 module. 前缀# 只保留模型权重,过滤掉训练状态ifnotany(xinkforxin['optimizer','scheduler','scaler','epoch','step']):model_state_dict[k]=v model.load_state_dict(model_state_dict,strict=False)returnmodel