From 5cb18370444625023f900c196f7812d0d5322514 Mon Sep 17 00:00:00 2001 From: liangyi1019 Date: Sat, 3 Dec 2022 19:17:34 +0800 Subject: [PATCH 1/4] fix 4p --- .../DeeplabV3_for_Pytorch/test/train_performance_4p.sh | 4 ++-- .../test/train_performance_4p_openmmlab.sh | 4 +++- .../test/train_performance_4p_openmmlab.sh | 2 +- .../SE-ResNet-50/test/train_performance_4p_openmmlab.sh | 3 ++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/train/model/accuracy_test/DeeplabV3_for_Pytorch/test/train_performance_4p.sh b/train/model/accuracy_test/DeeplabV3_for_Pytorch/test/train_performance_4p.sh index af40a99f..acceda8f 100644 --- a/train/model/accuracy_test/DeeplabV3_for_Pytorch/test/train_performance_4p.sh +++ b/train/model/accuracy_test/DeeplabV3_for_Pytorch/test/train_performance_4p.sh @@ -88,7 +88,7 @@ do --seed 1 \ --deterministic \ --device npu \ - --options data.workers_per_gpu=${workers} \ + --options device_num=4 data.workers_per_gpu=${workers} \ --local_rank 0 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & else python3.7 ${cur_path}/tools/train.py ${cur_path}/configs/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes.py \ @@ -96,7 +96,7 @@ do --seed 1 \ --deterministic \ --device npu \ - --options data.workers_per_gpu=${workers} \ + --options device_num=4 data.workers_per_gpu=${workers} \ --local_rank 0 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & fi done diff --git a/train/model/perf_test/InceptionV3_ID1596_for_PyTorch/test/train_performance_4p_openmmlab.sh b/train/model/perf_test/InceptionV3_ID1596_for_PyTorch/test/train_performance_4p_openmmlab.sh index c2485c3f..762d6579 100644 --- a/train/model/perf_test/InceptionV3_ID1596_for_PyTorch/test/train_performance_4p_openmmlab.sh +++ b/train/model/perf_test/InceptionV3_ID1596_for_PyTorch/test/train_performance_4p_openmmlab.sh @@ -66,7 +66,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -KERNEL_NUM=$(($(nproc)/4)) +KERNEL_NUM=$(($(nproc)/8)) for i in $(seq 0 3) do if [ $(uname -m) = "aarch64" ] @@ -78,6 +78,7 @@ do --amp \ --loss_scale=128 \ --data ${data_path} \ + --device_list=0,1,2,3 \ --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \ --workers=128 \ @@ -101,6 +102,7 @@ do --amp \ --loss_scale=128 \ --data ${data_path} \ + --device_list=0,1,2,3 \ --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \ --workers=128 \ diff --git a/train/model/perf_test/ResNet50_for_PyTorch/test/train_performance_4p_openmmlab.sh b/train/model/perf_test/ResNet50_for_PyTorch/test/train_performance_4p_openmmlab.sh index c46d402a..9bcd1dc2 100644 --- a/train/model/perf_test/ResNet50_for_PyTorch/test/train_performance_4p_openmmlab.sh +++ b/train/model/perf_test/ResNet50_for_PyTorch/test/train_performance_4p_openmmlab.sh @@ -69,7 +69,7 @@ nohup python3.7 ./DistributedResnet50/main_apex_d76_npu.py \ --data ${data_path} \ --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \ - --device-list 0 1 2 3 \ + --device-list 0,1,2,3 \ --workers=${workers} \ --learning-rate=1.6 \ --warmup=8 \ diff --git a/train/model/perf_test/SE-ResNet-50/test/train_performance_4p_openmmlab.sh b/train/model/perf_test/SE-ResNet-50/test/train_performance_4p_openmmlab.sh index 513becb6..acaa604b 100644 --- a/train/model/perf_test/SE-ResNet-50/test/train_performance_4p_openmmlab.sh +++ b/train/model/perf_test/SE-ResNet-50/test/train_performance_4p_openmmlab.sh @@ -64,7 +64,7 @@ etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -KERNEL_NUM=$(($(nproc)/4)) +KERNEL_NUM=$(($(nproc)/8)) for i in $(seq 0 3) do PID_START=$((KERNEL_NUM * i)) @@ -74,6 +74,7 @@ do --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \ --workers=64 \ + --device_list=0,1,2,3 \ --learning-rate=0.6 \ --mom=0.9 \ --weight-decay=1.0e-04 \ -- Gitee From cdd97db34f296f480ab8c31e9abf839d3ae16cc0 Mon Sep 17 00:00:00 2001 From: liangyi1019 Date: Mon, 5 Dec 2022 15:51:01 +0800 Subject: [PATCH 2/4] fix FCOS_scripts --- train/model/perf_test/FCOS/mmdet/apis/train.py | 5 +++-- train/model/perf_test/FCOS/test/train_performance_1p.sh | 2 +- train/model/perf_test/FCOS/test/train_performance_4p.sh | 2 +- train/model/perf_test/FCOS/test/train_performance_8p.sh | 2 +- train/model/perf_test/FCOS/tools/train.py | 2 +- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/train/model/perf_test/FCOS/mmdet/apis/train.py b/train/model/perf_test/FCOS/mmdet/apis/train.py index 3c6da670..4e40f06c 100644 --- a/train/model/perf_test/FCOS/mmdet/apis/train.py +++ b/train/model/perf_test/FCOS/mmdet/apis/train.py @@ -31,7 +31,7 @@ # ============================================================================ import random - +import mmcv.runner import get_dist_info import numpy as np import torch from mmcv.parallel import MMDataParallel, MMDistributedDataParallel @@ -118,10 +118,11 @@ def train_detector(model, model = MMDataParallel( model, device_ids=cfg.npu_ids) # mode with apex - + _, world_size = get_dist_info() # build runner runner = EpochBasedRunner( model, + num_of_gpus=world_size, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, diff --git a/train/model/perf_test/FCOS/test/train_performance_1p.sh b/train/model/perf_test/FCOS/test/train_performance_1p.sh index a45b2244..95f7b69b 100644 --- a/train/model/perf_test/FCOS/test/train_performance_1p.sh +++ b/train/model/perf_test/FCOS/test/train_performance_1p.sh @@ -66,7 +66,7 @@ fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 PORT=29880 ./tools/dist_train.sh ./configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py 1 \ --npu-ids 0 \ - --cfg-options optimizer.lr=0.00125 data.samples_per_gpu=16 total_epochs=1 data_root=$data_path \ + --cfg-options optimizer.lr=0.00125 data.samples_per_gpu=2 total_epochs=1 data_root=$data_path \ --seed 0 \ --opt-level O1 \ --loss-scale 32.0 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & diff --git a/train/model/perf_test/FCOS/test/train_performance_4p.sh b/train/model/perf_test/FCOS/test/train_performance_4p.sh index aa08fac9..dbfc152c 100644 --- a/train/model/perf_test/FCOS/test/train_performance_4p.sh +++ b/train/model/perf_test/FCOS/test/train_performance_4p.sh @@ -66,7 +66,7 @@ fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 PORT=29888 ./tools/dist_train.sh ./configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py 4 \ --npu-ids 0 \ - --cfg-options optimizer.lr=0.01 total_epochs=1 data_root=$data_path \ + --cfg-options optimizer.lr=0.01 data.samples_per_gpu=2 total_epochs=1 data_root=$data_path \ --seed 0 \ --opt-level O1 \ --loss-scale 32.0 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & diff --git a/train/model/perf_test/FCOS/test/train_performance_8p.sh b/train/model/perf_test/FCOS/test/train_performance_8p.sh index 33539d92..2ee3e9a8 100644 --- a/train/model/perf_test/FCOS/test/train_performance_8p.sh +++ b/train/model/perf_test/FCOS/test/train_performance_8p.sh @@ -66,7 +66,7 @@ fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 PORT=29888 ./tools/dist_train.sh ./configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py 8 \ --npu-ids 0 \ - --cfg-options optimizer.lr=0.01 total_epochs=1 data_root=$data_path \ + --cfg-options optimizer.lr=0.01 data.samples_per_gpu=2 total_epochs=1 data_root=$data_path \ --seed 0 \ --opt-level O1 \ --loss-scale 32.0 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & diff --git a/train/model/perf_test/FCOS/tools/train.py b/train/model/perf_test/FCOS/tools/train.py index ae24e69b..3666fe67 100644 --- a/train/model/perf_test/FCOS/tools/train.py +++ b/train/model/perf_test/FCOS/tools/train.py @@ -153,7 +153,7 @@ def main(): cfg.resume_from = args.resume_from if args.npu_ids is not None: cfg.npu_ids = args.npu_ids - # torch.npu.set_device(cfg.npu_ids[0]) + torch.npu.set_device(cfg.npu_ids[0]) else: cfg.npu_ids = range(1) if args.npus is None else range(args.npus) -- Gitee From cf119e24e00293c6584c96ec5a794bb2d312657c Mon Sep 17 00:00:00 2001 From: liangyi1019 Date: Mon, 5 Dec 2022 16:46:18 +0800 Subject: [PATCH 3/4] fix --- train/model/perf_test/FCOS/mmdet/apis/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/model/perf_test/FCOS/mmdet/apis/train.py b/train/model/perf_test/FCOS/mmdet/apis/train.py index 4e40f06c..8531accf 100644 --- a/train/model/perf_test/FCOS/mmdet/apis/train.py +++ b/train/model/perf_test/FCOS/mmdet/apis/train.py @@ -31,7 +31,7 @@ # ============================================================================ import random -import mmcv.runner import get_dist_info +from mmcv.runner import get_dist_info import numpy as np import torch from mmcv.parallel import MMDataParallel, MMDistributedDataParallel -- Gitee From d305f16bf9b4fd79987d75af967331e21495df69 Mon Sep 17 00:00:00 2001 From: liangyi1019 Date: Mon, 5 Dec 2022 17:15:27 +0800 Subject: [PATCH 4/4] fix --- train/model/perf_test/FCOS/mmdet/apis/train.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/train/model/perf_test/FCOS/mmdet/apis/train.py b/train/model/perf_test/FCOS/mmdet/apis/train.py index 525303ef..8531accf 100644 --- a/train/model/perf_test/FCOS/mmdet/apis/train.py +++ b/train/model/perf_test/FCOS/mmdet/apis/train.py @@ -31,11 +31,7 @@ # ============================================================================ import random -<<<<<<< HEAD from mmcv.runner import get_dist_info -======= -import mmcv.runner import get_dist_info ->>>>>>> 365e04d9125158cf477adb0c6a90287887b62496 import numpy as np import torch from mmcv.parallel import MMDataParallel, MMDistributedDataParallel -- Gitee