From c3f55982c709d725bcf870717a1f43f0ca1f415a Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 19 Jan 2022 14:45:35 +0800 Subject: [PATCH 1/6] Create train_infer_python_fleet.txt --- test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt b/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt new file mode 100644 index 0000000000..72943a16fb --- /dev/null +++ b/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt @@ -0,0 +1 @@ +aaa From 8fe6209d5806a19dab47d740fc93ec26a88e859e Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 19 Jan 2022 14:51:27 +0800 Subject: [PATCH 2/6] Update train_infer_python_fleet.txt --- .../test_tipc/train_infer_python_fleet.txt | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt b/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt index 72943a16fb..b095f02cf4 100644 --- a/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt +++ b/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt @@ -1 +1,18 @@ -aaa +===========================train_params=========================== +model_name:ch_PPOCRv2_det +python:python3.7 +gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1 +use_gpu:True +AMP.use_amp:True|False +epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000 +save_model_dir:./output/ +TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280 +pretrained_model:null +checkpoints:null +use_custom_relu:False|True +model_type:cls|cls_distill|cls_distill_multiopt +MODEL.siamese:False|True +norm_train:train.py -c mv3_large_x0_5.yml -o +quant_train:False +prune_train:False + From ac2d2c6543bf9aefee18e77b23bf60c302cc012f Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 9 Feb 2022 14:57:01 +0800 Subject: [PATCH 3/6] Update readme.md --- test_tipc/supplementary/readme.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test_tipc/supplementary/readme.md b/test_tipc/supplementary/readme.md index b630b0f30b..cc49f00996 100644 --- a/test_tipc/supplementary/readme.md +++ b/test_tipc/supplementary/readme.md @@ -47,6 +47,13 @@ bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_PACT.txt 'lit bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_FPGM.txt 'lite_train_lite_infer' ``` +多机多卡的运行配置文件分别为'train_infer_python_fleet.txt', 'train_infer_python_FPGM_fleet.txt', 'train_infer_python_PACT_fleet.txt'. +运行时,需要修改配置文件中的`gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1`. 将`xx.xx.xx.xx` 和 `yy.yy.yy.yy`替换为具体的 `ip` 地址。 另外,和单机训练 +不同,多机多卡训练需要在多机的每个节点上分别运行命令。 以多机多卡量化训练为例, 指令如下: +``` +bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_PACT_fleet.txt 'lite_train_lite_infer' +``` + 运行相应指令后,在`test_tipc/output`文件夹下自动会保存运行日志。如'lite_train_lite_infer'模式运行后,在test_tipc/extra_output文件夹有以下文件: ``` From dfaccd5b5a7ccdb9b3536f78e5be76ed114b05d8 Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 9 Feb 2022 14:57:38 +0800 Subject: [PATCH 4/6] Update readme.md --- test_tipc/supplementary/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_tipc/supplementary/readme.md b/test_tipc/supplementary/readme.md index cc49f00996..1ad8076256 100644 --- a/test_tipc/supplementary/readme.md +++ b/test_tipc/supplementary/readme.md @@ -47,7 +47,7 @@ bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_PACT.txt 'lit bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_FPGM.txt 'lite_train_lite_infer' ``` -多机多卡的运行配置文件分别为'train_infer_python_fleet.txt', 'train_infer_python_FPGM_fleet.txt', 'train_infer_python_PACT_fleet.txt'. +多机多卡的运行配置文件分别为`train_infer_python_fleet.txt`, `train_infer_python_FPGM_fleet.txt`, `train_infer_python_PACT_fleet.txt`. 运行时,需要修改配置文件中的`gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1`. 将`xx.xx.xx.xx` 和 `yy.yy.yy.yy`替换为具体的 `ip` 地址。 另外,和单机训练 不同,多机多卡训练需要在多机的每个节点上分别运行命令。 以多机多卡量化训练为例, 指令如下: ``` From 0979bb76c286e73120e88db06d0b3d2beab15a34 Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 9 Feb 2022 15:01:18 +0800 Subject: [PATCH 5/6] Update readme.md --- test_tipc/supplementary/readme.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_tipc/supplementary/readme.md b/test_tipc/supplementary/readme.md index 1ad8076256..a378fc5f35 100644 --- a/test_tipc/supplementary/readme.md +++ b/test_tipc/supplementary/readme.md @@ -47,9 +47,9 @@ bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_PACT.txt 'lit bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_FPGM.txt 'lite_train_lite_infer' ``` -多机多卡的运行配置文件分别为`train_infer_python_fleet.txt`, `train_infer_python_FPGM_fleet.txt`, `train_infer_python_PACT_fleet.txt`. -运行时,需要修改配置文件中的`gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1`. 将`xx.xx.xx.xx` 和 `yy.yy.yy.yy`替换为具体的 `ip` 地址。 另外,和单机训练 -不同,多机多卡训练需要在多机的每个节点上分别运行命令。 以多机多卡量化训练为例, 指令如下: +多机多卡的运行配置文件分别为 `train_infer_python_fleet.txt`, `train_infer_python_FPGM_fleet.txt` 和 `train_infer_python_PACT_fleet.txt`。 +运行时,需要修改配置文件中的 `gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1`。 将 `xx.xx.xx.xx` 替换为具体的 `ip` 地址,各个`ip`地址之间用`,`分隔。 另外,和单机训练 +不同,启动多机多卡训练需要在多机的每个节点上分别运行命令。以多机多卡量化训练为例,指令如下: ``` bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_PACT_fleet.txt 'lite_train_lite_infer' ``` From 8a44a90396e13f5ba4605c327a9be05226818b51 Mon Sep 17 00:00:00 2001 From: stephon Date: Wed, 9 Feb 2022 09:24:49 +0000 Subject: [PATCH 6/6] add fleet tests --- .../test_tipc/test_train_python.sh | 3 ++- .../test_tipc/train_infer_python_FPGM.txt | 4 ++-- .../test_tipc/train_infer_python_FPGM_fleet.txt | 17 +++++++++++++++++ .../test_tipc/train_infer_python_PACT.txt | 4 ++-- .../test_tipc/train_infer_python_PACT_fleet.txt | 17 +++++++++++++++++ .../test_tipc/train_infer_python_fleet.txt | 3 +-- 6 files changed, 41 insertions(+), 7 deletions(-) create mode 100644 test_tipc/supplementary/test_tipc/train_infer_python_FPGM_fleet.txt create mode 100644 test_tipc/supplementary/test_tipc/train_infer_python_PACT_fleet.txt diff --git a/test_tipc/supplementary/test_tipc/test_train_python.sh b/test_tipc/supplementary/test_tipc/test_train_python.sh index f922b57bba..ed709c1c4b 100644 --- a/test_tipc/supplementary/test_tipc/test_train_python.sh +++ b/test_tipc/supplementary/test_tipc/test_train_python.sh @@ -35,7 +35,6 @@ use_share_conv_key=$(func_parser_key "${lines[13]}") use_share_conv_list=$(func_parser_value "${lines[13]}") run_train_py=$(func_parser_value "${lines[14]}") - LOG_PATH="./test_tipc/extra_output" mkdir -p ${LOG_PATH} status_log="${LOG_PATH}/results_python.log" @@ -98,6 +97,8 @@ if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "whole_train_whole_infer cmd="${python} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}" elif [ ${#ips} -le 26 ];then # train with multi-gpu cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}" + else + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}" fi # run train diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_FPGM.txt b/test_tipc/supplementary/test_tipc/train_infer_python_FPGM.txt index 4c2e28b91e..ccbd27ffbc 100644 --- a/test_tipc/supplementary/test_tipc/train_infer_python_FPGM.txt +++ b/test_tipc/supplementary/test_tipc/train_infer_python_FPGM.txt @@ -4,9 +4,9 @@ python:python3.7 gpu_list:0|0,1 use_gpu:True|True AMP.use_amp:True|False -epoch:lite_train_lite_infer=20|whole_train_whole_infer=1000 +epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000 save_model_dir:./output/ -TRAIN.batch_size:lite_train_lite_infer=2|whole_train_whole_infer=4 +TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280 pretrained_model:null checkpoints:null use_custom_relu:False|True diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_FPGM_fleet.txt b/test_tipc/supplementary/test_tipc/train_infer_python_FPGM_fleet.txt new file mode 100644 index 0000000000..be2b2117d7 --- /dev/null +++ b/test_tipc/supplementary/test_tipc/train_infer_python_FPGM_fleet.txt @@ -0,0 +1,17 @@ +===========================train_params=========================== +model_name:ch_PPOCRv2_det +python:python3.7 +gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1 +use_gpu:True +AMP.use_amp:True|False +epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000 +save_model_dir:./output/ +TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280 +pretrained_model:null +checkpoints:null +use_custom_relu:False|True +model_type:cls|cls_distill|cls_distill_multiopt +MODEL.siamese:False|True +norm_train:train.py -c mv3_large_x0_5.yml -o prune_train=True +quant_train:False +prune_train:False diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_PACT.txt b/test_tipc/supplementary/test_tipc/train_infer_python_PACT.txt index 079cddf878..24d291b4b3 100644 --- a/test_tipc/supplementary/test_tipc/train_infer_python_PACT.txt +++ b/test_tipc/supplementary/test_tipc/train_infer_python_PACT.txt @@ -4,9 +4,9 @@ python:python3.7 gpu_list:0|0,1 use_gpu:True|True AMP.use_amp:True|False -epoch:lite_train_lite_infer=20|whole_train_whole_infer=1000 +epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000 save_model_dir:./output/ -TRAIN.batch_size:lite_train_lite_infer=2|whole_train_whole_infer=4 +TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280 pretrained_model:null checkpoints:null use_custom_relu:False|True diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_PACT_fleet.txt b/test_tipc/supplementary/test_tipc/train_infer_python_PACT_fleet.txt new file mode 100644 index 0000000000..93f06d7633 --- /dev/null +++ b/test_tipc/supplementary/test_tipc/train_infer_python_PACT_fleet.txt @@ -0,0 +1,17 @@ +===========================train_params=========================== +model_name:ch_PPOCRv2_det +python:python3.7 +gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1 +use_gpu:True +AMP.use_amp:True|False +epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000 +save_model_dir:./output/ +TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280 +pretrained_model:null +checkpoints:null +use_custom_relu:False|True +model_type:cls|cls_distill|cls_distill_multiopt +MODEL.siamese:False|True +norm_train:train.py -c mv3_large_x0_5.yml -o quant_train=True +quant_train:False +prune_train:False diff --git a/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt b/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt index b095f02cf4..00b9e8234b 100644 --- a/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt +++ b/test_tipc/supplementary/test_tipc/train_infer_python_fleet.txt @@ -12,7 +12,6 @@ checkpoints:null use_custom_relu:False|True model_type:cls|cls_distill|cls_distill_multiopt MODEL.siamese:False|True -norm_train:train.py -c mv3_large_x0_5.yml -o +norm_train: train.py -c mv3_large_x0_5.yml -o quant_train:False prune_train:False -