mirror of
https://github.com/PaddlePaddle/PaddleOCR.git
synced 2025-12-29 07:58:41 +00:00
add multi_node and amp train in script
This commit is contained in:
parent
8bb9fb7e3d
commit
987517fd5d
@ -14,8 +14,8 @@ Global:
|
||||
use_visualdl: False
|
||||
infer_img: doc/imgs_en/img_10.jpg
|
||||
save_res_path: ./output/det_db/predicts_db.txt
|
||||
|
||||
AMP:
|
||||
#amp related
|
||||
use_amp: True
|
||||
scale_loss: 1024.0
|
||||
use_dynamic_loss_scaling: True
|
||||
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
===========================train_params===========================
|
||||
model_name:ocr_det
|
||||
python:python3.7
|
||||
gpu_list:0|0,1
|
||||
gpu_list:0|0,1|10.21.226.181,10.21.226.133;0,1
|
||||
Global.use_gpu:True|True
|
||||
Global.auto_cast:null
|
||||
Global.auto_cast:fp32|amp
|
||||
Global.epoch_num:lite_train_infer=1|whole_train_infer=300
|
||||
Global.save_model_dir:./output/
|
||||
Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4
|
||||
|
||||
@ -253,6 +253,11 @@ else
|
||||
env=" "
|
||||
fi
|
||||
for autocast in ${autocast_list[*]}; do
|
||||
if [ ${autocast} = "amp" ]; then
|
||||
set_amp_config="Gloabl.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True"
|
||||
else
|
||||
set_amp_config=" "
|
||||
fi
|
||||
for trainer in ${trainer_list[*]}; do
|
||||
flag_quant=False
|
||||
if [ ${trainer} = ${pact_key} ]; then
|
||||
@ -279,7 +284,6 @@ else
|
||||
if [ ${run_train} = "null" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
|
||||
set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
|
||||
set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
|
||||
@ -295,11 +299,11 @@ else
|
||||
|
||||
set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
|
||||
if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
|
||||
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} "
|
||||
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} "
|
||||
elif [ ${#gpu} -le 15 ];then # train with multi-gpu
|
||||
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}"
|
||||
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
|
||||
else # train with multi-machine
|
||||
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
|
||||
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
|
||||
fi
|
||||
# run train
|
||||
eval "unset CUDA_VISIBLE_DEVICES"
|
||||
|
||||
@ -103,16 +103,16 @@ def main(config, device, logger, vdl_writer):
|
||||
logger.info('valid dataloader has {} iters'.format(
|
||||
len(valid_dataloader)))
|
||||
|
||||
use_amp = True if "AMP" in config else False
|
||||
use_amp = config["Global"].get("use_amp", False)
|
||||
if use_amp:
|
||||
AMP_RELATED_FLAGS_SETTING = {
|
||||
'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
|
||||
'FLAGS_max_inplace_grad_add': 8,
|
||||
}
|
||||
paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
|
||||
scale_loss = config["AMP"].get("scale_loss", 1.0)
|
||||
use_dynamic_loss_scaling = config["AMP"].get("use_dynamic_loss_scaling",
|
||||
False)
|
||||
scale_loss = config["Global"].get("scale_loss", 1.0)
|
||||
use_dynamic_loss_scaling = config["Global"].get(
|
||||
"use_dynamic_loss_scaling", False)
|
||||
scaler = paddle.amp.GradScaler(
|
||||
init_loss_scaling=scale_loss,
|
||||
use_dynamic_loss_scaling=use_dynamic_loss_scaling)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user