Spaces:
Runtime error
Runtime error
#SBATCH --job-name=pretrain_bart # create a short name for your job | |
#SBATCH --nodes=1 # node count | |
#SBATCH --ntasks-per-node=8 # number of tasks to run per node | |
#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) | |
#SBATCH --gres=gpu:8 # number of gpus per node | |
#SBATCH -o %x-%j.log # output and error log file names (%x for job id) | |
#SBATCH -x dgx050 | |
# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen | |
ROOT_DIR=../../workspace | |
export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions | |
MODEL_NAME=erlangshen-bert-base | |
MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} | |
if [ ! -d ${MODEL_ROOT_DIR} ];then | |
mkdir ${MODEL_ROOT_DIR} | |
fi | |
NNODES=1 | |
GPUS_PER_NODE=1 | |
MICRO_BATCH_SIZE=32 | |
# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin | |
CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json" | |
ZERO_STAGE=1 | |
# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() | |
cat <<EOT > $CONFIG_JSON | |
{ | |
"zero_optimization": { | |
"stage": ${ZERO_STAGE} | |
}, | |
"fp16": { | |
"enabled": true | |
}, | |
"gradient_clipping": 2, | |
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE | |
} | |
EOT | |
export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON | |
### End | |
DATA_ARGS="\ | |
--dataloader_workers 2 \ | |
--train_batchsize $MICRO_BATCH_SIZE \ | |
--val_batchsize $MICRO_BATCH_SIZE \ | |
--test_batchsize $MICRO_BATCH_SIZE \ | |
--datasets_name IDEA-CCNL/PretrainCorpusDemo \ | |
" | |
# 如果你有一批数据,可以参照IDEA-CCNL/PretrainCorpusDemo的格式处理,通过参数传入 | |
# --train_file train.json | |
# --val_file val.json | |
# --test_file test.json | |
MODEL_ARGS="\ | |
--model_path $MODEL_ROOT_DIR/pretrain \ | |
--learning_rate 1e-4 \ | |
--weight_decay 1e-1 \ | |
--warmup_ratio 0.01 \ | |
" | |
MODEL_CHECKPOINT_ARGS="\ | |
--save_last \ | |
--save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \ | |
--load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \ | |
" | |
TRAINER_ARGS="\ | |
--max_epoch 1 \ | |
--gpus $GPUS_PER_NODE \ | |
--num_nodes $NNODES \ | |
--strategy deepspeed_stage_${ZERO_STAGE} \ | |
--log_every_n_steps 1 \ | |
--precision 16 \ | |
--default_root_dir ${MODEL_ROOT_DIR} \ | |
--replace_sampler_ddp False \ | |
" | |
export options=" \ | |
$DATA_ARGS \ | |
$MODEL_ARGS \ | |
$MODEL_CHECKPOINT_ARGS \ | |
$TRAINER_ARGS \ | |
" | |
python3 pretrain_erlangshen.py $options | |