plaguss HF staff commited on
Commit
5bd471f
·
verified ·
1 Parent(s): e884913

Create sft.slurm

Browse files
Files changed (1) hide show
  1. sft.slurm +39 -0
sft.slurm ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=apigen-fine-tune
3
+ #SBATCH --partition=hopper-prod
4
+ #SBATCH --qos=normal
5
+ #SBATCH --nodes=1
6
+ #SBATCH --ntasks-per-node=1
7
+ #SBATCH --gpus-per-node=8
8
+ #SBATCH --output=./logs/%x-%j.out
9
+ #SBATCH --err=./logs/%x-%j.err
10
+ #SBATCH --time=02-00:00:00
11
+
12
+ set -ex
13
+
14
+ module load cuda/12.1
15
+
16
+ source .venv/bin/activate
17
+
18
+ srun --nodes=1 --ntasks=1 --export=ALL,ACCELERATE_LOG_LEVEL=info accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml examples/scripts/sft.py \
19
+ --run_name=Llama-3.2-1B-Instruct-APIGen-FC-v0.1 \
20
+ --model_name_or_path="meta-llama/Llama-3.2-1B-Instruct" \
21
+ --dataset_name="plaguss/apigen-synth-trl" \
22
+ --report_to="wandb" \
23
+ --learning_rate=5.0e-06 \
24
+ --lr_scheduler_type="cosine" \
25
+ --per_device_train_batch_size=6 \
26
+ --per_device_eval_batch_size=6 \
27
+ --do_eval \
28
+ --eval_strategy="steps" \
29
+ --gradient_accumulation_steps=2 \
30
+ --output_dir="data/Llama-3.2-1B-Instruct-APIGen-FC-v0.1" \
31
+ --logging_steps=5 \
32
+ --eval_steps=50 \
33
+ --num_train_epochs=2 \
34
+ --max_steps=-1 \
35
+ --warmup_steps=50 \
36
+ --max_seq_length=2048 \
37
+ --push_to_hub \
38
+ --gradient_checkpointing \
39
+ --bf16