Upload llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16
Browse files
llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/bench.slurm
CHANGED
@@ -9,8 +9,8 @@
|
|
9 |
#SBATCH --ntasks-per-node=1
|
10 |
#SBATCH --cpus-per-task=96
|
11 |
#SBATCH --exclusive
|
12 |
-
#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
13 |
-
#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
14 |
|
15 |
# Function to update status based on squeue output
|
16 |
update_status() {
|
@@ -53,7 +53,7 @@ huggingface-cli login --token $HUGGINGFACE_TOKEN
|
|
53 |
|
54 |
|
55 |
NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
|
56 |
-
CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
57 |
|
58 |
LAUNCHER="torchrun \
|
59 |
--nproc_per_node 8 \
|
@@ -72,7 +72,7 @@ cd ..
|
|
72 |
job_id=${SLURM_JOB_ID}
|
73 |
|
74 |
# Update status to "pending" or "running" in the background
|
75 |
-
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
76 |
|
77 |
# Run the main command
|
78 |
srun -u $LAUNCHER $CMD
|
@@ -80,28 +80,28 @@ exit_status=$?
|
|
80 |
|
81 |
# Update status based on the exit status of `srun`
|
82 |
if [ $exit_status -eq 0 ]; then
|
83 |
-
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
84 |
else
|
85 |
-
if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
86 |
-
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
87 |
-
elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
88 |
-
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
89 |
-
elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
90 |
-
printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
91 |
else
|
92 |
-
printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
93 |
fi
|
94 |
fi
|
95 |
|
96 |
# Run the report script if the job completed successfully
|
97 |
if [ $exit_status -eq 0 ]; then
|
98 |
-
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
99 |
-
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
100 |
fi
|
101 |
|
102 |
|
103 |
# Push to hub the folder using huggingface_cli
|
104 |
-
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
105 |
|
106 |
# Verify the upload
|
107 |
if [ $? -eq 0 ]; then
|
|
|
9 |
#SBATCH --ntasks-per-node=1
|
10 |
#SBATCH --cpus-per-task=96
|
11 |
#SBATCH --exclusive
|
12 |
+
#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/log.out
|
13 |
+
#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/log.out
|
14 |
|
15 |
# Function to update status based on squeue output
|
16 |
update_status() {
|
|
|
53 |
|
54 |
|
55 |
NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
|
56 |
+
CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/config.yaml"
|
57 |
|
58 |
LAUNCHER="torchrun \
|
59 |
--nproc_per_node 8 \
|
|
|
72 |
job_id=${SLURM_JOB_ID}
|
73 |
|
74 |
# Update status to "pending" or "running" in the background
|
75 |
+
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/status.txt &
|
76 |
|
77 |
# Run the main command
|
78 |
srun -u $LAUNCHER $CMD
|
|
|
80 |
|
81 |
# Update status based on the exit status of `srun`
|
82 |
if [ $exit_status -eq 0 ]; then
|
83 |
+
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/status.txt
|
84 |
else
|
85 |
+
if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/log.out; then
|
86 |
+
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/status.txt
|
87 |
+
elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/log.out; then
|
88 |
+
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/status.txt
|
89 |
+
elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/log.out; then
|
90 |
+
printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/status.txt
|
91 |
else
|
92 |
+
printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/status.txt
|
93 |
fi
|
94 |
fi
|
95 |
|
96 |
# Run the report script if the job completed successfully
|
97 |
if [ $exit_status -eq 0 ]; then
|
98 |
+
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16 --is_logs
|
99 |
+
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16 --is_profiler
|
100 |
fi
|
101 |
|
102 |
|
103 |
# Push to hub the folder using huggingface_cli
|
104 |
+
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16 llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16"
|
105 |
|
106 |
# Verify the upload
|
107 |
if [ $? -eq 0 ]; then
|
llama-1B/64_GPUS/dp-1_tp-8_pp-8_mbz-16/log.out
CHANGED
The diff for this file is too large to render.
See raw diff
|
|