Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16
Browse files
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/bench.slurm
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
#SBATCH --job-name=bench_cluster
|
4 |
-
#SBATCH --time=
|
5 |
#SBATCH --partition=hopper-prod
|
6 |
#SBATCH --nodes=8
|
7 |
#SBATCH --gres=gpu:8
|
8 |
-
#SBATCH --qos=
|
9 |
#SBATCH --ntasks-per-node=1
|
10 |
#SBATCH --cpus-per-task=96
|
11 |
#SBATCH --exclusive
|
12 |
-
#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
13 |
-
#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
14 |
|
15 |
# Function to update status based on squeue output
|
16 |
update_status() {
|
@@ -53,7 +53,7 @@ huggingface-cli login --token $HUGGINGFACE_TOKEN
|
|
53 |
|
54 |
|
55 |
NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
|
56 |
-
CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
57 |
|
58 |
LAUNCHER="torchrun \
|
59 |
--nproc_per_node 8 \
|
@@ -72,7 +72,7 @@ cd ..
|
|
72 |
job_id=${SLURM_JOB_ID}
|
73 |
|
74 |
# Update status to "pending" or "running" in the background
|
75 |
-
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
76 |
|
77 |
# Run the main command
|
78 |
srun -u $LAUNCHER $CMD
|
@@ -80,28 +80,28 @@ exit_status=$?
|
|
80 |
|
81 |
# Update status based on the exit status of `srun`
|
82 |
if [ $exit_status -eq 0 ]; then
|
83 |
-
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
84 |
else
|
85 |
-
if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
86 |
-
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
87 |
-
elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
88 |
-
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
89 |
-
elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
90 |
-
printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
91 |
else
|
92 |
-
printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
93 |
fi
|
94 |
fi
|
95 |
|
96 |
# Run the report script if the job completed successfully
|
97 |
if [ $exit_status -eq 0 ]; then
|
98 |
-
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
99 |
-
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
100 |
fi
|
101 |
|
102 |
|
103 |
# Push to hub the folder using huggingface_cli
|
104 |
-
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
105 |
|
106 |
# Verify the upload
|
107 |
if [ $? -eq 0 ]; then
|
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
#SBATCH --job-name=bench_cluster
|
4 |
+
#SBATCH --time=02:00:00
|
5 |
#SBATCH --partition=hopper-prod
|
6 |
#SBATCH --nodes=8
|
7 |
#SBATCH --gres=gpu:8
|
8 |
+
#SBATCH --qos=prod
|
9 |
#SBATCH --ntasks-per-node=1
|
10 |
#SBATCH --cpus-per-task=96
|
11 |
#SBATCH --exclusive
|
12 |
+
#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out
|
13 |
+
#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out
|
14 |
|
15 |
# Function to update status based on squeue output
|
16 |
update_status() {
|
|
|
53 |
|
54 |
|
55 |
NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
|
56 |
+
CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/config.yaml"
|
57 |
|
58 |
LAUNCHER="torchrun \
|
59 |
--nproc_per_node 8 \
|
|
|
72 |
job_id=${SLURM_JOB_ID}
|
73 |
|
74 |
# Update status to "pending" or "running" in the background
|
75 |
+
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt &
|
76 |
|
77 |
# Run the main command
|
78 |
srun -u $LAUNCHER $CMD
|
|
|
80 |
|
81 |
# Update status based on the exit status of `srun`
|
82 |
if [ $exit_status -eq 0 ]; then
|
83 |
+
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
|
84 |
else
|
85 |
+
if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out; then
|
86 |
+
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
|
87 |
+
elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out; then
|
88 |
+
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
|
89 |
+
elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out; then
|
90 |
+
printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
|
91 |
else
|
92 |
+
printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
|
93 |
fi
|
94 |
fi
|
95 |
|
96 |
# Run the report script if the job completed successfully
|
97 |
if [ $exit_status -eq 0 ]; then
|
98 |
+
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --is_logs
|
99 |
+
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --is_profiler
|
100 |
fi
|
101 |
|
102 |
|
103 |
# Push to hub the folder using huggingface_cli
|
104 |
+
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16"
|
105 |
|
106 |
# Verify the upload
|
107 |
if [ $? -eq 0 ]; then
|
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/config.yaml
CHANGED
@@ -48,7 +48,7 @@ parallelism:
|
|
48 |
dp: 1
|
49 |
expert_parallel_size: 1
|
50 |
pp: 64
|
51 |
-
pp_engine:
|
52 |
tp: 1
|
53 |
tp_linear_async_communication: false
|
54 |
tp_mode: REDUCE_SCATTER
|
|
|
48 |
dp: 1
|
49 |
expert_parallel_size: 1
|
50 |
pp: 64
|
51 |
+
pp_engine: afab
|
52 |
tp: 1
|
53 |
tp_linear_async_communication: false
|
54 |
tp_mode: REDUCE_SCATTER
|
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
|
|
|
1 |
+
fail
|