nanotron
/

old_bench

Model card Files Files and versions Community

3outeille HF staff commited on Jul 3, 2024

Commit

f458b72

verified ·

1 Parent(s): 374def9

Upload llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1

Browse files

Files changed (4) hide show

llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/bench.slurm +17 -17
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out +0 -0
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log_metrics.csv +2 -0
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt +1 -1

llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/bench.slurm CHANGED Viewed

@@ -1,16 +1,16 @@
 #!/bin/bash
 #SBATCH --job-name=bench_cluster
-#SBATCH --time=00:59:00
 #SBATCH --partition=hopper-prod
 #SBATCH --nodes=2
 #SBATCH --gres=gpu:8
-#SBATCH --qos=high
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=96
 #SBATCH --exclusive
-#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out
-#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out
 # Function to update status based on squeue output
 update_status() {
@@ -53,7 +53,7 @@ huggingface-cli login --token $HUGGINGFACE_TOKEN
 NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
-CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/config.yaml"
 LAUNCHER="torchrun \
    --nproc_per_node 8 \
@@ -72,7 +72,7 @@ cd ..
 job_id=${SLURM_JOB_ID}
 # Update status to "pending" or "running" in the background
-update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt &
 # Run the main command
 srun -u $LAUNCHER $CMD
@@ -80,28 +80,28 @@ exit_status=$?
 # Update status based on the exit status of `srun`
 if [ $exit_status -eq 0 ]; then
-    printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
 else
-    if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
-        printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
-    elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
-        printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
-    elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
-        printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
     else
-        printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
     fi
 fi
 # Run the report script if the job completed successfully
 if [ $exit_status -eq 0 ]; then
-    python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --is_logs
-    python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --is_profiler
 fi
 # Push to hub the folder using huggingface_cli
-huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --commit-message "Upload llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1"
 # Verify the upload
 if [ $? -eq 0 ]; then

 #!/bin/bash
 #SBATCH --job-name=bench_cluster
+#SBATCH --time=01:30:00
 #SBATCH --partition=hopper-prod
 #SBATCH --nodes=2
 #SBATCH --gres=gpu:8
+#SBATCH --qos=normal
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=96
 #SBATCH --exclusive
+#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out
+#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out
 # Function to update status based on squeue output
 update_status() {
 NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
+CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/config.yaml"
 LAUNCHER="torchrun \
    --nproc_per_node 8 \
 job_id=${SLURM_JOB_ID}
 # Update status to "pending" or "running" in the background
+update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt &
 # Run the main command
 srun -u $LAUNCHER $CMD
 # Update status based on the exit status of `srun`
 if [ $exit_status -eq 0 ]; then
+    printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
 else
+    if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
+        printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
+    elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
+        printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
+    elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
+        printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
     else
+        printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
     fi
 fi
 # Run the report script if the job completed successfully
 if [ $exit_status -eq 0 ]; then
+    python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --is_logs
+    python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --is_profiler
 fi
 # Push to hub the folder using huggingface_cli
+huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --commit-message "Upload llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1"
 # Verify the upload
 if [ $? -eq 0 ]; then

llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out CHANGED Viewed

The diff for this file is too large to render. See raw diff

llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log_metrics.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2	+ 1,4190000.0000000005,83600.0,50200.0,3140.0,1020.0,11.1,0.0001,28.5,28.5,24.9,3168.14,4459.01,13244.0

llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt CHANGED Viewed

	@@ -1 +1 @@
1	- ~~fail~~


1	+ completed