3outeille HF staff commited on
Commit
f458b72
1 Parent(s): 374def9

Upload llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1

Browse files
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/bench.slurm CHANGED
@@ -1,16 +1,16 @@
1
  #!/bin/bash
2
 
3
  #SBATCH --job-name=bench_cluster
4
- #SBATCH --time=00:59:00
5
  #SBATCH --partition=hopper-prod
6
  #SBATCH --nodes=2
7
  #SBATCH --gres=gpu:8
8
- #SBATCH --qos=high
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH --cpus-per-task=96
11
  #SBATCH --exclusive
12
- #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out
13
- #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out
14
 
15
  # Function to update status based on squeue output
16
  update_status() {
@@ -53,7 +53,7 @@ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
 
54
 
55
  NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
- CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/config.yaml"
57
 
58
  LAUNCHER="torchrun \
59
  --nproc_per_node 8 \
@@ -72,7 +72,7 @@ cd ..
72
  job_id=${SLURM_JOB_ID}
73
 
74
  # Update status to "pending" or "running" in the background
75
- update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt &
76
 
77
  # Run the main command
78
  srun -u $LAUNCHER $CMD
@@ -80,28 +80,28 @@ exit_status=$?
80
 
81
  # Update status based on the exit status of `srun`
82
  if [ $exit_status -eq 0 ]; then
83
- printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
84
  else
85
- if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
86
- printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
87
- elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
88
- printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
89
- elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
90
- printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
91
  else
92
- printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
93
  fi
94
  fi
95
 
96
  # Run the report script if the job completed successfully
97
  if [ $exit_status -eq 0 ]; then
98
- python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --is_logs
99
- python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --is_profiler
100
  fi
101
 
102
 
103
  # Push to hub the folder using huggingface_cli
104
- huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --commit-message "Upload llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1"
105
 
106
  # Verify the upload
107
  if [ $? -eq 0 ]; then
 
1
  #!/bin/bash
2
 
3
  #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=01:30:00
5
  #SBATCH --partition=hopper-prod
6
  #SBATCH --nodes=2
7
  #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH --cpus-per-task=96
11
  #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out
14
 
15
  # Function to update status based on squeue output
16
  update_status() {
 
53
 
54
 
55
  NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/config.yaml"
57
 
58
  LAUNCHER="torchrun \
59
  --nproc_per_node 8 \
 
72
  job_id=${SLURM_JOB_ID}
73
 
74
  # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt &
76
 
77
  # Run the main command
78
  srun -u $LAUNCHER $CMD
 
80
 
81
  # Update status based on the exit status of `srun`
82
  if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
84
  else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
91
  else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt
93
  fi
94
  fi
95
 
96
  # Run the report script if the job completed successfully
97
  if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --is_profiler
100
  fi
101
 
102
 
103
  # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1 --commit-message "Upload llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1"
105
 
106
  # Verify the upload
107
  if [ $? -eq 0 ]; then
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log.out CHANGED
The diff for this file is too large to render. See raw diff
 
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/log_metrics.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,83600.0,50200.0,3140.0,1020.0,11.1,0.0001,28.5,28.5,24.9,3168.14,4459.01,13244.0
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-1/status.txt CHANGED
@@ -1 +1 @@
1
- fail
 
1
+ completed