3outeille HF staff commited on
Commit
5e7018e
1 Parent(s): 1d87e55

Upload llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-4

Browse files
llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-4/bench.slurm CHANGED
@@ -1,11 +1,11 @@
1
  #!/bin/bash
2
 
3
  #SBATCH --job-name=bench_cluster
4
- #SBATCH --time=01:30:00
5
  #SBATCH --partition=hopper-prod
6
  #SBATCH --nodes=8
7
  #SBATCH --gres=gpu:8
8
- #SBATCH --qos=high
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH --cpus-per-task=96
11
  #SBATCH --exclusive
 
1
  #!/bin/bash
2
 
3
  #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
  #SBATCH --partition=hopper-prod
6
  #SBATCH --nodes=8
7
  #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=prod
9
  #SBATCH --ntasks-per-node=1
10
  #SBATCH --cpus-per-task=96
11
  #SBATCH --exclusive
llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-4/config.yaml CHANGED
@@ -48,7 +48,7 @@ parallelism:
48
  dp: 16
49
  expert_parallel_size: 1
50
  pp: 4
51
- pp_engine: 1f1b
52
  tp: 1
53
  tp_linear_async_communication: false
54
  tp_mode: REDUCE_SCATTER
 
48
  dp: 16
49
  expert_parallel_size: 1
50
  pp: 4
51
+ pp_engine: afab
52
  tp: 1
53
  tp_linear_async_communication: false
54
  tp_mode: REDUCE_SCATTER
llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-4/log.out CHANGED
The diff for this file is too large to render. See raw diff
 
llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-4/status.txt CHANGED
@@ -1 +1 @@
1
- timeout
 
1
+ oom