Upload llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-4
Browse files
llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-4/bench.slurm
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
#SBATCH --job-name=bench_cluster
|
4 |
-
#SBATCH --time=
|
5 |
#SBATCH --partition=hopper-prod
|
6 |
#SBATCH --nodes=8
|
7 |
#SBATCH --gres=gpu:8
|
8 |
-
#SBATCH --qos=
|
9 |
#SBATCH --ntasks-per-node=1
|
10 |
#SBATCH --cpus-per-task=96
|
11 |
#SBATCH --exclusive
|
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
#SBATCH --job-name=bench_cluster
|
4 |
+
#SBATCH --time=02:00:00
|
5 |
#SBATCH --partition=hopper-prod
|
6 |
#SBATCH --nodes=8
|
7 |
#SBATCH --gres=gpu:8
|
8 |
+
#SBATCH --qos=prod
|
9 |
#SBATCH --ntasks-per-node=1
|
10 |
#SBATCH --cpus-per-task=96
|
11 |
#SBATCH --exclusive
|
llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-4/config.yaml
CHANGED
@@ -48,7 +48,7 @@ parallelism:
|
|
48 |
dp: 16
|
49 |
expert_parallel_size: 1
|
50 |
pp: 4
|
51 |
-
pp_engine:
|
52 |
tp: 1
|
53 |
tp_linear_async_communication: false
|
54 |
tp_mode: REDUCE_SCATTER
|
|
|
48 |
dp: 16
|
49 |
expert_parallel_size: 1
|
50 |
pp: 4
|
51 |
+
pp_engine: afab
|
52 |
tp: 1
|
53 |
tp_linear_async_communication: false
|
54 |
tp_mode: REDUCE_SCATTER
|
llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-4/log.out
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-4/status.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
|
|
|
1 |
+
oom
|