|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
update_status() { |
|
job_id=$1 |
|
status_file=$2 |
|
|
|
while true; do |
|
job_status=$(squeue --job $job_id --noheader --format=%T) |
|
echo "Job status: $job_status" |
|
if [ -z "$job_status" ]; then |
|
|
|
break |
|
elif [ "$job_status" = "RUNNING" ]; then |
|
printf "running" > $status_file |
|
break |
|
fi |
|
sleep 10 |
|
done |
|
} |
|
|
|
|
|
echo "========================" |
|
echo "START TIME: $(date)" |
|
source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh |
|
conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster |
|
echo python3 version = $(python3 --version) |
|
echo "========================" |
|
|
|
|
|
export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") |
|
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) |
|
export MASTER_PORT=$((1024 + RANDOM % 64511)) |
|
|
|
export TMPDIR=/scratch |
|
export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" |
|
export CUBLAS_WORKSPACE_CONFIG=":4096:8" |
|
export CUDA_DEVICE_MAX_CONNECTIONS="1" |
|
|
|
huggingface-cli login --token $HUGGINGFACE_TOKEN |
|
|
|
|
|
NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" |
|
CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/config.yaml" |
|
|
|
LAUNCHER="torchrun \ |
|
--nproc_per_node 8 \ |
|
--nnodes 8 \ |
|
--rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ |
|
--rdzv_backend c10d \ |
|
--max_restarts 0 \ |
|
--tee 3 \ |
|
--node_rank ${SLURM_PROCID}" |
|
|
|
|
|
cd $NANOTRON_REPO |
|
git checkout bench_cluster |
|
cd .. |
|
|
|
job_id=${SLURM_JOB_ID} |
|
|
|
|
|
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt & |
|
|
|
|
|
srun -u $LAUNCHER $CMD |
|
exit_status=$? |
|
|
|
|
|
if [ $exit_status -eq 0 ]; then |
|
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt |
|
else |
|
if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out; then |
|
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt |
|
elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out; then |
|
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt |
|
elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out; then |
|
printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt |
|
else |
|
printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt |
|
fi |
|
fi |
|
|
|
|
|
if [ $exit_status -eq 0 ]; then |
|
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16 --is_logs |
|
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16 --is_profiler |
|
fi |
|
|
|
|
|
|
|
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16 llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16" |
|
|
|
|
|
if [ $? -eq 0 ]; then |
|
echo "Uploading to Huggingface Hub successful" |
|
else |
|
echo "Failed to upload to Huggingface Hub" |
|
fi |