nanotron
/

old_bench

Model card Files Files and versions Community

old_bench / llama-1B /64_GPUS /dp-16_tp-1_pp-4_mbz-1 /bench.slurm

3outeille HF staff

Upload llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1

4b7b948 verified 5 months ago

raw

history blame

4.8 kB

	#!/bin/bash

	#SBATCH --job-name=bench_cluster
	#SBATCH --time=02:00:00
	#SBATCH --partition=hopper-prod
	#SBATCH --nodes=8
	#SBATCH --gres=gpu:8
	#SBATCH --qos=prod
	#SBATCH --ntasks-per-node=1
	#SBATCH --cpus-per-task=96
	#SBATCH --exclusive
	#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/log.out
	#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/log.out

	# Function to update status based on squeue output
	update_status() {
	job_id=$1
	status_file=$2
	# For unknown reasons, it doenst update status for pending. It only works for running
	while true; do
	job_status=$(squeue --job $job_id --noheader --format=%T)
	echo "Job status: $job_status"
	if [ -z "$job_status" ]; then
	# Job has finished or is not found
	break
	elif [ "$job_status" = "RUNNING" ]; then
	printf "running" > $status_file
	break
	fi
	sleep 10
	done
	}

	# Misc initializations.
	echo "========================"
	echo "START TIME: $(date)"
	source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
	conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
	echo python3 version = $(python3 --version)
	echo "========================"

	# Slurm stuff
	export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
	export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| head -n 1)
	export MASTER_PORT=$((1024 + RANDOM % 64511))

	export TMPDIR=/scratch
	export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
	export CUBLAS_WORKSPACE_CONFIG=":4096:8"
	export CUDA_DEVICE_MAX_CONNECTIONS="1"

	huggingface-cli login --token $HUGGINGFACE_TOKEN


	NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
	CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/config.yaml"

	LAUNCHER="torchrun \
	--nproc_per_node 8 \
	--nnodes 8 \
	--rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
	--rdzv_backend c10d \
	--max_restarts 0 \
	--tee 3 \
	--node_rank ${SLURM_PROCID}"

	# Checkout the bench_cluster branch
	cd $NANOTRON_REPO
	git checkout bench_cluster
	cd ..
	# Get the current job ID
	job_id=${SLURM_JOB_ID}

	# Update status to "pending" or "running" in the background
	update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/status.txt &

	# Run the main command
	srun -u $LAUNCHER $CMD
	exit_status=$?

	# Update status based on the exit status of `srun`
	if [ $exit_status -eq 0 ]; then
	printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/status.txt
	else
	if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/log.out; then
	printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/status.txt
	elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/log.out; then
	printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/status.txt
	elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/log.out; then
	printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/status.txt
	else
	printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1/status.txt
	fi
	fi

	# Run the report script if the job completed successfully
	if [ $exit_status -eq 0 ]; then
	python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1 --is_logs
	python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1 --is_profiler
	fi


	# Push to hub the folder using huggingface_cli
	huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1 llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1 --commit-message "Upload llama-1B/64_GPUS/dp-16_tp-1_pp-4_mbz-1"

	# Verify the upload
	if [ $? -eq 0 ]; then
	echo "Uploading to Huggingface Hub successful"
	else
	echo "Failed to upload to Huggingface Hub"
	fi