Spaces:
Build error
Build error
################################################# | |
## TEMPLATE VERSION 1.01 ## | |
################################################# | |
## ALL SBATCH COMMANDS WILL START WITH #SBATCH ## | |
## DO NOT REMOVE THE # SYMBOL ## | |
################################################# | |
#SBATCH --nodes=1 # How many nodes required? Usually 1 | |
#SBATCH --cpus-per-task=10 # Number of CPU to request for the job | |
#SBATCH --mem=128GB # How much memory does your job require? | |
#SBATCH --gres=gpu:1 # Do you require GPUS? If not delete this line | |
#SBATCH --time=05-00:00:00 # How long to run the job for? Jobs exceed this time will be terminated | |
# Format <DD-HH:MM:SS> eg. 5 days 05-00:00:00 | |
# Format <DD-HH:MM:SS> eg. 24 hours 1-00:00:00 or 24:00:00 | |
#SBATCH --mail-type=BEGIN,END,FAIL # When should you receive an email? | |
#SBATCH --output=%u.%j.out # Where should the log files go? | |
# You must provide an absolute path eg /common/home/module/username/ | |
# If no paths are provided, the output file will be placed in your current working directory | |
#SBATCH --requeue # Remove if you are not want the workload scheduler to requeue your job after preemption | |
#SBATCH --constraint=l40 # This tells the workload scheduler to provision you l40 nodes | |
################################################################ | |
## EDIT AFTER THIS LINE IF YOU ARE OKAY WITH DEFAULT SETTINGS ## | |
################################################################ | |
# ================ Account parameters ================ | |
# Description | Value | |
# --------------------------------------------- | |
# Account name | tanahhweeresearch | |
# List of Assigned Partition | researchlong researchshort tanahhweeresearch | |
# List of Assigned QOS | research-1-qos tanahhweeresearch-priority | |
# --------------------------------------------- | |
#SBATCH --partition=researchlong # The partition you've been assigned | |
#SBATCH --account=tanahhweeresearch # The account you've been assigned (normally student) | |
#SBATCH --qos=research-1-qos # What is the QOS assigned to you? Check with myinfo command | |
#SBATCH --mail-user=haotian.hu.2021@scis.smu.edu.sg # Who should receive the email notifications | |
#SBATCH --job-name=1GPU_LLM_HT # Give the job a name | |
################################################# | |
## END OF SBATCH COMMANDS ## | |
################################################# | |
# Purge the environment, load the modules we require. | |
# Refer to https://violet.smu.edu.sg/origami/module/ for more information | |
module purge | |
module load Anaconda3/2022.05 | |
module load CUDA/12.1.1 | |
# Do not remove this line even if you have executed conda init | |
eval "$(conda shell.bash hook)" | |
# Create a virtual environment can be commented off if you already have a virtual environment | |
# conda create -n llm_ht python=3.11 | |
# This command assumes that you've already created the environment previously | |
# We're using an absolute path here. You may use a relative path, as long as SRUN is execute in the same working directory | |
# conda activate tgi | |
conda activate llm_ht | |
# If you require any packages, install it before the srun job submission. | |
# conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia | |
# Submit your job to the cluster | |
BASEDIR=$HOME/logical-reasoning/scripts | |
JOB=$1 | |
echo "Submitting job: $BASEDIR/$JOB" | |
srun --gres=gpu:1 $BASEDIR/$JOB | |
# sbatch logical-reasoning/scripts/1gpu_llm_ht.sh tune-mgtv-qwen2_7b.sh | |