slurm submission log: 2024-05-09 22:15:12.576191 created following sbatch script: ############################### #!/bin/bash #SBATCH --account=nlp #SBATCH --cpus-per-task=16 #SBATCH --dependency=afterok:7593077 #SBATCH --gres=gpu:2 #SBATCH --job-name=tthrush-job-1473981 #SBATCH --mem=400G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_lambada/train_job_o #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands srun --unbuffered run_as_child_processes 'utput.txt' 'torchrun --master_port 29504 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/lambada --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_lambada --o utput_hub_id pythia-70m_lambada --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2' ############################### submission to slurm complete! ############################### slurm submission output Submitted batch job 7593078 ############################### slurm submission log: 2024-05-09 23:03:15.492375 created following sbatch script: ############################### #!/bin/bash #SBATCH --account=nlp #SBATCH --cpus-per-task=16 #SBATCH --dependency=afterok:7593155 #SBATCH --gres=gpu:2 #SBATCH --job-name=tthrush-job-2751890 #SBATCH --mem=400G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_lambada/train_job_o #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands srun --unbuffered run_as_child_processes 'utput.txt' 'torchrun --master_port 29504 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/lambada --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_lambada --o utput_hub_id pythia-70m_lambada --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2' ############################### submission to slurm complete! ############################### slurm submission output Submitted batch job 7593156 ###############################