diff --git "a/train_job_output.txt" "b/train_job_output.txt" --- "a/train_job_output.txt" +++ "b/train_job_output.txt" @@ -1,4 +1,4 @@ -slurm submission log: 2024-05-09 15:03:33.199064 +slurm submission log: 2024-05-10 08:21:53.371659 created following sbatch script: ############################### @@ -8,22 +8,22 @@ created following sbatch script: #SBATCH --account=nlp #SBATCH --cpus-per-task=16 #SBATCH --gres=gpu:2 -#SBATCH --job-name=tthrush-job-3941143 +#SBATCH --job-name=tthrush-job-2404107 #SBATCH --mem=400G -#SBATCH --nodelist=sphinx2 +#SBATCH --nodelist=sphinx1 #SBATCH --open-mode=append -#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_sciq/train_job_output.txt +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_sciq/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment -. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection +. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands -srun --unbuffered run_as_child_processes 'torchrun --master_port 29502 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_2/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_sciq --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --num_train_epochs 14 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2' +srun --unbuffered run_as_child_processes 'torchrun --master_port 29502 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_sciq --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2' ############################### @@ -33,485 +33,105 @@ submission to slurm complete! ############################### slurm submission output -Submitted batch job 7592321 +Submitted batch job 7593609 ############################### -/var/lib/slurm/slurmd/job7592321/slurm_script: line 15: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory +slurm submission log: 2024-05-10 08:23:19.470596 +created following sbatch script: -CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. -To initialize your shell, run +############################### - $ conda init +#!/bin/bash -Currently supported shells are: - - bash - - fish - - tcsh - - xonsh - - zsh - - powershell +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:2 +#SBATCH --job-name=tthrush-job-3619453 +#SBATCH --mem=400G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_sciq/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 -See 'conda init --help' for more information and options. +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection -IMPORTANT: You may need to close and restart your shell after running 'conda init'. +# cd to working directory +cd . +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29502 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_sciq --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7593622 + + + +############################### ############################### -start time: 2024-05-09 16:40:46.819377 +start time: 2024-05-10 15:02:42.471821 machine: sphinx2 conda env: pretraining-coreset-selection ############################### running following processes - torchrun --master_port 29502 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_2/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_sciq --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --num_train_epochs 14 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 + torchrun --master_port 29502 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_3/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_3/pythia-70m_sciq --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 ############################### command outputs: -[2024-05-09 16:40:54,330] torch.distributed.run: [WARNING] -[2024-05-09 16:40:54,330] torch.distributed.run: [WARNING] ***************************************** -[2024-05-09 16:40:54,330] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -[2024-05-09 16:40:54,330] torch.distributed.run: [WARNING] ***************************************** -05/09/2024 16:40:58 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_2/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_sciq', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) -05/09/2024 16:40:58 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_2/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_2/pythia-70m_sciq', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) - 0%| | 0/10682 [00:00