slurm submission log: 2024-05-26 22:30:16.581914 created following sbatch script: ############################### #!/bin/bash #SBATCH --account=nlp #SBATCH --cpus-per-task=16 #SBATCH --dependency=afterok:7653570 #SBATCH --gres=gpu:2 #SBATCH --job-name=tthrush-job-3137501 #SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands srun --unbuffered run_as_child_processes 'torchrun --master_port 29509 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' ############################### submission to slurm complete! ############################### slurm submission output Submitted batch job 7653571 ############################### slurm submission log: 2024-05-26 22:32:57.495347 created following sbatch script: ############################### #!/bin/bash #SBATCH --account=nlp #SBATCH --cpus-per-task=16 #SBATCH --dependency=afterok:7653600 #SBATCH --gres=gpu:2 #SBATCH --job-name=tthrush-job-3075134 #SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands srun --unbuffered run_as_child_processes 'torchrun --master_port 29509 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' ############################### submission to slurm complete! ############################### slurm submission output Submitted batch job 7653601 ############################### slurm submission log: 2024-05-26 22:58:09.787171 created following sbatch script: ############################### #!/bin/bash #SBATCH --account=nlp #SBATCH --cpus-per-task=16 #SBATCH --dependency=afterok:7653655 #SBATCH --gres=gpu:2 #SBATCH --job-name=tthrush-job-3775598 #SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands srun --unbuffered run_as_child_processes 'torchrun --master_port 29509 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' ############################### submission to slurm complete! ############################### slurm submission output Submitted batch job 7653656 ############################### slurm submission log: 2024-05-26 23:16:43.398883 created following sbatch script: ############################### #!/bin/bash #SBATCH --account=nlp #SBATCH --cpus-per-task=16 #SBATCH --dependency=afterok:7653712 #SBATCH --gres=gpu:2 #SBATCH --job-name=tthrush-job-3360635 #SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands srun --unbuffered run_as_child_processes 'torchrun --master_port 29509 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' ############################### submission to slurm complete! ############################### slurm submission output Submitted batch job 7653713 ############################### ############################### start time: 2024-05-27 10:05:46.837699 machine: sphinx2 conda env: pretraining-coreset-selection ############################### running following processes torchrun --master_port 29509 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14 ############################### command outputs: [2024-05-27 10:05:53,482] torch.distributed.run: [WARNING] [2024-05-27 10:05:53,482] torch.distributed.run: [WARNING] ***************************************** [2024-05-27 10:05:53,482] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. [2024-05-27 10:05:53,482] torch.distributed.run: [WARNING] ***************************************** 05/27/2024 10:06:12 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) 05/27/2024 10:06:15 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) 0%| | 0/10682 [00:00