File size: 7,562 Bytes
581851e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
{
"version": ".unknown.",
"runner": {
"runner_type": "pdsh",
"hostsfile": null,
"hosts": null,
"master_port": 29500,
"master_addr": null,
"script": "src/scaling/transformer/train.py",
"default_gpu_count": 8,
"docker_config": {
"docker_container": null,
"docker_sudo": false,
"docker_mounts": [
[
"/mnt/",
"/mnt/"
]
]
},
"use_determined": true
},
"logger": {
"log_level": "info",
"log_dir": null,
"metrics_ranks": null,
"use_wandb": true,
"wandb_ranks": null,
"wandb_host": "https://api.wandb.ai",
"wandb_team": "aleph-alpha",
"wandb_project": "ng-semantic-embedding",
"wandb_group": "pharia_7B_embedding_all_source_256-2024-10-01-19-28-52",
"wandb_api_key": "d80060c84c496915fcad15283809dbde65087d73",
"use_tensorboard": false,
"tensorboard_ranks": null,
"determined_metrics_ranks": null
},
"topology": {
"global_rank": 0,
"world_size": 256,
"local_slot": 0,
"model_parallel_size": 1,
"pipe_parallel_size": 1,
"data_parallel_size": 256,
"global_batch_size": 2048,
"micro_batch_size": 8,
"gradient_accumulation_steps": 1,
"pipe_partition_method": "balanced",
"pipe_partition_overwrite": null,
"activation_checkpointing_type": "every_layer",
"sequence_parallel": false
},
"optimizer": {
"method": "adamw",
"beta1": 0.9,
"beta2": 0.95,
"eps": 1e-15,
"gradient_clipping": 0.0,
"allreduce_bucket_size": 500000000,
"loss_scaler": {
"enable": false,
"initial_scale": 4294967296.0,
"window": 1000,
"hysteresis": 2.0,
"consecutive_hysteresis": false,
"min_scale": 1.0,
"factor": 2.0
},
"zero": true,
"zero_save_static": false,
"debug_log": false
},
"learning_rate_scheduler": {
"learning_rate": 2e-05,
"learning_rate_minimum": 0.0,
"learning_rate_decay_style": "linear",
"learning_rate_decay_iters": 938,
"learning_rate_warmup_steps": 50
},
"embedding_learning_rate_scheduler": {
"learning_rate": 0.0,
"learning_rate_minimum": 0.0,
"learning_rate_decay_style": "cosine",
"learning_rate_decay_iters": 0,
"learning_rate_warmup_steps": 0
},
"training": {
"weight_decay": 0.0001,
"finetune": true,
"finetunable_parameters": [
"embed"
],
"parameters_exclude": [],
"use_separate_lr_on_embeddings": false,
"use_deterministic_torch_algorithms": false,
"loss_function_config": {
"loss_type": "contrastive_loss",
"number_of_hard_negatives": 1,
"use_instructions": true,
"query_side_only": false,
"scale": 50,
"log_verbose_metrics": true
}
},
"trainer": {
"save_dir": "checkpoints",
"save_interval": 938,
"load_dir": "/scratch/samuel/Pharia-1-LLM-7B-control",
"train_iterations": 938,
"assert_checkpoint_loaded": true,
"load_optimizer_states": false,
"delete_past_optimizer_states": true,
"load_context": false,
"allowed_missing_keys_in_checkpoint": [
"embed"
],
"allowed_unexpected_keys_in_checkpoint": null,
"ignore_keys_in_checkpoint": null,
"merge_lora_after_loading_checkpoint": false,
"seed": 42,
"dataloader_num_workers": 0,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": null,
"eval_iterations": 1,
"eval_interval": null,
"separate_file_for_parameters": [
"adapter_embed"
]
},
"profiler": {
"profile_steps": 0,
"profile_start_at_step": 0,
"profiler_output": null
},
"transformer_architecture": {
"vocab_size": 128000,
"vocab_file": "/scratch/samuel/Pharia-1-LLM-7B-control/vocab.json",
"hidden_size": 4608,
"num_layers": 27,
"num_attention_heads": 36,
"num_local_attention_heads": 0,
"local_attention_window_size": null,
"rotary_embedding_base": 1000000,
"rotary_percentage": 1.0,
"sequence_length": 2048,
"norm_type": "layernorm",
"relative_position_embedding_type": "rotary_complex",
"mlp_type": "default",
"mlp_factor": 4.0,
"attention_bias": true,
"attention_qkv_in_one": false,
"attention_num_kv_heads": 4,
"attention_use_matmul": false,
"mlp_bias": true,
"key_query_norm": false,
"weight_tying": false,
"masked_softmax": {
"kernel": "torch",
"softmax_in_fp32": true,
"scale": 1.0,
"deterministic_flash_attn_bwd": false
},
"layernorm": {
"optimization_type": "torch",
"layernorm_epsilon": 1e-05
},
"precision": "bfloat16",
"dropout_embedding": 0.0,
"dropout_attention_probs": 0.0,
"dropout_after_attention": 0.0,
"dropout_after_mlp": 0.0,
"bitfit_bias_config": null,
"finetunable_token_ids": [],
"image_encoder": false,
"dropout_image_encoder": 0.0,
"softprompt_config": null,
"adapter_config": {
"name": "embed",
"attention_downsampling_factor": 0.25,
"mlp_downsampling_factor": 0.25,
"init_std": 1e-05,
"version": ".unknown."
},
"lora_config": null,
"embedding_head_config": {
"name": "pooling_only",
"proj_layers": [
256
],
"pooling": "weighted_mean"
},
"lm_head": false,
"causal": true
},
"data": {
"legacy_dataset": false,
"load_mmap_index_to_memory": false,
"use_mmap": false,
"load_data_item_mmap_index_to_memory": false,
"finetuning_dataset": false,
"finetuning_chat_dataset": false,
"finetuning_dataset_memory_map": false,
"embedding_dataset": true,
"embedding_dataset_memory_map": false,
"data_prefixes": [
"/scratch/samuel/all_source_data_templated/allnli_eng.jsonl",
"/scratch/samuel/all_source_data_templated/hotpot_with_bm25_negatives.jsonl",
"/scratch/samuel/all_source_data_templated/medi_1.jsonl",
"/scratch/samuel/all_source_data_templated/msmarco.jsonl"
],
"validation_data_prefixes": null,
"blended_dataset": {
"weight_by_num_documents": true,
"weighted_sampler_alpha": 1.0,
"weights": null,
"weight_examples_proportional": false,
"ep_maximum": null,
"ep_temperature": 1.0,
"minimum_dataset_size": 0,
"cache_directory": "/scratch/samuel/blended_dataset_cache",
"shuffle_dataset_indices": false,
"load_dataset_indices_to_memory": false
},
"only_full_sequences": false,
"allow_incomplete_sequences_every_n": 0
},
"determined_experiment_id": 2965,
"determined_trial_id": 86040
} |