|
{ |
|
"version": ".unknown.", |
|
"runner": { |
|
"runner_type": "pdsh", |
|
"hostsfile": null, |
|
"hosts": null, |
|
"master_port": 29500, |
|
"master_addr": null, |
|
"script": "src/scaling/transformer/train.py", |
|
"default_gpu_count": 8, |
|
"docker_config": { |
|
"docker_container": null, |
|
"docker_sudo": false, |
|
"docker_mounts": [ |
|
[ |
|
"/mnt/", |
|
"/mnt/" |
|
] |
|
] |
|
}, |
|
"use_determined": true |
|
}, |
|
"logger": { |
|
"log_level": "info", |
|
"log_dir": null, |
|
"metrics_ranks": null, |
|
"use_wandb": true, |
|
"wandb_ranks": null, |
|
"wandb_host": "https://api.wandb.ai", |
|
"wandb_team": "aleph-alpha", |
|
"wandb_project": "ng-semantic-embedding", |
|
"wandb_group": "pharia_7B_embedding_all_source_256-2024-10-01-19-28-52", |
|
"wandb_api_key": "d80060c84c496915fcad15283809dbde65087d73", |
|
"use_tensorboard": false, |
|
"tensorboard_ranks": null, |
|
"determined_metrics_ranks": null |
|
}, |
|
"topology": { |
|
"global_rank": 0, |
|
"world_size": 256, |
|
"local_slot": 0, |
|
"model_parallel_size": 1, |
|
"pipe_parallel_size": 1, |
|
"data_parallel_size": 256, |
|
"global_batch_size": 2048, |
|
"micro_batch_size": 8, |
|
"gradient_accumulation_steps": 1, |
|
"pipe_partition_method": "balanced", |
|
"pipe_partition_overwrite": null, |
|
"activation_checkpointing_type": "every_layer", |
|
"sequence_parallel": false |
|
}, |
|
"optimizer": { |
|
"method": "adamw", |
|
"beta1": 0.9, |
|
"beta2": 0.95, |
|
"eps": 1e-15, |
|
"gradient_clipping": 0.0, |
|
"allreduce_bucket_size": 500000000, |
|
"loss_scaler": { |
|
"enable": false, |
|
"initial_scale": 4294967296.0, |
|
"window": 1000, |
|
"hysteresis": 2.0, |
|
"consecutive_hysteresis": false, |
|
"min_scale": 1.0, |
|
"factor": 2.0 |
|
}, |
|
"zero": true, |
|
"zero_save_static": false, |
|
"debug_log": false |
|
}, |
|
"learning_rate_scheduler": { |
|
"learning_rate": 2e-05, |
|
"learning_rate_minimum": 0.0, |
|
"learning_rate_decay_style": "linear", |
|
"learning_rate_decay_iters": 938, |
|
"learning_rate_warmup_steps": 50 |
|
}, |
|
"embedding_learning_rate_scheduler": { |
|
"learning_rate": 0.0, |
|
"learning_rate_minimum": 0.0, |
|
"learning_rate_decay_style": "cosine", |
|
"learning_rate_decay_iters": 0, |
|
"learning_rate_warmup_steps": 0 |
|
}, |
|
"training": { |
|
"weight_decay": 0.0001, |
|
"finetune": true, |
|
"finetunable_parameters": [ |
|
"embed" |
|
], |
|
"parameters_exclude": [], |
|
"use_separate_lr_on_embeddings": false, |
|
"use_deterministic_torch_algorithms": false, |
|
"loss_function_config": { |
|
"loss_type": "contrastive_loss", |
|
"number_of_hard_negatives": 1, |
|
"use_instructions": true, |
|
"query_side_only": false, |
|
"scale": 50, |
|
"log_verbose_metrics": true |
|
} |
|
}, |
|
"trainer": { |
|
"save_dir": "checkpoints", |
|
"save_interval": 938, |
|
"load_dir": "/scratch/samuel/Pharia-1-LLM-7B-control", |
|
"train_iterations": 938, |
|
"assert_checkpoint_loaded": true, |
|
"load_optimizer_states": false, |
|
"delete_past_optimizer_states": true, |
|
"load_context": false, |
|
"allowed_missing_keys_in_checkpoint": [ |
|
"embed" |
|
], |
|
"allowed_unexpected_keys_in_checkpoint": null, |
|
"ignore_keys_in_checkpoint": null, |
|
"merge_lora_after_loading_checkpoint": false, |
|
"seed": 42, |
|
"dataloader_num_workers": 0, |
|
"dataloader_pin_memory": true, |
|
"dataloader_prefetch_factor": null, |
|
"eval_iterations": 1, |
|
"eval_interval": null, |
|
"separate_file_for_parameters": [ |
|
"adapter_embed" |
|
] |
|
}, |
|
"profiler": { |
|
"profile_steps": 0, |
|
"profile_start_at_step": 0, |
|
"profiler_output": null |
|
}, |
|
"transformer_architecture": { |
|
"vocab_size": 128000, |
|
"vocab_file": "/scratch/samuel/Pharia-1-LLM-7B-control/vocab.json", |
|
"hidden_size": 4608, |
|
"num_layers": 27, |
|
"num_attention_heads": 36, |
|
"num_local_attention_heads": 0, |
|
"local_attention_window_size": null, |
|
"rotary_embedding_base": 1000000, |
|
"rotary_percentage": 1.0, |
|
"sequence_length": 2048, |
|
"norm_type": "layernorm", |
|
"relative_position_embedding_type": "rotary_complex", |
|
"mlp_type": "default", |
|
"mlp_factor": 4.0, |
|
"attention_bias": true, |
|
"attention_qkv_in_one": false, |
|
"attention_num_kv_heads": 4, |
|
"attention_use_matmul": false, |
|
"mlp_bias": true, |
|
"key_query_norm": false, |
|
"weight_tying": false, |
|
"masked_softmax": { |
|
"kernel": "torch", |
|
"softmax_in_fp32": true, |
|
"scale": 1.0, |
|
"deterministic_flash_attn_bwd": false |
|
}, |
|
"layernorm": { |
|
"optimization_type": "torch", |
|
"layernorm_epsilon": 1e-05 |
|
}, |
|
"precision": "bfloat16", |
|
"dropout_embedding": 0.0, |
|
"dropout_attention_probs": 0.0, |
|
"dropout_after_attention": 0.0, |
|
"dropout_after_mlp": 0.0, |
|
"bitfit_bias_config": null, |
|
"finetunable_token_ids": [], |
|
"image_encoder": false, |
|
"dropout_image_encoder": 0.0, |
|
"softprompt_config": null, |
|
"adapter_config": { |
|
"name": "embed", |
|
"attention_downsampling_factor": 0.25, |
|
"mlp_downsampling_factor": 0.25, |
|
"init_std": 1e-05, |
|
"version": ".unknown." |
|
}, |
|
"lora_config": null, |
|
"embedding_head_config": { |
|
"name": "pooling_only", |
|
"proj_layers": [ |
|
256 |
|
], |
|
"pooling": "weighted_mean" |
|
}, |
|
"lm_head": false, |
|
"causal": true |
|
}, |
|
"data": { |
|
"legacy_dataset": false, |
|
"load_mmap_index_to_memory": false, |
|
"use_mmap": false, |
|
"load_data_item_mmap_index_to_memory": false, |
|
"finetuning_dataset": false, |
|
"finetuning_chat_dataset": false, |
|
"finetuning_dataset_memory_map": false, |
|
"embedding_dataset": true, |
|
"embedding_dataset_memory_map": false, |
|
"data_prefixes": [ |
|
"/scratch/samuel/all_source_data_templated/allnli_eng.jsonl", |
|
"/scratch/samuel/all_source_data_templated/hotpot_with_bm25_negatives.jsonl", |
|
"/scratch/samuel/all_source_data_templated/medi_1.jsonl", |
|
"/scratch/samuel/all_source_data_templated/msmarco.jsonl" |
|
], |
|
"validation_data_prefixes": null, |
|
"blended_dataset": { |
|
"weight_by_num_documents": true, |
|
"weighted_sampler_alpha": 1.0, |
|
"weights": null, |
|
"weight_examples_proportional": false, |
|
"ep_maximum": null, |
|
"ep_temperature": 1.0, |
|
"minimum_dataset_size": 0, |
|
"cache_directory": "/scratch/samuel/blended_dataset_cache", |
|
"shuffle_dataset_indices": false, |
|
"load_dataset_indices_to_memory": false |
|
}, |
|
"only_full_sequences": false, |
|
"allow_incomplete_sequences_every_n": 0 |
|
}, |
|
"determined_experiment_id": 2965, |
|
"determined_trial_id": 86040 |
|
} |