Markus Pernpointner
Initial model commit
581851e
{
"version": ".unknown.",
"runner": {
"runner_type": "pdsh",
"hostsfile": null,
"hosts": null,
"master_port": 29500,
"master_addr": null,
"script": "src/scaling/transformer/train.py",
"default_gpu_count": 8,
"docker_config": {
"docker_container": null,
"docker_sudo": false,
"docker_mounts": [
[
"/mnt/",
"/mnt/"
]
]
},
"use_determined": true
},
"logger": {
"log_level": "info",
"log_dir": null,
"metrics_ranks": null,
"use_wandb": true,
"wandb_ranks": null,
"wandb_host": "https://api.wandb.ai",
"wandb_team": "aleph-alpha",
"wandb_project": "ng-semantic-embedding",
"wandb_group": "pharia_7B_embedding_all_source_256-2024-10-01-19-28-52",
"wandb_api_key": "d80060c84c496915fcad15283809dbde65087d73",
"use_tensorboard": false,
"tensorboard_ranks": null,
"determined_metrics_ranks": null
},
"topology": {
"global_rank": 0,
"world_size": 256,
"local_slot": 0,
"model_parallel_size": 1,
"pipe_parallel_size": 1,
"data_parallel_size": 256,
"global_batch_size": 2048,
"micro_batch_size": 8,
"gradient_accumulation_steps": 1,
"pipe_partition_method": "balanced",
"pipe_partition_overwrite": null,
"activation_checkpointing_type": "every_layer",
"sequence_parallel": false
},
"optimizer": {
"method": "adamw",
"beta1": 0.9,
"beta2": 0.95,
"eps": 1e-15,
"gradient_clipping": 0.0,
"allreduce_bucket_size": 500000000,
"loss_scaler": {
"enable": false,
"initial_scale": 4294967296.0,
"window": 1000,
"hysteresis": 2.0,
"consecutive_hysteresis": false,
"min_scale": 1.0,
"factor": 2.0
},
"zero": true,
"zero_save_static": false,
"debug_log": false
},
"learning_rate_scheduler": {
"learning_rate": 2e-05,
"learning_rate_minimum": 0.0,
"learning_rate_decay_style": "linear",
"learning_rate_decay_iters": 938,
"learning_rate_warmup_steps": 50
},
"embedding_learning_rate_scheduler": {
"learning_rate": 0.0,
"learning_rate_minimum": 0.0,
"learning_rate_decay_style": "cosine",
"learning_rate_decay_iters": 0,
"learning_rate_warmup_steps": 0
},
"training": {
"weight_decay": 0.0001,
"finetune": true,
"finetunable_parameters": [
"embed"
],
"parameters_exclude": [],
"use_separate_lr_on_embeddings": false,
"use_deterministic_torch_algorithms": false,
"loss_function_config": {
"loss_type": "contrastive_loss",
"number_of_hard_negatives": 1,
"use_instructions": true,
"query_side_only": false,
"scale": 50,
"log_verbose_metrics": true
}
},
"trainer": {
"save_dir": "checkpoints",
"save_interval": 938,
"load_dir": "/scratch/samuel/Pharia-1-LLM-7B-control",
"train_iterations": 938,
"assert_checkpoint_loaded": true,
"load_optimizer_states": false,
"delete_past_optimizer_states": true,
"load_context": false,
"allowed_missing_keys_in_checkpoint": [
"embed"
],
"allowed_unexpected_keys_in_checkpoint": null,
"ignore_keys_in_checkpoint": null,
"merge_lora_after_loading_checkpoint": false,
"seed": 42,
"dataloader_num_workers": 0,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": null,
"eval_iterations": 1,
"eval_interval": null,
"separate_file_for_parameters": [
"adapter_embed"
]
},
"profiler": {
"profile_steps": 0,
"profile_start_at_step": 0,
"profiler_output": null
},
"transformer_architecture": {
"vocab_size": 128000,
"vocab_file": "/scratch/samuel/Pharia-1-LLM-7B-control/vocab.json",
"hidden_size": 4608,
"num_layers": 27,
"num_attention_heads": 36,
"num_local_attention_heads": 0,
"local_attention_window_size": null,
"rotary_embedding_base": 1000000,
"rotary_percentage": 1.0,
"sequence_length": 2048,
"norm_type": "layernorm",
"relative_position_embedding_type": "rotary_complex",
"mlp_type": "default",
"mlp_factor": 4.0,
"attention_bias": true,
"attention_qkv_in_one": false,
"attention_num_kv_heads": 4,
"attention_use_matmul": false,
"mlp_bias": true,
"key_query_norm": false,
"weight_tying": false,
"masked_softmax": {
"kernel": "torch",
"softmax_in_fp32": true,
"scale": 1.0,
"deterministic_flash_attn_bwd": false
},
"layernorm": {
"optimization_type": "torch",
"layernorm_epsilon": 1e-05
},
"precision": "bfloat16",
"dropout_embedding": 0.0,
"dropout_attention_probs": 0.0,
"dropout_after_attention": 0.0,
"dropout_after_mlp": 0.0,
"bitfit_bias_config": null,
"finetunable_token_ids": [],
"image_encoder": false,
"dropout_image_encoder": 0.0,
"softprompt_config": null,
"adapter_config": {
"name": "embed",
"attention_downsampling_factor": 0.25,
"mlp_downsampling_factor": 0.25,
"init_std": 1e-05,
"version": ".unknown."
},
"lora_config": null,
"embedding_head_config": {
"name": "pooling_only",
"proj_layers": [
256
],
"pooling": "weighted_mean"
},
"lm_head": false,
"causal": true
},
"data": {
"legacy_dataset": false,
"load_mmap_index_to_memory": false,
"use_mmap": false,
"load_data_item_mmap_index_to_memory": false,
"finetuning_dataset": false,
"finetuning_chat_dataset": false,
"finetuning_dataset_memory_map": false,
"embedding_dataset": true,
"embedding_dataset_memory_map": false,
"data_prefixes": [
"/scratch/samuel/all_source_data_templated/allnli_eng.jsonl",
"/scratch/samuel/all_source_data_templated/hotpot_with_bm25_negatives.jsonl",
"/scratch/samuel/all_source_data_templated/medi_1.jsonl",
"/scratch/samuel/all_source_data_templated/msmarco.jsonl"
],
"validation_data_prefixes": null,
"blended_dataset": {
"weight_by_num_documents": true,
"weighted_sampler_alpha": 1.0,
"weights": null,
"weight_examples_proportional": false,
"ep_maximum": null,
"ep_temperature": 1.0,
"minimum_dataset_size": 0,
"cache_directory": "/scratch/samuel/blended_dataset_cache",
"shuffle_dataset_indices": false,
"load_dataset_indices_to_memory": false
},
"only_full_sequences": false,
"allow_incomplete_sequences_every_n": 0
},
"determined_experiment_id": 2965,
"determined_trial_id": 86040
}