{ "version": ".unknown.", "runner": { "runner_type": "pdsh", "hostsfile": null, "hosts": null, "master_port": 29500, "master_addr": null, "script": "src/scaling/transformer/train.py", "default_gpu_count": 8, "docker_config": { "docker_container": null, "docker_sudo": false, "docker_mounts": [ [ "/mnt/", "/mnt/" ] ] }, "use_determined": true }, "logger": { "log_level": "info", "log_dir": null, "metrics_ranks": null, "use_wandb": true, "wandb_ranks": null, "wandb_host": "https://api.wandb.ai", "wandb_team": "aleph-alpha", "wandb_project": "ng-semantic-embedding", "wandb_group": "pharia_7B_embedding_all_source_256-2024-10-01-19-28-52", "wandb_api_key": "d80060c84c496915fcad15283809dbde65087d73", "use_tensorboard": false, "tensorboard_ranks": null, "determined_metrics_ranks": null }, "topology": { "global_rank": 0, "world_size": 256, "local_slot": 0, "model_parallel_size": 1, "pipe_parallel_size": 1, "data_parallel_size": 256, "global_batch_size": 2048, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "pipe_partition_method": "balanced", "pipe_partition_overwrite": null, "activation_checkpointing_type": "every_layer", "sequence_parallel": false }, "optimizer": { "method": "adamw", "beta1": 0.9, "beta2": 0.95, "eps": 1e-15, "gradient_clipping": 0.0, "allreduce_bucket_size": 500000000, "loss_scaler": { "enable": false, "initial_scale": 4294967296.0, "window": 1000, "hysteresis": 2.0, "consecutive_hysteresis": false, "min_scale": 1.0, "factor": 2.0 }, "zero": true, "zero_save_static": false, "debug_log": false }, "learning_rate_scheduler": { "learning_rate": 2e-05, "learning_rate_minimum": 0.0, "learning_rate_decay_style": "linear", "learning_rate_decay_iters": 938, "learning_rate_warmup_steps": 50 }, "embedding_learning_rate_scheduler": { "learning_rate": 0.0, "learning_rate_minimum": 0.0, "learning_rate_decay_style": "cosine", "learning_rate_decay_iters": 0, "learning_rate_warmup_steps": 0 }, "training": { "weight_decay": 0.0001, "finetune": true, "finetunable_parameters": [ "embed" ], "parameters_exclude": [], "use_separate_lr_on_embeddings": false, "use_deterministic_torch_algorithms": false, "loss_function_config": { "loss_type": "contrastive_loss", "number_of_hard_negatives": 1, "use_instructions": true, "query_side_only": false, "scale": 50, "log_verbose_metrics": true } }, "trainer": { "save_dir": "checkpoints", "save_interval": 938, "load_dir": "/scratch/samuel/Pharia-1-LLM-7B-control", "train_iterations": 938, "assert_checkpoint_loaded": true, "load_optimizer_states": false, "delete_past_optimizer_states": true, "load_context": false, "allowed_missing_keys_in_checkpoint": [ "embed" ], "allowed_unexpected_keys_in_checkpoint": null, "ignore_keys_in_checkpoint": null, "merge_lora_after_loading_checkpoint": false, "seed": 42, "dataloader_num_workers": 0, "dataloader_pin_memory": true, "dataloader_prefetch_factor": null, "eval_iterations": 1, "eval_interval": null, "separate_file_for_parameters": [ "adapter_embed" ] }, "profiler": { "profile_steps": 0, "profile_start_at_step": 0, "profiler_output": null }, "transformer_architecture": { "vocab_size": 128000, "vocab_file": "/scratch/samuel/Pharia-1-LLM-7B-control/vocab.json", "hidden_size": 4608, "num_layers": 27, "num_attention_heads": 36, "num_local_attention_heads": 0, "local_attention_window_size": null, "rotary_embedding_base": 1000000, "rotary_percentage": 1.0, "sequence_length": 2048, "norm_type": "layernorm", "relative_position_embedding_type": "rotary_complex", "mlp_type": "default", "mlp_factor": 4.0, "attention_bias": true, "attention_qkv_in_one": false, "attention_num_kv_heads": 4, "attention_use_matmul": false, "mlp_bias": true, "key_query_norm": false, "weight_tying": false, "masked_softmax": { "kernel": "torch", "softmax_in_fp32": true, "scale": 1.0, "deterministic_flash_attn_bwd": false }, "layernorm": { "optimization_type": "torch", "layernorm_epsilon": 1e-05 }, "precision": "bfloat16", "dropout_embedding": 0.0, "dropout_attention_probs": 0.0, "dropout_after_attention": 0.0, "dropout_after_mlp": 0.0, "bitfit_bias_config": null, "finetunable_token_ids": [], "image_encoder": false, "dropout_image_encoder": 0.0, "softprompt_config": null, "adapter_config": { "name": "embed", "attention_downsampling_factor": 0.25, "mlp_downsampling_factor": 0.25, "init_std": 1e-05, "version": ".unknown." }, "lora_config": null, "embedding_head_config": { "name": "pooling_only", "proj_layers": [ 256 ], "pooling": "weighted_mean" }, "lm_head": false, "causal": true }, "data": { "legacy_dataset": false, "load_mmap_index_to_memory": false, "use_mmap": false, "load_data_item_mmap_index_to_memory": false, "finetuning_dataset": false, "finetuning_chat_dataset": false, "finetuning_dataset_memory_map": false, "embedding_dataset": true, "embedding_dataset_memory_map": false, "data_prefixes": [ "/scratch/samuel/all_source_data_templated/allnli_eng.jsonl", "/scratch/samuel/all_source_data_templated/hotpot_with_bm25_negatives.jsonl", "/scratch/samuel/all_source_data_templated/medi_1.jsonl", "/scratch/samuel/all_source_data_templated/msmarco.jsonl" ], "validation_data_prefixes": null, "blended_dataset": { "weight_by_num_documents": true, "weighted_sampler_alpha": 1.0, "weights": null, "weight_examples_proportional": false, "ep_maximum": null, "ep_temperature": 1.0, "minimum_dataset_size": 0, "cache_directory": "/scratch/samuel/blended_dataset_cache", "shuffle_dataset_indices": false, "load_dataset_indices_to_memory": false }, "only_full_sequences": false, "allow_incomplete_sequences_every_n": 0 }, "determined_experiment_id": 2965, "determined_trial_id": 86040 }