File size: 7,562 Bytes
581851e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
{
    "version": ".unknown.",
    "runner": {
        "runner_type": "pdsh",
        "hostsfile": null,
        "hosts": null,
        "master_port": 29500,
        "master_addr": null,
        "script": "src/scaling/transformer/train.py",
        "default_gpu_count": 8,
        "docker_config": {
            "docker_container": null,
            "docker_sudo": false,
            "docker_mounts": [
                [
                    "/mnt/",
                    "/mnt/"
                ]
            ]
        },
        "use_determined": true
    },
    "logger": {
        "log_level": "info",
        "log_dir": null,
        "metrics_ranks": null,
        "use_wandb": true,
        "wandb_ranks": null,
        "wandb_host": "https://api.wandb.ai",
        "wandb_team": "aleph-alpha",
        "wandb_project": "ng-semantic-embedding",
        "wandb_group": "pharia_7B_embedding_all_source_256-2024-10-01-19-28-52",
        "wandb_api_key": "d80060c84c496915fcad15283809dbde65087d73",
        "use_tensorboard": false,
        "tensorboard_ranks": null,
        "determined_metrics_ranks": null
    },
    "topology": {
        "global_rank": 0,
        "world_size": 256,
        "local_slot": 0,
        "model_parallel_size": 1,
        "pipe_parallel_size": 1,
        "data_parallel_size": 256,
        "global_batch_size": 2048,
        "micro_batch_size": 8,
        "gradient_accumulation_steps": 1,
        "pipe_partition_method": "balanced",
        "pipe_partition_overwrite": null,
        "activation_checkpointing_type": "every_layer",
        "sequence_parallel": false
    },
    "optimizer": {
        "method": "adamw",
        "beta1": 0.9,
        "beta2": 0.95,
        "eps": 1e-15,
        "gradient_clipping": 0.0,
        "allreduce_bucket_size": 500000000,
        "loss_scaler": {
            "enable": false,
            "initial_scale": 4294967296.0,
            "window": 1000,
            "hysteresis": 2.0,
            "consecutive_hysteresis": false,
            "min_scale": 1.0,
            "factor": 2.0
        },
        "zero": true,
        "zero_save_static": false,
        "debug_log": false
    },
    "learning_rate_scheduler": {
        "learning_rate": 2e-05,
        "learning_rate_minimum": 0.0,
        "learning_rate_decay_style": "linear",
        "learning_rate_decay_iters": 938,
        "learning_rate_warmup_steps": 50
    },
    "embedding_learning_rate_scheduler": {
        "learning_rate": 0.0,
        "learning_rate_minimum": 0.0,
        "learning_rate_decay_style": "cosine",
        "learning_rate_decay_iters": 0,
        "learning_rate_warmup_steps": 0
    },
    "training": {
        "weight_decay": 0.0001,
        "finetune": true,
        "finetunable_parameters": [
            "embed"
        ],
        "parameters_exclude": [],
        "use_separate_lr_on_embeddings": false,
        "use_deterministic_torch_algorithms": false,
        "loss_function_config": {
            "loss_type": "contrastive_loss",
            "number_of_hard_negatives": 1,
            "use_instructions": true,
            "query_side_only": false,
            "scale": 50,
            "log_verbose_metrics": true
        }
    },
    "trainer": {
        "save_dir": "checkpoints",
        "save_interval": 938,
        "load_dir": "/scratch/samuel/Pharia-1-LLM-7B-control",
        "train_iterations": 938,
        "assert_checkpoint_loaded": true,
        "load_optimizer_states": false,
        "delete_past_optimizer_states": true,
        "load_context": false,
        "allowed_missing_keys_in_checkpoint": [
            "embed"
        ],
        "allowed_unexpected_keys_in_checkpoint": null,
        "ignore_keys_in_checkpoint": null,
        "merge_lora_after_loading_checkpoint": false,
        "seed": 42,
        "dataloader_num_workers": 0,
        "dataloader_pin_memory": true,
        "dataloader_prefetch_factor": null,
        "eval_iterations": 1,
        "eval_interval": null,
        "separate_file_for_parameters": [
            "adapter_embed"
        ]
    },
    "profiler": {
        "profile_steps": 0,
        "profile_start_at_step": 0,
        "profiler_output": null
    },
    "transformer_architecture": {
        "vocab_size": 128000,
        "vocab_file": "/scratch/samuel/Pharia-1-LLM-7B-control/vocab.json",
        "hidden_size": 4608,
        "num_layers": 27,
        "num_attention_heads": 36,
        "num_local_attention_heads": 0,
        "local_attention_window_size": null,
        "rotary_embedding_base": 1000000,
        "rotary_percentage": 1.0,
        "sequence_length": 2048,
        "norm_type": "layernorm",
        "relative_position_embedding_type": "rotary_complex",
        "mlp_type": "default",
        "mlp_factor": 4.0,
        "attention_bias": true,
        "attention_qkv_in_one": false,
        "attention_num_kv_heads": 4,
        "attention_use_matmul": false,
        "mlp_bias": true,
        "key_query_norm": false,
        "weight_tying": false,
        "masked_softmax": {
            "kernel": "torch",
            "softmax_in_fp32": true,
            "scale": 1.0,
            "deterministic_flash_attn_bwd": false
        },
        "layernorm": {
            "optimization_type": "torch",
            "layernorm_epsilon": 1e-05
        },
        "precision": "bfloat16",
        "dropout_embedding": 0.0,
        "dropout_attention_probs": 0.0,
        "dropout_after_attention": 0.0,
        "dropout_after_mlp": 0.0,
        "bitfit_bias_config": null,
        "finetunable_token_ids": [],
        "image_encoder": false,
        "dropout_image_encoder": 0.0,
        "softprompt_config": null,
        "adapter_config": {
            "name": "embed",
            "attention_downsampling_factor": 0.25,
            "mlp_downsampling_factor": 0.25,
            "init_std": 1e-05,
            "version": ".unknown."
        },
        "lora_config": null,
        "embedding_head_config": {
            "name": "pooling_only",
            "proj_layers": [
                256
            ],
            "pooling": "weighted_mean"
        },
        "lm_head": false,
        "causal": true
    },
    "data": {
        "legacy_dataset": false,
        "load_mmap_index_to_memory": false,
        "use_mmap": false,
        "load_data_item_mmap_index_to_memory": false,
        "finetuning_dataset": false,
        "finetuning_chat_dataset": false,
        "finetuning_dataset_memory_map": false,
        "embedding_dataset": true,
        "embedding_dataset_memory_map": false,
        "data_prefixes": [
            "/scratch/samuel/all_source_data_templated/allnli_eng.jsonl",
            "/scratch/samuel/all_source_data_templated/hotpot_with_bm25_negatives.jsonl",
            "/scratch/samuel/all_source_data_templated/medi_1.jsonl",
            "/scratch/samuel/all_source_data_templated/msmarco.jsonl"
        ],
        "validation_data_prefixes": null,
        "blended_dataset": {
            "weight_by_num_documents": true,
            "weighted_sampler_alpha": 1.0,
            "weights": null,
            "weight_examples_proportional": false,
            "ep_maximum": null,
            "ep_temperature": 1.0,
            "minimum_dataset_size": 0,
            "cache_directory": "/scratch/samuel/blended_dataset_cache",
            "shuffle_dataset_indices": false,
            "load_dataset_indices_to_memory": false
        },
        "only_full_sequences": false,
        "allow_incomplete_sequences_every_n": 0
    },
    "determined_experiment_id": 2965,
    "determined_trial_id": 86040
}