tim-lawson's picture
Upload folder using huggingface_hub
366da3e verified
{"project": "skip-gate-diff-loss", "seed": 0, "num_devices": 4, "train_filename_pattern": "data/fineweb10B/fineweb_train_*.bin", "num_train_tokens": 2400190464, "val_filename_pattern": "data/fineweb10B/fineweb_val_*.bin", "num_val_tokens": 1048576, "batch_size": 512, "device_batch_size": 8, "max_seq_len": 1024, "fraction_warmup_steps": 0, "fraction_warmdown_steps": 0.25, "log_every_steps": 1, "val_every_steps": 20, "save_every_steps": -1, "vocab_size": 50304, "dim": 768, "num_layers": 12, "num_heads": 12, "gating_layers": [0, 1, 2, 3, 4, 5], "normalize_gates": false, "gating_func_weight_eps": 1e-05, "gating_func_bias_eps": 1e-05, "gating_mask_weight": 1, "gating_mask_bias": 0, "gate_loss": "gate_diff", "gate_coef_global": 0.01, "gate_coef_factor_inc": 1, "gate_coef_factor_dec": 1, "gate_coef_delta_inc": 0, "gate_coef_delta_dec": 0, "gate_coef_min": null, "gate_coef_max": null, "gate_targets": true, "gate_target_start": 0.3, "gate_target_end": 0.9, "mask_targets": false, "mask_target_start": 1, "mask_target_end": 1, "zero_targets": false, "zero_target_start": 0, "zero_target_end": 0, "wte_lr": 0.3, "wte_beta1": 0.9, "wte_beta2": 0.95, "lm_head_lr": 0.002, "lm_head_beta1": 0.9, "lm_head_beta2": 0.95, "h_lr": 0.02, "h_momentum": 0.95, "gate_lr": 0.0001, "gate_beta1": 0.9, "gate_beta2": 0.95}