accum_freq: 1 attn_activation: None attn_name: auto attn_seq_scalar: None attn_seq_scalar_alpha: None average: None average_coefficients: None beta1: 0.9 beta2: 0.95 checkpoint_path: logs/448/c4_original-open_lm_1b-4.0/checkpoints copy_codebase: False data_key: txt dataset_manifest: None dataset_resampled: False dataset_type: auto ddp_static_graph: False debug: False delete_previous_checkpoint: True device: cuda:0 disable_buffer: False dist_backend: nccl dist_url: env:// distill_model: None distill_pretrained: None distributed: True epochs: 5 epochs_cooldown: None eps: 1e-08 experimental_meta_device: False ffn_type: swiglu force_distributed: False force_min_lr: 0.0 fsdp: False fsdp_amp: False fsdp_backward_prefetch: False fsdp_checkpoint: False fsdp_cpu_offload: False fsdp_hybrid: False fsdp_hybrid_o2: False fsdp_limit_all_gathers: False fsdp_pure_bf16: False fsdp_use_orig_params: False global_batch_size: 128 global_val_batch_size: 128 grad_checkpointing: False grad_clip_norm: 1.0 hf_fsdp_block: None hf_model: None hf_seq_len: None ignore_parse_errors: False load_pretrained_state: False local_rank: 0 log_every_n_steps: 20 log_level: 20 log_local: False log_logit_mean: False log_path: logs/448/c4_original-open_lm_1b-4.0/out.log logs: logs/448 lr: 0.003 lr_cooldown_end: 3e-05 lr_cooldown_power: 1.0 lr_scheduler: cosine model: open_lm_1b model_norm: gain_only_lp_layer_norm moe_capacity_factor: 1.25 moe_expert_model_parallelism: False moe_freq: 0 moe_loss_weight: 0.1 moe_num_experts: None moe_top_k: 2 moe_weight_parallelism: False multiple_data_passes: False name: c4_original-open_lm_1b-4.0 no_set_device_rank: False optimizer: adamw per_gpu_batch_size: 16 per_gpu_val_batch_size: 16 positional_embedding_type: rotary precision: amp_bfloat16 pretrained: None qk_norm: True rank: 0 remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 remote_sync_frequency: 300 remote_sync_protocol: s3 report_to: resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-open_lm_1b-4.0/checkpoints/epoch_6.pt save_frequency: 1 save_most_recent: False seed: 124 seq_len: 2048 skip_scheduler: False squash_mask_left: True target_mask_individual: 50400 target_mask_left: 50300 tensorboard: False tensorboard_path: torchcompile: False torchscript: False trace: False train_data: None train_data_mix_weights: None train_data_upsampling_factors: None train_num_samples: None use_bn_sync: False use_bnb_linear: None val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar'] val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz'] val_frequency: 5 val_iter_ci: 10000 val_max_pop_ci: 300000 val_num_samples: None val_seq_ci: True val_tok_ci: True vocab_size: 50432 wandb: False wandb_notes: wandb_project_name: open-lm warmup: 5000 wd: 0.033 workers: 2 world_size: 8 z_loss_coefficient: 0.0001