sagadre commited on
Commit
6ade3a7
1 Parent(s): e207d53

overtraining model release

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt +3 -0
  2. c4_original-d=1024_l=24_h=8-0.25/params.txt +123 -0
  3. c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt +3 -0
  4. c4_original-d=1024_l=24_h=8-0.5/params.txt +123 -0
  5. c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt +3 -0
  6. c4_original-d=1024_l=24_h=8-1.0/params.txt +123 -0
  7. c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt +3 -0
  8. c4_original-d=1024_l=24_h=8-16.0/params.txt +123 -0
  9. c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt +3 -0
  10. c4_original-d=1024_l=24_h=8-2.0/params.txt +123 -0
  11. c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt +3 -0
  12. c4_original-d=1024_l=24_h=8-4.0/params.txt +123 -0
  13. c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt +3 -0
  14. c4_original-d=1024_l=24_h=8-8.0/params.txt +123 -0
  15. c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt +3 -0
  16. c4_original-d=512_l=8_h=4-0.25/params.txt +123 -0
  17. c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt +3 -0
  18. c4_original-d=512_l=8_h=4-0.5/params.txt +123 -0
  19. c4_original-d=512_l=8_h=4-1.0/checkpoints/epoch_6.pt +3 -0
  20. c4_original-d=512_l=8_h=4-1.0/params.txt +123 -0
  21. c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt +3 -0
  22. c4_original-d=512_l=8_h=4-16.0/params.txt +123 -0
  23. c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt +3 -0
  24. c4_original-d=512_l=8_h=4-2.0/params.txt +123 -0
  25. c4_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt +3 -0
  26. c4_original-d=512_l=8_h=4-32.0/params.txt +123 -0
  27. c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt +3 -0
  28. c4_original-d=512_l=8_h=4-4.0/params.txt +123 -0
  29. c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt +3 -0
  30. c4_original-d=512_l=8_h=4-8.0/params.txt +123 -0
  31. c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt +3 -0
  32. c4_original-d=576_l=24_h=8-0.25/params.txt +123 -0
  33. c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt +3 -0
  34. c4_original-d=576_l=24_h=8-0.5/params.txt +123 -0
  35. c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt +3 -0
  36. c4_original-d=576_l=24_h=8-1.0/params.txt +123 -0
  37. c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt +3 -0
  38. c4_original-d=576_l=24_h=8-16.0/params.txt +123 -0
  39. c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt +3 -0
  40. c4_original-d=576_l=24_h=8-2.0/params.txt +123 -0
  41. c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt +3 -0
  42. c4_original-d=576_l=24_h=8-32.0/params.txt +123 -0
  43. c4_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt +3 -0
  44. c4_original-d=576_l=24_h=8-4.0/params.txt +123 -0
  45. c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt +3 -0
  46. c4_original-d=576_l=24_h=8-8.0/params.txt +123 -0
  47. c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt +3 -0
  48. c4_original-d=96_l=8_h=4-0.25/params.txt +123 -0
  49. c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt +3 -0
  50. c4_original-d=96_l=8_h=4-0.5/params.txt +123 -0
c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fed29f7cdafd92543a361433dc4cb9945ec9b57b8eefd1bd90261575ee64f27
3
+ size 1646767740
c4_original-d=1024_l=24_h=8-0.25/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.25/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.25/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=1024_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=1024_l=24_h=8-0.25
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 2000
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f4fb6936004ce754dc68494dd4cf1676204b974525767026685f71cbf6b1bba
3
+ size 1646767740
c4_original-d=1024_l=24_h=8-0.5/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.5/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.5/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=1024_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=1024_l=24_h=8-0.5
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 2000
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e22cd01e23e35977c64e6d10b0b1150f143eecd4b677c314899ff636427504b
3
+ size 1646767740
c4_original-d=1024_l=24_h=8-1.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-1.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-1.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=1024_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=1024_l=24_h=8-1.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 2000
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0384d69859b449de6fdc8484653b43988d4775673f39f58788190f640ecdf616
3
+ size 1646767036
c4_original-d=1024_l=24_h=8-16.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 2
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: logs/26439/c4_original-d=1024_l=24_h=8-16.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 32
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: logs/26439/c4_original-d=1024_l=24_h=8-16.0/out.log
58
+ logs: logs/26439
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=1024_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=1024_l=24_h=8-16.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 8
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 2000
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 2
123
+ z_loss_coefficient: 0.0001
c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c27cfcf448d1f1db45b6fb4a5902f57879f36603394399768c141262ebd2a56
3
+ size 1646767740
c4_original-d=1024_l=24_h=8-2.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-2.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-2.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=1024_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=1024_l=24_h=8-2.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 2000
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:836fd49f4007798516f011caa995547fcbe637c2c28b28b6c518bf3bdc920369
3
+ size 1646766972
c4_original-d=1024_l=24_h=8-4.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 2
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: logs/25614/c4_original-d=1024_l=24_h=8-4.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 64
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: logs/25614/c4_original-d=1024_l=24_h=8-4.0/out.log
58
+ logs: logs/25614
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=1024_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=1024_l=24_h=8-4.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 8
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 2000
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6380471557fc25d8fd64fd3f833a4285f13244c3b7ee89f3820bf6236a5d6b2
3
+ size 1646767740
c4_original-d=1024_l=24_h=8-8.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-8.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-8.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=1024_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=1024_l=24_h=8-8.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 2000
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:294960f1a93aee07fa9972ac71eb2b10254b4f2fa20c32baf74342668fdd2274
3
+ size 315725493
c4_original-d=512_l=8_h=4-0.25/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.25/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.25/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=512_l=8_h=4
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=512_l=8_h=4-0.25
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7566deea2ebb5dfeb2d568f80f9a8829fc500aa14e8a30ffc21f01aa57bd734
3
+ size 315725493
c4_original-d=512_l=8_h=4-0.5/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.5/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.5/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=512_l=8_h=4
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=512_l=8_h=4-0.5
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=512_l=8_h=4-1.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4f740c0d4f4bc9e11d6ca96facab4970997ca2c12f307c59c410f98ff0f66a3
3
+ size 315725493
c4_original-d=512_l=8_h=4-1.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-1.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-1.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=512_l=8_h=4
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=512_l=8_h=4-1.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-1.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d24828a5e1cca5e3a65fce61fe9ecd72ea7d8e09c6658ca07768a5a3fc6978a5
3
+ size 315725557
c4_original-d=512_l=8_h=4-16.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-16.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-16.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=512_l=8_h=4
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=512_l=8_h=4-16.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16a65618e30f4d1ae030457c00fe3159becb284c2c7d90fa518eebc4d3dfeb94
3
+ size 315725557
c4_original-d=512_l=8_h=4-2.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-2.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-2.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=512_l=8_h=4
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=512_l=8_h=4-2.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10d08659e80b72a44718bfe98c2ecde3964c62c3177ac5d4e15a09738520b602
3
+ size 315725557
c4_original-d=512_l=8_h=4-32.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-32.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-32.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=512_l=8_h=4
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=512_l=8_h=4-32.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2eb1fdc413c0b694f10375240e796fb6b27ed9207eff96d21967ea5abd2d776
3
+ size 315725557
c4_original-d=512_l=8_h=4-4.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-4.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-4.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=512_l=8_h=4
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=512_l=8_h=4-4.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:807c93b06719d7f853ff3a645115e177e0e70bd12196371750d03e38c4fc7683
3
+ size 315725557
c4_original-d=512_l=8_h=4-8.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-8.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-8.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=512_l=8_h=4
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=512_l=8_h=4-8.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae6efb2482178dde290567c7514d882f8b00bf67df82690a6984c912aca5d065
3
+ size 614923196
c4_original-d=576_l=24_h=8-0.25/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.25/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.25/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=576_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=576_l=24_h=8-0.25
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78a83ba24d2518ce88b3349fddd4a687ea23e1a65b22bba5556643bc5e70d704
3
+ size 614923196
c4_original-d=576_l=24_h=8-0.5/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.5/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.5/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=576_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=576_l=24_h=8-0.5
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efa37e9abd5ba9f09b8d28810350e004ade0efbb426e3f6fa364b6d630300192
3
+ size 614923196
c4_original-d=576_l=24_h=8-1.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-1.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-1.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=576_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=576_l=24_h=8-1.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:796d153dd609535364be9901f88d14361ef01a58c38d1b0cfbffa81f7d1e359a
3
+ size 614922428
c4_original-d=576_l=24_h=8-16.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: logs/186/c4_original-d=576_l=24_h=8-16.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 32
44
+ global_val_batch_size: 4
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: logs/186/c4_original-d=576_l=24_h=8-16.0/out.log
58
+ logs: logs/186
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=576_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=576_l=24_h=8-16.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 2
123
+ z_loss_coefficient: 0.0001
c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16c94bfeadfe8bff32399f264e582786f207d34e36a1f12598a6cf55abe0f10b
3
+ size 614923196
c4_original-d=576_l=24_h=8-2.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-2.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-2.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=576_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=576_l=24_h=8-2.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a495a3c15e7f598d2d404622514f8f6a0cd540f85b0d42593b086fa135314c57
3
+ size 614923196
c4_original-d=576_l=24_h=8-32.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-32.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-32.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=576_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=576_l=24_h=8-32.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f09b35d1b0c8c188186f63419cb560d473f485cd79940587b562e7a82657c0f1
3
+ size 614923196
c4_original-d=576_l=24_h=8-4.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 8
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-4.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 16
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-4.0/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=576_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=576_l=24_h=8-4.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 2
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a674ef037b53e720f575e92c3b64d20ec04913276559633f03c1953f39854f3
3
+ size 614922428
c4_original-d=576_l=24_h=8-8.0/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 2
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: logs/787/c4_original-d=576_l=24_h=8-8.0/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 64
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: logs/787/c4_original-d=576_l=24_h=8-8.0/out.log
58
+ logs: logs/787
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=576_l=24_h=8
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=576_l=24_h=8-8.0
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 8
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 400
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3b8a4e13665af64f2b60e78412b823b4f1cf712f85f4736ea95aa1c56ec5057
3
+ size 42317749
c4_original-d=96_l=8_h=4-0.25/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 1
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.25/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 128
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.25/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=96_l=8_h=4
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=96_l=8_h=4-0.25
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 16
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 100
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001
c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be7b97a11887e036413637725f12a57a34f74d8220ab35d860dc9a86ccf71153
3
+ size 42317749
c4_original-d=96_l=8_h=4-0.5/params.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_freq: 1
2
+ attn_activation: None
3
+ attn_name: auto
4
+ attn_seq_scalar: None
5
+ attn_seq_scalar_alpha: None
6
+ average: None
7
+ average_coefficients: None
8
+ beta1: 0.9
9
+ beta2: 0.95
10
+ checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.5/checkpoints
11
+ copy_codebase: False
12
+ data_key: txt
13
+ dataset_manifest: None
14
+ dataset_resampled: False
15
+ dataset_type: auto
16
+ ddp_static_graph: False
17
+ debug: False
18
+ delete_previous_checkpoint: True
19
+ device: cuda:0
20
+ disable_buffer: False
21
+ dist_backend: nccl
22
+ dist_url: env://
23
+ distill_model: None
24
+ distill_pretrained: None
25
+ distributed: True
26
+ epochs: 5
27
+ epochs_cooldown: None
28
+ eps: 1e-08
29
+ experimental_meta_device: False
30
+ ffn_type: swiglu
31
+ force_distributed: False
32
+ force_min_lr: 0.0
33
+ fsdp: False
34
+ fsdp_amp: False
35
+ fsdp_backward_prefetch: False
36
+ fsdp_checkpoint: False
37
+ fsdp_cpu_offload: False
38
+ fsdp_hybrid: False
39
+ fsdp_hybrid_o2: False
40
+ fsdp_limit_all_gathers: False
41
+ fsdp_pure_bf16: False
42
+ fsdp_use_orig_params: False
43
+ global_batch_size: 128
44
+ global_val_batch_size: 128
45
+ grad_checkpointing: False
46
+ grad_clip_norm: 1.0
47
+ hf_fsdp_block: None
48
+ hf_model: None
49
+ hf_seq_len: None
50
+ ignore_parse_errors: False
51
+ load_pretrained_state: False
52
+ local_rank: 0
53
+ log_every_n_steps: 20
54
+ log_level: 20
55
+ log_local: False
56
+ log_logit_mean: False
57
+ log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.5/out.log
58
+ logs: /admin/home-sy/dcnlp_logs
59
+ lr: 0.003
60
+ lr_cooldown_end: 3e-05
61
+ lr_cooldown_power: 1.0
62
+ lr_scheduler: cosine
63
+ model: d=96_l=8_h=4
64
+ model_norm: gain_only_lp_layer_norm
65
+ moe_capacity_factor: 1.25
66
+ moe_expert_model_parallelism: False
67
+ moe_freq: 0
68
+ moe_loss_weight: 0.1
69
+ moe_num_experts: None
70
+ moe_top_k: 2
71
+ moe_weight_parallelism: False
72
+ multiple_data_passes: False
73
+ name: c4_original-d=96_l=8_h=4-0.5
74
+ no_set_device_rank: False
75
+ optimizer: adamw
76
+ per_gpu_batch_size: 16
77
+ per_gpu_val_batch_size: 16
78
+ positional_embedding_type: rotary
79
+ precision: amp_bfloat16
80
+ pretrained: None
81
+ qk_norm: True
82
+ rank: 0
83
+ remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
84
+ remote_sync_frequency: 300
85
+ remote_sync_protocol: s3
86
+ report_to:
87
+ resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt
88
+ save_frequency: 1
89
+ save_most_recent: False
90
+ seed: 124
91
+ seq_len: 2048
92
+ skip_scheduler: False
93
+ squash_mask_left: True
94
+ target_mask_individual: 50400
95
+ target_mask_left: 50300
96
+ tensorboard: False
97
+ tensorboard_path:
98
+ torchcompile: False
99
+ torchscript: False
100
+ trace: False
101
+ train_data: None
102
+ train_data_mix_weights: None
103
+ train_data_upsampling_factors: None
104
+ train_num_samples: None
105
+ use_bn_sync: False
106
+ use_bnb_linear: None
107
+ val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
108
+ val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
109
+ val_frequency: 5
110
+ val_iter_ci: 10000
111
+ val_max_pop_ci: 300000
112
+ val_num_samples: None
113
+ val_seq_ci: True
114
+ val_tok_ci: True
115
+ vocab_size: 50432
116
+ wandb: False
117
+ wandb_notes:
118
+ wandb_project_name: open-lm
119
+ warmup: 100
120
+ wd: 0.033
121
+ workers: 2
122
+ world_size: 8
123
+ z_loss_coefficient: 0.0001