sagadre
commited on
Commit
•
6ade3a7
1
Parent(s):
e207d53
overtraining model release
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt +3 -0
- c4_original-d=1024_l=24_h=8-0.25/params.txt +123 -0
- c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt +3 -0
- c4_original-d=1024_l=24_h=8-0.5/params.txt +123 -0
- c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=1024_l=24_h=8-1.0/params.txt +123 -0
- c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=1024_l=24_h=8-16.0/params.txt +123 -0
- c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=1024_l=24_h=8-2.0/params.txt +123 -0
- c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=1024_l=24_h=8-4.0/params.txt +123 -0
- c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=1024_l=24_h=8-8.0/params.txt +123 -0
- c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt +3 -0
- c4_original-d=512_l=8_h=4-0.25/params.txt +123 -0
- c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt +3 -0
- c4_original-d=512_l=8_h=4-0.5/params.txt +123 -0
- c4_original-d=512_l=8_h=4-1.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=512_l=8_h=4-1.0/params.txt +123 -0
- c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=512_l=8_h=4-16.0/params.txt +123 -0
- c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=512_l=8_h=4-2.0/params.txt +123 -0
- c4_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=512_l=8_h=4-32.0/params.txt +123 -0
- c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=512_l=8_h=4-4.0/params.txt +123 -0
- c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=512_l=8_h=4-8.0/params.txt +123 -0
- c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt +3 -0
- c4_original-d=576_l=24_h=8-0.25/params.txt +123 -0
- c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt +3 -0
- c4_original-d=576_l=24_h=8-0.5/params.txt +123 -0
- c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=576_l=24_h=8-1.0/params.txt +123 -0
- c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=576_l=24_h=8-16.0/params.txt +123 -0
- c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=576_l=24_h=8-2.0/params.txt +123 -0
- c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=576_l=24_h=8-32.0/params.txt +123 -0
- c4_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=576_l=24_h=8-4.0/params.txt +123 -0
- c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt +3 -0
- c4_original-d=576_l=24_h=8-8.0/params.txt +123 -0
- c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt +3 -0
- c4_original-d=96_l=8_h=4-0.25/params.txt +123 -0
- c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt +3 -0
- c4_original-d=96_l=8_h=4-0.5/params.txt +123 -0
c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5fed29f7cdafd92543a361433dc4cb9945ec9b57b8eefd1bd90261575ee64f27
|
3 |
+
size 1646767740
|
c4_original-d=1024_l=24_h=8-0.25/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.25/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.25/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=1024_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=1024_l=24_h=8-0.25
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-0.25/checkpoints/epoch_8.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 2000
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f4fb6936004ce754dc68494dd4cf1676204b974525767026685f71cbf6b1bba
|
3 |
+
size 1646767740
|
c4_original-d=1024_l=24_h=8-0.5/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.5/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-0.5/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=1024_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=1024_l=24_h=8-0.5
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-0.5/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 2000
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e22cd01e23e35977c64e6d10b0b1150f143eecd4b677c314899ff636427504b
|
3 |
+
size 1646767740
|
c4_original-d=1024_l=24_h=8-1.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-1.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-1.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=1024_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=1024_l=24_h=8-1.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-1.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 2000
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0384d69859b449de6fdc8484653b43988d4775673f39f58788190f640ecdf616
|
3 |
+
size 1646767036
|
c4_original-d=1024_l=24_h=8-16.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 2
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: logs/26439/c4_original-d=1024_l=24_h=8-16.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 32
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: logs/26439/c4_original-d=1024_l=24_h=8-16.0/out.log
|
58 |
+
logs: logs/26439
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=1024_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=1024_l=24_h=8-16.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 8
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 2000
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 2
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c27cfcf448d1f1db45b6fb4a5902f57879f36603394399768c141262ebd2a56
|
3 |
+
size 1646767740
|
c4_original-d=1024_l=24_h=8-2.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-2.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-2.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=1024_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=1024_l=24_h=8-2.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-2.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 2000
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:836fd49f4007798516f011caa995547fcbe637c2c28b28b6c518bf3bdc920369
|
3 |
+
size 1646766972
|
c4_original-d=1024_l=24_h=8-4.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 2
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: logs/25614/c4_original-d=1024_l=24_h=8-4.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 64
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: logs/25614/c4_original-d=1024_l=24_h=8-4.0/out.log
|
58 |
+
logs: logs/25614
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=1024_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=1024_l=24_h=8-4.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 8
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-4.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 2000
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6380471557fc25d8fd64fd3f833a4285f13244c3b7ee89f3820bf6236a5d6b2
|
3 |
+
size 1646767740
|
c4_original-d=1024_l=24_h=8-8.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-8.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=1024_l=24_h=8-8.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=1024_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=1024_l=24_h=8-8.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=1024_l=24_h=8-8.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 2000
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:294960f1a93aee07fa9972ac71eb2b10254b4f2fa20c32baf74342668fdd2274
|
3 |
+
size 315725493
|
c4_original-d=512_l=8_h=4-0.25/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.25/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.25/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=512_l=8_h=4
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=512_l=8_h=4-0.25
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-0.25/checkpoints/epoch_2.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b7566deea2ebb5dfeb2d568f80f9a8829fc500aa14e8a30ffc21f01aa57bd734
|
3 |
+
size 315725493
|
c4_original-d=512_l=8_h=4-0.5/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.5/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-0.5/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=512_l=8_h=4
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=512_l=8_h=4-0.5
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-0.5/checkpoints/epoch_3.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=512_l=8_h=4-1.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4f740c0d4f4bc9e11d6ca96facab4970997ca2c12f307c59c410f98ff0f66a3
|
3 |
+
size 315725493
|
c4_original-d=512_l=8_h=4-1.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-1.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-1.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=512_l=8_h=4
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=512_l=8_h=4-1.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-1.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d24828a5e1cca5e3a65fce61fe9ecd72ea7d8e09c6658ca07768a5a3fc6978a5
|
3 |
+
size 315725557
|
c4_original-d=512_l=8_h=4-16.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-16.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-16.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=512_l=8_h=4
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=512_l=8_h=4-16.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-16.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16a65618e30f4d1ae030457c00fe3159becb284c2c7d90fa518eebc4d3dfeb94
|
3 |
+
size 315725557
|
c4_original-d=512_l=8_h=4-2.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-2.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-2.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=512_l=8_h=4
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=512_l=8_h=4-2.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-2.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10d08659e80b72a44718bfe98c2ecde3964c62c3177ac5d4e15a09738520b602
|
3 |
+
size 315725557
|
c4_original-d=512_l=8_h=4-32.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-32.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-32.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=512_l=8_h=4
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=512_l=8_h=4-32.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-32.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2eb1fdc413c0b694f10375240e796fb6b27ed9207eff96d21967ea5abd2d776
|
3 |
+
size 315725557
|
c4_original-d=512_l=8_h=4-4.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-4.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-4.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=512_l=8_h=4
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=512_l=8_h=4-4.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-4.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:807c93b06719d7f853ff3a645115e177e0e70bd12196371750d03e38c4fc7683
|
3 |
+
size 315725557
|
c4_original-d=512_l=8_h=4-8.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-8.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=512_l=8_h=4-8.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=512_l=8_h=4
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=512_l=8_h=4-8.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=512_l=8_h=4-8.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae6efb2482178dde290567c7514d882f8b00bf67df82690a6984c912aca5d065
|
3 |
+
size 614923196
|
c4_original-d=576_l=24_h=8-0.25/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.25/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.25/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=576_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=576_l=24_h=8-0.25
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-0.25/checkpoints/epoch_3.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78a83ba24d2518ce88b3349fddd4a687ea23e1a65b22bba5556643bc5e70d704
|
3 |
+
size 614923196
|
c4_original-d=576_l=24_h=8-0.5/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.5/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-0.5/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=576_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=576_l=24_h=8-0.5
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-0.5/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:efa37e9abd5ba9f09b8d28810350e004ade0efbb426e3f6fa364b6d630300192
|
3 |
+
size 614923196
|
c4_original-d=576_l=24_h=8-1.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-1.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-1.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=576_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=576_l=24_h=8-1.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-1.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:796d153dd609535364be9901f88d14361ef01a58c38d1b0cfbffa81f7d1e359a
|
3 |
+
size 614922428
|
c4_original-d=576_l=24_h=8-16.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: logs/186/c4_original-d=576_l=24_h=8-16.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 32
|
44 |
+
global_val_batch_size: 4
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: logs/186/c4_original-d=576_l=24_h=8-16.0/out.log
|
58 |
+
logs: logs/186
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=576_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=576_l=24_h=8-16.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-16.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 2
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16c94bfeadfe8bff32399f264e582786f207d34e36a1f12598a6cf55abe0f10b
|
3 |
+
size 614923196
|
c4_original-d=576_l=24_h=8-2.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-2.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-2.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=576_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=576_l=24_h=8-2.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-2.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a495a3c15e7f598d2d404622514f8f6a0cd540f85b0d42593b086fa135314c57
|
3 |
+
size 614923196
|
c4_original-d=576_l=24_h=8-32.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-32.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-32.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=576_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=576_l=24_h=8-32.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-32.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f09b35d1b0c8c188186f63419cb560d473f485cd79940587b562e7a82657c0f1
|
3 |
+
size 614923196
|
c4_original-d=576_l=24_h=8-4.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 8
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-4.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 16
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=576_l=24_h=8-4.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=576_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=576_l=24_h=8-4.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 2
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-4.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a674ef037b53e720f575e92c3b64d20ec04913276559633f03c1953f39854f3
|
3 |
+
size 614922428
|
c4_original-d=576_l=24_h=8-8.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 2
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: logs/787/c4_original-d=576_l=24_h=8-8.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 64
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: logs/787/c4_original-d=576_l=24_h=8-8.0/out.log
|
58 |
+
logs: logs/787
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=576_l=24_h=8
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=576_l=24_h=8-8.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 8
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=576_l=24_h=8-8.0/checkpoints/epoch_6.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 400
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3b8a4e13665af64f2b60e78412b823b4f1cf712f85f4736ea95aa1c56ec5057
|
3 |
+
size 42317749
|
c4_original-d=96_l=8_h=4-0.25/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 1
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.25/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 128
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.25/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=96_l=8_h=4
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=96_l=8_h=4-0.25
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 16
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-0.25/checkpoints/epoch_1.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 100
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|
c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be7b97a11887e036413637725f12a57a34f74d8220ab35d860dc9a86ccf71153
|
3 |
+
size 42317749
|
c4_original-d=96_l=8_h=4-0.5/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 1
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.5/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: True
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 128
|
44 |
+
global_val_batch_size: 128
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-d=96_l=8_h=4-0.5/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: d=96_l=8_h=4
|
64 |
+
model_norm: gain_only_lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-d=96_l=8_h=4-0.5
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 16
|
77 |
+
per_gpu_val_batch_size: 16
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-d=96_l=8_h=4-0.5/checkpoints/epoch_1.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar']
|
108 |
+
val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 100
|
120 |
+
wd: 0.033
|
121 |
+
workers: 2
|
122 |
+
world_size: 8
|
123 |
+
z_loss_coefficient: 0.0001
|