File size: 1,117 Bytes
ca671b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
mode: pt
device: gpu
precision: bf16
eval_only: false
predict_only: false
seed: 93789
tokenizer:
  name: BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5
working_dir: null
model:
  liger: true
  klass: local_t5
  name: pszemraj/tFINE-850m-24x24-1024ctx
  overwrite:
    dropout_rate: 0.0
    num_decoder_layers: 16
    num_key_value_heads: 4
    num_layers: 16
    use_gqa: true
  add_config:
    is_bf16: false
  checkpoint_path: ''
  random_init: true
  compile: true
data:
  multi_task: true
  NTP: 0.3
  input_length: 512
  max_seq_len: 512
  mlm_probability: 0.15
  mean_noise_span_length: 3.0
  num_workers: 0
optim:
  name: adamwscale
  base_lr: 0.001
  batch_size: 128
  total_steps: 65536
  epochs: -1
  warmup_steps: 5000
  lr_scheduler: cosine
  weight_decay: 0.01
  grad_clip: 1.0
  grad_acc: 16
  final_cosine: 2.0e-05
eval:
  every_steps: 500
  steps: 0
checkpoint:
  every_steps: 1500
logging:
  every_steps: 25
  grad_l2: true
  weights_l2: true
  use_wandb: true
  wandb_config:
    project: nanoT5
    entity: amazingvince
    tags:
    - gqa
    - large
    - e32-d16
    - 512 ctx
    mode: online