jon-tow commited on
Commit
5a4f909
1 Parent(s): 4b2a54d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +101 -0
README.md CHANGED
@@ -12,6 +12,107 @@ This particular model is from a checkpoint captured at step 175,500 for an extra
12
 
13
  Note: Sequence length warmup was not used to move up from 2048 but, in hindsight, should have been applied.
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  ## Acknoweldgements
16
 
17
  This work would not have been possible without the support of [Stability AI](https://stability.ai/).
 
12
 
13
  Note: Sequence length warmup was not used to move up from 2048 but, in hindsight, should have been applied.
14
 
15
+
16
+ ## Config
17
+
18
+ ```yaml
19
+ {
20
+ "eval_batch_size": 2,
21
+
22
+ "pipe-parallel-size": 1,
23
+ "model-parallel-size": 1,
24
+
25
+ "num-layers": 32,
26
+ "hidden-size": 2560,
27
+ "num-attention-heads": 32,
28
+ "seq-length": 4096,
29
+ "max-position-embeddings": 4096,
30
+
31
+ "norm": "layernorm",
32
+ "pos-emb": "rotary",
33
+ "rotary-pct": 0.25,
34
+ "no-weight-tying": true,
35
+ "gpt-j-residual": true,
36
+ "output-layer-parallelism": "column",
37
+
38
+ "init_method": "small_init",
39
+ "output_layer_init_method": "wang_init",
40
+
41
+ "attention-config": [[["flash"], 32]],
42
+ "scaled-upper-triang-masked-softmax-fusion": true,
43
+ "bias-gelu-fusion": true,
44
+
45
+ "optimizer": {
46
+ "type": "Adam",
47
+ "params": {
48
+ "lr": 1.6e-5,
49
+ "betas": [0.9, 0.95],
50
+ "eps": 1.0e-08
51
+ },
52
+ },
53
+ "min_lr": 8.0e-06,
54
+
55
+ "zero_optimization":{
56
+ "stage": 1,
57
+ "allgather_partitions": true,
58
+ "allgather_bucket_size": 500000000,
59
+ "overlap_comm": true,
60
+ "reduce_scatter": true,
61
+ "reduce_bucket_size": 500000000,
62
+ "contiguous_gradients": true,
63
+ "cpu_offload": false,
64
+ },
65
+ "train_micro_batch_size_per_gpu": 4,
66
+ "gradient-accumulation-steps": 4,
67
+ "data-impl": "mmap",
68
+
69
+ "checkpoint-activations": true,
70
+ "checkpoint-num-layers": 1,
71
+ "partition-activations": true,
72
+ "synchronize-each-layer": true,
73
+
74
+ "gradient_clipping": 1.0,
75
+ "weight-decay": 0.1,
76
+ "hidden-dropout": 0,
77
+ "attention-dropout": 0,
78
+
79
+ "fp16": {
80
+ "fp16": true,
81
+ "enabled": true,
82
+ "loss_scale": 0,
83
+ "loss_scale_window": 1000,
84
+ "initial_scale_power": 12,
85
+ "hysteresis": 2,
86
+ "min_loss_scale": 1,
87
+ },
88
+
89
+ "train-iters": 318000,
90
+ "lr-decay-iters": 318000,
91
+ "distributed-backend": "nccl",
92
+ "lr-decay-style": "cosine",
93
+ "warmup": 0.01,
94
+ "checkpoint-factor": 500,
95
+ "eval-interval": 50000,
96
+ "eval-iters": 10,
97
+ "extra-save-iters": [0, 512, 152001],
98
+
99
+ "train-data-paths": ["pile_0.87_deduped_text_document"],
100
+ "valid-data-paths": ["pile_0.87_deduped_text_document"],
101
+ "test-data-paths": ["pile_0.87_deduped_text_document"],
102
+
103
+ "tokenizer_type": "HFTokenizer",
104
+ "vocab-file": "20B_tokenizer.json",
105
+
106
+ "log-interval": 10,
107
+ "steps_per_print": 10,
108
+ "wall_clock_breakdown": true,
109
+ "log-grad-norm": true,
110
+
111
+ "launcher": "slurm",
112
+ "deepspeed_slurm": true,
113
+ }
114
+ ```
115
+
116
  ## Acknoweldgements
117
 
118
  This work would not have been possible without the support of [Stability AI](https://stability.ai/).