pretrain model
Browse files
scripts/pretrain-model.yaml
CHANGED
@@ -86,7 +86,7 @@ train:
|
|
86 |
max_steps:
|
87 |
|
88 |
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
89 |
-
|
90 |
|
91 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
92 |
tie_embeddings: true
|
@@ -121,15 +121,15 @@ optimizer:
|
|
121 |
|
122 |
init_args:
|
123 |
# (type: float, default: 0.001)
|
124 |
-
lr:
|
125 |
|
126 |
# (type: float, default: 0.01)
|
127 |
-
weight_decay:
|
128 |
|
129 |
# (type: tuple, default: (0.9,0.999))
|
130 |
betas:
|
131 |
- 0.9
|
132 |
-
- 0.
|
133 |
|
134 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
135 |
devices: auto
|
|
|
86 |
max_steps:
|
87 |
|
88 |
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
89 |
+
max_seq_length: 8193
|
90 |
|
91 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
92 |
tie_embeddings: true
|
|
|
121 |
|
122 |
init_args:
|
123 |
# (type: float, default: 0.001)
|
124 |
+
lr: 4e-04
|
125 |
|
126 |
# (type: float, default: 0.01)
|
127 |
+
weight_decay: 0.1
|
128 |
|
129 |
# (type: tuple, default: (0.9,0.999))
|
130 |
betas:
|
131 |
- 0.9
|
132 |
+
- 0.95
|
133 |
|
134 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
135 |
devices: auto
|