# This file defines the SGD-related parameters for Marian trainings. | |
# This is the teacher configuration. | |
seed: 141414 | |
# cost | |
cost-type: ce-sum | |
label-smoothing: 0.1 | |
# optimizer config | |
optimizer: adam | |
learn-rate: 0.0005 | |
lr-warmup: 4000 | |
lr-decay-inv-sqrt: 4000 | |
mini-batch-warmup: 4000 | |
mini-batch-round-up: true | |
optimizer-params: | |
- 0.9 | |
- 0.999 | |
- 1e-08 | |
- 0.01 | |
clip-norm: 0 | |
dynamic-gradient-scaling: | |
- 2 | |
- log | |
exponential-smoothing: 1e-3 | |
# alignment | |
guided-alignment-weight: 0 | |
# batch-size related parameters | |
mini-batch-fit: true | |
mini-batch-fit-step: 5 | |
maxi-batch: 1000 | |
mini-batch: 1000 | |
mini-batch-words: 500000 | |
max-length: 256 | |
# validation-related parameters | |
# Note: Valid-metrics is specified in code (cf. k_validMetricNames), since it has a relation with the model pathname. | |
# Note: Decoding parameters below are only for the validation decoding, decoding parameters in deployment are separate. | |
early-stopping: 40 | |
valid-mini-batch: 32 | |
beam-size: 4 | |
normalize: 1.0 | |
word-penalty: 0.0 | |
valid-max-length: 1000 | |
n-best: false | |
# general parameters | |
logical-epoch: 1Gt | |
after: 40e | |
valid-freq: 1Gt | |
save-freq: 1Gt | |
disp-freq: 100Mt | |
disp-label-counts: true | |
lr-report: true | |
sync-sgd: true | |
shuffle: batches | |
shuffle-in-ram: true | |
disp-first: 10 | |
# multi-node sharding mode, irrelevant for single-node | |
sharding: local | |
sync-freq: 200u | |
fp16: false | |
# https://machinetranslation.visualstudio.com/Marian/_git/autogen?path=/configs/trainingConfigTeacherPoloniumV2Top15.yml&version=GBmain&_a=contents | |
# for fp16 stability | |
cost-scaling: | |
- 256.f | |
- 10000 | |
- 1.f | |
- 256.f | |
# model structure | |
type: transformer | |
# Flo generates separate vocabs, so don't tie between source and target | |
tied-embeddings: true | |
tied-embeddings-all: true | |
tied-embeddings-src: false | |
# dimensions | |
dim-emb: 1024 | |
enc-depth: 6 | |
dec-depth: 6 | |
transformer-dim-ffn: 8192 | |
transformer-decoder-dim-ffn: 8192 | |
transformer-depth-scaling: true | |
lemma-dim-emb: 0 | |
max-length: 256 | |
# architecture details | |
transformer-decoder-autoreg: self-attention | |
transformer-tied-layers: [] | |
# further transformer details | |
transformer-ffn-activation: relu | |
transformer-heads: 8 | |
transformer-postprocess-emb: d | |
transformer-postprocess: dan | |
transformer-dropout: 0.1 | |
transformer-dropout-attention: 0 | |
transformer-dropout-ffn: 0.1 | |
# data munging | |
all-caps-every: 0 | |
english-title-case-every: 0 | |
log-time-zone: PST8PDT | |
quiet-translation: true | |
keep-best: true | |
overwrite: false | |
interpolate-env-vars: true | |
log: train.log | |
valid-log: valid.log | |
valid-translation-output: valid.trg.output | |