baseline_en-de_64k_ep21 / marian-config.yaml
rewicks's picture
Upload marian-config.yaml
c772d41 verified
# This file defines the SGD-related parameters for Marian trainings.
# This is the teacher configuration.
seed: 141414
# cost
cost-type: ce-sum
label-smoothing: 0.1
# optimizer config
optimizer: adam
learn-rate: 0.0005
lr-warmup: 4000
lr-decay-inv-sqrt: 4000
mini-batch-warmup: 4000
mini-batch-round-up: true
optimizer-params:
- 0.9
- 0.999
- 1e-08
- 0.01
clip-norm: 0
dynamic-gradient-scaling:
- 2
- log
exponential-smoothing: 1e-3
# alignment
guided-alignment-weight: 0
# batch-size related parameters
mini-batch-fit: true
mini-batch-fit-step: 5
maxi-batch: 1000
mini-batch: 1000
mini-batch-words: 500000
max-length: 256
# validation-related parameters
# Note: Valid-metrics is specified in code (cf. k_validMetricNames), since it has a relation with the model pathname.
# Note: Decoding parameters below are only for the validation decoding, decoding parameters in deployment are separate.
early-stopping: 40
valid-mini-batch: 32
beam-size: 4
normalize: 1.0
word-penalty: 0.0
valid-max-length: 1000
n-best: false
# general parameters
logical-epoch: 1Gt
after: 40e
valid-freq: 1Gt
save-freq: 1Gt
disp-freq: 100Mt
disp-label-counts: true
lr-report: true
sync-sgd: true
shuffle: batches
shuffle-in-ram: true
disp-first: 10
# multi-node sharding mode, irrelevant for single-node
sharding: local
sync-freq: 200u
fp16: false
# https://machinetranslation.visualstudio.com/Marian/_git/autogen?path=/configs/trainingConfigTeacherPoloniumV2Top15.yml&version=GBmain&_a=contents
# for fp16 stability
cost-scaling:
- 256.f
- 10000
- 1.f
- 256.f
# model structure
type: transformer
# Flo generates separate vocabs, so don't tie between source and target
tied-embeddings: true
tied-embeddings-all: true
tied-embeddings-src: false
# dimensions
dim-emb: 1024
enc-depth: 6
dec-depth: 6
transformer-dim-ffn: 8192
transformer-decoder-dim-ffn: 8192
transformer-depth-scaling: true
lemma-dim-emb: 0
max-length: 256
# architecture details
transformer-decoder-autoreg: self-attention
transformer-tied-layers: []
# further transformer details
transformer-ffn-activation: relu
transformer-heads: 8
transformer-postprocess-emb: d
transformer-postprocess: dan
transformer-dropout: 0.1
transformer-dropout-attention: 0
transformer-dropout-ffn: 0.1
# data munging
all-caps-every: 0
english-title-case-every: 0
log-time-zone: PST8PDT
quiet-translation: true
keep-best: true
overwrite: false
interpolate-env-vars: true
log: train.log
valid-log: valid.log
valid-translation-output: valid.trg.output