# This file defines the SGD-related parameters for Marian trainings. # This is the teacher configuration. seed: 141414 # cost cost-type: ce-sum label-smoothing: 0.1 # optimizer config optimizer: adam learn-rate: 0.0005 lr-warmup: 4000 lr-decay-inv-sqrt: 4000 mini-batch-warmup: 4000 mini-batch-round-up: true optimizer-params: - 0.9 - 0.999 - 1e-08 - 0.01 clip-norm: 0 dynamic-gradient-scaling: - 2 - log exponential-smoothing: 1e-3 # alignment guided-alignment-weight: 0 # batch-size related parameters mini-batch-fit: true mini-batch-fit-step: 5 maxi-batch: 1000 mini-batch: 1000 mini-batch-words: 500000 max-length: 256 # validation-related parameters # Note: Valid-metrics is specified in code (cf. k_validMetricNames), since it has a relation with the model pathname. # Note: Decoding parameters below are only for the validation decoding, decoding parameters in deployment are separate. early-stopping: 40 valid-mini-batch: 32 beam-size: 4 normalize: 1.0 word-penalty: 0.0 valid-max-length: 1000 n-best: false # general parameters logical-epoch: 1Gt after: 40e valid-freq: 1Gt save-freq: 1Gt disp-freq: 100Mt disp-label-counts: true lr-report: true sync-sgd: true shuffle: batches shuffle-in-ram: true disp-first: 10 # multi-node sharding mode, irrelevant for single-node sharding: local sync-freq: 200u fp16: false # https://machinetranslation.visualstudio.com/Marian/_git/autogen?path=/configs/trainingConfigTeacherPoloniumV2Top15.yml&version=GBmain&_a=contents # for fp16 stability cost-scaling: - 256.f - 10000 - 1.f - 256.f # model structure type: transformer # Flo generates separate vocabs, so don't tie between source and target tied-embeddings: true tied-embeddings-all: true tied-embeddings-src: false # dimensions dim-emb: 1024 enc-depth: 6 dec-depth: 6 transformer-dim-ffn: 8192 transformer-decoder-dim-ffn: 8192 transformer-depth-scaling: true lemma-dim-emb: 0 max-length: 256 # architecture details transformer-decoder-autoreg: self-attention transformer-tied-layers: [] # further transformer details transformer-ffn-activation: relu transformer-heads: 8 transformer-postprocess-emb: d transformer-postprocess: dan transformer-dropout: 0.1 transformer-dropout-attention: 0 transformer-dropout-ffn: 0.1 # data munging all-caps-every: 0 english-title-case-every: 0 log-time-zone: PST8PDT quiet-translation: true keep-best: true overwrite: false interpolate-env-vars: true log: train.log valid-log: valid.log valid-translation-output: valid.trg.output