rewicks
/

baseline_en-de_64k_ep21

Text2Text Generation

Inference Endpoints

Model card Files Files and versions Community

baseline_en-de_64k_ep21 / marian-config.yaml

rewicks's picture

Upload marian-config.yaml

c772d41 verified 4 days ago

history blame contribute delete

2.63 kB

	# This file defines the SGD-related parameters for Marian trainings.
	# This is the teacher configuration.

	seed: 141414

	# cost
	cost-type: ce-sum
	label-smoothing: 0.1

	# optimizer config
	optimizer: adam
	learn-rate: 0.0005
	lr-warmup: 4000
	lr-decay-inv-sqrt: 4000
	mini-batch-warmup: 4000
	mini-batch-round-up: true
	optimizer-params:
	- 0.9
	- 0.999
	- 1e-08
	- 0.01
	clip-norm: 0
	dynamic-gradient-scaling:
	- 2
	- log
	exponential-smoothing: 1e-3

	# alignment
	guided-alignment-weight: 0

	# batch-size related parameters
	mini-batch-fit: true
	mini-batch-fit-step: 5
	maxi-batch: 1000
	mini-batch: 1000
	mini-batch-words: 500000
	max-length: 256

	# validation-related parameters
	# Note: Valid-metrics is specified in code (cf. k_validMetricNames), since it has a relation with the model pathname.
	# Note: Decoding parameters below are only for the validation decoding, decoding parameters in deployment are separate.
	early-stopping: 40
	valid-mini-batch: 32
	beam-size: 4
	normalize: 1.0
	word-penalty: 0.0
	valid-max-length: 1000
	n-best: false

	# general parameters
	logical-epoch: 1Gt
	after: 40e
	valid-freq: 1Gt
	save-freq: 1Gt
	disp-freq: 100Mt
	disp-label-counts: true
	lr-report: true
	sync-sgd: true
	shuffle: batches
	shuffle-in-ram: true
	disp-first: 10

	# multi-node sharding mode, irrelevant for single-node
	sharding: local
	sync-freq: 200u

	fp16: false
	# https://machinetranslation.visualstudio.com/Marian/_git/autogen?path=/configs/trainingConfigTeacherPoloniumV2Top15.yml&version=GBmain&_a=contents
	# for fp16 stability
	cost-scaling:
	- 256.f
	- 10000
	- 1.f
	- 256.f

	# model structure
	type: transformer

	# Flo generates separate vocabs, so don't tie between source and target
	tied-embeddings: true
	tied-embeddings-all: true
	tied-embeddings-src: false

	# dimensions
	dim-emb: 1024
	enc-depth: 6
	dec-depth: 6
	transformer-dim-ffn: 8192
	transformer-decoder-dim-ffn: 8192
	transformer-depth-scaling: true
	lemma-dim-emb: 0
	max-length: 256

	# architecture details
	transformer-decoder-autoreg: self-attention
	transformer-tied-layers: []

	# further transformer details
	transformer-ffn-activation: relu

	transformer-heads: 8
	transformer-postprocess-emb: d
	transformer-postprocess: dan

	transformer-dropout: 0.1
	transformer-dropout-attention: 0
	transformer-dropout-ffn: 0.1

	# data munging
	all-caps-every: 0
	english-title-case-every: 0

	log-time-zone: PST8PDT

	quiet-translation: true
	keep-best: true
	overwrite: false
	interpolate-env-vars: true
	log: train.log
	valid-log: valid.log
	valid-translation-output: valid.trg.output