save_data: run ## Where the vocab(s) will be written src_vocab: run/bpe.vocab.src tgt_vocab: run/bpe.vocab.tgt overwrite: True # Corpus opts: data: europarl: path_src: ../DGTcorpora_tokenized/en_gl/europarl/partitions/en_train.txt path_tgt: ../DGTcorpora_tokenized/en_gl/europarl/partitions/gl_train.txt transforms: [bpe, filtertoolong] weight: 120 opensub: path_src: ../DGTcorpora_tokenized/en_gl/opensub/partitions/en_train.txt path_tgt: ../DGTcorpora_tokenized/en_gl/opensub/partitions/gl_train.txt transforms: [bpe, filtertoolong] weight: 152 opus: path_src: ../DGTcorpora_tokenized/en_gl/opus/partitions/en_train.txt path_tgt: ../DGTcorpora_tokenized/en_gl/opus/partitions/gl_train.txt transforms: [bpe, filtertoolong] weight: 160 ted2020: path_src: ../DGTcorpora_tokenized/en_gl/ted2020/partitions/en_train.txt path_tgt: ../DGTcorpora_tokenized/en_gl/ted2020/partitions/gl_train.txt transforms: [bpe, filtertoolong] weight: 10 corgaback: path_src: ../DGTcorpora_tokenized/en_gl/corgaback/partitions/en_train.txt path_tgt: ../DGTcorpora_tokenized/en_gl/corgaback/partitions/gl_train.txt transforms: [bpe, filtertoolong] weight: 15 ccmatrix: path_src: ../DGTcorpora_tokenized/en_gl/ccmatrix/en_tok_dbo.txt path_tgt: ../DGTcorpora_tokenized/en_gl/ccmatrix/gl_tok_dbo.txt transforms: [bpe, filtertoolong] weight: 380 ##75 ## 25000000/13000000 = 2; 760/2 = 380 * 5 = 1900 (380/5=75) wikimatrix: path_src: ../DGTcorpora_tokenized/en_gl/wikimatrix/en.txt path_tgt: ../DGTcorpora_tokenized/en_gl/wikimatrix/gl.txt transforms: [bpe, filtertoolong] weight: 70 #25000000/450000 = 55 ; 760/55 = 14 ; 14 * 5 = 70 cluvi: path_src: ../DGTcorpora_tokenized/en_gl/cluvi/en.txt path_tgt: ../DGTcorpora_tokenized/en_gl/cluvi/gl.txt transforms: [bpe, filtertoolong] weight: 70 #25000000/295000 = 84 ; 760/84 = 9 ; 9 * 10 = 90 #wikimedia: # path_src: ../DGTcorpora_tokenized/en_gl/wikimedia/en.txt #path_tgt: ../DGTcorpora_tokenized/en_gl/wikimedia/gl.txt #transforms: [bpe, filtertoolong] #weight: 4 # xlent: #path_src: ../DGTcorpora_tokenized/en_gl/xlent/en.txt #path_tgt: ../DGTcorpora_tokenized/en_gl/xlent/gl.txt #transforms: [bpe, filtertoolong] #weight: 50 #25000000/1600000=15; 760/15=50 #linux: #path_src: ../DGTcorpora_tokenized/en_gl/linux/en.txt #path_tgt: ../DGTcorpora_tokenized/en_gl/linux/gl.txt #transforms: [bpe, filtertoolong] #weight: 20 #25000000/150000=166; 760/166=5 * 5 = 20 valid: path_src: ../DGTcorpora_tokenized/en_gl/partitions/all-en_valid.txt path_tgt: ../DGTcorpora_tokenized/en_gl/partitions/all-gl_valid.txt transforms: [bpe, filtertoolong] ### Transform related opts: #### Subword src_subword_model: ./bpe/en.code tgt_subword_model: ./bpe/gl.code src_subword_vocab: ./run/bpe.vocab.src tgt_subword_vocab: ./run/bpe.vocab.tgt #src_subword_model: ../sentencepiece/en-gl/en.sp.model #tgt_subword_model: ../sentencepiece/en-gl/gl.sp.model src_subword_type: bpe tgt_subord_type: bpe src_subword_nbest: 1 src_subword_alpha: 0.0 tgt_subword_nbest: 1 tgt_subword_alpha: 0.0 #### Filter src_seq_length: 150 tgt_seq_length: 150 # silently ignore empty lines in the data skip_empty_level: silent ##embeddings src_embeddings: ../embeddings/en.emb.txt tgt_embeddings: ../embeddings/gl.emb.txt ## supported types: GloVe, word2vec embeddings_type: "word2vec" # word_vec_size need to match with the pretrained embeddings dimensions word_vec_size: 300 # General opts save_model: run/model keep_checkpoint: 50 save_checkpoint_steps: 10000 average_decay: 0.0005 seed: 1234 report_every: 1000 train_steps: 200000 valid_steps: 10000 # Batching queue_size: 10000 bucket_size: 32768 world_size: 1 gpu_ranks: [0] batch_type: "tokens" batch_size: 8192 #batch_size: 4096 valid_batch_size: 64 batch_size_multiple: 1 max_generator_batches: 2 accum_count: [4] accum_steps: [0] # Optimization model_dtype: "fp16" optim: "adam" learning_rate: 2 warmup_steps: 8000 decay_method: "noam" adam_beta2: 0.998 max_grad_norm: 0 label_smoothing: 0.1 param_init: 0 param_init_glorot: true normalization: "tokens" # Model encoder_type: transformer decoder_type: transformer position_encoding: true enc_layers: 6 dec_layers: 6 heads: 8 rnn_size: 512 word_vec_size: 512 transformer_ff: 2048 dropout_steps: [0] dropout: [0.1] attention_dropout: [0.1] share_decoder_embeddings: true share_embeddings: false