## IO save_data: fr-en/data_spm overwrite: True seed: 1234 report_every: 100 valid_metrics: ["BLEU"] tensorboard: true tensorboard_log_dir: tensorboard ### Vocab src_vocab: fr-en/joint.eole.vocab tgt_vocab: fr-en/joint.eole.vocab src_vocab_size: 50000 tgt_vocab_size: 50000 vocab_size_multiple: 8 share_vocab: True n_sample: 0 data: corpus_1: path_src: hf://quickmt/quickmt-train.fr-en/fr path_tgt: hf://quickmt/quickmt-train.fr-en/en path_sco: hf://quickmt/quickmt-train.fr-en/sco valid: path_src: fr-en/dev.src path_tgt: fr-en/dev.tgt transforms: [sentencepiece, filtertoolong] transforms_configs: sentencepiece: src_subword_model: "fr-en/joint.spm.model" tgt_subword_model: "fr-en/joint.spm.model" filtertoolong: src_seq_length: 256 tgt_seq_length: 256 training: # Run configuration model_path: fr-en/model keep_checkpoint: 4 save_checkpoint_steps: 2000 train_steps: 100000 valid_steps: 2000 # Train on a single GPU world_size: 1 gpu_ranks: [0] # Batching batch_type: "tokens" batch_size: 8192 valid_batch_size: 8192 batch_size_multiple: 8 accum_count: [16] accum_steps: [0] # Optimizer & Compute compute_dtype: "bf16" optim: "pagedadamw8bit" #optim: "adamw" learning_rate: 2.0 warmup_steps: 10000 decay_method: "noam" adam_beta2: 0.998 # Data loading bucket_size: 128000 num_workers: 4 prefetch_factor: 100 # Hyperparams dropout_steps: [0] dropout: [0.1] attention_dropout: [0.1] max_grad_norm: 2 label_smoothing: 0.1 average_decay: 0.0001 param_init_method: xavier_uniform normalization: "tokens" model: architecture: "transformer" layer_norm: standard share_embeddings: true share_decoder_embeddings: true add_ffnbias: true mlp_activation_fn: gelu add_estimator: false add_qkvbias: false norm_eps: 1e-6 hidden_size: 1024 encoder: layers: 8 decoder: layers: 2 heads: 8 transformer_ff: 4096 embeddings: word_vec_size: 1024 position_encoding_type: "SinusoidalInterleaved"