# @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: no_epoch_checkpoints: true task: _name: masked_lm data: ??? sample_break_mode: complete tokens_per_sample: 512 criterion: masked_lm dataset: batch_size: 16 ignore_unused_valid_subsets: true optimizer: _name: adam weight_decay: 0.01 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 10000 optimization: clip_norm: 0 lr: [0.0005] max_update: 125000 update_freq: [16] model: _name: roberta max_positions: 512 dropout: 0.1 attention_dropout: 0.1