model: transformer_model: "microsoft/deberta-v3-base" optimizer: lr: 0.0001 warmup_steps: 5000 total_steps: ${training.trainer.max_steps} total_reset: 1 weight_decay: 0.0 lr_decay: 0.8 no_decay_params: - "bias" - LayerNorm.weight entities_per_forward: 100