wandb_version: 1 _n_gpu: desc: null value: 1 _name_or_path: desc: null value: ./ _wandb: desc: null value: cli_version: 0.12.10 framework: huggingface huggingface_version: 4.17.0.dev0 is_jupyter_run: false is_kaggle_kernel: false m: - 1: train/global_step 6: - 3 - 1: train/loss 5: 1 6: - 1 - 1: train/learning_rate 5: 1 6: - 1 - 1: train/epoch 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.cls\.predictions\.transform\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.23\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.22\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.21\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.20\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.19\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.18\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.17\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.16\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.15\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.14\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.13\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.12\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.11\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.10\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.9\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.8\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.7\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.6\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.5\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.4\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.3\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.2\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.1\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.intermediate\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.intermediate\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.intermediate\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.intermediate\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.intermediate\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.intermediate\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.crossattention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.dense\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.dense\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.dense\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.dense\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.dense\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.output\.dense\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.value\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.value\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.value\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.value\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.value\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.value\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.key\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.key\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.key\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.key\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.key\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.key\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.query\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.query\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.query\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.query\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.query\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.encoder\.layer\.0\.attention\.self\.query\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.LayerNorm\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.LayerNorm\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.LayerNorm\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.LayerNorm\.bias._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.LayerNorm\.bias.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.LayerNorm\.bias.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.position_embeddings\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.position_embeddings\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.position_embeddings\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.token_type_embeddings\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.token_type_embeddings\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.token_type_embeddings\.weight.bins 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.word_embeddings\.weight._type 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.word_embeddings\.weight.values 5: 1 6: - 1 - 1: gradients/decoder\.bert\.embeddings\.word_embeddings\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.2\.conv\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.1\.conv\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.adapter\.layers\.0\.conv\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.23\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.22\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.21\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.20\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.19\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.18\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.17\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.16\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.15\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.14\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.13\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.12\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.11\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.10\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.9\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.8\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.7\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.6\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.5\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.4\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.3\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.2\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.1\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.output_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.feed_forward\.intermediate_dense\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.final_layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.out_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.v_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.k_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.attention\.q_proj\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.layers\.0\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_v.bins 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g._type 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g.values 5: 1 6: - 1 - 1: gradients/encoder\.encoder\.pos_conv_embed\.conv\.weight_g.bins 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.projection\.bias.bins 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.weight._type 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.weight.values 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.weight.bins 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.bias._type 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.bias.values 5: 1 6: - 1 - 1: gradients/encoder\.feature_projection\.layer_norm\.bias.bins 5: 1 6: - 1 - 1: eval/loss 5: 1 6: - 1 - 1: eval/wer 5: 1 6: - 1 - 1: eval/runtime 5: 1 6: - 1 - 1: eval/samples_per_second 5: 1 6: - 1 - 1: eval/steps_per_second 5: 1 6: - 1 - 1: train/train_runtime 5: 1 6: - 1 - 1: train/train_samples_per_second 5: 1 6: - 1 - 1: train/train_steps_per_second 5: 1 6: - 1 - 1: train/total_flos 5: 1 6: - 1 - 1: train/train_loss 5: 1 6: - 1 python_version: 3.9.5 start_time: 1646227683 t: 1: - 1 - 5 - 11 3: - 13 4: 3.9.5 5: 0.12.10 6: 4.17.0.dev0 8: - 5 adafactor: desc: null value: false adam_beta1: desc: null value: 0.9 adam_beta2: desc: null value: 0.999 adam_epsilon: desc: null value: 1.0e-08 add_cross_attention: desc: null value: false architectures: desc: null value: - SpeechEncoderDecoderModel bad_words_ids: desc: null value: null bf16: desc: null value: false bf16_full_eval: desc: null value: false bos_token_id: desc: null value: null chunk_size_feed_forward: desc: null value: 0 cross_attention_hidden_size: desc: null value: null dataloader_drop_last: desc: null value: false dataloader_num_workers: desc: null value: 0 dataloader_pin_memory: desc: null value: true ddp_bucket_cap_mb: desc: null value: None ddp_find_unused_parameters: desc: null value: None debug: desc: null value: '[]' decoder: desc: null value: _name_or_path: decoder add_cross_attention: true architectures: - BertLMHeadModel attention_probs_dropout_prob: 0.0 bad_words_ids: null bos_token_id: null chunk_size_feed_forward: 0 classifier_dropout: null cross_attention_hidden_size: null decoder_start_token_id: null diversity_penalty: 0.0 do_sample: false early_stopping: false encoder_no_repeat_ngram_size: 0 eos_token_id: null finetuning_task: null forced_bos_token_id: null forced_eos_token_id: null gradient_checkpointing: false hidden_act: gelu hidden_dropout_prob: 0.1 hidden_size: 1024 id2label: '0': LABEL_0 '1': LABEL_1 initializer_range: 0.02 intermediate_size: 4096 is_decoder: true is_encoder_decoder: false label2id: LABEL_0: 0 LABEL_1: 1 layer_norm_eps: 1.0e-12 length_penalty: 1.0 max_length: 20 max_position_embeddings: 512 min_length: 0 model_type: bert no_repeat_ngram_size: 0 num_attention_heads: 16 num_beam_groups: 1 num_beams: 1 num_hidden_layers: 24 num_return_sequences: 1 output_attentions: false output_hidden_states: false output_scores: false pad_token_id: 0 position_embedding_type: absolute prefix: null problem_type: null pruned_heads: {} remove_invalid_values: false repetition_penalty: 1.0 return_dict: true return_dict_in_generate: false sep_token_id: null task_specific_params: null temperature: 1.0 tie_encoder_decoder: false tie_word_embeddings: true tokenizer_class: null top_k: 50 top_p: 1.0 torch_dtype: float32 torchscript: false transformers_version: 4.17.0.dev0 type_vocab_size: 2 use_bfloat16: false use_cache: false vocab_size: 30522 decoder_start_token_id: desc: null value: 101 deepspeed: desc: null value: None disable_tqdm: desc: null value: false diversity_penalty: desc: null value: 0.0 do_eval: desc: null value: true do_predict: desc: null value: false do_sample: desc: null value: false do_train: desc: null value: true early_stopping: desc: null value: false encoder: desc: null value: _name_or_path: facebook/wav2vec2-large-lv60 activation_dropout: 0.0 adapter_kernel_size: 3 adapter_stride: 2 add_adapter: true add_cross_attention: false apply_spec_augment: false architectures: - Wav2Vec2ForPreTraining attention_dropout: 0.0 bad_words_ids: null bos_token_id: 1 chunk_size_feed_forward: 0 classifier_proj_size: 256 codevector_dim: 768 contrastive_logits_temperature: 0.1 conv_bias: true conv_dim: - 512 - 512 - 512 - 512 - 512 - 512 - 512 conv_kernel: - 10 - 3 - 3 - 3 - 3 - 2 - 2 conv_stride: - 5 - 2 - 2 - 2 - 2 - 2 - 2 cross_attention_hidden_size: null ctc_loss_reduction: sum ctc_zero_infinity: false decoder_start_token_id: null diversity_loss_weight: 0.1 diversity_penalty: 0.0 do_sample: false do_stable_layer_norm: true early_stopping: false encoder_no_repeat_ngram_size: 0 eos_token_id: 2 feat_extract_activation: gelu feat_extract_dropout: 0.0 feat_extract_norm: layer feat_proj_dropout: 0.0 feat_quantizer_dropout: 0.0 final_dropout: 0.0 finetuning_task: null forced_bos_token_id: null forced_eos_token_id: null gradient_checkpointing: false hidden_act: gelu hidden_dropout: 0.0 hidden_dropout_prob: 0.0 hidden_size: 1024 id2label: '0': LABEL_0 '1': LABEL_1 initializer_range: 0.02 intermediate_size: 4096 is_decoder: false is_encoder_decoder: false label2id: LABEL_0: 0 LABEL_1: 1 layer_norm_eps: 1.0e-05 layerdrop: 0.0 length_penalty: 1.0 mask_feature_length: 10 mask_feature_min_masks: 0 mask_feature_prob: 0.0 mask_time_length: 10 mask_time_min_masks: 2 mask_time_prob: 0.0 max_length: 20 min_length: 0 model_type: wav2vec2 no_repeat_ngram_size: 0 num_adapter_layers: 3 num_attention_heads: 16 num_beam_groups: 1 num_beams: 1 num_codevector_groups: 2 num_codevectors_per_group: 320 num_conv_pos_embedding_groups: 16 num_conv_pos_embeddings: 128 num_feat_extract_layers: 7 num_hidden_layers: 24 num_negatives: 100 num_return_sequences: 1 output_attentions: false output_hidden_size: 1024 output_hidden_states: false output_scores: false pad_token_id: 0 prefix: null problem_type: null proj_codevector_dim: 768 pruned_heads: {} remove_invalid_values: false repetition_penalty: 1.0 return_dict: true return_dict_in_generate: false sep_token_id: null task_specific_params: null tdnn_dilation: - 1 - 2 - 3 - 1 - 1 tdnn_dim: - 512 - 512 - 512 - 512 - 1500 tdnn_kernel: - 5 - 3 - 3 - 1 - 1 temperature: 1.0 tie_encoder_decoder: false tie_word_embeddings: true tokenizer_class: null top_k: 50 top_p: 1.0 torch_dtype: null torchscript: false transformers_version: 4.17.0.dev0 use_bfloat16: false use_weighted_layer_sum: false vocab_size: 32 xvector_output_dim: 512 encoder_no_repeat_ngram_size: desc: null value: 0 eos_token_id: desc: null value: 102 eval_accumulation_steps: desc: null value: None eval_batch_size: desc: null value: 8 eval_steps: desc: null value: 1500 evaluation_strategy: desc: null value: steps finetuning_task: desc: null value: null forced_bos_token_id: desc: null value: null forced_eos_token_id: desc: null value: null fp16: desc: null value: true fp16_backend: desc: null value: auto fp16_full_eval: desc: null value: false fp16_opt_level: desc: null value: O1 generation_max_length: desc: null value: 40 generation_num_beams: desc: null value: 1 gradient_accumulation_steps: desc: null value: 4 gradient_checkpointing: desc: null value: true greater_is_better: desc: null value: None group_by_length: desc: null value: true half_precision_backend: desc: null value: amp hub_model_id: desc: null value: None hub_strategy: desc: null value: every_save hub_token: desc: null value: id2label: desc: null value: '0': LABEL_0 '1': LABEL_1 ignore_data_skip: desc: null value: false is_decoder: desc: null value: false is_encoder_decoder: desc: null value: true label2id: desc: null value: LABEL_0: 0 LABEL_1: 1 label_names: desc: null value: None label_smoothing_factor: desc: null value: 0.0 learning_rate: desc: null value: 3.0e-05 length_column_name: desc: null value: input_length length_penalty: desc: null value: 1.0 load_best_model_at_end: desc: null value: false local_rank: desc: null value: -1 log_level: desc: null value: -1 log_level_replica: desc: null value: -1 log_on_each_node: desc: null value: true logging_dir: desc: null value: ./runs/Mar02_13-27-31_sanchit--v100 logging_first_step: desc: null value: false logging_nan_inf_filter: desc: null value: true logging_steps: desc: null value: 1 logging_strategy: desc: null value: steps lr_scheduler_type: desc: null value: linear max_grad_norm: desc: null value: 1.0 max_length: desc: null value: 50 max_steps: desc: null value: -1 metric_for_best_model: desc: null value: None min_length: desc: null value: 0 model_type: desc: null value: speech-encoder-decoder mp_parameters: desc: null value: '' no_cuda: desc: null value: false no_repeat_ngram_size: desc: null value: 0 num_beam_groups: desc: null value: 1 num_beams: desc: null value: 1 num_return_sequences: desc: null value: 1 num_train_epochs: desc: null value: 5.0 optim: desc: null value: adamw_hf output_attentions: desc: null value: false output_dir: desc: null value: ./ output_hidden_states: desc: null value: false output_scores: desc: null value: false overwrite_output_dir: desc: null value: true pad_token_id: desc: null value: 0 past_index: desc: null value: -1 per_device_eval_batch_size: desc: null value: 8 per_device_train_batch_size: desc: null value: 8 per_gpu_eval_batch_size: desc: null value: None per_gpu_train_batch_size: desc: null value: None predict_with_generate: desc: null value: true prediction_loss_only: desc: null value: false prefix: desc: null value: null problem_type: desc: null value: null processor_class: desc: null value: Wav2Vec2Processor pruned_heads: desc: null value: {} push_to_hub: desc: null value: true push_to_hub_model_id: desc: null value: None push_to_hub_organization: desc: null value: None push_to_hub_token: desc: null value: remove_invalid_values: desc: null value: false remove_unused_columns: desc: null value: true repetition_penalty: desc: null value: 1.0 report_to: desc: null value: '[''tensorboard'', ''wandb'']' resume_from_checkpoint: desc: null value: None return_dict: desc: null value: true return_dict_in_generate: desc: null value: false run_name: desc: null value: ./ save_on_each_node: desc: null value: false save_steps: desc: null value: 1500 save_strategy: desc: null value: steps save_total_limit: desc: null value: 1 seed: desc: null value: 42 sep_token_id: desc: null value: null sharded_ddp: desc: null value: '[]' skip_memory_metrics: desc: null value: true sortish_sampler: desc: null value: false task_specific_params: desc: null value: null temperature: desc: null value: 1.0 tf32: desc: null value: None tie_encoder_decoder: desc: null value: false tie_word_embeddings: desc: null value: false tokenizer_class: desc: null value: null top_k: desc: null value: 50 top_p: desc: null value: 1.0 torch_dtype: desc: null value: torch.float32 torchscript: desc: null value: false tpu_metrics_debug: desc: null value: false tpu_num_cores: desc: null value: None train_batch_size: desc: null value: 8 transformers_version: desc: null value: null use_bfloat16: desc: null value: false use_cache: desc: null value: false use_legacy_prediction_loop: desc: null value: false warmup_ratio: desc: null value: 0.0 warmup_steps: desc: null value: 1000 weight_decay: desc: null value: 0.0 xpu_backend: desc: null value: None