diff --git "a/config.yaml" "b/config.yaml" new file mode 100644--- /dev/null +++ "b/config.yaml" @@ -0,0 +1,5263 @@ +wandb_version: 1 + +_n_gpu: + desc: null + value: 1 +_name_or_path: + desc: null + value: dmis-lab/biobert-large-cased-v1.1 +_wandb: + desc: null + value: + cli_version: 0.12.9 + framework: huggingface + huggingface_version: 4.12.5 + is_jupyter_run: false + is_kaggle_kernel: false + m: + - 1: train/global_step + 6: + - 3 + - 1: gradients/classifier\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/classifier\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/classifier\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/classifier\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/classifier\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/classifier\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.23\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.22\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.21\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.20\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.19\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.18\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.17\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.16\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.15\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.14\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.13\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.12\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.11\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.10\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.9\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.8\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.7\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.6\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.5\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.4\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.3\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.2\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.1\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.intermediate\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.output\.dense\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.value\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.key\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.encoder\.layer\.0\.attention\.self\.query\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.LayerNorm\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.LayerNorm\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.LayerNorm\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.LayerNorm\.bias._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.LayerNorm\.bias.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.LayerNorm\.bias.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.position_embeddings\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.position_embeddings\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.position_embeddings\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.token_type_embeddings\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.token_type_embeddings\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.token_type_embeddings\.weight.bins + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.word_embeddings\.weight._type + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.word_embeddings\.weight.values + 5: 1 + 6: + - 1 + - 1: gradients/bert\.embeddings\.word_embeddings\.weight.bins + 5: 1 + 6: + - 1 + - 1: train/loss + 5: 1 + 6: + - 1 + - 1: train/learning_rate + 5: 1 + 6: + - 1 + - 1: train/epoch + 5: 1 + 6: + - 1 + - 1: train/train_runtime + 5: 1 + 6: + - 1 + - 1: train/train_samples_per_second + 5: 1 + 6: + - 1 + - 1: train/train_steps_per_second + 5: 1 + 6: + - 1 + - 1: train/total_flos + 5: 1 + 6: + - 1 + - 1: train/train_loss + 5: 1 + 6: + - 1 + - 1: eval/loss + 5: 1 + 6: + - 1 + - 1: eval/precision + 5: 1 + 6: + - 1 + - 1: eval/recall + 5: 1 + 6: + - 1 + - 1: eval/f1 + 5: 1 + 6: + - 1 + - 1: eval/runtime + 5: 1 + 6: + - 1 + - 1: eval/samples_per_second + 5: 1 + 6: + - 1 + - 1: eval/steps_per_second + 5: 1 + 6: + - 1 + python_version: 3.6.8 + start_time: 1642028216 + t: + 1: + - 1 + - 2 + - 3 + - 5 + - 11 + 2: + - 1 + - 2 + - 3 + - 5 + - 11 + 3: + - 1 + - 7 + - 13 + 4: 3.6.8 + 5: 0.12.9 + 6: 4.12.5 + 8: + - 5 +adafactor: + desc: null + value: false +adam_beta1: + desc: null + value: 0.9 +adam_beta2: + desc: null + value: 0.999 +adam_epsilon: + desc: null + value: 1.0e-06 +add_cross_attention: + desc: null + value: false +architectures: + desc: null + value: null +attention_probs_dropout_prob: + desc: null + value: 0.1 +bad_words_ids: + desc: null + value: null +bos_token_id: + desc: null + value: null +chunk_size_feed_forward: + desc: null + value: 0 +classifier_dropout: + desc: null + value: null +cross_attention_hidden_size: + desc: null + value: null +dataloader_drop_last: + desc: null + value: false +dataloader_num_workers: + desc: null + value: 0 +dataloader_pin_memory: + desc: null + value: true +ddp_find_unused_parameters: + desc: null + value: None +debug: + desc: null + value: '[''underflow_overflow'']' +decoder_start_token_id: + desc: null + value: null +deepspeed: + desc: null + value: None +disable_tqdm: + desc: null + value: false +diversity_penalty: + desc: null + value: 0.0 +do_eval: + desc: null + value: true +do_predict: + desc: null + value: false +do_sample: + desc: null + value: false +do_train: + desc: null + value: true +early_stopping: + desc: null + value: false +encoder_no_repeat_ngram_size: + desc: null + value: 0 +eos_token_id: + desc: null + value: null +eval_accumulation_steps: + desc: null + value: None +eval_batch_size: + desc: null + value: 8 +eval_steps: + desc: null + value: None +evaluation_strategy: + desc: null + value: 'no' +finetuning_task: + desc: null + value: null +forced_bos_token_id: + desc: null + value: null +forced_eos_token_id: + desc: null + value: null +fp16: + desc: null + value: false +fp16_backend: + desc: null + value: auto +fp16_full_eval: + desc: null + value: false +fp16_opt_level: + desc: null + value: O1 +gradient_accumulation_steps: + desc: null + value: 1 +gradient_checkpointing: + desc: null + value: true +greater_is_better: + desc: null + value: None +group_by_length: + desc: null + value: false +hidden_act: + desc: null + value: gelu +hidden_dropout_prob: + desc: null + value: 0.1 +hidden_size: + desc: null + value: 1024 +hub_model_id: + desc: null + value: None +hub_strategy: + desc: null + value: every_save +hub_token: + desc: null + value: +id2label: + desc: null + value: + '0': B-DATE + '1': B-EPI + '10': I-SEX + '11': I-STAT + '12': O + '2': B-ETHN + '3': B-LOC + '4': B-SEX + '5': B-STAT + '6': I-DATE + '7': I-EPI + '8': I-ETHN + '9': I-LOC +ignore_data_skip: + desc: null + value: false +initializer_range: + desc: null + value: 0.02 +intermediate_size: + desc: null + value: 4096 +is_decoder: + desc: null + value: false +is_encoder_decoder: + desc: null + value: false +label2id: + desc: null + value: + B-DATE: 0 + B-EPI: 1 + B-ETHN: 2 + B-LOC: 3 + B-SEX: 4 + B-STAT: 5 + I-DATE: 6 + I-EPI: 7 + I-ETHN: 8 + I-LOC: 9 + I-SEX: 10 + I-STAT: 11 + O: 12 +label_names: + desc: null + value: None +label_smoothing_factor: + desc: null + value: 0.0 +layer_norm_eps: + desc: null + value: 1.0e-12 +learning_rate: + desc: null + value: 3.0e-05 +length_column_name: + desc: null + value: length +length_penalty: + desc: null + value: 1.0 +load_best_model_at_end: + desc: null + value: false +local_rank: + desc: null + value: -1 +log_level: + desc: null + value: 10 +log_level_replica: + desc: null + value: -1 +log_on_each_node: + desc: null + value: true +logging_dir: + desc: null + value: ./o14/runs/Jan12_22-56-44_ip-10-9-1-9.ec2.internal +logging_first_step: + desc: null + value: false +logging_nan_inf_filter: + desc: null + value: true +logging_steps: + desc: null + value: 500 +logging_strategy: + desc: null + value: steps +lr_scheduler_type: + desc: null + value: linear +max_grad_norm: + desc: null + value: 1.0 +max_length: + desc: null + value: 20 +max_position_embeddings: + desc: null + value: 512 +max_steps: + desc: null + value: -1 +metric_for_best_model: + desc: null + value: None +min_length: + desc: null + value: 0 +model_type: + desc: null + value: bert +mp_parameters: + desc: null + value: '' +no_cuda: + desc: null + value: false +no_repeat_ngram_size: + desc: null + value: 0 +num_attention_heads: + desc: null + value: 16 +num_beam_groups: + desc: null + value: 1 +num_beams: + desc: null + value: 1 +num_hidden_layers: + desc: null + value: 24 +num_return_sequences: + desc: null + value: 1 +num_train_epochs: + desc: null + value: 4.0 +output_attentions: + desc: null + value: false +output_dir: + desc: null + value: ./o14 +output_hidden_states: + desc: null + value: false +output_scores: + desc: null + value: false +overwrite_output_dir: + desc: null + value: true +pad_token_id: + desc: null + value: 0 +past_index: + desc: null + value: -1 +per_device_eval_batch_size: + desc: null + value: 8 +per_device_train_batch_size: + desc: null + value: 16 +per_gpu_eval_batch_size: + desc: null + value: None +per_gpu_train_batch_size: + desc: null + value: None +position_embedding_type: + desc: null + value: absolute +prediction_loss_only: + desc: null + value: false +prefix: + desc: null + value: null +problem_type: + desc: null + value: null +pruned_heads: + desc: null + value: {} +push_to_hub: + desc: null + value: false +push_to_hub_model_id: + desc: null + value: None +push_to_hub_organization: + desc: null + value: None +push_to_hub_token: + desc: null + value: +remove_invalid_values: + desc: null + value: false +remove_unused_columns: + desc: null + value: true +repetition_penalty: + desc: null + value: 1.0 +report_to: + desc: null + value: '[''tensorboard'', ''wandb'']' +resume_from_checkpoint: + desc: null + value: None +return_dict: + desc: null + value: true +return_dict_in_generate: + desc: null + value: false +run_name: + desc: null + value: ./o14 +save_on_each_node: + desc: null + value: false +save_steps: + desc: null + value: 1500 +save_strategy: + desc: null + value: steps +save_total_limit: + desc: null + value: None +seed: + desc: null + value: 42 +sep_token_id: + desc: null + value: null +sharded_ddp: + desc: null + value: '[]' +skip_memory_metrics: + desc: null + value: true +task_specific_params: + desc: null + value: null +temperature: + desc: null + value: 1.0 +tie_encoder_decoder: + desc: null + value: false +tie_word_embeddings: + desc: null + value: true +tokenizer_class: + desc: null + value: null +top_k: + desc: null + value: 50 +top_p: + desc: null + value: 1.0 +torch_dtype: + desc: null + value: null +torchscript: + desc: null + value: false +tpu_metrics_debug: + desc: null + value: false +tpu_num_cores: + desc: null + value: None +train_batch_size: + desc: null + value: 16 +transformers_version: + desc: null + value: 4.12.5 +type_vocab_size: + desc: null + value: 2 +use_bfloat16: + desc: null + value: false +use_cache: + desc: null + value: true +use_legacy_prediction_loop: + desc: null + value: false +vocab_size: + desc: null + value: 58996 +warmup_ratio: + desc: null + value: 0.06 +warmup_steps: + desc: null + value: 0 +weight_decay: + desc: null + value: 0.01 +xpu_backend: + desc: null + value: None