{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading all tensors:\n", "\n", "Key: context_embedder.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0032, -0.0107, 0.0138, -0.0129, 0.0147])\n", "\n", "Key: context_embedder.weight\n", "Shape: torch.Size([3072, 4096])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0669, -0.0099, -0.0311, 0.0228, -0.0073])\n", "\n", "Key: single_transformer_blocks.0.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2266, 1.2578, 1.2969, 1.2734, 1.2500])\n", "\n", "Key: single_transformer_blocks.0.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2266, 1.2578, 1.2969, 1.2734, 1.2500])\n", "\n", "Key: single_transformer_blocks.0.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0947, -0.0981, 0.0498, 0.0422, 0.0525])\n", "\n", "Key: single_transformer_blocks.0.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.0.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 97, 92, -47, 85, -49], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.0.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.0.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0024, -0.0019, -0.0004, 0.0083, 0.0059])\n", "\n", "Key: single_transformer_blocks.0.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.0.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-36, 93, 90, -43, 82], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.0.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.0.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0024, -0.0172, -0.0075, -0.0112, -0.0118])\n", "\n", "Key: single_transformer_blocks.0.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.0.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, 100, 89, -69, -46], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.0.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.0.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0020, -0.0046, -0.0050, -0.0025, -0.0012])\n", "\n", "Key: single_transformer_blocks.0.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.0.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([66, 61, 70, 78, 65], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.0.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.0.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0160, -0.0134, -0.0262, -0.0269, -0.0311])\n", "\n", "Key: single_transformer_blocks.0.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.0.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-127, -37, 91, 64, 80], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.0.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.0.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0014, -0.0381, 0.0048, 0.0192, 0.0194])\n", "\n", "Key: single_transformer_blocks.0.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.0.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-40, 88, 88, 72, -56], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.0.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.1.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2344, 1.1797, 1.2500, 1.2188, 1.2734])\n", "\n", "Key: single_transformer_blocks.1.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2344, 1.1797, 1.2500, 1.2188, 1.2734])\n", "\n", "Key: single_transformer_blocks.1.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0532, -0.0177, -0.0159, 0.1338, -0.0547])\n", "\n", "Key: single_transformer_blocks.1.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.1.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 87, -40, 94, -41, -35], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.1.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.1.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0315, 0.0058, -0.0242, 0.0298, -0.0018])\n", "\n", "Key: single_transformer_blocks.1.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.1.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 87, 84, 80, 92, -43], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.1.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.1.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0016, -0.0098, -0.0118, -0.0137, 0.0037])\n", "\n", "Key: single_transformer_blocks.1.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.1.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 88, -49, -54, 73, 85], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.1.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.1.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0033, -0.0029, -0.0011, -0.0025, -0.0023])\n", "\n", "Key: single_transformer_blocks.1.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.1.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-48, -40, 79, -67, 82], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.1.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.1.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0251, -0.0053, -0.0182, -0.0234, -0.0356])\n", "\n", "Key: single_transformer_blocks.1.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.1.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 34, -34, -32, -38, 81], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.1.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.1.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0223, 0.0510, -0.0244, 0.0096, 0.0057])\n", "\n", "Key: single_transformer_blocks.1.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.1.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-34, -37, 78, -43, -46], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.1.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.10.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2266, 1.1719, 1.4219, 1.4375, 1.3203])\n", "\n", "Key: single_transformer_blocks.10.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2266, 1.1719, 1.4219, 1.4375, 1.3203])\n", "\n", "Key: single_transformer_blocks.10.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.1523, 0.0356, 0.0197, 0.0166, 0.0255])\n", "\n", "Key: single_transformer_blocks.10.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.10.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 81, -38, -40, 85, 84], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.10.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.10.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0076, 0.0325, 0.0184, 0.0023, -0.0014])\n", "\n", "Key: single_transformer_blocks.10.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.10.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-60, -43, -37, -40, 81], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.10.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.10.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0052, 0.0019, 0.0034, -0.0003, -0.0068])\n", "\n", "Key: single_transformer_blocks.10.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.10.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, -38, 84, -48, 98], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.10.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.10.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0014, -0.0022, 0.0044, -0.0007, -0.0055])\n", "\n", "Key: single_transformer_blocks.10.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.10.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, 63, -51, -58, -47], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.10.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.10.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0625, -0.0344, -0.0256, -0.0713, -0.0352])\n", "\n", "Key: single_transformer_blocks.10.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.10.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 96, -44, -61, -39, 89], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.10.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.10.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0574, 0.0104, -0.0312, 0.0190, 0.0588])\n", "\n", "Key: single_transformer_blocks.10.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.10.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-39, -34, -39, -32, 87], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.10.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.11.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3125, 1.2656, 1.5078, 1.1406, 1.2656])\n", "\n", "Key: single_transformer_blocks.11.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3125, 1.2656, 1.5078, 1.1406, 1.2656])\n", "\n", "Key: single_transformer_blocks.11.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.2910, 0.0386, -0.0287, 0.0520, 0.0315])\n", "\n", "Key: single_transformer_blocks.11.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.11.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-43, 86, 89, 72, 91], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.11.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.11.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.1426, 0.0264, -0.0087, -0.0150, 0.0183])\n", "\n", "Key: single_transformer_blocks.11.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.11.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 60, -45, 89, 88, -41], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.11.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.11.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0013, 0.0067, -0.0051, 0.0129, -0.0072])\n", "\n", "Key: single_transformer_blocks.11.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.11.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-43, 89, -51, 82, -92], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.11.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.11.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0012, -0.0092, -0.0026, -0.0036, -0.0071])\n", "\n", "Key: single_transformer_blocks.11.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.11.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-48, -52, 65, -58, 80], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.11.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.11.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0228, -0.0120, -0.0128, -0.0347, -0.0376])\n", "\n", "Key: single_transformer_blocks.11.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.11.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-39, 79, -46, 82, -41], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.11.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.11.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0347, -0.0142, 0.0231, 0.0193, -0.0659])\n", "\n", "Key: single_transformer_blocks.11.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.11.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([ 63, -33, -29, 87, 86], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.11.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.12.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.4688, 1.3281, 1.4844, 1.3438, 1.4766])\n", "\n", "Key: single_transformer_blocks.12.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.4688, 1.3281, 1.4844, 1.3438, 1.4766])\n", "\n", "Key: single_transformer_blocks.12.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0815, 0.0349, 0.0088, 0.0981, -0.0476])\n", "\n", "Key: single_transformer_blocks.12.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.12.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, -30, -51, 79, -50], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.12.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.12.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0024, 0.0098, -0.0070, 0.0110, -0.0159])\n", "\n", "Key: single_transformer_blocks.12.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.12.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 87, 96, -37, -39, 69], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.12.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.12.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0014, -0.0074, 0.0029, 0.0008, -0.0011])\n", "\n", "Key: single_transformer_blocks.12.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.12.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 73, -38, 96, 83, -51], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.12.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.12.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0025, -0.0012, -0.0009, -0.0007, -0.0052])\n", "\n", "Key: single_transformer_blocks.12.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.12.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 67, 81, 72, 83, -58], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.12.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.12.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0270, -0.0298, -0.0248, -0.0276, -0.0302])\n", "\n", "Key: single_transformer_blocks.12.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.12.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 88, 77, 56, -40, -56], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.12.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.12.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0146, 0.0030, -0.0417, 0.0049, 0.0933])\n", "\n", "Key: single_transformer_blocks.12.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.12.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-54, -41, 92, -38, -39], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.12.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.13.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2734, 1.4062, 1.4766, 1.3281, 1.3047])\n", "\n", "Key: single_transformer_blocks.13.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2734, 1.4062, 1.4766, 1.3281, 1.3047])\n", "\n", "Key: single_transformer_blocks.13.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0596, 0.0051, 0.0012, -0.0566, -0.0698])\n", "\n", "Key: single_transformer_blocks.13.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.13.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, -60, -31, 94, -39], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.13.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.13.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0354, 0.0010, 0.0044, -0.0339, -0.0126])\n", "\n", "Key: single_transformer_blocks.13.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.13.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-44, 100, -42, 81, -37], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.13.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.13.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0041, -0.0042, 0.0046, -0.0062, -0.0024])\n", "\n", "Key: single_transformer_blocks.13.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.13.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-31, 90, -36, -46, -54], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.13.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.13.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0002, -0.0022, -0.0024, -0.0070, 0.0018])\n", "\n", "Key: single_transformer_blocks.13.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.13.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 74, -47, 45, 75, -66], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.13.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.13.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0396, -0.0352, -0.0214, -0.0513, -0.0225])\n", "\n", "Key: single_transformer_blocks.13.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.13.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-47, -43, -36, 82, -34], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.13.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.13.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0124, -0.0204, -0.0055, 0.0018, -0.0762])\n", "\n", "Key: single_transformer_blocks.13.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.13.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-29, 93, 55, 53, 90], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.13.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.14.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2500, 1.4297, 1.2891, 1.4297, 1.3516])\n", "\n", "Key: single_transformer_blocks.14.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2500, 1.4297, 1.2891, 1.4297, 1.3516])\n", "\n", "Key: single_transformer_blocks.14.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.1050, 0.0874, -0.0654, -0.0469, 0.0588])\n", "\n", "Key: single_transformer_blocks.14.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.14.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-47, 83, 89, -44, 94], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.14.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.14.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0251, -0.0050, -0.0003, -0.0160, 0.0061])\n", "\n", "Key: single_transformer_blocks.14.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.14.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 70, 62, 88, -52, 71], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.14.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.14.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0109, -0.0116, -0.0023, 0.0049, -0.0093])\n", "\n", "Key: single_transformer_blocks.14.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.14.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-41, -34, 82, 85, 89], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.14.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.14.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0007, 0.0013, -0.0012, -0.0015, 0.0021])\n", "\n", "Key: single_transformer_blocks.14.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.14.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-51, -46, 88, 84, -52], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.14.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.14.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0054, -0.0248, -0.0142, -0.0425, -0.0253])\n", "\n", "Key: single_transformer_blocks.14.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.14.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 86, 91, 84, -62, -78], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.14.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.14.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0111, -0.0160, 0.0099, 0.0249, -0.0564])\n", "\n", "Key: single_transformer_blocks.14.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.14.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([ 75, 81, 85, -59, 85], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.14.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.15.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3203, 1.3203, 1.2188, 1.2656, 1.2656])\n", "\n", "Key: single_transformer_blocks.15.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3281, 1.3203, 1.2188, 1.2656, 1.2656])\n", "\n", "Key: single_transformer_blocks.15.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0337, -0.0190, 0.0435, 0.0635, -0.0635])\n", "\n", "Key: single_transformer_blocks.15.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.15.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 78, 88, -40, -37, 90], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.15.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.15.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0059, 0.0178, -0.0005, 0.0107, 0.0007])\n", "\n", "Key: single_transformer_blocks.15.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.15.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-39, 92, 90, 81, -55], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.15.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.15.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0081, 0.0012, -0.0042, -0.0147, 0.0014])\n", "\n", "Key: single_transformer_blocks.15.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.15.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-26, -67, 57, -47, -74], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.15.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.15.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0019, -0.0011, -0.0033, -0.0025, -0.0013])\n", "\n", "Key: single_transformer_blocks.15.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.15.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-68, 81, 82, -72, 29], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.15.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.15.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0143, -0.0199, -0.0152, -0.0131, -0.0195])\n", "\n", "Key: single_transformer_blocks.15.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.15.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 85, 80, 93, 86, -43], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.15.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.15.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0170, 0.0104, 0.0041, 0.0154, 0.0618])\n", "\n", "Key: single_transformer_blocks.15.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.15.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([87, 98, 85, 97, 84], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.15.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.16.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2969, 1.2734, 1.3906, 1.4062, 1.4609])\n", "\n", "Key: single_transformer_blocks.16.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2969, 1.2734, 1.3906, 1.4062, 1.4609])\n", "\n", "Key: single_transformer_blocks.16.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.1475, 0.0879, -0.1133, -0.0349, -0.0237])\n", "\n", "Key: single_transformer_blocks.16.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.16.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-31, 56, 52, -56, -30], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.16.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.16.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0064, 0.0137, -0.0074, -0.0128, -0.0015])\n", "\n", "Key: single_transformer_blocks.16.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.16.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, -40, -50, -34, -32], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.16.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.16.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0176, -0.0013, 0.0131, -0.0116, -0.0066])\n", "\n", "Key: single_transformer_blocks.16.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.16.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-53, 91, 76, -40, 94], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.16.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.16.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-1.1110e-04, 7.0953e-04, 4.1199e-03, -7.1526e-05, -3.0975e-03])\n", "\n", "Key: single_transformer_blocks.16.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.16.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 77, -48, -48, -79, 63], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.16.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.16.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0154, -0.0366, -0.0420, -0.0154, -0.0118])\n", "\n", "Key: single_transformer_blocks.16.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.16.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-34, 87, 96, 69, 78], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.16.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.16.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0119, 0.0071, -0.0184, 0.0166, 0.0520])\n", "\n", "Key: single_transformer_blocks.16.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.16.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, 93, -45, 81, -35], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.16.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.17.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1406, 1.2656, 1.4297, 1.4062, 1.4219])\n", "\n", "Key: single_transformer_blocks.17.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1406, 1.2656, 1.4297, 1.3984, 1.4219])\n", "\n", "Key: single_transformer_blocks.17.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([0.1572, 0.0728, 0.0161, 0.1001, 0.0048])\n", "\n", "Key: single_transformer_blocks.17.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.17.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-95, 91, -43, -56, -70], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.17.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.17.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0140, 0.0026, -0.0002, 0.0325, -0.0113])\n", "\n", "Key: single_transformer_blocks.17.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.17.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-38, 85, 104, 69, -32], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.17.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.17.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0043, -0.0022, 0.0069, -0.0089, 0.0099])\n", "\n", "Key: single_transformer_blocks.17.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.17.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 84, 94, 88, -49, 79], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.17.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.17.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0017, 0.0031, -0.0041, -0.0016, 0.0002])\n", "\n", "Key: single_transformer_blocks.17.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.17.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 67, -60, -55, 64, -66], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.17.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.17.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0170, -0.0309, -0.0569, -0.0361, -0.0332])\n", "\n", "Key: single_transformer_blocks.17.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.17.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, 90, -40, -54, -47], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.17.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.17.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0056, 0.0184, -0.0168, -0.0306, -0.0259])\n", "\n", "Key: single_transformer_blocks.17.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.17.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-31, 92, 85, 75, -37], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.17.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.18.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2656, 1.4375, 1.2812, 1.2734, 1.2969])\n", "\n", "Key: single_transformer_blocks.18.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2656, 1.4375, 1.2812, 1.2734, 1.2969])\n", "\n", "Key: single_transformer_blocks.18.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0381, 0.1660, -0.1377, -0.0728, 0.0334])\n", "\n", "Key: single_transformer_blocks.18.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.18.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-47, -50, -63, 72, -57], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.18.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.18.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0075, 0.0090, -0.0123, 0.0010, 0.0039])\n", "\n", "Key: single_transformer_blocks.18.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.18.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 58, -43, 80, 77, -46], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.18.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.18.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0013, -0.0022, 0.0055, -0.0060, -0.0037])\n", "\n", "Key: single_transformer_blocks.18.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.18.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([70, 94, 96, 76, 99], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.18.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.18.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0007, -0.0002, -0.0004, -0.0019, -0.0022])\n", "\n", "Key: single_transformer_blocks.18.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.18.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-53, 87, 50, 73, 72], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.18.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.18.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0023, -0.0054, -0.0457, -0.0244, 0.0018])\n", "\n", "Key: single_transformer_blocks.18.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.18.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, 96, -56, -37, 84], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.18.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.18.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0002, -0.0352, -0.0120, 0.0294, 0.0417])\n", "\n", "Key: single_transformer_blocks.18.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.18.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([ 78, 67, -39, -60, 68], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.18.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.19.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3984, 1.3672, 1.1406, 1.4141, 1.3516])\n", "\n", "Key: single_transformer_blocks.19.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3984, 1.3672, 1.1406, 1.4141, 1.3516])\n", "\n", "Key: single_transformer_blocks.19.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0137, -0.0564, 0.0781, -0.1367, 0.0952])\n", "\n", "Key: single_transformer_blocks.19.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.19.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-66, 90, 67, -48, 79], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.19.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.19.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0227, -0.0256, 0.0342, -0.0356, 0.2930])\n", "\n", "Key: single_transformer_blocks.19.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.19.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-44, -33, 87, 84, -53], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.19.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.19.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0165, -0.0024, -0.0188, -0.0027, 0.0193])\n", "\n", "Key: single_transformer_blocks.19.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.19.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 84, -67, -24, -32, 82], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.19.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.19.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0033, 0.0015, 0.0010, -0.0044, 0.0008])\n", "\n", "Key: single_transformer_blocks.19.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.19.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, -72, -54, -47, 52], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.19.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.19.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0081, -0.0175, -0.0312, -0.0057, -0.0210])\n", "\n", "Key: single_transformer_blocks.19.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.19.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, 98, 95, -48, -34], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.19.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.19.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0277, 0.0137, -0.0061, 0.0265, -0.0500])\n", "\n", "Key: single_transformer_blocks.19.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.19.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-36, 72, 69, -51, -40], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.19.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.2.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2812, 1.2969, 1.3047, 1.2734, 1.3047])\n", "\n", "Key: single_transformer_blocks.2.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2812, 1.2969, 1.3047, 1.2734, 1.3047])\n", "\n", "Key: single_transformer_blocks.2.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0026, 0.0057, -0.0171, 0.0142, 0.0061])\n", "\n", "Key: single_transformer_blocks.2.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.2.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-32, 66, 68, 74, 70], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.2.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.2.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0046, 0.0018, -0.0049, 0.0025, -0.0013])\n", "\n", "Key: single_transformer_blocks.2.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.2.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 88, -63, -29, 88, 78], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.2.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.2.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0140, -0.0037, -0.0184, 0.0054, -0.0051])\n", "\n", "Key: single_transformer_blocks.2.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.2.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ -50, -41, 89, -95, -100], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.2.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.2.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0007, -0.0011, 0.0049, -0.0037, -0.0017])\n", "\n", "Key: single_transformer_blocks.2.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.2.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 78, -69, 76, 74, 74], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.2.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.2.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0588, -0.0366, -0.0688, -0.0547, -0.0398])\n", "\n", "Key: single_transformer_blocks.2.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.2.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-26, -80, -33, 81, -48], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.2.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.2.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0227, 0.0311, -0.0247, -0.0234, -0.0204])\n", "\n", "Key: single_transformer_blocks.2.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.2.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-32, -31, 87, 99, 78], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.2.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.20.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2969, 1.3125, 1.3281, 1.2969, 1.2734])\n", "\n", "Key: single_transformer_blocks.20.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2969, 1.3125, 1.3281, 1.2969, 1.2734])\n", "\n", "Key: single_transformer_blocks.20.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0811, 0.0608, -0.1050, -0.0161, -0.1040])\n", "\n", "Key: single_transformer_blocks.20.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.20.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 89, -53, -66, -36, 61], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.20.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.20.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0278, 0.0155, 0.0004, 0.0047, -0.0024])\n", "\n", "Key: single_transformer_blocks.20.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.20.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 78, -62, 84, -93, 52], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.20.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.20.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0039, 0.0044, -0.0076, -0.0022, 0.0002])\n", "\n", "Key: single_transformer_blocks.20.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.20.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 86, -51, 81, 79, 90], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.20.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.20.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0060, -0.0003, 0.0060, 0.0034, -0.0043])\n", "\n", "Key: single_transformer_blocks.20.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.20.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 78, -48, 37, 62, -43], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.20.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.20.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0486, -0.0217, -0.0171, 0.0069, -0.0713])\n", "\n", "Key: single_transformer_blocks.20.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.20.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 58, -84, -70, 81, -54], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.20.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.20.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0070, -0.0079, 0.0088, 0.0291, -0.0659])\n", "\n", "Key: single_transformer_blocks.20.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.20.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([81, 84, 74, 76, 84], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.20.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.21.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2578, 1.3750, 1.4766, 1.4453, 1.4297])\n", "\n", "Key: single_transformer_blocks.21.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2578, 1.3750, 1.4766, 1.4453, 1.4297])\n", "\n", "Key: single_transformer_blocks.21.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.2275, -0.1533, -0.0022, 0.0583, -0.0967])\n", "\n", "Key: single_transformer_blocks.21.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.21.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 97, -42, -37, 83, 72], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.21.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.21.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0298, -0.0713, 0.0125, 0.0078, -0.0121])\n", "\n", "Key: single_transformer_blocks.21.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.21.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-65, -49, 92, 83, 87], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.21.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.21.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0059, 0.0205, 0.0081, 0.0238, 0.0209])\n", "\n", "Key: single_transformer_blocks.21.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.21.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, -29, -29, -61, 90], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.21.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.21.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0049, -0.0014, 0.0062, -0.0015, -0.0014])\n", "\n", "Key: single_transformer_blocks.21.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.21.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-67, 81, 86, 46, 58], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.21.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.21.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0101, -0.0216, -0.0145, -0.0243, -0.0457])\n", "\n", "Key: single_transformer_blocks.21.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.21.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 89, 50, -41, 79, 63], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.21.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.21.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0388, -0.0299, -0.0352, -0.0327, -0.0664])\n", "\n", "Key: single_transformer_blocks.21.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.21.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-53, -72, 81, 40, -56], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.21.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.22.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3203, 1.4141, 1.3906, 1.4219, 1.4219])\n", "\n", "Key: single_transformer_blocks.22.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3203, 1.4141, 1.3906, 1.4141, 1.4219])\n", "\n", "Key: single_transformer_blocks.22.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0635, -0.1245, -0.0076, -0.0747, 0.1514])\n", "\n", "Key: single_transformer_blocks.22.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.22.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-41, -36, 73, 73, 79], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.22.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.22.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0103, -0.0106, -0.0255, 0.0008, 0.0398])\n", "\n", "Key: single_transformer_blocks.22.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.22.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 77, 74, 87, 88, -51], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.22.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.22.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0062, 0.0063, -0.0081, 0.0009, 0.0007])\n", "\n", "Key: single_transformer_blocks.22.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.22.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 86, 75, 96, 74, -38], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.22.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.22.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0059, -0.0019, -0.0013, -0.0017, 0.0017])\n", "\n", "Key: single_transformer_blocks.22.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.22.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-59, -58, -45, 72, 76], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.22.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.22.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0151, -0.0513, -0.0503, 0.0107, -0.0256])\n", "\n", "Key: single_transformer_blocks.22.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.22.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-28, 70, -72, 82, 83], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.22.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.22.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0178, -0.0245, 0.0125, 0.0294, -0.0420])\n", "\n", "Key: single_transformer_blocks.22.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.22.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-55, -60, -43, -31, 84], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.22.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.23.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0046, 0.0006, -0.0013, 0.0010, 0.0023])\n", "\n", "Key: single_transformer_blocks.23.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.23.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-70, -59, -73, -47, 76], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.23.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.23.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0430, -0.0405, -0.0231, 0.0074, -0.0077])\n", "\n", "Key: single_transformer_blocks.23.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.23.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 88, 72, -38, 93, -47], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.23.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.3.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3516, 1.3594, 1.3438, 1.3906, 1.3438])\n", "\n", "Key: single_transformer_blocks.3.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3516, 1.3594, 1.3438, 1.3906, 1.3438])\n", "\n", "Key: single_transformer_blocks.3.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0659, -0.0530, 0.0461, 0.0354, 0.0586])\n", "\n", "Key: single_transformer_blocks.3.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.3.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-39, -54, 93, -40, -44], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.3.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.3.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-1.3000e-02, -2.7954e-02, -2.9449e-03, -8.4400e-05, 5.6152e-03])\n", "\n", "Key: single_transformer_blocks.3.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.3.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-73, 65, -49, 47, -35], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.3.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.3.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0007, -0.0089, -0.0011, -0.0099, -0.0027])\n", "\n", "Key: single_transformer_blocks.3.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.3.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, -53, 61, 80, -39], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.3.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.3.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([ 3.3875e-03, 3.3112e-03, 5.4321e-03, -7.9346e-04, -7.7724e-05])\n", "\n", "Key: single_transformer_blocks.3.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.3.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 81, -84, 82, 76, -58], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.3.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.3.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0199, -0.0292, -0.0454, -0.0238, -0.0308])\n", "\n", "Key: single_transformer_blocks.3.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.3.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-45, 67, 86, -50, -31], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.3.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.3.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0282, 0.0253, 0.0115, 0.0035, -0.0227])\n", "\n", "Key: single_transformer_blocks.3.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.3.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-30, -38, 62, 89, -43], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.3.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.4.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3438, 1.3438, 1.1484, 1.3438, 1.3203])\n", "\n", "Key: single_transformer_blocks.4.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3438, 1.3438, 1.1484, 1.3438, 1.3203])\n", "\n", "Key: single_transformer_blocks.4.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0654, -0.0850, 0.0503, -0.0449, 0.0264])\n", "\n", "Key: single_transformer_blocks.4.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.4.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, 60, -62, 69, -59], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.4.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.4.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0105, -0.0118, 0.0250, 0.0288, -0.0121])\n", "\n", "Key: single_transformer_blocks.4.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.4.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-45, 91, 93, 74, -49], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.4.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.4.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0134, 0.0231, -0.0108, 0.0111, -0.0034])\n", "\n", "Key: single_transformer_blocks.4.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.4.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-56, 82, -39, -65, 85], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.4.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.4.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0037, -0.0035, 0.0009, -0.0017, 0.0006])\n", "\n", "Key: single_transformer_blocks.4.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.4.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 77, 85, -51, 65, -55], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.4.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.4.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0540, -0.0280, -0.0168, -0.0400, -0.0645])\n", "\n", "Key: single_transformer_blocks.4.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.4.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-51, -40, -38, 85, 81], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.4.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.4.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0209, -0.0332, 0.0154, 0.0105, -0.0317])\n", "\n", "Key: single_transformer_blocks.4.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.4.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-67, -39, -41, -38, 71], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.4.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.5.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3516, 1.3672, 1.2578, 1.1328, 1.3828])\n", "\n", "Key: single_transformer_blocks.5.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3516, 1.3672, 1.2500, 1.1328, 1.3828])\n", "\n", "Key: single_transformer_blocks.5.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0244, -0.0850, 0.0258, -0.0537, -0.0820])\n", "\n", "Key: single_transformer_blocks.5.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.5.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 46, -42, 84, 82, 85], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.5.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.5.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0128, 0.0069, -0.0200, -0.0082, -0.0107])\n", "\n", "Key: single_transformer_blocks.5.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.5.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 53, -35, 79, 92, -34], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.5.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.5.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-5.0783e-05, 9.9487e-03, 4.0588e-03, -5.5313e-04, -7.8735e-03])\n", "\n", "Key: single_transformer_blocks.5.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.5.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-46, 98, -37, -55, 95], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.5.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.5.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0013, -0.0002, -0.0010, -0.0027, 0.0009])\n", "\n", "Key: single_transformer_blocks.5.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.5.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-60, -93, -47, -42, -54], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.5.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.5.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0187, -0.0175, -0.0439, -0.0261, -0.0096])\n", "\n", "Key: single_transformer_blocks.5.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.5.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-40, -41, -42, -42, 95], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.5.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.5.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0327, -0.0189, 0.0170, -0.0187, -0.0253])\n", "\n", "Key: single_transformer_blocks.5.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.5.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-58, -38, 80, -51, -46], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.5.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.6.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1328, 1.4453, 1.4219, 1.4609, 1.3906])\n", "\n", "Key: single_transformer_blocks.6.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1406, 1.4453, 1.4219, 1.4609, 1.3906])\n", "\n", "Key: single_transformer_blocks.6.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0713, 0.0903, -0.0620, -0.0405, -0.0471])\n", "\n", "Key: single_transformer_blocks.6.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.6.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 75, -46, 97, -47, -46], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.6.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.6.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0284, 0.0141, -0.0117, -0.0040, -0.0142])\n", "\n", "Key: single_transformer_blocks.6.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.6.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 69, 90, -39, -46, -37], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.6.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.6.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0046, 0.0038, 0.0211, 0.0139, -0.0098])\n", "\n", "Key: single_transformer_blocks.6.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.6.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, 92, -46, 74, 88], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.6.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.6.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0035, -0.0007, 0.0035, -0.0303, -0.0012])\n", "\n", "Key: single_transformer_blocks.6.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.6.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-50, 89, 78, -58, 70], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.6.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.6.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0256, -0.0190, -0.0417, -0.0270, -0.0297])\n", "\n", "Key: single_transformer_blocks.6.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.6.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-71, 81, 60, 89, 85], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.6.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.6.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0640, 0.0076, -0.0119, -0.0126, 0.0179])\n", "\n", "Key: single_transformer_blocks.6.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.6.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-43, 91, 81, -59, -32], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.6.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.7.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.4609, 1.4297, 1.2656, 1.0781, 1.4219])\n", "\n", "Key: single_transformer_blocks.7.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.4609, 1.4297, 1.2656, 1.0781, 1.4219])\n", "\n", "Key: single_transformer_blocks.7.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0415, 0.1318, -0.0569, -0.1475, 0.0444])\n", "\n", "Key: single_transformer_blocks.7.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.7.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 83, -43, -39, -43, 88], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.7.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.7.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0247, 0.0405, -0.0254, -0.0211, -0.0084])\n", "\n", "Key: single_transformer_blocks.7.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.7.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 53, 87, -38, 83, 88], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.7.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.7.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0045, 0.0024, 0.0010, 0.0073, 0.0182])\n", "\n", "Key: single_transformer_blocks.7.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.7.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 97, -86, -46, -52, 85], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.7.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.7.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0056, 0.0003, 0.0012, -0.0019, 0.0027])\n", "\n", "Key: single_transformer_blocks.7.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.7.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-62, 75, -61, 60, 73], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.7.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.7.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0188, -0.0544, -0.0415, 0.0111, -0.0312])\n", "\n", "Key: single_transformer_blocks.7.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.7.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, -47, 90, -69, -65], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.7.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.7.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0518, 0.0014, 0.0114, 0.0177, -0.0091])\n", "\n", "Key: single_transformer_blocks.7.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.7.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([82, 92, 65, 67, 37], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.7.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.8.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.4688, 1.4688, 1.4844, 1.2578, 1.2812])\n", "\n", "Key: single_transformer_blocks.8.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.4688, 1.4688, 1.4844, 1.2578, 1.2812])\n", "\n", "Key: single_transformer_blocks.8.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0013, 0.1006, -0.0300, 0.1328, -0.0591])\n", "\n", "Key: single_transformer_blocks.8.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.8.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-29, 98, -36, -35, -48], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.8.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.8.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0056, 0.0152, -0.0231, 0.0767, -0.0306])\n", "\n", "Key: single_transformer_blocks.8.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.8.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-31, 93, 72, -39, 83], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.8.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.8.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0064, 0.0058, -0.0150, 0.0014, -0.0023])\n", "\n", "Key: single_transformer_blocks.8.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.8.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 90, 77, 82, 83, -63], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.8.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.8.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0022, -0.0007, -0.0019, -0.0021, -0.0055])\n", "\n", "Key: single_transformer_blocks.8.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.8.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-44, 75, -47, -52, -51], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.8.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.8.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0413, -0.0422, -0.0137, -0.0317, -0.0273])\n", "\n", "Key: single_transformer_blocks.8.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.8.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-43, -44, -38, -62, 86], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.8.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.8.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 5.6641e-02, 7.2956e-05, -2.3193e-02, 2.5024e-02, 4.1016e-02])\n", "\n", "Key: single_transformer_blocks.8.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.8.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([-34, -40, 58, 78, -38], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.8.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.9.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.4844, 1.4766, 1.1094, 1.2031, 1.4844])\n", "\n", "Key: single_transformer_blocks.9.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.4844, 1.4766, 1.1094, 1.2031, 1.4844])\n", "\n", "Key: single_transformer_blocks.9.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0625, -0.0762, -0.0002, -0.0181, -0.1318])\n", "\n", "Key: single_transformer_blocks.9.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.9.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, 82, -32, 82, -40], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.9.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.9.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-5.7983e-03, -8.0109e-05, -1.5747e-02, 1.4954e-02, -1.7090e-02])\n", "\n", "Key: single_transformer_blocks.9.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.9.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, -43, 74, -55, -38], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.9.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.9.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0069, -0.0067, 0.0095, -0.0007, 0.0041])\n", "\n", "Key: single_transformer_blocks.9.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.9.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-48, -71, -40, 71, 92], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.9.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.9.norm.linear.bias\n", "Shape: torch.Size([9216])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0012, -0.0021, 0.0014, -0.0449, 0.0008])\n", "\n", "Key: single_transformer_blocks.9.norm.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.9.norm.linear.weight\n", "Shape: torch.Size([9216, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 64, -46, -51, 62, -60], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.9.norm.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.9.proj_mlp.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0077, -0.0181, -0.0320, -0.0225, -0.0549])\n", "\n", "Key: single_transformer_blocks.9.proj_mlp.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.9.proj_mlp.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 77, -58, 89, -73, 73], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.9.proj_mlp.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: single_transformer_blocks.9.proj_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0593, -0.0106, -0.0059, -0.0149, 0.0574])\n", "\n", "Key: single_transformer_blocks.9.proj_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: single_transformer_blocks.9.proj_out.weight\n", "Shape: torch.Size([3072, 15360])\n", "Dtype: torch.int8\n", "First few values: tensor([ 95, -42, -81, 91, 64], dtype=torch.int8)\n", "\n", "Key: single_transformer_blocks.9.proj_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: time_text_embed.guidance_embedder.linear_1.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0016, -0.0002, -0.0008, -0.0017, -0.0004])\n", "\n", "Key: time_text_embed.guidance_embedder.linear_1.weight\n", "Shape: torch.Size([3072, 256])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0154, -0.0171, -0.0103, 0.0139, -0.0076])\n", "\n", "Key: time_text_embed.guidance_embedder.linear_2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-7.2861e-04, -4.8828e-04, 9.6798e-05, 5.4932e-04, 8.5831e-04])\n", "\n", "Key: time_text_embed.guidance_embedder.linear_2.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0254, 0.0165, -0.0170, 0.0014, -0.0086])\n", "\n", "Key: time_text_embed.text_embedder.linear_1.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0012, -0.0017, -0.0009, -0.0028, -0.0018])\n", "\n", "Key: time_text_embed.text_embedder.linear_1.weight\n", "Shape: torch.Size([3072, 768])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0010, -0.0051, -0.0001, 0.0013, -0.0008])\n", "\n", "Key: time_text_embed.text_embedder.linear_2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-2.7771e-03, 1.6251e-03, 1.9989e-03, 1.0223e-03, 9.7156e-06])\n", "\n", "Key: time_text_embed.text_embedder.linear_2.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0040, 0.0014, -0.0024, -0.0014, -0.0024])\n", "\n", "Key: time_text_embed.timestep_embedder.linear_1.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0003, 0.0008, 0.0024, -0.0001, -0.0022])\n", "\n", "Key: time_text_embed.timestep_embedder.linear_1.weight\n", "Shape: torch.Size([3072, 256])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0011, -0.0013, -0.0031, 0.0032, -0.0006])\n", "\n", "Key: time_text_embed.timestep_embedder.linear_2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0028, 0.0016, 0.0022, 0.0013, -0.0003])\n", "\n", "Key: time_text_embed.timestep_embedder.linear_2.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0029, 0.0048, 0.0037, -0.0020, -0.0074])\n", "\n", "Key: transformer_blocks.0.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0302, 0.0300, 0.0291, 0.0131, 0.0024])\n", "\n", "Key: transformer_blocks.0.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-46, -49, 92, 82, 78], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0400, 0.0310, -0.0271, -0.0530, 0.0522])\n", "\n", "Key: transformer_blocks.0.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 59, 74, 88, 93, -42], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([6.2466e-05, 7.8125e-03, 6.1340e-03, 1.3245e-02, 7.2632e-03])\n", "\n", "Key: transformer_blocks.0.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 89, 79, -54, 92, 89], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9688, 0.9375, 0.9453, 1.0156, 0.9336])\n", "\n", "Key: transformer_blocks.0.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9062, 0.8320, 0.8945, 0.9219, 0.8633])\n", "\n", "Key: transformer_blocks.0.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.6133, 0.4492, 0.4492, 0.4141, 0.3789])\n", "\n", "Key: transformer_blocks.0.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0219, 0.6914, 0.3184, 0.0137, 0.4395])\n", "\n", "Key: transformer_blocks.0.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0161, 0.0156, -0.0050, -0.0085, 0.0179])\n", "\n", "Key: transformer_blocks.0.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 95, 90, -26, 81, 76], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0503, -0.0371, 0.0474, 0.0243, 0.0312])\n", "\n", "Key: transformer_blocks.0.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-76, 89, 81, 59, 89], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0010, 0.0004, 0.0015, 0.0118, -0.0002])\n", "\n", "Key: transformer_blocks.0.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-69, 81, -60, 70, -60], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0332, -0.0093, 0.0074, 0.0093, 0.0117])\n", "\n", "Key: transformer_blocks.0.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 76, -40, 65, -44, 79], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0825, 0.0007, 0.0022, 0.0017, 0.0654])\n", "\n", "Key: transformer_blocks.0.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 56, 73, 65, -47, -56], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0022, -0.0430, 0.0055, -0.0171, -0.0192])\n", "\n", "Key: transformer_blocks.0.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-85, -80, 69, -51, -82], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0081, -0.0028, -0.0063, 0.0388, 0.0053])\n", "\n", "Key: transformer_blocks.0.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, -46, 89, 71, 82], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0112, 0.0011, -0.0469, -0.0102, -0.0325])\n", "\n", "Key: transformer_blocks.0.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 87, -61, -47, 99, 88], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0190, -0.0165, -0.0081, 0.0396, 0.0018])\n", "\n", "Key: transformer_blocks.0.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-62, -44, -51, -41, -49], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0010, -0.0038, -0.0017, 0.0007, 0.0023])\n", "\n", "Key: transformer_blocks.0.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, 72, 89, 75, -54], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.0.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0014, 0.0052, 0.0009, -0.0031, -0.0065])\n", "\n", "Key: transformer_blocks.0.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.0.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 84, 91, 80, 71, -51], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.0.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0052, -0.0096, 0.0276, 0.0149, 0.0298])\n", "\n", "Key: transformer_blocks.1.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-60, 69, 64, -70, -34], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0136, 0.0562, -0.0150, 0.0493, -0.0586])\n", "\n", "Key: transformer_blocks.1.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-68, 70, 69, -31, 81], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0007, -0.0037, 0.0133, -0.0085, -0.0026])\n", "\n", "Key: transformer_blocks.1.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 84, -54, 59, -39, 74], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0234, 0.7734, 0.9141, 0.9023, 0.9180])\n", "\n", "Key: transformer_blocks.1.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0156, 0.7656, 0.9375, 0.8906, 0.9219])\n", "\n", "Key: transformer_blocks.1.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.6367, 0.5898, 0.5977, 0.5078, 0.4824])\n", "\n", "Key: transformer_blocks.1.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.6094, 0.6055, 0.6875, 0.5859, 0.4316])\n", "\n", "Key: transformer_blocks.1.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0063, -0.0066, 0.0226, 0.0240, -0.0022])\n", "\n", "Key: transformer_blocks.1.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, 79, 65, -48, 72], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0388, 0.0052, 0.0192, -0.0087, 0.0132])\n", "\n", "Key: transformer_blocks.1.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 83, -70, -70, 93, 80], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0006, -0.0072, 0.0214, 0.0209, -0.0068])\n", "\n", "Key: transformer_blocks.1.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, -44, 49, 72, -58], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0425, -0.0074, -0.0017, 0.0583, -0.0172])\n", "\n", "Key: transformer_blocks.1.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-65, 68, 91, -68, -46], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0019, 0.0011, 0.0035, 0.0029, -0.0050])\n", "\n", "Key: transformer_blocks.1.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, -66, 35, 82, 84], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0264, -0.0339, -0.0361, -0.0334, 0.0160])\n", "\n", "Key: transformer_blocks.1.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-75, 58, -73, 82, -52], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0035, 0.0002, -0.0044, 0.0273, 0.0034])\n", "\n", "Key: transformer_blocks.1.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 80, 75, -49, -79, 81], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0081, -0.0093, -0.0239, -0.0391, -0.0452])\n", "\n", "Key: transformer_blocks.1.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-66, -41, -45, 75, 85], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0187, -0.0093, 0.0153, -0.0125, 0.0092])\n", "\n", "Key: transformer_blocks.1.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 64, 80, -46, 61, -63], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0007, 0.0003, 0.0013, 0.0028, 0.0007])\n", "\n", "Key: transformer_blocks.1.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 74, -57, 61, 70, -51], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.1.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0046, 0.0014, 0.0023, 0.0081, -0.0007])\n", "\n", "Key: transformer_blocks.1.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.1.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-54, 92, -66, 78, 68], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.1.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0247, -0.0354, 0.0227, 0.0067, 0.0167])\n", "\n", "Key: transformer_blocks.10.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 89, 68, 99, -37, 78], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0219, 0.1045, -0.1299, -0.0869, 0.0359])\n", "\n", "Key: transformer_blocks.10.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-42, -53, 91, -64, 79], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0272, -0.0117, 0.0508, -0.0232, -0.0708])\n", "\n", "Key: transformer_blocks.10.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-43, -46, 83, 91, -62], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9336, 1.0391, 0.9648, 1.0469, 0.9922])\n", "\n", "Key: transformer_blocks.10.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8711, 0.8945, 0.7383, 0.9180, 0.9023])\n", "\n", "Key: transformer_blocks.10.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9570, 1.0234, 1.0156, 0.9648, 1.0547])\n", "\n", "Key: transformer_blocks.10.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0547, 1.1562, 1.1328, 1.1484, 1.1328])\n", "\n", "Key: transformer_blocks.10.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0145, 0.0062, 0.0182, 0.0031, -0.0276])\n", "\n", "Key: transformer_blocks.10.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-54, 95, 84, -56, 88], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0250, -0.0117, 0.0413, 0.0228, -0.0110])\n", "\n", "Key: transformer_blocks.10.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 64, -65, 83, 91, 75], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0103, 0.0075, -0.0050, -0.0152, 0.0067])\n", "\n", "Key: transformer_blocks.10.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 78, -41, -50, 72, 92], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0342, 0.0013, 0.0010, -0.2158, -0.0099])\n", "\n", "Key: transformer_blocks.10.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, -35, -34, -36, 90], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0029, 0.0003, -0.0199, 0.0099, 0.0005])\n", "\n", "Key: transformer_blocks.10.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-70, 73, -50, 90, 50], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0038, -0.0146, -0.0142, -0.0376, -0.0024])\n", "\n", "Key: transformer_blocks.10.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([88, 93, 84, 84, 75], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0229, -0.0229, -0.0194, -0.0113, 0.0229])\n", "\n", "Key: transformer_blocks.10.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 77, 98, -40, -39, -47], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0400, -0.0248, -0.0217, -0.0369, 0.0159])\n", "\n", "Key: transformer_blocks.10.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 36, 84, 64, -53, -42], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0200, -0.0156, 0.0115, 0.0047, 0.0121])\n", "\n", "Key: transformer_blocks.10.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 70, -32, 91, -55, -26], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0054, -0.0015, -0.0002, -0.0011, 0.0009])\n", "\n", "Key: transformer_blocks.10.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-48, -44, -42, -65, 77], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.10.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0002, 0.0090, -0.0053, 0.0039, -0.0033])\n", "\n", "Key: transformer_blocks.10.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.10.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, -76, -60, -38, 84], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.10.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0203, -0.0649, -0.0147, -0.0084, -0.0074])\n", "\n", "Key: transformer_blocks.11.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-42, 80, 79, 94, 74], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0449, 0.0547, 0.0045, 0.0146, 0.1240])\n", "\n", "Key: transformer_blocks.11.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([100, 97, 82, -49, -39], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0432, -0.0291, -0.0053, 0.0089, -0.0121])\n", "\n", "Key: transformer_blocks.11.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, -31, -32, -79, -46], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9648, 0.9766, 1.0391, 1.0859, 0.9492])\n", "\n", "Key: transformer_blocks.11.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9297, 0.9492, 0.8906, 0.8984, 0.8828])\n", "\n", "Key: transformer_blocks.11.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1250, 1.1094, 1.0234, 1.0547, 1.0703])\n", "\n", "Key: transformer_blocks.11.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1953, 1.1484, 1.1953, 1.2344, 1.1641])\n", "\n", "Key: transformer_blocks.11.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0148, 0.0356, -0.0004, -0.0059, -0.0238])\n", "\n", "Key: transformer_blocks.11.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-39, -35, -31, 89, -42], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0079, -0.0432, -0.0325, -0.0035, -0.0283])\n", "\n", "Key: transformer_blocks.11.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-44, -72, -38, -38, 75], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0097, 0.0347, -0.0053, -0.0359, 0.0117])\n", "\n", "Key: transformer_blocks.11.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-46, -71, 98, -38, -38], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0049, -0.0060, 0.0114, -0.0197, -0.0219])\n", "\n", "Key: transformer_blocks.11.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-47, -40, 90, 81, 59], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0159, 0.0065, 0.0042, 0.0054, -0.0018])\n", "\n", "Key: transformer_blocks.11.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, -71, 76, -46, -39], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0179, -0.0094, 0.0366, -0.0099, -0.0003])\n", "\n", "Key: transformer_blocks.11.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 94, -37, 60, -39, -55], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0204, -0.0300, 0.0079, 0.0083, 0.0052])\n", "\n", "Key: transformer_blocks.11.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-42, -43, 90, 90, 93], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0005, -0.0025, -0.0204, 0.0045, 0.0201])\n", "\n", "Key: transformer_blocks.11.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-34, 56, 89, -36, 90], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0096, -0.0444, -0.0132, -0.0613, -0.0332])\n", "\n", "Key: transformer_blocks.11.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 73, -54, 56, 91, 61], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0056, 0.0101, -0.0012, -0.0020, -0.0023])\n", "\n", "Key: transformer_blocks.11.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([78, 75, 94, 78, 81], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.11.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0051, 0.0031, -0.0090, 0.0025, -0.0059])\n", "\n", "Key: transformer_blocks.11.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.11.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 58, 33, 61, 73, -39], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.11.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0510, -0.0297, -0.0684, -0.0204, -0.0109])\n", "\n", "Key: transformer_blocks.12.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, 95, -39, 26, -54], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0029, 0.0291, 0.0408, -0.0427, -0.0107])\n", "\n", "Key: transformer_blocks.12.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 98, -41, -34, -51, 97], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0339, 0.0193, -0.0109, 0.0222, 0.0023])\n", "\n", "Key: transformer_blocks.12.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([89, 94, 87, 78, 84], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1797, 1.2031, 0.9688, 1.0781, 1.1953])\n", "\n", "Key: transformer_blocks.12.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0000, 0.9766, 0.7695, 0.7891, 0.9688])\n", "\n", "Key: transformer_blocks.12.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1094, 1.0859, 1.1094, 1.0547, 1.0391])\n", "\n", "Key: transformer_blocks.12.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3047, 1.3203, 1.0703, 1.1562, 1.2266])\n", "\n", "Key: transformer_blocks.12.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0017, -0.0383, 0.0050, 0.0231, -0.0171])\n", "\n", "Key: transformer_blocks.12.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, -36, -51, 98, 89], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0923, -0.0610, -0.0459, -0.0227, 0.0031])\n", "\n", "Key: transformer_blocks.12.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-58, -31, 91, -39, 93], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0123, -0.0118, -0.0063, -0.0069, 0.0376])\n", "\n", "Key: transformer_blocks.12.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 95, -35, 98, -48, -30], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0014, 0.0277, 0.0110, 0.0066, -0.0088])\n", "\n", "Key: transformer_blocks.12.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 88, -47, 92, -39, 87], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0085, 0.0013, -0.0068, 0.0034, -0.0093])\n", "\n", "Key: transformer_blocks.12.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-38, -55, -36, 77, -66], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0182, -0.0106, -0.0337, -0.0303, -0.0198])\n", "\n", "Key: transformer_blocks.12.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 64, -61, 81, 100, -35], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0108, 0.0166, -0.0190, 0.0120, -0.0205])\n", "\n", "Key: transformer_blocks.12.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, -42, -33, -41, -33], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0070, -0.0698, -0.0491, -0.0583, -0.0601])\n", "\n", "Key: transformer_blocks.12.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 89, -54, -63, 68, -42], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0063, 0.0031, -0.0157, -0.0330, 0.0030])\n", "\n", "Key: transformer_blocks.12.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-67, 70, -52, -58, -69], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0045, -0.0038, 0.0023, 0.0004, -0.0031])\n", "\n", "Key: transformer_blocks.12.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-54, -62, -42, -47, 83], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.12.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0073, 0.0145, -0.0101, -0.0037, 0.0024])\n", "\n", "Key: transformer_blocks.12.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.12.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-50, 58, -35, 91, -38], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.12.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0138, -0.1216, -0.0063, -0.0151, 0.0491])\n", "\n", "Key: transformer_blocks.13.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-33, 76, -52, -50, 76], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0542, 0.0203, -0.0525, -0.0515, 0.0167])\n", "\n", "Key: transformer_blocks.13.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 46, -41, -55, -45, 89], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0020, 0.0125, -0.0334, -0.0047, -0.0228])\n", "\n", "Key: transformer_blocks.13.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-31, 92, -58, 89, -50], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8828, 0.9062, 0.8789, 0.8320, 0.8672])\n", "\n", "Key: transformer_blocks.13.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8750, 0.8906, 0.8867, 0.8008, 0.8672])\n", "\n", "Key: transformer_blocks.13.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2891, 1.2969, 1.3047, 1.2500, 1.2969])\n", "\n", "Key: transformer_blocks.13.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3281, 1.3828, 1.3359, 1.3125, 1.3359])\n", "\n", "Key: transformer_blocks.13.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0033, -0.0410, 0.0047, -0.0091, 0.0298])\n", "\n", "Key: transformer_blocks.13.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, -48, 83, 75, 69], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0199, -0.0530, -0.0713, 0.0206, 0.0518])\n", "\n", "Key: transformer_blocks.13.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-39, -30, -60, -49, 89], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0110, 0.0282, 0.0146, -0.0337, -0.0347])\n", "\n", "Key: transformer_blocks.13.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, -30, -37, 90, -40], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0042, -0.0228, -0.0083, 0.0106, 0.0061])\n", "\n", "Key: transformer_blocks.13.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 93, -51, 91, -61, 94], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0030, 0.0032, 0.0028, -0.0010, 0.0118])\n", "\n", "Key: transformer_blocks.13.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 85, 75, 77, 77, -39], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0025, 0.0071, -0.0212, 0.0032, -0.0276])\n", "\n", "Key: transformer_blocks.13.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, -39, 77, 72, 85], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0208, 0.0291, 0.0074, 0.0205, 0.0077])\n", "\n", "Key: transformer_blocks.13.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 88, 91, -38, 85, -59], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0012, -0.0288, -0.0210, -0.0476, -0.0483])\n", "\n", "Key: transformer_blocks.13.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 97, -36, -43, 84, 82], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0125, 0.0135, -0.0047, -0.0153, 0.0254])\n", "\n", "Key: transformer_blocks.13.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-35, -40, 77, 73, -61], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0055, 0.0006, -0.0005, -0.0018, 0.0002])\n", "\n", "Key: transformer_blocks.13.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-86, 76, -40, 72, -51], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.13.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0009, 0.0067, -0.0026, 0.0028, -0.0044])\n", "\n", "Key: transformer_blocks.13.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.13.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 70, -38, -47, 92, -55], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.13.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0043, -0.0544, -0.0146, 0.0305, -0.0369])\n", "\n", "Key: transformer_blocks.14.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 49, 95, -33, 77, -55], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0063, -0.0403, 0.0811, 0.0669, -0.0359])\n", "\n", "Key: transformer_blocks.14.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-30, 81, -44, -58, 57], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0190, -0.0067, -0.0298, -0.0157, -0.0064])\n", "\n", "Key: transformer_blocks.14.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 90, -33, -31, 78, 83], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9648, 0.9922, 1.1797, 1.0547, 1.0391])\n", "\n", "Key: transformer_blocks.14.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.7930, 0.9844, 1.0078, 0.9844, 0.9727])\n", "\n", "Key: transformer_blocks.14.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0938, 1.1562, 1.0469, 1.1406, 1.1484])\n", "\n", "Key: transformer_blocks.14.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0938, 1.2188, 1.2500, 1.2578, 1.2734])\n", "\n", "Key: transformer_blocks.14.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0064, 0.0620, -0.0038, -0.0432, -0.0284])\n", "\n", "Key: transformer_blocks.14.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 90, -47, -41, 92, -32], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0089, -0.0049, -0.0155, -0.0209, 0.0081])\n", "\n", "Key: transformer_blocks.14.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-70, 86, -36, -39, -52], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0352, -0.0189, -0.0116, -0.0010, 0.0093])\n", "\n", "Key: transformer_blocks.14.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, 81, -53, -38, 74], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0131, -0.0069, 0.0128, 0.0096, -0.0259])\n", "\n", "Key: transformer_blocks.14.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 92, -37, -38, 70, -40], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0012, -0.0034, -0.0021, 0.0037, 0.0084])\n", "\n", "Key: transformer_blocks.14.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-56, 89, 75, -61, -36], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0177, -0.0039, -0.0159, -0.0334, -0.0354])\n", "\n", "Key: transformer_blocks.14.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([101, -41, -44, -38, 76], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0182, 0.0223, -0.0168, -0.0160, 0.0064])\n", "\n", "Key: transformer_blocks.14.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-55, -46, 90, -33, 93], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0688, -0.0106, -0.0674, -0.0508, -0.0334])\n", "\n", "Key: transformer_blocks.14.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 45, -60, -48, -45, -41], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0029, -0.0270, 0.0226, -0.0491, 0.0225])\n", "\n", "Key: transformer_blocks.14.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, 71, -64, 74, 71], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0001, -0.0007, 0.0053, -0.0061, -0.0013])\n", "\n", "Key: transformer_blocks.14.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-54, -52, 67, -59, 66], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.14.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0048, -0.0066, 0.0008, 0.0002, -0.0008])\n", "\n", "Key: transformer_blocks.14.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.14.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-46, 77, 83, -71, -56], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.14.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0162, 0.0033, -0.0168, 0.0317, 0.0029])\n", "\n", "Key: transformer_blocks.15.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([96, 49, 66, 87, 96], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0194, 0.0576, 0.0483, -0.0153, 0.0119])\n", "\n", "Key: transformer_blocks.15.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-30, -38, -55, -61, 85], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-6.0120e-03, -9.6436e-03, 1.3306e-02, -2.5272e-05, -1.4954e-02])\n", "\n", "Key: transformer_blocks.15.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 98, 81, 82, -60, 73], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8672, 0.8906, 0.8945, 0.8477, 0.9258])\n", "\n", "Key: transformer_blocks.15.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8281, 1.0312, 0.8789, 0.8398, 0.9844])\n", "\n", "Key: transformer_blocks.15.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1797, 1.2656, 1.2031, 1.1562, 1.2812])\n", "\n", "Key: transformer_blocks.15.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0469, 1.2500, 1.2891, 1.1797, 1.2891])\n", "\n", "Key: transformer_blocks.15.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0060, -0.0864, 0.0171, 0.0320, 0.0376])\n", "\n", "Key: transformer_blocks.15.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, -82, 78, 72, -40], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0542, -0.0206, -0.0559, 0.0417, 0.0378])\n", "\n", "Key: transformer_blocks.15.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, -40, -38, 59, 80], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0212, -0.0312, -0.0127, 0.0354, 0.0315])\n", "\n", "Key: transformer_blocks.15.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-48, -66, 88, -62, 99], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0344, -0.0159, -0.0082, -0.0004, 0.0162])\n", "\n", "Key: transformer_blocks.15.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 73, 89, 89, -41, -75], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0035, -0.0041, -0.0079, -0.0027, 0.0027])\n", "\n", "Key: transformer_blocks.15.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-61, -37, 89, 81, -39], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0156, -0.0136, -0.0102, -0.0140, -0.0221])\n", "\n", "Key: transformer_blocks.15.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 87, 89, -43, 86, 74], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0245, 0.0420, -0.0058, 0.0171, 0.0322])\n", "\n", "Key: transformer_blocks.15.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 80, -45, 84, -40, 90], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0058, -0.0003, 0.0278, 0.0084, 0.0024])\n", "\n", "Key: transformer_blocks.15.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 93, 89, 88, -57, -36], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0134, 0.1172, 0.0052, 0.0820, -0.0352])\n", "\n", "Key: transformer_blocks.15.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-43, -47, -39, 92, 94], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0029, 0.0002, 0.0049, 0.0020, -0.0021])\n", "\n", "Key: transformer_blocks.15.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-42, 84, -49, 87, 86], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.15.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0085, -0.0096, 0.0043, 0.0002, -0.0067])\n", "\n", "Key: transformer_blocks.15.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.15.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-45, -43, -45, -44, -47], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.15.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0120, -0.0098, 0.0258, 0.0767, 0.0240])\n", "\n", "Key: transformer_blocks.16.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 85, -43, -46, -45, -36], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0123, 0.0654, -0.0688, -0.0422, -0.0104])\n", "\n", "Key: transformer_blocks.16.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-45, 75, -40, -42, 97], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0305, 0.0007, 0.0571, -0.0006, 0.0586])\n", "\n", "Key: transformer_blocks.16.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 86, 91, 91, -37, -51], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9570, 0.9375, 0.9609, 0.6797, 0.9648])\n", "\n", "Key: transformer_blocks.16.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0312, 0.9648, 1.0547, 1.0234, 1.1250])\n", "\n", "Key: transformer_blocks.16.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2344, 1.2500, 1.2891, 1.3359, 1.3203])\n", "\n", "Key: transformer_blocks.16.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2812, 1.2656, 1.2734, 1.1406, 1.2578])\n", "\n", "Key: transformer_blocks.16.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0026, 0.1138, 0.0095, -0.0596, -0.0413])\n", "\n", "Key: transformer_blocks.16.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-46, -34, 91, 77, -42], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0400, -0.1289, -0.0012, 0.0160, 0.0197])\n", "\n", "Key: transformer_blocks.16.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-42, -62, 75, 60, -58], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0259, -0.0330, -0.0069, -0.0120, -0.0542])\n", "\n", "Key: transformer_blocks.16.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, 70, 74, 95, 31], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0022, 0.0747, 0.0094, -0.0026, 0.0159])\n", "\n", "Key: transformer_blocks.16.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-40, -60, -67, -40, 85], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0012, -0.0027, 0.0053, 0.0007, 0.0035])\n", "\n", "Key: transformer_blocks.16.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 78, -40, -38, 53, 77], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0122, -0.0020, -0.0100, -0.0128, -0.0208])\n", "\n", "Key: transformer_blocks.16.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-45, 87, 72, 90, -37], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0095, -0.0493, -0.0073, 0.0099, 0.0332])\n", "\n", "Key: transformer_blocks.16.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, -39, -39, 94, 73], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022, 0.0058, 0.0062, 0.0033, 0.0264])\n", "\n", "Key: transformer_blocks.16.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 99, -49, 75, -42, 74], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0079, -0.0215, -0.0258, -0.0854, 0.0270])\n", "\n", "Key: transformer_blocks.16.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 61, -38, 76, -34, 82], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 8.6060e-03, 3.7193e-04, 3.4637e-03, 4.6730e-05, -2.1362e-03])\n", "\n", "Key: transformer_blocks.16.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 72, -69, 81, -57, -62], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.16.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0075, -0.0117, 0.0046, -0.0084, -0.0049])\n", "\n", "Key: transformer_blocks.16.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.16.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-45, 84, -38, 74, -44], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.16.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0776, -0.0189, -0.0129, -0.0140, 0.0051])\n", "\n", "Key: transformer_blocks.17.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-47, 57, 76, -39, 88], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0864, -0.0103, -0.0203, 0.0674, 0.0015])\n", "\n", "Key: transformer_blocks.17.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 72, 93, 65, 90, -35], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0054, 0.0097, -0.0250, -0.0150, 0.0220])\n", "\n", "Key: transformer_blocks.17.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, 90, -39, 82, 77], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0312, 1.0078, 1.0469, 0.9648, 0.9492])\n", "\n", "Key: transformer_blocks.17.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0703, 1.0625, 0.9531, 1.0859, 0.8711])\n", "\n", "Key: transformer_blocks.17.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2578, 1.2734, 1.2500, 1.2891, 1.2656])\n", "\n", "Key: transformer_blocks.17.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3125, 1.3203, 1.2969, 1.3047, 1.2031])\n", "\n", "Key: transformer_blocks.17.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0175, 0.0417, 0.0067, -0.0786, -0.0164])\n", "\n", "Key: transformer_blocks.17.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 88, 63, -67, 71, -42], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0251, 0.0057, 0.0222, 0.0106, 0.0123])\n", "\n", "Key: transformer_blocks.17.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-55, 77, -41, -44, -53], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0278, 0.0610, 0.0347, 0.0232, 0.0376])\n", "\n", "Key: transformer_blocks.17.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-43, -40, -47, 86, 77], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 1.7090e-02, -9.5215e-03, 4.3335e-03, 1.2512e-02, -2.2531e-05])\n", "\n", "Key: transformer_blocks.17.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 66, -60, 72, -59, -49], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0081, -0.0004, 0.0015, 0.0014, -0.0016])\n", "\n", "Key: transformer_blocks.17.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 48, -55, -52, -45, -38], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0094, -0.0167, -0.0151, -0.0038, -0.0099])\n", "\n", "Key: transformer_blocks.17.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, 90, 73, 95, -46], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0239, -0.0630, 0.0222, 0.0153, -0.0038])\n", "\n", "Key: transformer_blocks.17.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-64, -36, 92, -40, 79], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0179, 0.0044, -0.0016, 0.0210, -0.0047])\n", "\n", "Key: transformer_blocks.17.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 97, 92, -45, -55, 85], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0126, -0.0369, 0.0128, -0.0500, -0.0223])\n", "\n", "Key: transformer_blocks.17.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([110, -45, -28, 67, 89], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0219, -0.0039, 0.0036, -0.0036, -0.0034])\n", "\n", "Key: transformer_blocks.17.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-46, 70, -53, 72, 78], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.17.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 6.4087e-03, -1.6968e-02, 6.6223e-03, -9.6798e-05, -2.9755e-03])\n", "\n", "Key: transformer_blocks.17.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.17.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-78, 76, 81, 81, 94], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.17.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0547, -0.1001, -0.0217, 0.0327, 0.0067])\n", "\n", "Key: transformer_blocks.18.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 84, -46, 88, 90, -59], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0913, 0.0183, -0.1348, 0.1050, -0.0449])\n", "\n", "Key: transformer_blocks.18.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, -32, -36, -40, 63], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0110, 0.0005, 0.0118, -0.0304, 0.0117])\n", "\n", "Key: transformer_blocks.18.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-48, -56, -54, -56, 81], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0625, 1.1562, 1.1875, 1.1719, 1.1953])\n", "\n", "Key: transformer_blocks.18.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8711, 0.9531, 0.8867, 0.8750, 0.8945])\n", "\n", "Key: transformer_blocks.18.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.2422, 1.2266, 1.2500, 1.2656, 1.2422])\n", "\n", "Key: transformer_blocks.18.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.3359, 1.4844, 1.5000, 1.5234, 1.5391])\n", "\n", "Key: transformer_blocks.18.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0203, -0.0187, -0.0056, 0.0952, 0.0020])\n", "\n", "Key: transformer_blocks.18.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 83, 101, 65, 86, -41], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0596, -0.0048, -0.0138, -0.0221, -0.0352])\n", "\n", "Key: transformer_blocks.18.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 86, 93, -80, -33, -73], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0049, -0.0869, -0.0178, -0.0203, -0.0310])\n", "\n", "Key: transformer_blocks.18.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 73, -42, -48, -54, 95], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0104, -0.0216, -0.0300, 0.0221, -0.0254])\n", "\n", "Key: transformer_blocks.18.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 65, 85, -51, -32, -55], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0021, 0.0027, 0.0101, 0.0077, 0.0029])\n", "\n", "Key: transformer_blocks.18.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, -47, -51, -40, -56], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0237, -0.0026, -0.0242, -0.0289, -0.0327])\n", "\n", "Key: transformer_blocks.18.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-51, -58, -43, 85, 97], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0167, -0.0532, 0.0101, 0.0214, -0.0104])\n", "\n", "Key: transformer_blocks.18.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-55, -40, 79, 97, -43], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0085, -0.0593, 0.0097, -0.0388, 0.0073])\n", "\n", "Key: transformer_blocks.18.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-46, 59, 51, -28, 85], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0084, -0.0056, -0.0035, 0.0156, -0.0003])\n", "\n", "Key: transformer_blocks.18.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 77, 25, -46, 51, -37], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0065, -0.0093, 0.0018, -0.0007, -0.0073])\n", "\n", "Key: transformer_blocks.18.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 32, -63, -46, 72, 75], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.18.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0003, 0.0006, 0.0036, -0.0045, -0.0011])\n", "\n", "Key: transformer_blocks.18.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.18.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 84, 85, 85, 66, -39], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.18.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0275, 0.0161, -0.0057, -0.0248, -0.0123])\n", "\n", "Key: transformer_blocks.2.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 74, -42, -58, -51, -49], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0251, -0.0383, -0.0217, 0.0244, 0.0549])\n", "\n", "Key: transformer_blocks.2.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 87, 82, -60, -61, -55], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0214, -0.0040, 0.0085, -0.0243, 0.0297])\n", "\n", "Key: transformer_blocks.2.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-42, 81, -60, 71, -44], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.7539, 0.6250, 0.6523, 0.7148, 0.6992])\n", "\n", "Key: transformer_blocks.2.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8594, 0.6367, 0.7305, 0.6211, 0.7383])\n", "\n", "Key: transformer_blocks.2.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.7773, 0.6719, 0.5742, 0.7812, 0.7461])\n", "\n", "Key: transformer_blocks.2.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.7031, 0.6992, 0.5352, 0.8945, 0.7461])\n", "\n", "Key: transformer_blocks.2.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0049, -0.0045, 0.0049, 0.0417, 0.0066])\n", "\n", "Key: transformer_blocks.2.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-34, -44, 85, -62, -50], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0147, -0.0117, 0.0079, -0.0088, 0.0212])\n", "\n", "Key: transformer_blocks.2.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, 58, -43, 78, -37], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0060, -0.0016, -0.0219, -0.0322, -0.0112])\n", "\n", "Key: transformer_blocks.2.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 84, 72, -48, -50, 73], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0058, -0.0114, -0.0080, 0.0025, 0.0066])\n", "\n", "Key: transformer_blocks.2.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 24, 78, 91, 59, -59], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-4.9591e-05, -6.3477e-03, -1.3733e-03, 6.6757e-04, -4.5776e-03])\n", "\n", "Key: transformer_blocks.2.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 79, -51, -62, 72, 30], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0256, -0.0129, -0.0128, -0.0017, -0.0203])\n", "\n", "Key: transformer_blocks.2.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 64, 101, -54, 72, 75], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0105, 0.0077, -0.0037, 0.0309, -0.0082])\n", "\n", "Key: transformer_blocks.2.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 44, 72, -54, 75, -45], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0112, 0.0139, -0.0322, -0.0192, -0.0009])\n", "\n", "Key: transformer_blocks.2.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-51, -28, -33, -49, 89], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0500, -0.0188, 0.0197, 0.0432, -0.0034])\n", "\n", "Key: transformer_blocks.2.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 54, 78, 73, -45, -48], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0015, 0.0003, 0.0003, 0.0025, 0.0016])\n", "\n", "Key: transformer_blocks.2.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, 75, -53, -70, 80], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.2.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 2.4414e-03, -7.7820e-03, 7.9346e-03, 3.4637e-03, -4.7445e-05])\n", "\n", "Key: transformer_blocks.2.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.2.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 86, -37, -56, 73, -70], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.2.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0474, -0.0479, 0.0339, 0.0061, -0.0535])\n", "\n", "Key: transformer_blocks.3.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 87, -60, 81, -48, 88], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0598, -0.0383, 0.0087, 0.0535, 0.0396])\n", "\n", "Key: transformer_blocks.3.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-33, 72, 88, -42, 73], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0034, 0.0023, 0.0172, 0.0255, -0.0121])\n", "\n", "Key: transformer_blocks.3.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-53, -59, 79, -50, -50], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8594, 0.7969, 0.7305, 0.8477, 0.9844])\n", "\n", "Key: transformer_blocks.3.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.7891, 0.8477, 0.6914, 0.8594, 0.8359])\n", "\n", "Key: transformer_blocks.3.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8711, 0.8828, 0.6992, 0.8516, 0.9688])\n", "\n", "Key: transformer_blocks.3.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0156, 0.8594, 0.7539, 0.8867, 1.0859])\n", "\n", "Key: transformer_blocks.3.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0201, -0.0193, -0.0039, 0.0045, -0.0099])\n", "\n", "Key: transformer_blocks.3.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, 92, 91, -39, 85], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0282, -0.0017, 0.0327, -0.0013, -0.0396])\n", "\n", "Key: transformer_blocks.3.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 94, -45, 50, -63, 84], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0009, -0.0010, 0.0153, 0.0190, 0.0140])\n", "\n", "Key: transformer_blocks.3.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-61, 88, 85, -36, 90], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0299, 0.0288, 0.0023, -0.0157, -0.0101])\n", "\n", "Key: transformer_blocks.3.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 39, 97, 65, -49, -42], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0129, -0.0022, -0.0038, 0.0024, 0.0059])\n", "\n", "Key: transformer_blocks.3.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 68, 73, 59, -41, -55], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0084, -0.0029, 0.0097, -0.0016, 0.0057])\n", "\n", "Key: transformer_blocks.3.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 66, 72, 57, -78, -48], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0106, 0.0085, 0.0020, 0.0152, -0.0003])\n", "\n", "Key: transformer_blocks.3.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-68, -39, -50, 47, -54], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0117, -0.0032, -0.0283, -0.0312, -0.0093])\n", "\n", "Key: transformer_blocks.3.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 89, 99, 90, -40, 84], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0225, 0.0359, 0.0017, 0.0376, 0.0007])\n", "\n", "Key: transformer_blocks.3.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 90, -38, 66, -47, -36], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0005, -0.0032, -0.0008, -0.0014, 0.0018])\n", "\n", "Key: transformer_blocks.3.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-74, 81, 73, -71, -63], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.3.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0069, -0.0014, 0.0035, -0.0122, -0.0037])\n", "\n", "Key: transformer_blocks.3.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.3.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-35, -53, 82, 81, 87], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.3.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0034, 0.0574, 0.0104, -0.0737, -0.0055])\n", "\n", "Key: transformer_blocks.4.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, 82, -95, 93, 91], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0091, -0.1235, -0.0206, 0.0576, -0.0260])\n", "\n", "Key: transformer_blocks.4.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-53, 89, -55, -53, 87], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0045, -0.0417, -0.0098, -0.0232, 0.0152])\n", "\n", "Key: transformer_blocks.4.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 65, -60, -44, -47, -49], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8242, 0.8633, 0.9141, 0.7461, 0.9219])\n", "\n", "Key: transformer_blocks.4.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.7734, 0.7891, 0.8672, 0.8086, 0.9219])\n", "\n", "Key: transformer_blocks.4.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8203, 1.0078, 0.9922, 0.9609, 0.9727])\n", "\n", "Key: transformer_blocks.4.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9531, 1.1406, 1.0469, 0.9180, 1.0000])\n", "\n", "Key: transformer_blocks.4.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0229, -0.0439, -0.0019, 0.0349, -0.0022])\n", "\n", "Key: transformer_blocks.4.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 86, 82, -56, -38, -54], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0114, -0.0659, -0.0069, 0.0664, -0.0238])\n", "\n", "Key: transformer_blocks.4.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-70, 75, 82, -63, -43], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0083, -0.0103, 0.0084, 0.0173, 0.0267])\n", "\n", "Key: transformer_blocks.4.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 85, -59, 91, -47, 92], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0374, -0.0581, -0.0240, -0.0047, -0.0291])\n", "\n", "Key: transformer_blocks.4.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 71, -56, 78, 75, -52], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0045, -0.0027, 0.0002, 0.0035, -0.0042])\n", "\n", "Key: transformer_blocks.4.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 93, 81, -74, 95, -56], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0141, -0.0050, -0.0094, 0.0063, -0.0261])\n", "\n", "Key: transformer_blocks.4.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-38, -35, -40, -52, -92], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0020, -0.0190, -0.0122, -0.0162, -0.0227])\n", "\n", "Key: transformer_blocks.4.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, 68, -55, -46, -46], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0461, -0.0227, -0.0063, -0.0217, -0.0092])\n", "\n", "Key: transformer_blocks.4.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-45, -45, 69, 76, -42], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0168, 0.0251, -0.0032, 0.0444, 0.0055])\n", "\n", "Key: transformer_blocks.4.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, -47, 103, 53, -39], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0013, 0.0003, -0.0006, 0.0022, -0.0011])\n", "\n", "Key: transformer_blocks.4.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 85, -53, 80, 82, -63], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.4.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0059, -0.0065, -0.0047, 0.0002, -0.0022])\n", "\n", "Key: transformer_blocks.4.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.4.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 89, -40, 75, 85, 80], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.4.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0649, -0.0334, 0.0067, -0.0149, 0.0845])\n", "\n", "Key: transformer_blocks.5.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, -36, 72, 73, -37], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0859, -0.0703, 0.0151, -0.0234, 0.0229])\n", "\n", "Key: transformer_blocks.5.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 52, -34, -38, 71, -42], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0068, 0.0286, -0.0060, 0.0011, -0.0183])\n", "\n", "Key: transformer_blocks.5.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 84, 72, -58, -47, -53], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8438, 0.8398, 0.9531, 0.8672, 0.8984])\n", "\n", "Key: transformer_blocks.5.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8203, 0.7383, 0.7812, 0.7656, 0.7773])\n", "\n", "Key: transformer_blocks.5.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0547, 1.0625, 1.0625, 1.0547, 1.0469])\n", "\n", "Key: transformer_blocks.5.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1250, 1.1875, 1.2266, 1.1484, 1.1562])\n", "\n", "Key: transformer_blocks.5.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0062, 0.0176, 0.0136, -0.0067, 0.0075])\n", "\n", "Key: transformer_blocks.5.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 85, 91, 87, -31, -65], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0415, -0.0074, -0.0317, -0.0100, 0.0223])\n", "\n", "Key: transformer_blocks.5.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, -29, 89, 80, 88], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0019, 0.0162, 0.0064, 0.0212, -0.0286])\n", "\n", "Key: transformer_blocks.5.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 88, -38, 88, 93, -57], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0337, -0.0087, 0.0197, -0.0232, 0.0437])\n", "\n", "Key: transformer_blocks.5.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 80, -34, 73, 81, -57], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0176, -0.0084, 0.0054, -0.0001, 0.0133])\n", "\n", "Key: transformer_blocks.5.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-33, -35, 75, -68, -53], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0593, 0.0212, 0.0129, -0.0085, -0.0021])\n", "\n", "Key: transformer_blocks.5.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 99, 96, 69, 60, -61], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0160, -0.0149, 0.0491, 0.0137, -0.0444])\n", "\n", "Key: transformer_blocks.5.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-38, -43, 58, -38, -56], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0004, -0.0087, 0.0006, 0.0092, -0.0272])\n", "\n", "Key: transformer_blocks.5.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-56, 80, -72, -41, 99], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0075, 0.0310, 0.0142, 0.0481, 0.0020])\n", "\n", "Key: transformer_blocks.5.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 84, -56, -46, 84, 40], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 1.2436e-03, 8.8501e-04, -4.8399e-05, -3.2806e-03, 1.6880e-04])\n", "\n", "Key: transformer_blocks.5.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 69, 85, 87, 64, -43], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.5.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0035, 0.0025, 0.0022, 0.0026, -0.0002])\n", "\n", "Key: transformer_blocks.5.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.5.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 83, -37, -56, 83, -52], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.5.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.1030, -0.0205, 0.0173, -0.0879, 0.0233])\n", "\n", "Key: transformer_blocks.6.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([98, 85, 93, 89, 92], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0267, 0.0093, -0.0603, 0.0728, 0.0179])\n", "\n", "Key: transformer_blocks.6.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-32, -59, 92, 92, 86], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0405, 0.0053, 0.0444, -0.0007, 0.0376])\n", "\n", "Key: transformer_blocks.6.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, -52, -35, 82, 85], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9531, 1.0156, 1.0234, 0.9609, 0.9102])\n", "\n", "Key: transformer_blocks.6.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9609, 0.9883, 0.9648, 0.9062, 0.9375])\n", "\n", "Key: transformer_blocks.6.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9219, 0.9570, 0.9492, 0.9766, 0.9727])\n", "\n", "Key: transformer_blocks.6.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9492, 0.9961, 1.0625, 1.0625, 0.9258])\n", "\n", "Key: transformer_blocks.6.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0168, -0.0320, 0.0043, -0.0108, 0.0182])\n", "\n", "Key: transformer_blocks.6.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-32, -36, 69, 87, 66], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0574, -0.0559, 0.0074, -0.0596, -0.0457])\n", "\n", "Key: transformer_blocks.6.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-43, -63, 92, 84, -58], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0187, 0.0125, 0.0201, -0.0023, 0.0229])\n", "\n", "Key: transformer_blocks.6.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([104, -40, 86, 88, 92], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0227, 0.0093, 0.0005, 0.0084, 0.0625])\n", "\n", "Key: transformer_blocks.6.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([99, 79, 89, 68, 85], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0073, 0.0156, 0.0031, -0.0145, -0.0053])\n", "\n", "Key: transformer_blocks.6.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 93, -41, -40, 11, 90], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0201, -0.0044, -0.0012, -0.0535, 0.0045])\n", "\n", "Key: transformer_blocks.6.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 78, -61, -48, 36, -44], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0153, -0.0220, -0.0272, 0.0102, -0.0043])\n", "\n", "Key: transformer_blocks.6.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-56, 92, 73, -65, -39], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0051, 0.0054, 0.0111, 0.0131, -0.0232])\n", "\n", "Key: transformer_blocks.6.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, 87, -31, -35, 89], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 6.5308e-03, -3.0640e-02, -3.6001e-05, 5.0537e-02, -2.4658e-02])\n", "\n", "Key: transformer_blocks.6.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-48, -46, -37, 74, -65], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0029, 0.0042, 0.0005, -0.0024, -0.0021])\n", "\n", "Key: transformer_blocks.6.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-47, 63, 88, -42, 81], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.6.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0011, -0.0063, 0.0061, -0.0061, -0.0077])\n", "\n", "Key: transformer_blocks.6.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.6.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-42, -35, -46, 73, 82], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.6.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0466, -0.0566, 0.0294, 0.0112, 0.0571])\n", "\n", "Key: transformer_blocks.7.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 90, 59, 85, 41, -38], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0113, -0.0437, 0.0786, -0.0593, 0.0474])\n", "\n", "Key: transformer_blocks.7.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([95, 74, 92, 34, 94], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0016, 0.0315, 0.0049, 0.0109, -0.0104])\n", "\n", "Key: transformer_blocks.7.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, 80, 91, 89, 82], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8711, 0.8750, 0.8828, 0.9062, 0.7383])\n", "\n", "Key: transformer_blocks.7.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.8477, 0.8672, 0.8594, 0.8828, 0.8164])\n", "\n", "Key: transformer_blocks.7.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0312, 1.0234, 1.0000, 1.0312, 0.9453])\n", "\n", "Key: transformer_blocks.7.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1016, 1.1016, 1.0859, 1.1094, 0.9883])\n", "\n", "Key: transformer_blocks.7.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0122, 0.0192, -0.0096, 0.0299, 0.0271])\n", "\n", "Key: transformer_blocks.7.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, 84, -40, 74, 97], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0359, -0.0173, -0.0212, 0.0049, 0.0197])\n", "\n", "Key: transformer_blocks.7.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-66, -36, 88, 66, -40], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0105, 0.0322, 0.0139, -0.0011, -0.0048])\n", "\n", "Key: transformer_blocks.7.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, -32, 90, 83, -38], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0151, -0.0198, 0.0530, -0.0192, -0.0273])\n", "\n", "Key: transformer_blocks.7.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 92, -40, -29, 85, 85], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0018, 0.0039, -0.0092, -0.0138, 0.0001])\n", "\n", "Key: transformer_blocks.7.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-59, 68, -64, -79, -32], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0045, 0.0010, -0.0222, -0.0044, 0.0042])\n", "\n", "Key: transformer_blocks.7.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 63, -36, -38, -33, 94], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-4.2915e-04, -2.0020e-02, -4.5598e-06, -1.0315e-02, -5.2734e-02])\n", "\n", "Key: transformer_blocks.7.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 67, -32, 82, -46, -60], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0081, 0.0004, 0.0153, -0.0036, 0.0101])\n", "\n", "Key: transformer_blocks.7.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([94, 94, 81, 90, 90], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0153, -0.0145, -0.0173, -0.0679, 0.0349])\n", "\n", "Key: transformer_blocks.7.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-34, 91, -44, 97, -63], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0007, 0.0010, 0.0022, -0.0023, 0.0025])\n", "\n", "Key: transformer_blocks.7.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 32, -46, -52, -53, -54], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.7.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0008, 0.0020, -0.0043, -0.0008, -0.0054])\n", "\n", "Key: transformer_blocks.7.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.7.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-52, 65, -47, -72, 91], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.7.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0342, 0.0352, -0.0388, 0.0110, 0.0432])\n", "\n", "Key: transformer_blocks.8.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-46, -46, -40, 92, -54], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.1270, 0.0342, 0.0547, 0.0532, -0.0071])\n", "\n", "Key: transformer_blocks.8.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 83, 87, 76, -44, 80], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0023, 0.0417, -0.0222, 0.0284, 0.0017])\n", "\n", "Key: transformer_blocks.8.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 81, -51, 97, -64, 89], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0312, 1.0859, 1.0859, 0.9062, 0.8320])\n", "\n", "Key: transformer_blocks.8.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9336, 0.9922, 0.9648, 1.0859, 0.8438])\n", "\n", "Key: transformer_blocks.8.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0391, 0.9531, 0.9844, 1.0625, 0.9766])\n", "\n", "Key: transformer_blocks.8.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1250, 1.1094, 1.1172, 0.9102, 0.9531])\n", "\n", "Key: transformer_blocks.8.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0206, 0.0070, -0.0155, -0.0315, 0.0505])\n", "\n", "Key: transformer_blocks.8.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 81, 64, -51, 101, 58], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0344, -0.0327, -0.0156, -0.0099, -0.0317])\n", "\n", "Key: transformer_blocks.8.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-53, -37, -37, -48, -59], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0002, -0.0104, 0.0010, -0.0154, -0.0221])\n", "\n", "Key: transformer_blocks.8.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 86, -45, 65, -39, -38], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0144, 0.0157, 0.0087, -0.0225, 0.0315])\n", "\n", "Key: transformer_blocks.8.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-33, 97, -37, -34, 62], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 1.7624e-03, 3.5095e-03, 3.6621e-03, -2.5630e-05, 1.2512e-03])\n", "\n", "Key: transformer_blocks.8.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-37, 83, 51, -39, -55], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0186, -0.0153, -0.0090, 0.0143, -0.0022])\n", "\n", "Key: transformer_blocks.8.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-46, 75, 98, -39, -47], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0074, 0.0115, -0.0027, 0.0011, 0.0170])\n", "\n", "Key: transformer_blocks.8.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 92, -37, -43, -50, -34], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0194, -0.0031, -0.0309, 0.0010, -0.0162])\n", "\n", "Key: transformer_blocks.8.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 75, 85, 74, -59, 59], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0132, -0.0231, -0.0084, -0.0674, 0.0479])\n", "\n", "Key: transformer_blocks.8.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([-40, 97, 63, -64, -44], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0067, 0.0006, 0.0001, -0.0022, -0.0015])\n", "\n", "Key: transformer_blocks.8.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-44, 82, -43, 73, 45], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.8.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0009, 0.0031, 0.0082, 0.0017, 0.0034])\n", "\n", "Key: transformer_blocks.8.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.8.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-70, -54, -61, 86, 82], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.8.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.attn.add_k_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0361, 0.0060, 0.0153, 0.0221, -0.1445])\n", "\n", "Key: transformer_blocks.9.attn.add_k_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.attn.add_k_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 91, -39, 77, -52, 79], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.attn.add_k_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.attn.add_q_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0923, -0.0623, 0.0344, -0.0060, 0.1611])\n", "\n", "Key: transformer_blocks.9.attn.add_q_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.attn.add_q_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 87, 76, 89, -62, 83], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.attn.add_q_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.attn.add_v_proj.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0118, 0.0300, 0.0381, 0.0282, -0.0023])\n", "\n", "Key: transformer_blocks.9.attn.add_v_proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.attn.add_v_proj.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-28, -37, -62, 81, 53], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.attn.add_v_proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.attn.norm_added_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.9805, 1.0312, 0.9062, 1.0078, 0.9688])\n", "\n", "Key: transformer_blocks.9.attn.norm_added_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([0.7969, 0.7578, 0.7578, 0.8672, 0.8203])\n", "\n", "Key: transformer_blocks.9.attn.norm_k.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.0625, 1.0703, 1.1016, 1.1094, 1.0625])\n", "\n", "Key: transformer_blocks.9.attn.norm_q.weight\n", "Shape: torch.Size([128])\n", "Dtype: torch.float32\n", "First few values: tensor([1.1953, 1.2812, 1.2578, 1.2500, 1.1875])\n", "\n", "Key: transformer_blocks.9.attn.to_add_out.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0051, 0.0262, 0.0119, -0.0019, 0.0067])\n", "\n", "Key: transformer_blocks.9.attn.to_add_out.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.attn.to_add_out.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 98, -62, 80, -40, -43], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.attn.to_add_out.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.attn.to_k.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0188, 0.0503, 0.0554, -0.0250, 0.0859])\n", "\n", "Key: transformer_blocks.9.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.attn.to_k.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 92, 91, 82, -60, -41], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.attn.to_k.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.attn.to_out.0.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0264, 0.0184, -0.0082, -0.0128, 0.0234])\n", "\n", "Key: transformer_blocks.9.attn.to_out.0.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.attn.to_out.0.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-42, -47, -46, -39, -41], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.attn.to_out.0.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.attn.to_q.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0210, -0.1709, 0.0132, 0.0059, -0.0037])\n", "\n", "Key: transformer_blocks.9.attn.to_q.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.attn.to_q.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 82, -64, -89, 80, 90], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.attn.to_q.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.attn.to_v.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0286, 0.0095, 0.0173, 0.0065, -0.0093])\n", "\n", "Key: transformer_blocks.9.attn.to_v.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.attn.to_v.weight\n", "Shape: torch.Size([3072, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-73, 87, -46, 66, 77], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.attn.to_v.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.ff.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0312, -0.0021, -0.0060, -0.0026, -0.0042])\n", "\n", "Key: transformer_blocks.9.ff.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.ff.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-49, 86, 90, 94, -34], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.ff.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.ff.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([ 0.0064, 0.0131, -0.0038, -0.0161, 0.0217])\n", "\n", "Key: transformer_blocks.9.ff.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.ff.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 78, -44, -62, -60, -48], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.ff.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.ff_context.net.0.proj.bias\n", "Shape: torch.Size([12288])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0286, 0.0112, 0.0071, 0.0182, 0.0142])\n", "\n", "Key: transformer_blocks.9.ff_context.net.0.proj.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.ff_context.net.0.proj.weight\n", "Shape: torch.Size([12288, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([-39, 60, -82, 90, 65], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.ff_context.net.0.proj.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.ff_context.net.2.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0254, -0.0164, -0.0009, -0.0688, -0.0579])\n", "\n", "Key: transformer_blocks.9.ff_context.net.2.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.ff_context.net.2.weight\n", "Shape: torch.Size([3072, 12288])\n", "Dtype: torch.int8\n", "First few values: tensor([ 97, 95, -37, -44, -44], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.ff_context.net.2.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.norm1.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-7.8583e-04, 4.2343e-04, -6.8665e-05, 1.9684e-03, -7.0572e-04])\n", "\n", "Key: transformer_blocks.9.norm1.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.norm1.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ 70, -52, -50, 84, 83], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.norm1.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: transformer_blocks.9.norm1_context.linear.bias\n", "Shape: torch.Size([18432])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0036, -0.0057, -0.0037, -0.0005, 0.0004])\n", "\n", "Key: transformer_blocks.9.norm1_context.linear.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Key: transformer_blocks.9.norm1_context.linear.weight\n", "Shape: torch.Size([18432, 3072])\n", "Dtype: torch.int8\n", "First few values: tensor([ -45, -102, 81, 76, -39], dtype=torch.int8)\n", "\n", "Key: transformer_blocks.9.norm1_context.linear.weight_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0022])\n", "\n", "Key: x_embedder.bias\n", "Shape: torch.Size([3072])\n", "Dtype: torch.float32\n", "First few values: tensor([0.0091, 0.0091, 0.0009, 0.0047, 0.0031])\n", "\n", "Key: x_embedder.weight\n", "Shape: torch.Size([3072, 64])\n", "Dtype: torch.float32\n", "First few values: tensor([-0.0304, 0.0195, -0.0260, 0.0249, -0.0422])\n", "\n", "Loading specific tensor:\n", "\n", "Key: single_transformer_blocks.1.attn.to_k.in_scale\n", "Shape: torch.Size([1])\n", "Dtype: torch.float32\n", "First few values: tensor([1.])\n", "\n", "Alternative method - loading entire file:\n", "Available keys: ['context_embedder.bias', 'context_embedder.weight', 'single_transformer_blocks.0.attn.norm_k.weight', 'single_transformer_blocks.0.attn.norm_q.weight', 'single_transformer_blocks.0.attn.to_k.bias', 'single_transformer_blocks.0.attn.to_k.in_scale', 'single_transformer_blocks.0.attn.to_k.weight', 'single_transformer_blocks.0.attn.to_k.weight_scale', 'single_transformer_blocks.0.attn.to_q.bias', 'single_transformer_blocks.0.attn.to_q.in_scale', 'single_transformer_blocks.0.attn.to_q.weight', 'single_transformer_blocks.0.attn.to_q.weight_scale', 'single_transformer_blocks.0.attn.to_v.bias', 'single_transformer_blocks.0.attn.to_v.in_scale', 'single_transformer_blocks.0.attn.to_v.weight', 'single_transformer_blocks.0.attn.to_v.weight_scale', 'single_transformer_blocks.0.norm.linear.bias', 'single_transformer_blocks.0.norm.linear.in_scale', 'single_transformer_blocks.0.norm.linear.weight', 'single_transformer_blocks.0.norm.linear.weight_scale', 'single_transformer_blocks.0.proj_mlp.bias', 'single_transformer_blocks.0.proj_mlp.in_scale', 'single_transformer_blocks.0.proj_mlp.weight', 'single_transformer_blocks.0.proj_mlp.weight_scale', 'single_transformer_blocks.0.proj_out.bias', 'single_transformer_blocks.0.proj_out.in_scale', 'single_transformer_blocks.0.proj_out.weight', 'single_transformer_blocks.0.proj_out.weight_scale', 'single_transformer_blocks.1.attn.norm_k.weight', 'single_transformer_blocks.1.attn.norm_q.weight', 'single_transformer_blocks.1.attn.to_k.bias', 'single_transformer_blocks.1.attn.to_k.in_scale', 'single_transformer_blocks.1.attn.to_k.weight', 'single_transformer_blocks.1.attn.to_k.weight_scale', 'single_transformer_blocks.1.attn.to_q.bias', 'single_transformer_blocks.1.attn.to_q.in_scale', 'single_transformer_blocks.1.attn.to_q.weight', 'single_transformer_blocks.1.attn.to_q.weight_scale', 'single_transformer_blocks.1.attn.to_v.bias', 'single_transformer_blocks.1.attn.to_v.in_scale', 'single_transformer_blocks.1.attn.to_v.weight', 'single_transformer_blocks.1.attn.to_v.weight_scale', 'single_transformer_blocks.1.norm.linear.bias', 'single_transformer_blocks.1.norm.linear.in_scale', 'single_transformer_blocks.1.norm.linear.weight', 'single_transformer_blocks.1.norm.linear.weight_scale', 'single_transformer_blocks.1.proj_mlp.bias', 'single_transformer_blocks.1.proj_mlp.in_scale', 'single_transformer_blocks.1.proj_mlp.weight', 'single_transformer_blocks.1.proj_mlp.weight_scale', 'single_transformer_blocks.1.proj_out.bias', 'single_transformer_blocks.1.proj_out.in_scale', 'single_transformer_blocks.1.proj_out.weight', 'single_transformer_blocks.1.proj_out.weight_scale', 'single_transformer_blocks.10.attn.norm_k.weight', 'single_transformer_blocks.10.attn.norm_q.weight', 'single_transformer_blocks.10.attn.to_k.bias', 'single_transformer_blocks.10.attn.to_k.in_scale', 'single_transformer_blocks.10.attn.to_k.weight', 'single_transformer_blocks.10.attn.to_k.weight_scale', 'single_transformer_blocks.10.attn.to_q.bias', 'single_transformer_blocks.10.attn.to_q.in_scale', 'single_transformer_blocks.10.attn.to_q.weight', 'single_transformer_blocks.10.attn.to_q.weight_scale', 'single_transformer_blocks.10.attn.to_v.bias', 'single_transformer_blocks.10.attn.to_v.in_scale', 'single_transformer_blocks.10.attn.to_v.weight', 'single_transformer_blocks.10.attn.to_v.weight_scale', 'single_transformer_blocks.10.norm.linear.bias', 'single_transformer_blocks.10.norm.linear.in_scale', 'single_transformer_blocks.10.norm.linear.weight', 'single_transformer_blocks.10.norm.linear.weight_scale', 'single_transformer_blocks.10.proj_mlp.bias', 'single_transformer_blocks.10.proj_mlp.in_scale', 'single_transformer_blocks.10.proj_mlp.weight', 'single_transformer_blocks.10.proj_mlp.weight_scale', 'single_transformer_blocks.10.proj_out.bias', 'single_transformer_blocks.10.proj_out.in_scale', 'single_transformer_blocks.10.proj_out.weight', 'single_transformer_blocks.10.proj_out.weight_scale', 'single_transformer_blocks.11.attn.norm_k.weight', 'single_transformer_blocks.11.attn.norm_q.weight', 'single_transformer_blocks.11.attn.to_k.bias', 'single_transformer_blocks.11.attn.to_k.in_scale', 'single_transformer_blocks.11.attn.to_k.weight', 'single_transformer_blocks.11.attn.to_k.weight_scale', 'single_transformer_blocks.11.attn.to_q.bias', 'single_transformer_blocks.11.attn.to_q.in_scale', 'single_transformer_blocks.11.attn.to_q.weight', 'single_transformer_blocks.11.attn.to_q.weight_scale', 'single_transformer_blocks.11.attn.to_v.bias', 'single_transformer_blocks.11.attn.to_v.in_scale', 'single_transformer_blocks.11.attn.to_v.weight', 'single_transformer_blocks.11.attn.to_v.weight_scale', 'single_transformer_blocks.11.norm.linear.bias', 'single_transformer_blocks.11.norm.linear.in_scale', 'single_transformer_blocks.11.norm.linear.weight', 'single_transformer_blocks.11.norm.linear.weight_scale', 'single_transformer_blocks.11.proj_mlp.bias', 'single_transformer_blocks.11.proj_mlp.in_scale', 'single_transformer_blocks.11.proj_mlp.weight', 'single_transformer_blocks.11.proj_mlp.weight_scale', 'single_transformer_blocks.11.proj_out.bias', 'single_transformer_blocks.11.proj_out.in_scale', 'single_transformer_blocks.11.proj_out.weight', 'single_transformer_blocks.11.proj_out.weight_scale', 'single_transformer_blocks.12.attn.norm_k.weight', 'single_transformer_blocks.12.attn.norm_q.weight', 'single_transformer_blocks.12.attn.to_k.bias', 'single_transformer_blocks.12.attn.to_k.in_scale', 'single_transformer_blocks.12.attn.to_k.weight', 'single_transformer_blocks.12.attn.to_k.weight_scale', 'single_transformer_blocks.12.attn.to_q.bias', 'single_transformer_blocks.12.attn.to_q.in_scale', 'single_transformer_blocks.12.attn.to_q.weight', 'single_transformer_blocks.12.attn.to_q.weight_scale', 'single_transformer_blocks.12.attn.to_v.bias', 'single_transformer_blocks.12.attn.to_v.in_scale', 'single_transformer_blocks.12.attn.to_v.weight', 'single_transformer_blocks.12.attn.to_v.weight_scale', 'single_transformer_blocks.12.norm.linear.bias', 'single_transformer_blocks.12.norm.linear.in_scale', 'single_transformer_blocks.12.norm.linear.weight', 'single_transformer_blocks.12.norm.linear.weight_scale', 'single_transformer_blocks.12.proj_mlp.bias', 'single_transformer_blocks.12.proj_mlp.in_scale', 'single_transformer_blocks.12.proj_mlp.weight', 'single_transformer_blocks.12.proj_mlp.weight_scale', 'single_transformer_blocks.12.proj_out.bias', 'single_transformer_blocks.12.proj_out.in_scale', 'single_transformer_blocks.12.proj_out.weight', 'single_transformer_blocks.12.proj_out.weight_scale', 'single_transformer_blocks.13.attn.norm_k.weight', 'single_transformer_blocks.13.attn.norm_q.weight', 'single_transformer_blocks.13.attn.to_k.bias', 'single_transformer_blocks.13.attn.to_k.in_scale', 'single_transformer_blocks.13.attn.to_k.weight', 'single_transformer_blocks.13.attn.to_k.weight_scale', 'single_transformer_blocks.13.attn.to_q.bias', 'single_transformer_blocks.13.attn.to_q.in_scale', 'single_transformer_blocks.13.attn.to_q.weight', 'single_transformer_blocks.13.attn.to_q.weight_scale', 'single_transformer_blocks.13.attn.to_v.bias', 'single_transformer_blocks.13.attn.to_v.in_scale', 'single_transformer_blocks.13.attn.to_v.weight', 'single_transformer_blocks.13.attn.to_v.weight_scale', 'single_transformer_blocks.13.norm.linear.bias', 'single_transformer_blocks.13.norm.linear.in_scale', 'single_transformer_blocks.13.norm.linear.weight', 'single_transformer_blocks.13.norm.linear.weight_scale', 'single_transformer_blocks.13.proj_mlp.bias', 'single_transformer_blocks.13.proj_mlp.in_scale', 'single_transformer_blocks.13.proj_mlp.weight', 'single_transformer_blocks.13.proj_mlp.weight_scale', 'single_transformer_blocks.13.proj_out.bias', 'single_transformer_blocks.13.proj_out.in_scale', 'single_transformer_blocks.13.proj_out.weight', 'single_transformer_blocks.13.proj_out.weight_scale', 'single_transformer_blocks.14.attn.norm_k.weight', 'single_transformer_blocks.14.attn.norm_q.weight', 'single_transformer_blocks.14.attn.to_k.bias', 'single_transformer_blocks.14.attn.to_k.in_scale', 'single_transformer_blocks.14.attn.to_k.weight', 'single_transformer_blocks.14.attn.to_k.weight_scale', 'single_transformer_blocks.14.attn.to_q.bias', 'single_transformer_blocks.14.attn.to_q.in_scale', 'single_transformer_blocks.14.attn.to_q.weight', 'single_transformer_blocks.14.attn.to_q.weight_scale', 'single_transformer_blocks.14.attn.to_v.bias', 'single_transformer_blocks.14.attn.to_v.in_scale', 'single_transformer_blocks.14.attn.to_v.weight', 'single_transformer_blocks.14.attn.to_v.weight_scale', 'single_transformer_blocks.14.norm.linear.bias', 'single_transformer_blocks.14.norm.linear.in_scale', 'single_transformer_blocks.14.norm.linear.weight', 'single_transformer_blocks.14.norm.linear.weight_scale', 'single_transformer_blocks.14.proj_mlp.bias', 'single_transformer_blocks.14.proj_mlp.in_scale', 'single_transformer_blocks.14.proj_mlp.weight', 'single_transformer_blocks.14.proj_mlp.weight_scale', 'single_transformer_blocks.14.proj_out.bias', 'single_transformer_blocks.14.proj_out.in_scale', 'single_transformer_blocks.14.proj_out.weight', 'single_transformer_blocks.14.proj_out.weight_scale', 'single_transformer_blocks.15.attn.norm_k.weight', 'single_transformer_blocks.15.attn.norm_q.weight', 'single_transformer_blocks.15.attn.to_k.bias', 'single_transformer_blocks.15.attn.to_k.in_scale', 'single_transformer_blocks.15.attn.to_k.weight', 'single_transformer_blocks.15.attn.to_k.weight_scale', 'single_transformer_blocks.15.attn.to_q.bias', 'single_transformer_blocks.15.attn.to_q.in_scale', 'single_transformer_blocks.15.attn.to_q.weight', 'single_transformer_blocks.15.attn.to_q.weight_scale', 'single_transformer_blocks.15.attn.to_v.bias', 'single_transformer_blocks.15.attn.to_v.in_scale', 'single_transformer_blocks.15.attn.to_v.weight', 'single_transformer_blocks.15.attn.to_v.weight_scale', 'single_transformer_blocks.15.norm.linear.bias', 'single_transformer_blocks.15.norm.linear.in_scale', 'single_transformer_blocks.15.norm.linear.weight', 'single_transformer_blocks.15.norm.linear.weight_scale', 'single_transformer_blocks.15.proj_mlp.bias', 'single_transformer_blocks.15.proj_mlp.in_scale', 'single_transformer_blocks.15.proj_mlp.weight', 'single_transformer_blocks.15.proj_mlp.weight_scale', 'single_transformer_blocks.15.proj_out.bias', 'single_transformer_blocks.15.proj_out.in_scale', 'single_transformer_blocks.15.proj_out.weight', 'single_transformer_blocks.15.proj_out.weight_scale', 'single_transformer_blocks.16.attn.norm_k.weight', 'single_transformer_blocks.16.attn.norm_q.weight', 'single_transformer_blocks.16.attn.to_k.bias', 'single_transformer_blocks.16.attn.to_k.in_scale', 'single_transformer_blocks.16.attn.to_k.weight', 'single_transformer_blocks.16.attn.to_k.weight_scale', 'single_transformer_blocks.16.attn.to_q.bias', 'single_transformer_blocks.16.attn.to_q.in_scale', 'single_transformer_blocks.16.attn.to_q.weight', 'single_transformer_blocks.16.attn.to_q.weight_scale', 'single_transformer_blocks.16.attn.to_v.bias', 'single_transformer_blocks.16.attn.to_v.in_scale', 'single_transformer_blocks.16.attn.to_v.weight', 'single_transformer_blocks.16.attn.to_v.weight_scale', 'single_transformer_blocks.16.norm.linear.bias', 'single_transformer_blocks.16.norm.linear.in_scale', 'single_transformer_blocks.16.norm.linear.weight', 'single_transformer_blocks.16.norm.linear.weight_scale', 'single_transformer_blocks.16.proj_mlp.bias', 'single_transformer_blocks.16.proj_mlp.in_scale', 'single_transformer_blocks.16.proj_mlp.weight', 'single_transformer_blocks.16.proj_mlp.weight_scale', 'single_transformer_blocks.16.proj_out.bias', 'single_transformer_blocks.16.proj_out.in_scale', 'single_transformer_blocks.16.proj_out.weight', 'single_transformer_blocks.16.proj_out.weight_scale', 'single_transformer_blocks.17.attn.norm_k.weight', 'single_transformer_blocks.17.attn.norm_q.weight', 'single_transformer_blocks.17.attn.to_k.bias', 'single_transformer_blocks.17.attn.to_k.in_scale', 'single_transformer_blocks.17.attn.to_k.weight', 'single_transformer_blocks.17.attn.to_k.weight_scale', 'single_transformer_blocks.17.attn.to_q.bias', 'single_transformer_blocks.17.attn.to_q.in_scale', 'single_transformer_blocks.17.attn.to_q.weight', 'single_transformer_blocks.17.attn.to_q.weight_scale', 'single_transformer_blocks.17.attn.to_v.bias', 'single_transformer_blocks.17.attn.to_v.in_scale', 'single_transformer_blocks.17.attn.to_v.weight', 'single_transformer_blocks.17.attn.to_v.weight_scale', 'single_transformer_blocks.17.norm.linear.bias', 'single_transformer_blocks.17.norm.linear.in_scale', 'single_transformer_blocks.17.norm.linear.weight', 'single_transformer_blocks.17.norm.linear.weight_scale', 'single_transformer_blocks.17.proj_mlp.bias', 'single_transformer_blocks.17.proj_mlp.in_scale', 'single_transformer_blocks.17.proj_mlp.weight', 'single_transformer_blocks.17.proj_mlp.weight_scale', 'single_transformer_blocks.17.proj_out.bias', 'single_transformer_blocks.17.proj_out.in_scale', 'single_transformer_blocks.17.proj_out.weight', 'single_transformer_blocks.17.proj_out.weight_scale', 'single_transformer_blocks.18.attn.norm_k.weight', 'single_transformer_blocks.18.attn.norm_q.weight', 'single_transformer_blocks.18.attn.to_k.bias', 'single_transformer_blocks.18.attn.to_k.in_scale', 'single_transformer_blocks.18.attn.to_k.weight', 'single_transformer_blocks.18.attn.to_k.weight_scale', 'single_transformer_blocks.18.attn.to_q.bias', 'single_transformer_blocks.18.attn.to_q.in_scale', 'single_transformer_blocks.18.attn.to_q.weight', 'single_transformer_blocks.18.attn.to_q.weight_scale', 'single_transformer_blocks.18.attn.to_v.bias', 'single_transformer_blocks.18.attn.to_v.in_scale', 'single_transformer_blocks.18.attn.to_v.weight', 'single_transformer_blocks.18.attn.to_v.weight_scale', 'single_transformer_blocks.18.norm.linear.bias', 'single_transformer_blocks.18.norm.linear.in_scale', 'single_transformer_blocks.18.norm.linear.weight', 'single_transformer_blocks.18.norm.linear.weight_scale', 'single_transformer_blocks.18.proj_mlp.bias', 'single_transformer_blocks.18.proj_mlp.in_scale', 'single_transformer_blocks.18.proj_mlp.weight', 'single_transformer_blocks.18.proj_mlp.weight_scale', 'single_transformer_blocks.18.proj_out.bias', 'single_transformer_blocks.18.proj_out.in_scale', 'single_transformer_blocks.18.proj_out.weight', 'single_transformer_blocks.18.proj_out.weight_scale', 'single_transformer_blocks.19.attn.norm_k.weight', 'single_transformer_blocks.19.attn.norm_q.weight', 'single_transformer_blocks.19.attn.to_k.bias', 'single_transformer_blocks.19.attn.to_k.in_scale', 'single_transformer_blocks.19.attn.to_k.weight', 'single_transformer_blocks.19.attn.to_k.weight_scale', 'single_transformer_blocks.19.attn.to_q.bias', 'single_transformer_blocks.19.attn.to_q.in_scale', 'single_transformer_blocks.19.attn.to_q.weight', 'single_transformer_blocks.19.attn.to_q.weight_scale', 'single_transformer_blocks.19.attn.to_v.bias', 'single_transformer_blocks.19.attn.to_v.in_scale', 'single_transformer_blocks.19.attn.to_v.weight', 'single_transformer_blocks.19.attn.to_v.weight_scale', 'single_transformer_blocks.19.norm.linear.bias', 'single_transformer_blocks.19.norm.linear.in_scale', 'single_transformer_blocks.19.norm.linear.weight', 'single_transformer_blocks.19.norm.linear.weight_scale', 'single_transformer_blocks.19.proj_mlp.bias', 'single_transformer_blocks.19.proj_mlp.in_scale', 'single_transformer_blocks.19.proj_mlp.weight', 'single_transformer_blocks.19.proj_mlp.weight_scale', 'single_transformer_blocks.19.proj_out.bias', 'single_transformer_blocks.19.proj_out.in_scale', 'single_transformer_blocks.19.proj_out.weight', 'single_transformer_blocks.19.proj_out.weight_scale', 'single_transformer_blocks.2.attn.norm_k.weight', 'single_transformer_blocks.2.attn.norm_q.weight', 'single_transformer_blocks.2.attn.to_k.bias', 'single_transformer_blocks.2.attn.to_k.in_scale', 'single_transformer_blocks.2.attn.to_k.weight', 'single_transformer_blocks.2.attn.to_k.weight_scale', 'single_transformer_blocks.2.attn.to_q.bias', 'single_transformer_blocks.2.attn.to_q.in_scale', 'single_transformer_blocks.2.attn.to_q.weight', 'single_transformer_blocks.2.attn.to_q.weight_scale', 'single_transformer_blocks.2.attn.to_v.bias', 'single_transformer_blocks.2.attn.to_v.in_scale', 'single_transformer_blocks.2.attn.to_v.weight', 'single_transformer_blocks.2.attn.to_v.weight_scale', 'single_transformer_blocks.2.norm.linear.bias', 'single_transformer_blocks.2.norm.linear.in_scale', 'single_transformer_blocks.2.norm.linear.weight', 'single_transformer_blocks.2.norm.linear.weight_scale', 'single_transformer_blocks.2.proj_mlp.bias', 'single_transformer_blocks.2.proj_mlp.in_scale', 'single_transformer_blocks.2.proj_mlp.weight', 'single_transformer_blocks.2.proj_mlp.weight_scale', 'single_transformer_blocks.2.proj_out.bias', 'single_transformer_blocks.2.proj_out.in_scale', 'single_transformer_blocks.2.proj_out.weight', 'single_transformer_blocks.2.proj_out.weight_scale', 'single_transformer_blocks.20.attn.norm_k.weight', 'single_transformer_blocks.20.attn.norm_q.weight', 'single_transformer_blocks.20.attn.to_k.bias', 'single_transformer_blocks.20.attn.to_k.in_scale', 'single_transformer_blocks.20.attn.to_k.weight', 'single_transformer_blocks.20.attn.to_k.weight_scale', 'single_transformer_blocks.20.attn.to_q.bias', 'single_transformer_blocks.20.attn.to_q.in_scale', 'single_transformer_blocks.20.attn.to_q.weight', 'single_transformer_blocks.20.attn.to_q.weight_scale', 'single_transformer_blocks.20.attn.to_v.bias', 'single_transformer_blocks.20.attn.to_v.in_scale', 'single_transformer_blocks.20.attn.to_v.weight', 'single_transformer_blocks.20.attn.to_v.weight_scale', 'single_transformer_blocks.20.norm.linear.bias', 'single_transformer_blocks.20.norm.linear.in_scale', 'single_transformer_blocks.20.norm.linear.weight', 'single_transformer_blocks.20.norm.linear.weight_scale', 'single_transformer_blocks.20.proj_mlp.bias', 'single_transformer_blocks.20.proj_mlp.in_scale', 'single_transformer_blocks.20.proj_mlp.weight', 'single_transformer_blocks.20.proj_mlp.weight_scale', 'single_transformer_blocks.20.proj_out.bias', 'single_transformer_blocks.20.proj_out.in_scale', 'single_transformer_blocks.20.proj_out.weight', 'single_transformer_blocks.20.proj_out.weight_scale', 'single_transformer_blocks.21.attn.norm_k.weight', 'single_transformer_blocks.21.attn.norm_q.weight', 'single_transformer_blocks.21.attn.to_k.bias', 'single_transformer_blocks.21.attn.to_k.in_scale', 'single_transformer_blocks.21.attn.to_k.weight', 'single_transformer_blocks.21.attn.to_k.weight_scale', 'single_transformer_blocks.21.attn.to_q.bias', 'single_transformer_blocks.21.attn.to_q.in_scale', 'single_transformer_blocks.21.attn.to_q.weight', 'single_transformer_blocks.21.attn.to_q.weight_scale', 'single_transformer_blocks.21.attn.to_v.bias', 'single_transformer_blocks.21.attn.to_v.in_scale', 'single_transformer_blocks.21.attn.to_v.weight', 'single_transformer_blocks.21.attn.to_v.weight_scale', 'single_transformer_blocks.21.norm.linear.bias', 'single_transformer_blocks.21.norm.linear.in_scale', 'single_transformer_blocks.21.norm.linear.weight', 'single_transformer_blocks.21.norm.linear.weight_scale', 'single_transformer_blocks.21.proj_mlp.bias', 'single_transformer_blocks.21.proj_mlp.in_scale', 'single_transformer_blocks.21.proj_mlp.weight', 'single_transformer_blocks.21.proj_mlp.weight_scale', 'single_transformer_blocks.21.proj_out.bias', 'single_transformer_blocks.21.proj_out.in_scale', 'single_transformer_blocks.21.proj_out.weight', 'single_transformer_blocks.21.proj_out.weight_scale', 'single_transformer_blocks.22.attn.norm_k.weight', 'single_transformer_blocks.22.attn.norm_q.weight', 'single_transformer_blocks.22.attn.to_k.bias', 'single_transformer_blocks.22.attn.to_k.in_scale', 'single_transformer_blocks.22.attn.to_k.weight', 'single_transformer_blocks.22.attn.to_k.weight_scale', 'single_transformer_blocks.22.attn.to_q.bias', 'single_transformer_blocks.22.attn.to_q.in_scale', 'single_transformer_blocks.22.attn.to_q.weight', 'single_transformer_blocks.22.attn.to_q.weight_scale', 'single_transformer_blocks.22.attn.to_v.bias', 'single_transformer_blocks.22.attn.to_v.in_scale', 'single_transformer_blocks.22.attn.to_v.weight', 'single_transformer_blocks.22.attn.to_v.weight_scale', 'single_transformer_blocks.22.norm.linear.bias', 'single_transformer_blocks.22.norm.linear.in_scale', 'single_transformer_blocks.22.norm.linear.weight', 'single_transformer_blocks.22.norm.linear.weight_scale', 'single_transformer_blocks.22.proj_mlp.bias', 'single_transformer_blocks.22.proj_mlp.in_scale', 'single_transformer_blocks.22.proj_mlp.weight', 'single_transformer_blocks.22.proj_mlp.weight_scale', 'single_transformer_blocks.22.proj_out.bias', 'single_transformer_blocks.22.proj_out.in_scale', 'single_transformer_blocks.22.proj_out.weight', 'single_transformer_blocks.22.proj_out.weight_scale', 'single_transformer_blocks.23.norm.linear.bias', 'single_transformer_blocks.23.norm.linear.in_scale', 'single_transformer_blocks.23.norm.linear.weight', 'single_transformer_blocks.23.norm.linear.weight_scale', 'single_transformer_blocks.23.proj_mlp.bias', 'single_transformer_blocks.23.proj_mlp.in_scale', 'single_transformer_blocks.23.proj_mlp.weight', 'single_transformer_blocks.23.proj_mlp.weight_scale', 'single_transformer_blocks.3.attn.norm_k.weight', 'single_transformer_blocks.3.attn.norm_q.weight', 'single_transformer_blocks.3.attn.to_k.bias', 'single_transformer_blocks.3.attn.to_k.in_scale', 'single_transformer_blocks.3.attn.to_k.weight', 'single_transformer_blocks.3.attn.to_k.weight_scale', 'single_transformer_blocks.3.attn.to_q.bias', 'single_transformer_blocks.3.attn.to_q.in_scale', 'single_transformer_blocks.3.attn.to_q.weight', 'single_transformer_blocks.3.attn.to_q.weight_scale', 'single_transformer_blocks.3.attn.to_v.bias', 'single_transformer_blocks.3.attn.to_v.in_scale', 'single_transformer_blocks.3.attn.to_v.weight', 'single_transformer_blocks.3.attn.to_v.weight_scale', 'single_transformer_blocks.3.norm.linear.bias', 'single_transformer_blocks.3.norm.linear.in_scale', 'single_transformer_blocks.3.norm.linear.weight', 'single_transformer_blocks.3.norm.linear.weight_scale', 'single_transformer_blocks.3.proj_mlp.bias', 'single_transformer_blocks.3.proj_mlp.in_scale', 'single_transformer_blocks.3.proj_mlp.weight', 'single_transformer_blocks.3.proj_mlp.weight_scale', 'single_transformer_blocks.3.proj_out.bias', 'single_transformer_blocks.3.proj_out.in_scale', 'single_transformer_blocks.3.proj_out.weight', 'single_transformer_blocks.3.proj_out.weight_scale', 'single_transformer_blocks.4.attn.norm_k.weight', 'single_transformer_blocks.4.attn.norm_q.weight', 'single_transformer_blocks.4.attn.to_k.bias', 'single_transformer_blocks.4.attn.to_k.in_scale', 'single_transformer_blocks.4.attn.to_k.weight', 'single_transformer_blocks.4.attn.to_k.weight_scale', 'single_transformer_blocks.4.attn.to_q.bias', 'single_transformer_blocks.4.attn.to_q.in_scale', 'single_transformer_blocks.4.attn.to_q.weight', 'single_transformer_blocks.4.attn.to_q.weight_scale', 'single_transformer_blocks.4.attn.to_v.bias', 'single_transformer_blocks.4.attn.to_v.in_scale', 'single_transformer_blocks.4.attn.to_v.weight', 'single_transformer_blocks.4.attn.to_v.weight_scale', 'single_transformer_blocks.4.norm.linear.bias', 'single_transformer_blocks.4.norm.linear.in_scale', 'single_transformer_blocks.4.norm.linear.weight', 'single_transformer_blocks.4.norm.linear.weight_scale', 'single_transformer_blocks.4.proj_mlp.bias', 'single_transformer_blocks.4.proj_mlp.in_scale', 'single_transformer_blocks.4.proj_mlp.weight', 'single_transformer_blocks.4.proj_mlp.weight_scale', 'single_transformer_blocks.4.proj_out.bias', 'single_transformer_blocks.4.proj_out.in_scale', 'single_transformer_blocks.4.proj_out.weight', 'single_transformer_blocks.4.proj_out.weight_scale', 'single_transformer_blocks.5.attn.norm_k.weight', 'single_transformer_blocks.5.attn.norm_q.weight', 'single_transformer_blocks.5.attn.to_k.bias', 'single_transformer_blocks.5.attn.to_k.in_scale', 'single_transformer_blocks.5.attn.to_k.weight', 'single_transformer_blocks.5.attn.to_k.weight_scale', 'single_transformer_blocks.5.attn.to_q.bias', 'single_transformer_blocks.5.attn.to_q.in_scale', 'single_transformer_blocks.5.attn.to_q.weight', 'single_transformer_blocks.5.attn.to_q.weight_scale', 'single_transformer_blocks.5.attn.to_v.bias', 'single_transformer_blocks.5.attn.to_v.in_scale', 'single_transformer_blocks.5.attn.to_v.weight', 'single_transformer_blocks.5.attn.to_v.weight_scale', 'single_transformer_blocks.5.norm.linear.bias', 'single_transformer_blocks.5.norm.linear.in_scale', 'single_transformer_blocks.5.norm.linear.weight', 'single_transformer_blocks.5.norm.linear.weight_scale', 'single_transformer_blocks.5.proj_mlp.bias', 'single_transformer_blocks.5.proj_mlp.in_scale', 'single_transformer_blocks.5.proj_mlp.weight', 'single_transformer_blocks.5.proj_mlp.weight_scale', 'single_transformer_blocks.5.proj_out.bias', 'single_transformer_blocks.5.proj_out.in_scale', 'single_transformer_blocks.5.proj_out.weight', 'single_transformer_blocks.5.proj_out.weight_scale', 'single_transformer_blocks.6.attn.norm_k.weight', 'single_transformer_blocks.6.attn.norm_q.weight', 'single_transformer_blocks.6.attn.to_k.bias', 'single_transformer_blocks.6.attn.to_k.in_scale', 'single_transformer_blocks.6.attn.to_k.weight', 'single_transformer_blocks.6.attn.to_k.weight_scale', 'single_transformer_blocks.6.attn.to_q.bias', 'single_transformer_blocks.6.attn.to_q.in_scale', 'single_transformer_blocks.6.attn.to_q.weight', 'single_transformer_blocks.6.attn.to_q.weight_scale', 'single_transformer_blocks.6.attn.to_v.bias', 'single_transformer_blocks.6.attn.to_v.in_scale', 'single_transformer_blocks.6.attn.to_v.weight', 'single_transformer_blocks.6.attn.to_v.weight_scale', 'single_transformer_blocks.6.norm.linear.bias', 'single_transformer_blocks.6.norm.linear.in_scale', 'single_transformer_blocks.6.norm.linear.weight', 'single_transformer_blocks.6.norm.linear.weight_scale', 'single_transformer_blocks.6.proj_mlp.bias', 'single_transformer_blocks.6.proj_mlp.in_scale', 'single_transformer_blocks.6.proj_mlp.weight', 'single_transformer_blocks.6.proj_mlp.weight_scale', 'single_transformer_blocks.6.proj_out.bias', 'single_transformer_blocks.6.proj_out.in_scale', 'single_transformer_blocks.6.proj_out.weight', 'single_transformer_blocks.6.proj_out.weight_scale', 'single_transformer_blocks.7.attn.norm_k.weight', 'single_transformer_blocks.7.attn.norm_q.weight', 'single_transformer_blocks.7.attn.to_k.bias', 'single_transformer_blocks.7.attn.to_k.in_scale', 'single_transformer_blocks.7.attn.to_k.weight', 'single_transformer_blocks.7.attn.to_k.weight_scale', 'single_transformer_blocks.7.attn.to_q.bias', 'single_transformer_blocks.7.attn.to_q.in_scale', 'single_transformer_blocks.7.attn.to_q.weight', 'single_transformer_blocks.7.attn.to_q.weight_scale', 'single_transformer_blocks.7.attn.to_v.bias', 'single_transformer_blocks.7.attn.to_v.in_scale', 'single_transformer_blocks.7.attn.to_v.weight', 'single_transformer_blocks.7.attn.to_v.weight_scale', 'single_transformer_blocks.7.norm.linear.bias', 'single_transformer_blocks.7.norm.linear.in_scale', 'single_transformer_blocks.7.norm.linear.weight', 'single_transformer_blocks.7.norm.linear.weight_scale', 'single_transformer_blocks.7.proj_mlp.bias', 'single_transformer_blocks.7.proj_mlp.in_scale', 'single_transformer_blocks.7.proj_mlp.weight', 'single_transformer_blocks.7.proj_mlp.weight_scale', 'single_transformer_blocks.7.proj_out.bias', 'single_transformer_blocks.7.proj_out.in_scale', 'single_transformer_blocks.7.proj_out.weight', 'single_transformer_blocks.7.proj_out.weight_scale', 'single_transformer_blocks.8.attn.norm_k.weight', 'single_transformer_blocks.8.attn.norm_q.weight', 'single_transformer_blocks.8.attn.to_k.bias', 'single_transformer_blocks.8.attn.to_k.in_scale', 'single_transformer_blocks.8.attn.to_k.weight', 'single_transformer_blocks.8.attn.to_k.weight_scale', 'single_transformer_blocks.8.attn.to_q.bias', 'single_transformer_blocks.8.attn.to_q.in_scale', 'single_transformer_blocks.8.attn.to_q.weight', 'single_transformer_blocks.8.attn.to_q.weight_scale', 'single_transformer_blocks.8.attn.to_v.bias', 'single_transformer_blocks.8.attn.to_v.in_scale', 'single_transformer_blocks.8.attn.to_v.weight', 'single_transformer_blocks.8.attn.to_v.weight_scale', 'single_transformer_blocks.8.norm.linear.bias', 'single_transformer_blocks.8.norm.linear.in_scale', 'single_transformer_blocks.8.norm.linear.weight', 'single_transformer_blocks.8.norm.linear.weight_scale', 'single_transformer_blocks.8.proj_mlp.bias', 'single_transformer_blocks.8.proj_mlp.in_scale', 'single_transformer_blocks.8.proj_mlp.weight', 'single_transformer_blocks.8.proj_mlp.weight_scale', 'single_transformer_blocks.8.proj_out.bias', 'single_transformer_blocks.8.proj_out.in_scale', 'single_transformer_blocks.8.proj_out.weight', 'single_transformer_blocks.8.proj_out.weight_scale', 'single_transformer_blocks.9.attn.norm_k.weight', 'single_transformer_blocks.9.attn.norm_q.weight', 'single_transformer_blocks.9.attn.to_k.bias', 'single_transformer_blocks.9.attn.to_k.in_scale', 'single_transformer_blocks.9.attn.to_k.weight', 'single_transformer_blocks.9.attn.to_k.weight_scale', 'single_transformer_blocks.9.attn.to_q.bias', 'single_transformer_blocks.9.attn.to_q.in_scale', 'single_transformer_blocks.9.attn.to_q.weight', 'single_transformer_blocks.9.attn.to_q.weight_scale', 'single_transformer_blocks.9.attn.to_v.bias', 'single_transformer_blocks.9.attn.to_v.in_scale', 'single_transformer_blocks.9.attn.to_v.weight', 'single_transformer_blocks.9.attn.to_v.weight_scale', 'single_transformer_blocks.9.norm.linear.bias', 'single_transformer_blocks.9.norm.linear.in_scale', 'single_transformer_blocks.9.norm.linear.weight', 'single_transformer_blocks.9.norm.linear.weight_scale', 'single_transformer_blocks.9.proj_mlp.bias', 'single_transformer_blocks.9.proj_mlp.in_scale', 'single_transformer_blocks.9.proj_mlp.weight', 'single_transformer_blocks.9.proj_mlp.weight_scale', 'single_transformer_blocks.9.proj_out.bias', 'single_transformer_blocks.9.proj_out.in_scale', 'single_transformer_blocks.9.proj_out.weight', 'single_transformer_blocks.9.proj_out.weight_scale', 'time_text_embed.guidance_embedder.linear_1.bias', 'time_text_embed.guidance_embedder.linear_1.weight', 'time_text_embed.guidance_embedder.linear_2.bias', 'time_text_embed.guidance_embedder.linear_2.weight', 'time_text_embed.text_embedder.linear_1.bias', 'time_text_embed.text_embedder.linear_1.weight', 'time_text_embed.text_embedder.linear_2.bias', 'time_text_embed.text_embedder.linear_2.weight', 'time_text_embed.timestep_embedder.linear_1.bias', 'time_text_embed.timestep_embedder.linear_1.weight', 'time_text_embed.timestep_embedder.linear_2.bias', 'time_text_embed.timestep_embedder.linear_2.weight', 'transformer_blocks.0.attn.add_k_proj.bias', 'transformer_blocks.0.attn.add_k_proj.in_scale', 'transformer_blocks.0.attn.add_k_proj.weight', 'transformer_blocks.0.attn.add_k_proj.weight_scale', 'transformer_blocks.0.attn.add_q_proj.bias', 'transformer_blocks.0.attn.add_q_proj.in_scale', 'transformer_blocks.0.attn.add_q_proj.weight', 'transformer_blocks.0.attn.add_q_proj.weight_scale', 'transformer_blocks.0.attn.add_v_proj.bias', 'transformer_blocks.0.attn.add_v_proj.in_scale', 'transformer_blocks.0.attn.add_v_proj.weight', 'transformer_blocks.0.attn.add_v_proj.weight_scale', 'transformer_blocks.0.attn.norm_added_k.weight', 'transformer_blocks.0.attn.norm_added_q.weight', 'transformer_blocks.0.attn.norm_k.weight', 'transformer_blocks.0.attn.norm_q.weight', 'transformer_blocks.0.attn.to_add_out.bias', 'transformer_blocks.0.attn.to_add_out.in_scale', 'transformer_blocks.0.attn.to_add_out.weight', 'transformer_blocks.0.attn.to_add_out.weight_scale', 'transformer_blocks.0.attn.to_k.bias', 'transformer_blocks.0.attn.to_k.in_scale', 'transformer_blocks.0.attn.to_k.weight', 'transformer_blocks.0.attn.to_k.weight_scale', 'transformer_blocks.0.attn.to_out.0.bias', 'transformer_blocks.0.attn.to_out.0.in_scale', 'transformer_blocks.0.attn.to_out.0.weight', 'transformer_blocks.0.attn.to_out.0.weight_scale', 'transformer_blocks.0.attn.to_q.bias', 'transformer_blocks.0.attn.to_q.in_scale', 'transformer_blocks.0.attn.to_q.weight', 'transformer_blocks.0.attn.to_q.weight_scale', 'transformer_blocks.0.attn.to_v.bias', 'transformer_blocks.0.attn.to_v.in_scale', 'transformer_blocks.0.attn.to_v.weight', 'transformer_blocks.0.attn.to_v.weight_scale', 'transformer_blocks.0.ff.net.0.proj.bias', 'transformer_blocks.0.ff.net.0.proj.in_scale', 'transformer_blocks.0.ff.net.0.proj.weight', 'transformer_blocks.0.ff.net.0.proj.weight_scale', 'transformer_blocks.0.ff.net.2.bias', 'transformer_blocks.0.ff.net.2.in_scale', 'transformer_blocks.0.ff.net.2.weight', 'transformer_blocks.0.ff.net.2.weight_scale', 'transformer_blocks.0.ff_context.net.0.proj.bias', 'transformer_blocks.0.ff_context.net.0.proj.in_scale', 'transformer_blocks.0.ff_context.net.0.proj.weight', 'transformer_blocks.0.ff_context.net.0.proj.weight_scale', 'transformer_blocks.0.ff_context.net.2.bias', 'transformer_blocks.0.ff_context.net.2.in_scale', 'transformer_blocks.0.ff_context.net.2.weight', 'transformer_blocks.0.ff_context.net.2.weight_scale', 'transformer_blocks.0.norm1.linear.bias', 'transformer_blocks.0.norm1.linear.in_scale', 'transformer_blocks.0.norm1.linear.weight', 'transformer_blocks.0.norm1.linear.weight_scale', 'transformer_blocks.0.norm1_context.linear.bias', 'transformer_blocks.0.norm1_context.linear.in_scale', 'transformer_blocks.0.norm1_context.linear.weight', 'transformer_blocks.0.norm1_context.linear.weight_scale', 'transformer_blocks.1.attn.add_k_proj.bias', 'transformer_blocks.1.attn.add_k_proj.in_scale', 'transformer_blocks.1.attn.add_k_proj.weight', 'transformer_blocks.1.attn.add_k_proj.weight_scale', 'transformer_blocks.1.attn.add_q_proj.bias', 'transformer_blocks.1.attn.add_q_proj.in_scale', 'transformer_blocks.1.attn.add_q_proj.weight', 'transformer_blocks.1.attn.add_q_proj.weight_scale', 'transformer_blocks.1.attn.add_v_proj.bias', 'transformer_blocks.1.attn.add_v_proj.in_scale', 'transformer_blocks.1.attn.add_v_proj.weight', 'transformer_blocks.1.attn.add_v_proj.weight_scale', 'transformer_blocks.1.attn.norm_added_k.weight', 'transformer_blocks.1.attn.norm_added_q.weight', 'transformer_blocks.1.attn.norm_k.weight', 'transformer_blocks.1.attn.norm_q.weight', 'transformer_blocks.1.attn.to_add_out.bias', 'transformer_blocks.1.attn.to_add_out.in_scale', 'transformer_blocks.1.attn.to_add_out.weight', 'transformer_blocks.1.attn.to_add_out.weight_scale', 'transformer_blocks.1.attn.to_k.bias', 'transformer_blocks.1.attn.to_k.in_scale', 'transformer_blocks.1.attn.to_k.weight', 'transformer_blocks.1.attn.to_k.weight_scale', 'transformer_blocks.1.attn.to_out.0.bias', 'transformer_blocks.1.attn.to_out.0.in_scale', 'transformer_blocks.1.attn.to_out.0.weight', 'transformer_blocks.1.attn.to_out.0.weight_scale', 'transformer_blocks.1.attn.to_q.bias', 'transformer_blocks.1.attn.to_q.in_scale', 'transformer_blocks.1.attn.to_q.weight', 'transformer_blocks.1.attn.to_q.weight_scale', 'transformer_blocks.1.attn.to_v.bias', 'transformer_blocks.1.attn.to_v.in_scale', 'transformer_blocks.1.attn.to_v.weight', 'transformer_blocks.1.attn.to_v.weight_scale', 'transformer_blocks.1.ff.net.0.proj.bias', 'transformer_blocks.1.ff.net.0.proj.in_scale', 'transformer_blocks.1.ff.net.0.proj.weight', 'transformer_blocks.1.ff.net.0.proj.weight_scale', 'transformer_blocks.1.ff.net.2.bias', 'transformer_blocks.1.ff.net.2.in_scale', 'transformer_blocks.1.ff.net.2.weight', 'transformer_blocks.1.ff.net.2.weight_scale', 'transformer_blocks.1.ff_context.net.0.proj.bias', 'transformer_blocks.1.ff_context.net.0.proj.in_scale', 'transformer_blocks.1.ff_context.net.0.proj.weight', 'transformer_blocks.1.ff_context.net.0.proj.weight_scale', 'transformer_blocks.1.ff_context.net.2.bias', 'transformer_blocks.1.ff_context.net.2.in_scale', 'transformer_blocks.1.ff_context.net.2.weight', 'transformer_blocks.1.ff_context.net.2.weight_scale', 'transformer_blocks.1.norm1.linear.bias', 'transformer_blocks.1.norm1.linear.in_scale', 'transformer_blocks.1.norm1.linear.weight', 'transformer_blocks.1.norm1.linear.weight_scale', 'transformer_blocks.1.norm1_context.linear.bias', 'transformer_blocks.1.norm1_context.linear.in_scale', 'transformer_blocks.1.norm1_context.linear.weight', 'transformer_blocks.1.norm1_context.linear.weight_scale', 'transformer_blocks.10.attn.add_k_proj.bias', 'transformer_blocks.10.attn.add_k_proj.in_scale', 'transformer_blocks.10.attn.add_k_proj.weight', 'transformer_blocks.10.attn.add_k_proj.weight_scale', 'transformer_blocks.10.attn.add_q_proj.bias', 'transformer_blocks.10.attn.add_q_proj.in_scale', 'transformer_blocks.10.attn.add_q_proj.weight', 'transformer_blocks.10.attn.add_q_proj.weight_scale', 'transformer_blocks.10.attn.add_v_proj.bias', 'transformer_blocks.10.attn.add_v_proj.in_scale', 'transformer_blocks.10.attn.add_v_proj.weight', 'transformer_blocks.10.attn.add_v_proj.weight_scale', 'transformer_blocks.10.attn.norm_added_k.weight', 'transformer_blocks.10.attn.norm_added_q.weight', 'transformer_blocks.10.attn.norm_k.weight', 'transformer_blocks.10.attn.norm_q.weight', 'transformer_blocks.10.attn.to_add_out.bias', 'transformer_blocks.10.attn.to_add_out.in_scale', 'transformer_blocks.10.attn.to_add_out.weight', 'transformer_blocks.10.attn.to_add_out.weight_scale', 'transformer_blocks.10.attn.to_k.bias', 'transformer_blocks.10.attn.to_k.in_scale', 'transformer_blocks.10.attn.to_k.weight', 'transformer_blocks.10.attn.to_k.weight_scale', 'transformer_blocks.10.attn.to_out.0.bias', 'transformer_blocks.10.attn.to_out.0.in_scale', 'transformer_blocks.10.attn.to_out.0.weight', 'transformer_blocks.10.attn.to_out.0.weight_scale', 'transformer_blocks.10.attn.to_q.bias', 'transformer_blocks.10.attn.to_q.in_scale', 'transformer_blocks.10.attn.to_q.weight', 'transformer_blocks.10.attn.to_q.weight_scale', 'transformer_blocks.10.attn.to_v.bias', 'transformer_blocks.10.attn.to_v.in_scale', 'transformer_blocks.10.attn.to_v.weight', 'transformer_blocks.10.attn.to_v.weight_scale', 'transformer_blocks.10.ff.net.0.proj.bias', 'transformer_blocks.10.ff.net.0.proj.in_scale', 'transformer_blocks.10.ff.net.0.proj.weight', 'transformer_blocks.10.ff.net.0.proj.weight_scale', 'transformer_blocks.10.ff.net.2.bias', 'transformer_blocks.10.ff.net.2.in_scale', 'transformer_blocks.10.ff.net.2.weight', 'transformer_blocks.10.ff.net.2.weight_scale', 'transformer_blocks.10.ff_context.net.0.proj.bias', 'transformer_blocks.10.ff_context.net.0.proj.in_scale', 'transformer_blocks.10.ff_context.net.0.proj.weight', 'transformer_blocks.10.ff_context.net.0.proj.weight_scale', 'transformer_blocks.10.ff_context.net.2.bias', 'transformer_blocks.10.ff_context.net.2.in_scale', 'transformer_blocks.10.ff_context.net.2.weight', 'transformer_blocks.10.ff_context.net.2.weight_scale', 'transformer_blocks.10.norm1.linear.bias', 'transformer_blocks.10.norm1.linear.in_scale', 'transformer_blocks.10.norm1.linear.weight', 'transformer_blocks.10.norm1.linear.weight_scale', 'transformer_blocks.10.norm1_context.linear.bias', 'transformer_blocks.10.norm1_context.linear.in_scale', 'transformer_blocks.10.norm1_context.linear.weight', 'transformer_blocks.10.norm1_context.linear.weight_scale', 'transformer_blocks.11.attn.add_k_proj.bias', 'transformer_blocks.11.attn.add_k_proj.in_scale', 'transformer_blocks.11.attn.add_k_proj.weight', 'transformer_blocks.11.attn.add_k_proj.weight_scale', 'transformer_blocks.11.attn.add_q_proj.bias', 'transformer_blocks.11.attn.add_q_proj.in_scale', 'transformer_blocks.11.attn.add_q_proj.weight', 'transformer_blocks.11.attn.add_q_proj.weight_scale', 'transformer_blocks.11.attn.add_v_proj.bias', 'transformer_blocks.11.attn.add_v_proj.in_scale', 'transformer_blocks.11.attn.add_v_proj.weight', 'transformer_blocks.11.attn.add_v_proj.weight_scale', 'transformer_blocks.11.attn.norm_added_k.weight', 'transformer_blocks.11.attn.norm_added_q.weight', 'transformer_blocks.11.attn.norm_k.weight', 'transformer_blocks.11.attn.norm_q.weight', 'transformer_blocks.11.attn.to_add_out.bias', 'transformer_blocks.11.attn.to_add_out.in_scale', 'transformer_blocks.11.attn.to_add_out.weight', 'transformer_blocks.11.attn.to_add_out.weight_scale', 'transformer_blocks.11.attn.to_k.bias', 'transformer_blocks.11.attn.to_k.in_scale', 'transformer_blocks.11.attn.to_k.weight', 'transformer_blocks.11.attn.to_k.weight_scale', 'transformer_blocks.11.attn.to_out.0.bias', 'transformer_blocks.11.attn.to_out.0.in_scale', 'transformer_blocks.11.attn.to_out.0.weight', 'transformer_blocks.11.attn.to_out.0.weight_scale', 'transformer_blocks.11.attn.to_q.bias', 'transformer_blocks.11.attn.to_q.in_scale', 'transformer_blocks.11.attn.to_q.weight', 'transformer_blocks.11.attn.to_q.weight_scale', 'transformer_blocks.11.attn.to_v.bias', 'transformer_blocks.11.attn.to_v.in_scale', 'transformer_blocks.11.attn.to_v.weight', 'transformer_blocks.11.attn.to_v.weight_scale', 'transformer_blocks.11.ff.net.0.proj.bias', 'transformer_blocks.11.ff.net.0.proj.in_scale', 'transformer_blocks.11.ff.net.0.proj.weight', 'transformer_blocks.11.ff.net.0.proj.weight_scale', 'transformer_blocks.11.ff.net.2.bias', 'transformer_blocks.11.ff.net.2.in_scale', 'transformer_blocks.11.ff.net.2.weight', 'transformer_blocks.11.ff.net.2.weight_scale', 'transformer_blocks.11.ff_context.net.0.proj.bias', 'transformer_blocks.11.ff_context.net.0.proj.in_scale', 'transformer_blocks.11.ff_context.net.0.proj.weight', 'transformer_blocks.11.ff_context.net.0.proj.weight_scale', 'transformer_blocks.11.ff_context.net.2.bias', 'transformer_blocks.11.ff_context.net.2.in_scale', 'transformer_blocks.11.ff_context.net.2.weight', 'transformer_blocks.11.ff_context.net.2.weight_scale', 'transformer_blocks.11.norm1.linear.bias', 'transformer_blocks.11.norm1.linear.in_scale', 'transformer_blocks.11.norm1.linear.weight', 'transformer_blocks.11.norm1.linear.weight_scale', 'transformer_blocks.11.norm1_context.linear.bias', 'transformer_blocks.11.norm1_context.linear.in_scale', 'transformer_blocks.11.norm1_context.linear.weight', 'transformer_blocks.11.norm1_context.linear.weight_scale', 'transformer_blocks.12.attn.add_k_proj.bias', 'transformer_blocks.12.attn.add_k_proj.in_scale', 'transformer_blocks.12.attn.add_k_proj.weight', 'transformer_blocks.12.attn.add_k_proj.weight_scale', 'transformer_blocks.12.attn.add_q_proj.bias', 'transformer_blocks.12.attn.add_q_proj.in_scale', 'transformer_blocks.12.attn.add_q_proj.weight', 'transformer_blocks.12.attn.add_q_proj.weight_scale', 'transformer_blocks.12.attn.add_v_proj.bias', 'transformer_blocks.12.attn.add_v_proj.in_scale', 'transformer_blocks.12.attn.add_v_proj.weight', 'transformer_blocks.12.attn.add_v_proj.weight_scale', 'transformer_blocks.12.attn.norm_added_k.weight', 'transformer_blocks.12.attn.norm_added_q.weight', 'transformer_blocks.12.attn.norm_k.weight', 'transformer_blocks.12.attn.norm_q.weight', 'transformer_blocks.12.attn.to_add_out.bias', 'transformer_blocks.12.attn.to_add_out.in_scale', 'transformer_blocks.12.attn.to_add_out.weight', 'transformer_blocks.12.attn.to_add_out.weight_scale', 'transformer_blocks.12.attn.to_k.bias', 'transformer_blocks.12.attn.to_k.in_scale', 'transformer_blocks.12.attn.to_k.weight', 'transformer_blocks.12.attn.to_k.weight_scale', 'transformer_blocks.12.attn.to_out.0.bias', 'transformer_blocks.12.attn.to_out.0.in_scale', 'transformer_blocks.12.attn.to_out.0.weight', 'transformer_blocks.12.attn.to_out.0.weight_scale', 'transformer_blocks.12.attn.to_q.bias', 'transformer_blocks.12.attn.to_q.in_scale', 'transformer_blocks.12.attn.to_q.weight', 'transformer_blocks.12.attn.to_q.weight_scale', 'transformer_blocks.12.attn.to_v.bias', 'transformer_blocks.12.attn.to_v.in_scale', 'transformer_blocks.12.attn.to_v.weight', 'transformer_blocks.12.attn.to_v.weight_scale', 'transformer_blocks.12.ff.net.0.proj.bias', 'transformer_blocks.12.ff.net.0.proj.in_scale', 'transformer_blocks.12.ff.net.0.proj.weight', 'transformer_blocks.12.ff.net.0.proj.weight_scale', 'transformer_blocks.12.ff.net.2.bias', 'transformer_blocks.12.ff.net.2.in_scale', 'transformer_blocks.12.ff.net.2.weight', 'transformer_blocks.12.ff.net.2.weight_scale', 'transformer_blocks.12.ff_context.net.0.proj.bias', 'transformer_blocks.12.ff_context.net.0.proj.in_scale', 'transformer_blocks.12.ff_context.net.0.proj.weight', 'transformer_blocks.12.ff_context.net.0.proj.weight_scale', 'transformer_blocks.12.ff_context.net.2.bias', 'transformer_blocks.12.ff_context.net.2.in_scale', 'transformer_blocks.12.ff_context.net.2.weight', 'transformer_blocks.12.ff_context.net.2.weight_scale', 'transformer_blocks.12.norm1.linear.bias', 'transformer_blocks.12.norm1.linear.in_scale', 'transformer_blocks.12.norm1.linear.weight', 'transformer_blocks.12.norm1.linear.weight_scale', 'transformer_blocks.12.norm1_context.linear.bias', 'transformer_blocks.12.norm1_context.linear.in_scale', 'transformer_blocks.12.norm1_context.linear.weight', 'transformer_blocks.12.norm1_context.linear.weight_scale', 'transformer_blocks.13.attn.add_k_proj.bias', 'transformer_blocks.13.attn.add_k_proj.in_scale', 'transformer_blocks.13.attn.add_k_proj.weight', 'transformer_blocks.13.attn.add_k_proj.weight_scale', 'transformer_blocks.13.attn.add_q_proj.bias', 'transformer_blocks.13.attn.add_q_proj.in_scale', 'transformer_blocks.13.attn.add_q_proj.weight', 'transformer_blocks.13.attn.add_q_proj.weight_scale', 'transformer_blocks.13.attn.add_v_proj.bias', 'transformer_blocks.13.attn.add_v_proj.in_scale', 'transformer_blocks.13.attn.add_v_proj.weight', 'transformer_blocks.13.attn.add_v_proj.weight_scale', 'transformer_blocks.13.attn.norm_added_k.weight', 'transformer_blocks.13.attn.norm_added_q.weight', 'transformer_blocks.13.attn.norm_k.weight', 'transformer_blocks.13.attn.norm_q.weight', 'transformer_blocks.13.attn.to_add_out.bias', 'transformer_blocks.13.attn.to_add_out.in_scale', 'transformer_blocks.13.attn.to_add_out.weight', 'transformer_blocks.13.attn.to_add_out.weight_scale', 'transformer_blocks.13.attn.to_k.bias', 'transformer_blocks.13.attn.to_k.in_scale', 'transformer_blocks.13.attn.to_k.weight', 'transformer_blocks.13.attn.to_k.weight_scale', 'transformer_blocks.13.attn.to_out.0.bias', 'transformer_blocks.13.attn.to_out.0.in_scale', 'transformer_blocks.13.attn.to_out.0.weight', 'transformer_blocks.13.attn.to_out.0.weight_scale', 'transformer_blocks.13.attn.to_q.bias', 'transformer_blocks.13.attn.to_q.in_scale', 'transformer_blocks.13.attn.to_q.weight', 'transformer_blocks.13.attn.to_q.weight_scale', 'transformer_blocks.13.attn.to_v.bias', 'transformer_blocks.13.attn.to_v.in_scale', 'transformer_blocks.13.attn.to_v.weight', 'transformer_blocks.13.attn.to_v.weight_scale', 'transformer_blocks.13.ff.net.0.proj.bias', 'transformer_blocks.13.ff.net.0.proj.in_scale', 'transformer_blocks.13.ff.net.0.proj.weight', 'transformer_blocks.13.ff.net.0.proj.weight_scale', 'transformer_blocks.13.ff.net.2.bias', 'transformer_blocks.13.ff.net.2.in_scale', 'transformer_blocks.13.ff.net.2.weight', 'transformer_blocks.13.ff.net.2.weight_scale', 'transformer_blocks.13.ff_context.net.0.proj.bias', 'transformer_blocks.13.ff_context.net.0.proj.in_scale', 'transformer_blocks.13.ff_context.net.0.proj.weight', 'transformer_blocks.13.ff_context.net.0.proj.weight_scale', 'transformer_blocks.13.ff_context.net.2.bias', 'transformer_blocks.13.ff_context.net.2.in_scale', 'transformer_blocks.13.ff_context.net.2.weight', 'transformer_blocks.13.ff_context.net.2.weight_scale', 'transformer_blocks.13.norm1.linear.bias', 'transformer_blocks.13.norm1.linear.in_scale', 'transformer_blocks.13.norm1.linear.weight', 'transformer_blocks.13.norm1.linear.weight_scale', 'transformer_blocks.13.norm1_context.linear.bias', 'transformer_blocks.13.norm1_context.linear.in_scale', 'transformer_blocks.13.norm1_context.linear.weight', 'transformer_blocks.13.norm1_context.linear.weight_scale', 'transformer_blocks.14.attn.add_k_proj.bias', 'transformer_blocks.14.attn.add_k_proj.in_scale', 'transformer_blocks.14.attn.add_k_proj.weight', 'transformer_blocks.14.attn.add_k_proj.weight_scale', 'transformer_blocks.14.attn.add_q_proj.bias', 'transformer_blocks.14.attn.add_q_proj.in_scale', 'transformer_blocks.14.attn.add_q_proj.weight', 'transformer_blocks.14.attn.add_q_proj.weight_scale', 'transformer_blocks.14.attn.add_v_proj.bias', 'transformer_blocks.14.attn.add_v_proj.in_scale', 'transformer_blocks.14.attn.add_v_proj.weight', 'transformer_blocks.14.attn.add_v_proj.weight_scale', 'transformer_blocks.14.attn.norm_added_k.weight', 'transformer_blocks.14.attn.norm_added_q.weight', 'transformer_blocks.14.attn.norm_k.weight', 'transformer_blocks.14.attn.norm_q.weight', 'transformer_blocks.14.attn.to_add_out.bias', 'transformer_blocks.14.attn.to_add_out.in_scale', 'transformer_blocks.14.attn.to_add_out.weight', 'transformer_blocks.14.attn.to_add_out.weight_scale', 'transformer_blocks.14.attn.to_k.bias', 'transformer_blocks.14.attn.to_k.in_scale', 'transformer_blocks.14.attn.to_k.weight', 'transformer_blocks.14.attn.to_k.weight_scale', 'transformer_blocks.14.attn.to_out.0.bias', 'transformer_blocks.14.attn.to_out.0.in_scale', 'transformer_blocks.14.attn.to_out.0.weight', 'transformer_blocks.14.attn.to_out.0.weight_scale', 'transformer_blocks.14.attn.to_q.bias', 'transformer_blocks.14.attn.to_q.in_scale', 'transformer_blocks.14.attn.to_q.weight', 'transformer_blocks.14.attn.to_q.weight_scale', 'transformer_blocks.14.attn.to_v.bias', 'transformer_blocks.14.attn.to_v.in_scale', 'transformer_blocks.14.attn.to_v.weight', 'transformer_blocks.14.attn.to_v.weight_scale', 'transformer_blocks.14.ff.net.0.proj.bias', 'transformer_blocks.14.ff.net.0.proj.in_scale', 'transformer_blocks.14.ff.net.0.proj.weight', 'transformer_blocks.14.ff.net.0.proj.weight_scale', 'transformer_blocks.14.ff.net.2.bias', 'transformer_blocks.14.ff.net.2.in_scale', 'transformer_blocks.14.ff.net.2.weight', 'transformer_blocks.14.ff.net.2.weight_scale', 'transformer_blocks.14.ff_context.net.0.proj.bias', 'transformer_blocks.14.ff_context.net.0.proj.in_scale', 'transformer_blocks.14.ff_context.net.0.proj.weight', 'transformer_blocks.14.ff_context.net.0.proj.weight_scale', 'transformer_blocks.14.ff_context.net.2.bias', 'transformer_blocks.14.ff_context.net.2.in_scale', 'transformer_blocks.14.ff_context.net.2.weight', 'transformer_blocks.14.ff_context.net.2.weight_scale', 'transformer_blocks.14.norm1.linear.bias', 'transformer_blocks.14.norm1.linear.in_scale', 'transformer_blocks.14.norm1.linear.weight', 'transformer_blocks.14.norm1.linear.weight_scale', 'transformer_blocks.14.norm1_context.linear.bias', 'transformer_blocks.14.norm1_context.linear.in_scale', 'transformer_blocks.14.norm1_context.linear.weight', 'transformer_blocks.14.norm1_context.linear.weight_scale', 'transformer_blocks.15.attn.add_k_proj.bias', 'transformer_blocks.15.attn.add_k_proj.in_scale', 'transformer_blocks.15.attn.add_k_proj.weight', 'transformer_blocks.15.attn.add_k_proj.weight_scale', 'transformer_blocks.15.attn.add_q_proj.bias', 'transformer_blocks.15.attn.add_q_proj.in_scale', 'transformer_blocks.15.attn.add_q_proj.weight', 'transformer_blocks.15.attn.add_q_proj.weight_scale', 'transformer_blocks.15.attn.add_v_proj.bias', 'transformer_blocks.15.attn.add_v_proj.in_scale', 'transformer_blocks.15.attn.add_v_proj.weight', 'transformer_blocks.15.attn.add_v_proj.weight_scale', 'transformer_blocks.15.attn.norm_added_k.weight', 'transformer_blocks.15.attn.norm_added_q.weight', 'transformer_blocks.15.attn.norm_k.weight', 'transformer_blocks.15.attn.norm_q.weight', 'transformer_blocks.15.attn.to_add_out.bias', 'transformer_blocks.15.attn.to_add_out.in_scale', 'transformer_blocks.15.attn.to_add_out.weight', 'transformer_blocks.15.attn.to_add_out.weight_scale', 'transformer_blocks.15.attn.to_k.bias', 'transformer_blocks.15.attn.to_k.in_scale', 'transformer_blocks.15.attn.to_k.weight', 'transformer_blocks.15.attn.to_k.weight_scale', 'transformer_blocks.15.attn.to_out.0.bias', 'transformer_blocks.15.attn.to_out.0.in_scale', 'transformer_blocks.15.attn.to_out.0.weight', 'transformer_blocks.15.attn.to_out.0.weight_scale', 'transformer_blocks.15.attn.to_q.bias', 'transformer_blocks.15.attn.to_q.in_scale', 'transformer_blocks.15.attn.to_q.weight', 'transformer_blocks.15.attn.to_q.weight_scale', 'transformer_blocks.15.attn.to_v.bias', 'transformer_blocks.15.attn.to_v.in_scale', 'transformer_blocks.15.attn.to_v.weight', 'transformer_blocks.15.attn.to_v.weight_scale', 'transformer_blocks.15.ff.net.0.proj.bias', 'transformer_blocks.15.ff.net.0.proj.in_scale', 'transformer_blocks.15.ff.net.0.proj.weight', 'transformer_blocks.15.ff.net.0.proj.weight_scale', 'transformer_blocks.15.ff.net.2.bias', 'transformer_blocks.15.ff.net.2.in_scale', 'transformer_blocks.15.ff.net.2.weight', 'transformer_blocks.15.ff.net.2.weight_scale', 'transformer_blocks.15.ff_context.net.0.proj.bias', 'transformer_blocks.15.ff_context.net.0.proj.in_scale', 'transformer_blocks.15.ff_context.net.0.proj.weight', 'transformer_blocks.15.ff_context.net.0.proj.weight_scale', 'transformer_blocks.15.ff_context.net.2.bias', 'transformer_blocks.15.ff_context.net.2.in_scale', 'transformer_blocks.15.ff_context.net.2.weight', 'transformer_blocks.15.ff_context.net.2.weight_scale', 'transformer_blocks.15.norm1.linear.bias', 'transformer_blocks.15.norm1.linear.in_scale', 'transformer_blocks.15.norm1.linear.weight', 'transformer_blocks.15.norm1.linear.weight_scale', 'transformer_blocks.15.norm1_context.linear.bias', 'transformer_blocks.15.norm1_context.linear.in_scale', 'transformer_blocks.15.norm1_context.linear.weight', 'transformer_blocks.15.norm1_context.linear.weight_scale', 'transformer_blocks.16.attn.add_k_proj.bias', 'transformer_blocks.16.attn.add_k_proj.in_scale', 'transformer_blocks.16.attn.add_k_proj.weight', 'transformer_blocks.16.attn.add_k_proj.weight_scale', 'transformer_blocks.16.attn.add_q_proj.bias', 'transformer_blocks.16.attn.add_q_proj.in_scale', 'transformer_blocks.16.attn.add_q_proj.weight', 'transformer_blocks.16.attn.add_q_proj.weight_scale', 'transformer_blocks.16.attn.add_v_proj.bias', 'transformer_blocks.16.attn.add_v_proj.in_scale', 'transformer_blocks.16.attn.add_v_proj.weight', 'transformer_blocks.16.attn.add_v_proj.weight_scale', 'transformer_blocks.16.attn.norm_added_k.weight', 'transformer_blocks.16.attn.norm_added_q.weight', 'transformer_blocks.16.attn.norm_k.weight', 'transformer_blocks.16.attn.norm_q.weight', 'transformer_blocks.16.attn.to_add_out.bias', 'transformer_blocks.16.attn.to_add_out.in_scale', 'transformer_blocks.16.attn.to_add_out.weight', 'transformer_blocks.16.attn.to_add_out.weight_scale', 'transformer_blocks.16.attn.to_k.bias', 'transformer_blocks.16.attn.to_k.in_scale', 'transformer_blocks.16.attn.to_k.weight', 'transformer_blocks.16.attn.to_k.weight_scale', 'transformer_blocks.16.attn.to_out.0.bias', 'transformer_blocks.16.attn.to_out.0.in_scale', 'transformer_blocks.16.attn.to_out.0.weight', 'transformer_blocks.16.attn.to_out.0.weight_scale', 'transformer_blocks.16.attn.to_q.bias', 'transformer_blocks.16.attn.to_q.in_scale', 'transformer_blocks.16.attn.to_q.weight', 'transformer_blocks.16.attn.to_q.weight_scale', 'transformer_blocks.16.attn.to_v.bias', 'transformer_blocks.16.attn.to_v.in_scale', 'transformer_blocks.16.attn.to_v.weight', 'transformer_blocks.16.attn.to_v.weight_scale', 'transformer_blocks.16.ff.net.0.proj.bias', 'transformer_blocks.16.ff.net.0.proj.in_scale', 'transformer_blocks.16.ff.net.0.proj.weight', 'transformer_blocks.16.ff.net.0.proj.weight_scale', 'transformer_blocks.16.ff.net.2.bias', 'transformer_blocks.16.ff.net.2.in_scale', 'transformer_blocks.16.ff.net.2.weight', 'transformer_blocks.16.ff.net.2.weight_scale', 'transformer_blocks.16.ff_context.net.0.proj.bias', 'transformer_blocks.16.ff_context.net.0.proj.in_scale', 'transformer_blocks.16.ff_context.net.0.proj.weight', 'transformer_blocks.16.ff_context.net.0.proj.weight_scale', 'transformer_blocks.16.ff_context.net.2.bias', 'transformer_blocks.16.ff_context.net.2.in_scale', 'transformer_blocks.16.ff_context.net.2.weight', 'transformer_blocks.16.ff_context.net.2.weight_scale', 'transformer_blocks.16.norm1.linear.bias', 'transformer_blocks.16.norm1.linear.in_scale', 'transformer_blocks.16.norm1.linear.weight', 'transformer_blocks.16.norm1.linear.weight_scale', 'transformer_blocks.16.norm1_context.linear.bias', 'transformer_blocks.16.norm1_context.linear.in_scale', 'transformer_blocks.16.norm1_context.linear.weight', 'transformer_blocks.16.norm1_context.linear.weight_scale', 'transformer_blocks.17.attn.add_k_proj.bias', 'transformer_blocks.17.attn.add_k_proj.in_scale', 'transformer_blocks.17.attn.add_k_proj.weight', 'transformer_blocks.17.attn.add_k_proj.weight_scale', 'transformer_blocks.17.attn.add_q_proj.bias', 'transformer_blocks.17.attn.add_q_proj.in_scale', 'transformer_blocks.17.attn.add_q_proj.weight', 'transformer_blocks.17.attn.add_q_proj.weight_scale', 'transformer_blocks.17.attn.add_v_proj.bias', 'transformer_blocks.17.attn.add_v_proj.in_scale', 'transformer_blocks.17.attn.add_v_proj.weight', 'transformer_blocks.17.attn.add_v_proj.weight_scale', 'transformer_blocks.17.attn.norm_added_k.weight', 'transformer_blocks.17.attn.norm_added_q.weight', 'transformer_blocks.17.attn.norm_k.weight', 'transformer_blocks.17.attn.norm_q.weight', 'transformer_blocks.17.attn.to_add_out.bias', 'transformer_blocks.17.attn.to_add_out.in_scale', 'transformer_blocks.17.attn.to_add_out.weight', 'transformer_blocks.17.attn.to_add_out.weight_scale', 'transformer_blocks.17.attn.to_k.bias', 'transformer_blocks.17.attn.to_k.in_scale', 'transformer_blocks.17.attn.to_k.weight', 'transformer_blocks.17.attn.to_k.weight_scale', 'transformer_blocks.17.attn.to_out.0.bias', 'transformer_blocks.17.attn.to_out.0.in_scale', 'transformer_blocks.17.attn.to_out.0.weight', 'transformer_blocks.17.attn.to_out.0.weight_scale', 'transformer_blocks.17.attn.to_q.bias', 'transformer_blocks.17.attn.to_q.in_scale', 'transformer_blocks.17.attn.to_q.weight', 'transformer_blocks.17.attn.to_q.weight_scale', 'transformer_blocks.17.attn.to_v.bias', 'transformer_blocks.17.attn.to_v.in_scale', 'transformer_blocks.17.attn.to_v.weight', 'transformer_blocks.17.attn.to_v.weight_scale', 'transformer_blocks.17.ff.net.0.proj.bias', 'transformer_blocks.17.ff.net.0.proj.in_scale', 'transformer_blocks.17.ff.net.0.proj.weight', 'transformer_blocks.17.ff.net.0.proj.weight_scale', 'transformer_blocks.17.ff.net.2.bias', 'transformer_blocks.17.ff.net.2.in_scale', 'transformer_blocks.17.ff.net.2.weight', 'transformer_blocks.17.ff.net.2.weight_scale', 'transformer_blocks.17.ff_context.net.0.proj.bias', 'transformer_blocks.17.ff_context.net.0.proj.in_scale', 'transformer_blocks.17.ff_context.net.0.proj.weight', 'transformer_blocks.17.ff_context.net.0.proj.weight_scale', 'transformer_blocks.17.ff_context.net.2.bias', 'transformer_blocks.17.ff_context.net.2.in_scale', 'transformer_blocks.17.ff_context.net.2.weight', 'transformer_blocks.17.ff_context.net.2.weight_scale', 'transformer_blocks.17.norm1.linear.bias', 'transformer_blocks.17.norm1.linear.in_scale', 'transformer_blocks.17.norm1.linear.weight', 'transformer_blocks.17.norm1.linear.weight_scale', 'transformer_blocks.17.norm1_context.linear.bias', 'transformer_blocks.17.norm1_context.linear.in_scale', 'transformer_blocks.17.norm1_context.linear.weight', 'transformer_blocks.17.norm1_context.linear.weight_scale', 'transformer_blocks.18.attn.add_k_proj.bias', 'transformer_blocks.18.attn.add_k_proj.in_scale', 'transformer_blocks.18.attn.add_k_proj.weight', 'transformer_blocks.18.attn.add_k_proj.weight_scale', 'transformer_blocks.18.attn.add_q_proj.bias', 'transformer_blocks.18.attn.add_q_proj.in_scale', 'transformer_blocks.18.attn.add_q_proj.weight', 'transformer_blocks.18.attn.add_q_proj.weight_scale', 'transformer_blocks.18.attn.add_v_proj.bias', 'transformer_blocks.18.attn.add_v_proj.in_scale', 'transformer_blocks.18.attn.add_v_proj.weight', 'transformer_blocks.18.attn.add_v_proj.weight_scale', 'transformer_blocks.18.attn.norm_added_k.weight', 'transformer_blocks.18.attn.norm_added_q.weight', 'transformer_blocks.18.attn.norm_k.weight', 'transformer_blocks.18.attn.norm_q.weight', 'transformer_blocks.18.attn.to_add_out.bias', 'transformer_blocks.18.attn.to_add_out.in_scale', 'transformer_blocks.18.attn.to_add_out.weight', 'transformer_blocks.18.attn.to_add_out.weight_scale', 'transformer_blocks.18.attn.to_k.bias', 'transformer_blocks.18.attn.to_k.in_scale', 'transformer_blocks.18.attn.to_k.weight', 'transformer_blocks.18.attn.to_k.weight_scale', 'transformer_blocks.18.attn.to_out.0.bias', 'transformer_blocks.18.attn.to_out.0.in_scale', 'transformer_blocks.18.attn.to_out.0.weight', 'transformer_blocks.18.attn.to_out.0.weight_scale', 'transformer_blocks.18.attn.to_q.bias', 'transformer_blocks.18.attn.to_q.in_scale', 'transformer_blocks.18.attn.to_q.weight', 'transformer_blocks.18.attn.to_q.weight_scale', 'transformer_blocks.18.attn.to_v.bias', 'transformer_blocks.18.attn.to_v.in_scale', 'transformer_blocks.18.attn.to_v.weight', 'transformer_blocks.18.attn.to_v.weight_scale', 'transformer_blocks.18.ff.net.0.proj.bias', 'transformer_blocks.18.ff.net.0.proj.in_scale', 'transformer_blocks.18.ff.net.0.proj.weight', 'transformer_blocks.18.ff.net.0.proj.weight_scale', 'transformer_blocks.18.ff.net.2.bias', 'transformer_blocks.18.ff.net.2.in_scale', 'transformer_blocks.18.ff.net.2.weight', 'transformer_blocks.18.ff.net.2.weight_scale', 'transformer_blocks.18.ff_context.net.0.proj.bias', 'transformer_blocks.18.ff_context.net.0.proj.in_scale', 'transformer_blocks.18.ff_context.net.0.proj.weight', 'transformer_blocks.18.ff_context.net.0.proj.weight_scale', 'transformer_blocks.18.ff_context.net.2.bias', 'transformer_blocks.18.ff_context.net.2.in_scale', 'transformer_blocks.18.ff_context.net.2.weight', 'transformer_blocks.18.ff_context.net.2.weight_scale', 'transformer_blocks.18.norm1.linear.bias', 'transformer_blocks.18.norm1.linear.in_scale', 'transformer_blocks.18.norm1.linear.weight', 'transformer_blocks.18.norm1.linear.weight_scale', 'transformer_blocks.18.norm1_context.linear.bias', 'transformer_blocks.18.norm1_context.linear.in_scale', 'transformer_blocks.18.norm1_context.linear.weight', 'transformer_blocks.18.norm1_context.linear.weight_scale', 'transformer_blocks.2.attn.add_k_proj.bias', 'transformer_blocks.2.attn.add_k_proj.in_scale', 'transformer_blocks.2.attn.add_k_proj.weight', 'transformer_blocks.2.attn.add_k_proj.weight_scale', 'transformer_blocks.2.attn.add_q_proj.bias', 'transformer_blocks.2.attn.add_q_proj.in_scale', 'transformer_blocks.2.attn.add_q_proj.weight', 'transformer_blocks.2.attn.add_q_proj.weight_scale', 'transformer_blocks.2.attn.add_v_proj.bias', 'transformer_blocks.2.attn.add_v_proj.in_scale', 'transformer_blocks.2.attn.add_v_proj.weight', 'transformer_blocks.2.attn.add_v_proj.weight_scale', 'transformer_blocks.2.attn.norm_added_k.weight', 'transformer_blocks.2.attn.norm_added_q.weight', 'transformer_blocks.2.attn.norm_k.weight', 'transformer_blocks.2.attn.norm_q.weight', 'transformer_blocks.2.attn.to_add_out.bias', 'transformer_blocks.2.attn.to_add_out.in_scale', 'transformer_blocks.2.attn.to_add_out.weight', 'transformer_blocks.2.attn.to_add_out.weight_scale', 'transformer_blocks.2.attn.to_k.bias', 'transformer_blocks.2.attn.to_k.in_scale', 'transformer_blocks.2.attn.to_k.weight', 'transformer_blocks.2.attn.to_k.weight_scale', 'transformer_blocks.2.attn.to_out.0.bias', 'transformer_blocks.2.attn.to_out.0.in_scale', 'transformer_blocks.2.attn.to_out.0.weight', 'transformer_blocks.2.attn.to_out.0.weight_scale', 'transformer_blocks.2.attn.to_q.bias', 'transformer_blocks.2.attn.to_q.in_scale', 'transformer_blocks.2.attn.to_q.weight', 'transformer_blocks.2.attn.to_q.weight_scale', 'transformer_blocks.2.attn.to_v.bias', 'transformer_blocks.2.attn.to_v.in_scale', 'transformer_blocks.2.attn.to_v.weight', 'transformer_blocks.2.attn.to_v.weight_scale', 'transformer_blocks.2.ff.net.0.proj.bias', 'transformer_blocks.2.ff.net.0.proj.in_scale', 'transformer_blocks.2.ff.net.0.proj.weight', 'transformer_blocks.2.ff.net.0.proj.weight_scale', 'transformer_blocks.2.ff.net.2.bias', 'transformer_blocks.2.ff.net.2.in_scale', 'transformer_blocks.2.ff.net.2.weight', 'transformer_blocks.2.ff.net.2.weight_scale', 'transformer_blocks.2.ff_context.net.0.proj.bias', 'transformer_blocks.2.ff_context.net.0.proj.in_scale', 'transformer_blocks.2.ff_context.net.0.proj.weight', 'transformer_blocks.2.ff_context.net.0.proj.weight_scale', 'transformer_blocks.2.ff_context.net.2.bias', 'transformer_blocks.2.ff_context.net.2.in_scale', 'transformer_blocks.2.ff_context.net.2.weight', 'transformer_blocks.2.ff_context.net.2.weight_scale', 'transformer_blocks.2.norm1.linear.bias', 'transformer_blocks.2.norm1.linear.in_scale', 'transformer_blocks.2.norm1.linear.weight', 'transformer_blocks.2.norm1.linear.weight_scale', 'transformer_blocks.2.norm1_context.linear.bias', 'transformer_blocks.2.norm1_context.linear.in_scale', 'transformer_blocks.2.norm1_context.linear.weight', 'transformer_blocks.2.norm1_context.linear.weight_scale', 'transformer_blocks.3.attn.add_k_proj.bias', 'transformer_blocks.3.attn.add_k_proj.in_scale', 'transformer_blocks.3.attn.add_k_proj.weight', 'transformer_blocks.3.attn.add_k_proj.weight_scale', 'transformer_blocks.3.attn.add_q_proj.bias', 'transformer_blocks.3.attn.add_q_proj.in_scale', 'transformer_blocks.3.attn.add_q_proj.weight', 'transformer_blocks.3.attn.add_q_proj.weight_scale', 'transformer_blocks.3.attn.add_v_proj.bias', 'transformer_blocks.3.attn.add_v_proj.in_scale', 'transformer_blocks.3.attn.add_v_proj.weight', 'transformer_blocks.3.attn.add_v_proj.weight_scale', 'transformer_blocks.3.attn.norm_added_k.weight', 'transformer_blocks.3.attn.norm_added_q.weight', 'transformer_blocks.3.attn.norm_k.weight', 'transformer_blocks.3.attn.norm_q.weight', 'transformer_blocks.3.attn.to_add_out.bias', 'transformer_blocks.3.attn.to_add_out.in_scale', 'transformer_blocks.3.attn.to_add_out.weight', 'transformer_blocks.3.attn.to_add_out.weight_scale', 'transformer_blocks.3.attn.to_k.bias', 'transformer_blocks.3.attn.to_k.in_scale', 'transformer_blocks.3.attn.to_k.weight', 'transformer_blocks.3.attn.to_k.weight_scale', 'transformer_blocks.3.attn.to_out.0.bias', 'transformer_blocks.3.attn.to_out.0.in_scale', 'transformer_blocks.3.attn.to_out.0.weight', 'transformer_blocks.3.attn.to_out.0.weight_scale', 'transformer_blocks.3.attn.to_q.bias', 'transformer_blocks.3.attn.to_q.in_scale', 'transformer_blocks.3.attn.to_q.weight', 'transformer_blocks.3.attn.to_q.weight_scale', 'transformer_blocks.3.attn.to_v.bias', 'transformer_blocks.3.attn.to_v.in_scale', 'transformer_blocks.3.attn.to_v.weight', 'transformer_blocks.3.attn.to_v.weight_scale', 'transformer_blocks.3.ff.net.0.proj.bias', 'transformer_blocks.3.ff.net.0.proj.in_scale', 'transformer_blocks.3.ff.net.0.proj.weight', 'transformer_blocks.3.ff.net.0.proj.weight_scale', 'transformer_blocks.3.ff.net.2.bias', 'transformer_blocks.3.ff.net.2.in_scale', 'transformer_blocks.3.ff.net.2.weight', 'transformer_blocks.3.ff.net.2.weight_scale', 'transformer_blocks.3.ff_context.net.0.proj.bias', 'transformer_blocks.3.ff_context.net.0.proj.in_scale', 'transformer_blocks.3.ff_context.net.0.proj.weight', 'transformer_blocks.3.ff_context.net.0.proj.weight_scale', 'transformer_blocks.3.ff_context.net.2.bias', 'transformer_blocks.3.ff_context.net.2.in_scale', 'transformer_blocks.3.ff_context.net.2.weight', 'transformer_blocks.3.ff_context.net.2.weight_scale', 'transformer_blocks.3.norm1.linear.bias', 'transformer_blocks.3.norm1.linear.in_scale', 'transformer_blocks.3.norm1.linear.weight', 'transformer_blocks.3.norm1.linear.weight_scale', 'transformer_blocks.3.norm1_context.linear.bias', 'transformer_blocks.3.norm1_context.linear.in_scale', 'transformer_blocks.3.norm1_context.linear.weight', 'transformer_blocks.3.norm1_context.linear.weight_scale', 'transformer_blocks.4.attn.add_k_proj.bias', 'transformer_blocks.4.attn.add_k_proj.in_scale', 'transformer_blocks.4.attn.add_k_proj.weight', 'transformer_blocks.4.attn.add_k_proj.weight_scale', 'transformer_blocks.4.attn.add_q_proj.bias', 'transformer_blocks.4.attn.add_q_proj.in_scale', 'transformer_blocks.4.attn.add_q_proj.weight', 'transformer_blocks.4.attn.add_q_proj.weight_scale', 'transformer_blocks.4.attn.add_v_proj.bias', 'transformer_blocks.4.attn.add_v_proj.in_scale', 'transformer_blocks.4.attn.add_v_proj.weight', 'transformer_blocks.4.attn.add_v_proj.weight_scale', 'transformer_blocks.4.attn.norm_added_k.weight', 'transformer_blocks.4.attn.norm_added_q.weight', 'transformer_blocks.4.attn.norm_k.weight', 'transformer_blocks.4.attn.norm_q.weight', 'transformer_blocks.4.attn.to_add_out.bias', 'transformer_blocks.4.attn.to_add_out.in_scale', 'transformer_blocks.4.attn.to_add_out.weight', 'transformer_blocks.4.attn.to_add_out.weight_scale', 'transformer_blocks.4.attn.to_k.bias', 'transformer_blocks.4.attn.to_k.in_scale', 'transformer_blocks.4.attn.to_k.weight', 'transformer_blocks.4.attn.to_k.weight_scale', 'transformer_blocks.4.attn.to_out.0.bias', 'transformer_blocks.4.attn.to_out.0.in_scale', 'transformer_blocks.4.attn.to_out.0.weight', 'transformer_blocks.4.attn.to_out.0.weight_scale', 'transformer_blocks.4.attn.to_q.bias', 'transformer_blocks.4.attn.to_q.in_scale', 'transformer_blocks.4.attn.to_q.weight', 'transformer_blocks.4.attn.to_q.weight_scale', 'transformer_blocks.4.attn.to_v.bias', 'transformer_blocks.4.attn.to_v.in_scale', 'transformer_blocks.4.attn.to_v.weight', 'transformer_blocks.4.attn.to_v.weight_scale', 'transformer_blocks.4.ff.net.0.proj.bias', 'transformer_blocks.4.ff.net.0.proj.in_scale', 'transformer_blocks.4.ff.net.0.proj.weight', 'transformer_blocks.4.ff.net.0.proj.weight_scale', 'transformer_blocks.4.ff.net.2.bias', 'transformer_blocks.4.ff.net.2.in_scale', 'transformer_blocks.4.ff.net.2.weight', 'transformer_blocks.4.ff.net.2.weight_scale', 'transformer_blocks.4.ff_context.net.0.proj.bias', 'transformer_blocks.4.ff_context.net.0.proj.in_scale', 'transformer_blocks.4.ff_context.net.0.proj.weight', 'transformer_blocks.4.ff_context.net.0.proj.weight_scale', 'transformer_blocks.4.ff_context.net.2.bias', 'transformer_blocks.4.ff_context.net.2.in_scale', 'transformer_blocks.4.ff_context.net.2.weight', 'transformer_blocks.4.ff_context.net.2.weight_scale', 'transformer_blocks.4.norm1.linear.bias', 'transformer_blocks.4.norm1.linear.in_scale', 'transformer_blocks.4.norm1.linear.weight', 'transformer_blocks.4.norm1.linear.weight_scale', 'transformer_blocks.4.norm1_context.linear.bias', 'transformer_blocks.4.norm1_context.linear.in_scale', 'transformer_blocks.4.norm1_context.linear.weight', 'transformer_blocks.4.norm1_context.linear.weight_scale', 'transformer_blocks.5.attn.add_k_proj.bias', 'transformer_blocks.5.attn.add_k_proj.in_scale', 'transformer_blocks.5.attn.add_k_proj.weight', 'transformer_blocks.5.attn.add_k_proj.weight_scale', 'transformer_blocks.5.attn.add_q_proj.bias', 'transformer_blocks.5.attn.add_q_proj.in_scale', 'transformer_blocks.5.attn.add_q_proj.weight', 'transformer_blocks.5.attn.add_q_proj.weight_scale', 'transformer_blocks.5.attn.add_v_proj.bias', 'transformer_blocks.5.attn.add_v_proj.in_scale', 'transformer_blocks.5.attn.add_v_proj.weight', 'transformer_blocks.5.attn.add_v_proj.weight_scale', 'transformer_blocks.5.attn.norm_added_k.weight', 'transformer_blocks.5.attn.norm_added_q.weight', 'transformer_blocks.5.attn.norm_k.weight', 'transformer_blocks.5.attn.norm_q.weight', 'transformer_blocks.5.attn.to_add_out.bias', 'transformer_blocks.5.attn.to_add_out.in_scale', 'transformer_blocks.5.attn.to_add_out.weight', 'transformer_blocks.5.attn.to_add_out.weight_scale', 'transformer_blocks.5.attn.to_k.bias', 'transformer_blocks.5.attn.to_k.in_scale', 'transformer_blocks.5.attn.to_k.weight', 'transformer_blocks.5.attn.to_k.weight_scale', 'transformer_blocks.5.attn.to_out.0.bias', 'transformer_blocks.5.attn.to_out.0.in_scale', 'transformer_blocks.5.attn.to_out.0.weight', 'transformer_blocks.5.attn.to_out.0.weight_scale', 'transformer_blocks.5.attn.to_q.bias', 'transformer_blocks.5.attn.to_q.in_scale', 'transformer_blocks.5.attn.to_q.weight', 'transformer_blocks.5.attn.to_q.weight_scale', 'transformer_blocks.5.attn.to_v.bias', 'transformer_blocks.5.attn.to_v.in_scale', 'transformer_blocks.5.attn.to_v.weight', 'transformer_blocks.5.attn.to_v.weight_scale', 'transformer_blocks.5.ff.net.0.proj.bias', 'transformer_blocks.5.ff.net.0.proj.in_scale', 'transformer_blocks.5.ff.net.0.proj.weight', 'transformer_blocks.5.ff.net.0.proj.weight_scale', 'transformer_blocks.5.ff.net.2.bias', 'transformer_blocks.5.ff.net.2.in_scale', 'transformer_blocks.5.ff.net.2.weight', 'transformer_blocks.5.ff.net.2.weight_scale', 'transformer_blocks.5.ff_context.net.0.proj.bias', 'transformer_blocks.5.ff_context.net.0.proj.in_scale', 'transformer_blocks.5.ff_context.net.0.proj.weight', 'transformer_blocks.5.ff_context.net.0.proj.weight_scale', 'transformer_blocks.5.ff_context.net.2.bias', 'transformer_blocks.5.ff_context.net.2.in_scale', 'transformer_blocks.5.ff_context.net.2.weight', 'transformer_blocks.5.ff_context.net.2.weight_scale', 'transformer_blocks.5.norm1.linear.bias', 'transformer_blocks.5.norm1.linear.in_scale', 'transformer_blocks.5.norm1.linear.weight', 'transformer_blocks.5.norm1.linear.weight_scale', 'transformer_blocks.5.norm1_context.linear.bias', 'transformer_blocks.5.norm1_context.linear.in_scale', 'transformer_blocks.5.norm1_context.linear.weight', 'transformer_blocks.5.norm1_context.linear.weight_scale', 'transformer_blocks.6.attn.add_k_proj.bias', 'transformer_blocks.6.attn.add_k_proj.in_scale', 'transformer_blocks.6.attn.add_k_proj.weight', 'transformer_blocks.6.attn.add_k_proj.weight_scale', 'transformer_blocks.6.attn.add_q_proj.bias', 'transformer_blocks.6.attn.add_q_proj.in_scale', 'transformer_blocks.6.attn.add_q_proj.weight', 'transformer_blocks.6.attn.add_q_proj.weight_scale', 'transformer_blocks.6.attn.add_v_proj.bias', 'transformer_blocks.6.attn.add_v_proj.in_scale', 'transformer_blocks.6.attn.add_v_proj.weight', 'transformer_blocks.6.attn.add_v_proj.weight_scale', 'transformer_blocks.6.attn.norm_added_k.weight', 'transformer_blocks.6.attn.norm_added_q.weight', 'transformer_blocks.6.attn.norm_k.weight', 'transformer_blocks.6.attn.norm_q.weight', 'transformer_blocks.6.attn.to_add_out.bias', 'transformer_blocks.6.attn.to_add_out.in_scale', 'transformer_blocks.6.attn.to_add_out.weight', 'transformer_blocks.6.attn.to_add_out.weight_scale', 'transformer_blocks.6.attn.to_k.bias', 'transformer_blocks.6.attn.to_k.in_scale', 'transformer_blocks.6.attn.to_k.weight', 'transformer_blocks.6.attn.to_k.weight_scale', 'transformer_blocks.6.attn.to_out.0.bias', 'transformer_blocks.6.attn.to_out.0.in_scale', 'transformer_blocks.6.attn.to_out.0.weight', 'transformer_blocks.6.attn.to_out.0.weight_scale', 'transformer_blocks.6.attn.to_q.bias', 'transformer_blocks.6.attn.to_q.in_scale', 'transformer_blocks.6.attn.to_q.weight', 'transformer_blocks.6.attn.to_q.weight_scale', 'transformer_blocks.6.attn.to_v.bias', 'transformer_blocks.6.attn.to_v.in_scale', 'transformer_blocks.6.attn.to_v.weight', 'transformer_blocks.6.attn.to_v.weight_scale', 'transformer_blocks.6.ff.net.0.proj.bias', 'transformer_blocks.6.ff.net.0.proj.in_scale', 'transformer_blocks.6.ff.net.0.proj.weight', 'transformer_blocks.6.ff.net.0.proj.weight_scale', 'transformer_blocks.6.ff.net.2.bias', 'transformer_blocks.6.ff.net.2.in_scale', 'transformer_blocks.6.ff.net.2.weight', 'transformer_blocks.6.ff.net.2.weight_scale', 'transformer_blocks.6.ff_context.net.0.proj.bias', 'transformer_blocks.6.ff_context.net.0.proj.in_scale', 'transformer_blocks.6.ff_context.net.0.proj.weight', 'transformer_blocks.6.ff_context.net.0.proj.weight_scale', 'transformer_blocks.6.ff_context.net.2.bias', 'transformer_blocks.6.ff_context.net.2.in_scale', 'transformer_blocks.6.ff_context.net.2.weight', 'transformer_blocks.6.ff_context.net.2.weight_scale', 'transformer_blocks.6.norm1.linear.bias', 'transformer_blocks.6.norm1.linear.in_scale', 'transformer_blocks.6.norm1.linear.weight', 'transformer_blocks.6.norm1.linear.weight_scale', 'transformer_blocks.6.norm1_context.linear.bias', 'transformer_blocks.6.norm1_context.linear.in_scale', 'transformer_blocks.6.norm1_context.linear.weight', 'transformer_blocks.6.norm1_context.linear.weight_scale', 'transformer_blocks.7.attn.add_k_proj.bias', 'transformer_blocks.7.attn.add_k_proj.in_scale', 'transformer_blocks.7.attn.add_k_proj.weight', 'transformer_blocks.7.attn.add_k_proj.weight_scale', 'transformer_blocks.7.attn.add_q_proj.bias', 'transformer_blocks.7.attn.add_q_proj.in_scale', 'transformer_blocks.7.attn.add_q_proj.weight', 'transformer_blocks.7.attn.add_q_proj.weight_scale', 'transformer_blocks.7.attn.add_v_proj.bias', 'transformer_blocks.7.attn.add_v_proj.in_scale', 'transformer_blocks.7.attn.add_v_proj.weight', 'transformer_blocks.7.attn.add_v_proj.weight_scale', 'transformer_blocks.7.attn.norm_added_k.weight', 'transformer_blocks.7.attn.norm_added_q.weight', 'transformer_blocks.7.attn.norm_k.weight', 'transformer_blocks.7.attn.norm_q.weight', 'transformer_blocks.7.attn.to_add_out.bias', 'transformer_blocks.7.attn.to_add_out.in_scale', 'transformer_blocks.7.attn.to_add_out.weight', 'transformer_blocks.7.attn.to_add_out.weight_scale', 'transformer_blocks.7.attn.to_k.bias', 'transformer_blocks.7.attn.to_k.in_scale', 'transformer_blocks.7.attn.to_k.weight', 'transformer_blocks.7.attn.to_k.weight_scale', 'transformer_blocks.7.attn.to_out.0.bias', 'transformer_blocks.7.attn.to_out.0.in_scale', 'transformer_blocks.7.attn.to_out.0.weight', 'transformer_blocks.7.attn.to_out.0.weight_scale', 'transformer_blocks.7.attn.to_q.bias', 'transformer_blocks.7.attn.to_q.in_scale', 'transformer_blocks.7.attn.to_q.weight', 'transformer_blocks.7.attn.to_q.weight_scale', 'transformer_blocks.7.attn.to_v.bias', 'transformer_blocks.7.attn.to_v.in_scale', 'transformer_blocks.7.attn.to_v.weight', 'transformer_blocks.7.attn.to_v.weight_scale', 'transformer_blocks.7.ff.net.0.proj.bias', 'transformer_blocks.7.ff.net.0.proj.in_scale', 'transformer_blocks.7.ff.net.0.proj.weight', 'transformer_blocks.7.ff.net.0.proj.weight_scale', 'transformer_blocks.7.ff.net.2.bias', 'transformer_blocks.7.ff.net.2.in_scale', 'transformer_blocks.7.ff.net.2.weight', 'transformer_blocks.7.ff.net.2.weight_scale', 'transformer_blocks.7.ff_context.net.0.proj.bias', 'transformer_blocks.7.ff_context.net.0.proj.in_scale', 'transformer_blocks.7.ff_context.net.0.proj.weight', 'transformer_blocks.7.ff_context.net.0.proj.weight_scale', 'transformer_blocks.7.ff_context.net.2.bias', 'transformer_blocks.7.ff_context.net.2.in_scale', 'transformer_blocks.7.ff_context.net.2.weight', 'transformer_blocks.7.ff_context.net.2.weight_scale', 'transformer_blocks.7.norm1.linear.bias', 'transformer_blocks.7.norm1.linear.in_scale', 'transformer_blocks.7.norm1.linear.weight', 'transformer_blocks.7.norm1.linear.weight_scale', 'transformer_blocks.7.norm1_context.linear.bias', 'transformer_blocks.7.norm1_context.linear.in_scale', 'transformer_blocks.7.norm1_context.linear.weight', 'transformer_blocks.7.norm1_context.linear.weight_scale', 'transformer_blocks.8.attn.add_k_proj.bias', 'transformer_blocks.8.attn.add_k_proj.in_scale', 'transformer_blocks.8.attn.add_k_proj.weight', 'transformer_blocks.8.attn.add_k_proj.weight_scale', 'transformer_blocks.8.attn.add_q_proj.bias', 'transformer_blocks.8.attn.add_q_proj.in_scale', 'transformer_blocks.8.attn.add_q_proj.weight', 'transformer_blocks.8.attn.add_q_proj.weight_scale', 'transformer_blocks.8.attn.add_v_proj.bias', 'transformer_blocks.8.attn.add_v_proj.in_scale', 'transformer_blocks.8.attn.add_v_proj.weight', 'transformer_blocks.8.attn.add_v_proj.weight_scale', 'transformer_blocks.8.attn.norm_added_k.weight', 'transformer_blocks.8.attn.norm_added_q.weight', 'transformer_blocks.8.attn.norm_k.weight', 'transformer_blocks.8.attn.norm_q.weight', 'transformer_blocks.8.attn.to_add_out.bias', 'transformer_blocks.8.attn.to_add_out.in_scale', 'transformer_blocks.8.attn.to_add_out.weight', 'transformer_blocks.8.attn.to_add_out.weight_scale', 'transformer_blocks.8.attn.to_k.bias', 'transformer_blocks.8.attn.to_k.in_scale', 'transformer_blocks.8.attn.to_k.weight', 'transformer_blocks.8.attn.to_k.weight_scale', 'transformer_blocks.8.attn.to_out.0.bias', 'transformer_blocks.8.attn.to_out.0.in_scale', 'transformer_blocks.8.attn.to_out.0.weight', 'transformer_blocks.8.attn.to_out.0.weight_scale', 'transformer_blocks.8.attn.to_q.bias', 'transformer_blocks.8.attn.to_q.in_scale', 'transformer_blocks.8.attn.to_q.weight', 'transformer_blocks.8.attn.to_q.weight_scale', 'transformer_blocks.8.attn.to_v.bias', 'transformer_blocks.8.attn.to_v.in_scale', 'transformer_blocks.8.attn.to_v.weight', 'transformer_blocks.8.attn.to_v.weight_scale', 'transformer_blocks.8.ff.net.0.proj.bias', 'transformer_blocks.8.ff.net.0.proj.in_scale', 'transformer_blocks.8.ff.net.0.proj.weight', 'transformer_blocks.8.ff.net.0.proj.weight_scale', 'transformer_blocks.8.ff.net.2.bias', 'transformer_blocks.8.ff.net.2.in_scale', 'transformer_blocks.8.ff.net.2.weight', 'transformer_blocks.8.ff.net.2.weight_scale', 'transformer_blocks.8.ff_context.net.0.proj.bias', 'transformer_blocks.8.ff_context.net.0.proj.in_scale', 'transformer_blocks.8.ff_context.net.0.proj.weight', 'transformer_blocks.8.ff_context.net.0.proj.weight_scale', 'transformer_blocks.8.ff_context.net.2.bias', 'transformer_blocks.8.ff_context.net.2.in_scale', 'transformer_blocks.8.ff_context.net.2.weight', 'transformer_blocks.8.ff_context.net.2.weight_scale', 'transformer_blocks.8.norm1.linear.bias', 'transformer_blocks.8.norm1.linear.in_scale', 'transformer_blocks.8.norm1.linear.weight', 'transformer_blocks.8.norm1.linear.weight_scale', 'transformer_blocks.8.norm1_context.linear.bias', 'transformer_blocks.8.norm1_context.linear.in_scale', 'transformer_blocks.8.norm1_context.linear.weight', 'transformer_blocks.8.norm1_context.linear.weight_scale', 'transformer_blocks.9.attn.add_k_proj.bias', 'transformer_blocks.9.attn.add_k_proj.in_scale', 'transformer_blocks.9.attn.add_k_proj.weight', 'transformer_blocks.9.attn.add_k_proj.weight_scale', 'transformer_blocks.9.attn.add_q_proj.bias', 'transformer_blocks.9.attn.add_q_proj.in_scale', 'transformer_blocks.9.attn.add_q_proj.weight', 'transformer_blocks.9.attn.add_q_proj.weight_scale', 'transformer_blocks.9.attn.add_v_proj.bias', 'transformer_blocks.9.attn.add_v_proj.in_scale', 'transformer_blocks.9.attn.add_v_proj.weight', 'transformer_blocks.9.attn.add_v_proj.weight_scale', 'transformer_blocks.9.attn.norm_added_k.weight', 'transformer_blocks.9.attn.norm_added_q.weight', 'transformer_blocks.9.attn.norm_k.weight', 'transformer_blocks.9.attn.norm_q.weight', 'transformer_blocks.9.attn.to_add_out.bias', 'transformer_blocks.9.attn.to_add_out.in_scale', 'transformer_blocks.9.attn.to_add_out.weight', 'transformer_blocks.9.attn.to_add_out.weight_scale', 'transformer_blocks.9.attn.to_k.bias', 'transformer_blocks.9.attn.to_k.in_scale', 'transformer_blocks.9.attn.to_k.weight', 'transformer_blocks.9.attn.to_k.weight_scale', 'transformer_blocks.9.attn.to_out.0.bias', 'transformer_blocks.9.attn.to_out.0.in_scale', 'transformer_blocks.9.attn.to_out.0.weight', 'transformer_blocks.9.attn.to_out.0.weight_scale', 'transformer_blocks.9.attn.to_q.bias', 'transformer_blocks.9.attn.to_q.in_scale', 'transformer_blocks.9.attn.to_q.weight', 'transformer_blocks.9.attn.to_q.weight_scale', 'transformer_blocks.9.attn.to_v.bias', 'transformer_blocks.9.attn.to_v.in_scale', 'transformer_blocks.9.attn.to_v.weight', 'transformer_blocks.9.attn.to_v.weight_scale', 'transformer_blocks.9.ff.net.0.proj.bias', 'transformer_blocks.9.ff.net.0.proj.in_scale', 'transformer_blocks.9.ff.net.0.proj.weight', 'transformer_blocks.9.ff.net.0.proj.weight_scale', 'transformer_blocks.9.ff.net.2.bias', 'transformer_blocks.9.ff.net.2.in_scale', 'transformer_blocks.9.ff.net.2.weight', 'transformer_blocks.9.ff.net.2.weight_scale', 'transformer_blocks.9.ff_context.net.0.proj.bias', 'transformer_blocks.9.ff_context.net.0.proj.in_scale', 'transformer_blocks.9.ff_context.net.0.proj.weight', 'transformer_blocks.9.ff_context.net.0.proj.weight_scale', 'transformer_blocks.9.ff_context.net.2.bias', 'transformer_blocks.9.ff_context.net.2.in_scale', 'transformer_blocks.9.ff_context.net.2.weight', 'transformer_blocks.9.ff_context.net.2.weight_scale', 'transformer_blocks.9.norm1.linear.bias', 'transformer_blocks.9.norm1.linear.in_scale', 'transformer_blocks.9.norm1.linear.weight', 'transformer_blocks.9.norm1.linear.weight_scale', 'transformer_blocks.9.norm1_context.linear.bias', 'transformer_blocks.9.norm1_context.linear.in_scale', 'transformer_blocks.9.norm1_context.linear.weight', 'transformer_blocks.9.norm1_context.linear.weight_scale', 'x_embedder.bias', 'x_embedder.weight']\n" ] } ], "source": [ "from safetensors import safe_open\n", "from safetensors.torch import load_file\n", "import torch\n", "\n", "def read_safetensor(filepath: str, key: str = None):\n", " \"\"\"\n", " Read a SafeTensor file using the safetensors library.\n", "\n", " Args:\n", " filepath: Path to the SafeTensor file\n", " key: Optional specific key to extract values for. If None, returns all keys.\n", "\n", " Returns:\n", " If key is None: Dictionary of all tensors\n", " If key is specified: Single tensor for that key\n", " \"\"\"\n", " # Method 1: Using safe_open (memory efficient, loads only what you need)\n", " with safe_open(filepath, framework=\"pt\") as f:\n", " # List all keys\n", " keys = f.keys()\n", "\n", " if key is None:\n", " # Return all tensors as a dictionary\n", " return {k: f.get_tensor(k) for k in keys}\n", " else:\n", " if key not in keys:\n", " raise KeyError(f\"Key '{key}' not found. Available keys: {list(keys)}\")\n", " return f.get_tensor(key)\n", "\n", "def print_tensor_info(tensor_dict):\n", " \"\"\"\n", " Print information about tensors in the dictionary.\n", " \"\"\"\n", " for key, tensor in tensor_dict.items():\n", " print(f\"\\nKey: {key}\")\n", " print(f\"Shape: {tensor.shape}\")\n", " print(f\"Dtype: {tensor.dtype}\")\n", " print(f\"First few values: {tensor.flatten()[:5]}\") # Show first 5 values\n", "\n", "# Example usage\n", "if __name__ == \"__main__\":\n", " filepath = \"/data/seungah/flux_test_fp8/transformer/diffusion_pytorch_model-00001-of-00002.safetensors\"\n", "\n", " # Example 1: List all tensors and their info\n", " print(\"Loading all tensors:\")\n", " tensors = read_safetensor(filepath)\n", " print_tensor_info(tensors)\n", "\n", " # Example 2: Load specific tensor\n", " print(\"\\nLoading specific tensor:\")\n", " try:\n", " key = \"single_transformer_blocks.1.attn.to_k.in_scale\" # replace with actual key name\n", " tensor = read_safetensor(filepath, key)\n", " print(f\"\\nKey: {key}\")\n", " print(f\"Shape: {tensor.shape}\")\n", " print(f\"Dtype: {tensor.dtype}\")\n", " print(f\"First few values: {tensor.flatten()[:5]}\")\n", " except KeyError as e:\n", " print(f\"Error: {e}\")\n", "\n", " # Alternative Method: Load entire file at once\n", " print(\"\\nAlternative method - loading entire file:\")\n", " tensors = load_file(filepath)\n", " print(f\"Available keys: {list(tensors.keys())}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "fmo-core", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }