{ "metadata": { "ParamSize": 666, "ParamBytes": 5274895360.0, "BitsPerParam": 16.0 }, "records": [ { "dataPath": "params_shard_0.bin", "format": "raw-shard", "nbytes": 262144000, "records": [ { "name": "lm_head.linear.weight", "shape": [ 51200, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 262144000, "byteOffset": 0 } ], "md5sum": "9684592d7ec6b67fcc10f35cac7d1904" }, { "dataPath": "params_shard_1.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.31.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "748813ab89cb015c6ca66d7c37365f46" }, { "dataPath": "params_shard_2.bin", "format": "raw-shard", "nbytes": 262144000, "records": [ { "name": "transformer.embd.weight", "shape": [ 51200, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 262144000, "byteOffset": 0 } ], "md5sum": "e37c2be02d78b921e5ba2cb92a757019" }, { "dataPath": "params_shard_3.bin", "format": "raw-shard", "nbytes": 30366720, "records": [ { "name": "lm_head.linear.bias", "shape": [ 51200 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 102400, "byteOffset": 0 }, { "name": "lm_head.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 102400 }, { "name": "lm_head.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 107520 }, { "name": "transformer.h.31.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 112640 }, { "name": "transformer.h.31.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 117760 }, { "name": "transformer.h.31.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 122880 }, { "name": "transformer.h.0.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 128000 }, { "name": "transformer.h.0.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 133120 }, { "name": "transformer.h.0.mlp.fc1.ALinear_no_train.weight", "shape": [ 10240, 1475 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 30208000, "byteOffset": 138240 }, { "name": "transformer.h.0.mlp.fc1.ALinear_train.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 30346240 } ], "md5sum": "d4eb6762c6134e20dcd0d78278ee6100" }, { "dataPath": "params_shard_4.bin", "format": "raw-shard", "nbytes": 26419200, "records": [ { "name": "transformer.h.0.mlp.fc2.BLinear_no_train.weight", "shape": [ 1290, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 26419200, "byteOffset": 0 } ], "md5sum": "9bf563c0f7ffe075a6c583be218923b2" }, { "dataPath": "params_shard_5.bin", "format": "raw-shard", "nbytes": 32486400, "records": [ { "name": "transformer.h.0.mlp.fc1.ALinear_train.weight", "shape": [ 10240, 163 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3338240, "byteOffset": 0 }, { "name": "transformer.h.0.mlp.fc1.BLinear_no_train.weight", "shape": [ 1475, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 7552000, "byteOffset": 3338240 }, { "name": "transformer.h.0.mlp.fc1.BLinear_train.weight", "shape": [ 163, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 834560, "byteOffset": 10890240 }, { "name": "transformer.h.0.mlp.fc2.ALinear_no_train.weight", "shape": [ 2560, 1290 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 6604800, "byteOffset": 11724800 }, { "name": "transformer.h.0.mlp.fc2.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18329600 }, { "name": "transformer.h.0.mlp.fc2.ALinear_train.weight", "shape": [ 2560, 143 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 732160, "byteOffset": 18334720 }, { "name": "transformer.h.0.mlp.fc2.BLinear_train.weight", "shape": [ 143, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2928640, "byteOffset": 19066880 }, { "name": "transformer.h.0.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 21995520 }, { "name": "transformer.h.0.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26716160 }, { "name": "transformer.h.0.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 26721280 }, { "name": "transformer.h.0.mixer.out_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 27243520 }, { "name": "transformer.h.0.mixer.out_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 31964160 } ], "md5sum": "36260d919cff72b30d6bc92e58ef847e" }, { "dataPath": "params_shard_6.bin", "format": "raw-shard", "nbytes": 30208000, "records": [ { "name": "transformer.h.1.mlp.fc1.ALinear_no_train.weight", "shape": [ 10240, 1475 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 30208000, "byteOffset": 0 } ], "md5sum": "6eac0e6eb4975087190c219e22948795" }, { "dataPath": "params_shard_7.bin", "format": "raw-shard", "nbytes": 32814080, "records": [ { "name": "transformer.h.0.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 0 }, { "name": "transformer.h.0.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 4720640 }, { "name": "transformer.h.0.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 4725760 }, { "name": "transformer.h.0.mixer.k_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 5248000 }, { "name": "transformer.h.0.mixer.k_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 9968640 }, { "name": "transformer.h.0.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 10490880 }, { "name": "transformer.h.0.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 14622720 }, { "name": "transformer.h.0.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 14627840 }, { "name": "transformer.h.0.mixer.q_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 15083520 }, { "name": "transformer.h.0.mixer.q_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 19215360 }, { "name": "transformer.h.0.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 19671040 }, { "name": "transformer.h.0.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 19676160 }, { "name": "transformer.h.1.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32783360 }, { "name": "transformer.h.1.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32788480 }, { "name": "transformer.h.1.mlp.fc1.ALinear_train.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 32793600 } ], "md5sum": "882b8d4af239f5bc6fa274a86afb09f2" }, { "dataPath": "params_shard_8.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.1.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "141766722b27e68325e208088b7cfe75" }, { "dataPath": "params_shard_9.bin", "format": "raw-shard", "nbytes": 28784640, "records": [ { "name": "transformer.h.1.mlp.fc1.ALinear_train.weight", "shape": [ 10240, 163 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3338240, "byteOffset": 0 }, { "name": "transformer.h.1.mlp.fc1.BLinear_no_train.weight", "shape": [ 1475, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 7552000, "byteOffset": 3338240 }, { "name": "transformer.h.1.mlp.fc1.BLinear_train.weight", "shape": [ 163, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 834560, "byteOffset": 10890240 }, { "name": "transformer.h.1.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 11724800 }, { "name": "transformer.h.1.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 11729920 }, { "name": "transformer.h.1.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 15861760 }, { "name": "transformer.h.1.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 15866880 }, { "name": "transformer.h.1.mixer.out_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 16322560 }, { "name": "transformer.h.1.mixer.out_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 20454400 }, { "name": "transformer.h.1.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 692 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 20910080 }, { "name": "transformer.h.1.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24453120 }, { "name": "transformer.h.1.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 76 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 24458240 }, { "name": "transformer.h.1.mixer.k_proj.BLinear_no_train.weight", "shape": [ 692, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 24847360 }, { "name": "transformer.h.1.mixer.k_proj.BLinear_train.weight", "shape": [ 76, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 28390400 }, { "name": "transformer.h.1.mixer.q_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 28779520 } ], "md5sum": "bf67187e79c07066743bdc29bb45eeb0" }, { "dataPath": "params_shard_10.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.10.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "d04557c3e6ac6baf219838ac31baa24d" }, { "dataPath": "params_shard_11.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.10.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "c0303861baea6886db6c1c78a3062aca" }, { "dataPath": "params_shard_12.bin", "format": "raw-shard", "nbytes": 32819200, "records": [ { "name": "transformer.h.1.mixer.q_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.1.mixer.v_proj.ALinear_no_train.weight", "shape": [ 2560, 576 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2949120, "byteOffset": 13107200 }, { "name": "transformer.h.1.mixer.v_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 16056320 }, { "name": "transformer.h.1.mixer.v_proj.ALinear_train.weight", "shape": [ 2560, 64 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 327680, "byteOffset": 16061440 }, { "name": "transformer.h.1.mixer.v_proj.BLinear_no_train.weight", "shape": [ 576, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2949120, "byteOffset": 16389120 }, { "name": "transformer.h.1.mixer.v_proj.BLinear_train.weight", "shape": [ 64, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 327680, "byteOffset": 19338240 }, { "name": "transformer.h.10.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 19665920 }, { "name": "transformer.h.10.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 19671040 }, { "name": "transformer.h.10.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 19676160 }, { "name": "transformer.h.10.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 19696640 }, { "name": "transformer.h.10.mixer.out_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 19701760 }, { "name": "transformer.h.10.mixer.out_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 19706880 }, { "name": "transformer.h.10.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32814080 } ], "md5sum": "0e15928230837054c237b3523c08a82d" }, { "dataPath": "params_shard_13.bin", "format": "raw-shard", "nbytes": 20981760, "records": [ { "name": "transformer.h.10.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.10.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 692 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 13107200 }, { "name": "transformer.h.10.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 16650240 }, { "name": "transformer.h.10.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 76 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 16655360 }, { "name": "transformer.h.10.mixer.q_proj.BLinear_no_train.weight", "shape": [ 692, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 17044480 }, { "name": "transformer.h.10.mixer.q_proj.BLinear_train.weight", "shape": [ 76, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 20587520 }, { "name": "transformer.h.10.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 20976640 } ], "md5sum": "832476266ba257f137d19a01c7e47586" }, { "dataPath": "params_shard_14.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.11.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "ffc203dfb4997b69b726794e490452c6" }, { "dataPath": "params_shard_15.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.11.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "2a6a3ae1665dab337eb373d4aac10a27" }, { "dataPath": "params_shard_16.bin", "format": "raw-shard", "nbytes": 31508480, "records": [ { "name": "transformer.h.10.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.11.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.11.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.11.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.11.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.11.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 461 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2360320, "byteOffset": 13143040 }, { "name": "transformer.h.11.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 15503360 }, { "name": "transformer.h.11.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 51 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 261120, "byteOffset": 15508480 }, { "name": "transformer.h.11.mixer.out_proj.BLinear_no_train.weight", "shape": [ 461, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2360320, "byteOffset": 15769600 }, { "name": "transformer.h.11.mixer.out_proj.BLinear_train.weight", "shape": [ 51, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 261120, "byteOffset": 18129920 }, { "name": "transformer.h.11.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18391040 }, { "name": "transformer.h.11.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 18396160 }, { "name": "transformer.h.11.mixer.q_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 31503360 } ], "md5sum": "b217e37267cd335ac55fc3a3f856c566" }, { "dataPath": "params_shard_17.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.12.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "5b319acb7b3dc8daca93b10ce25193c0" }, { "dataPath": "params_shard_18.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.12.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "47cc3e8d3077442934f4d575164eeb45" }, { "dataPath": "params_shard_19.bin", "format": "raw-shard", "nbytes": 32819200, "records": [ { "name": "transformer.h.11.mixer.q_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.11.mixer.v_proj.ALinear_no_train.weight", "shape": [ 2560, 692 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 13107200 }, { "name": "transformer.h.11.mixer.v_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 16650240 }, { "name": "transformer.h.11.mixer.v_proj.ALinear_train.weight", "shape": [ 2560, 76 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 16655360 }, { "name": "transformer.h.11.mixer.v_proj.BLinear_no_train.weight", "shape": [ 692, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 17044480 }, { "name": "transformer.h.11.mixer.v_proj.BLinear_train.weight", "shape": [ 76, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 20587520 }, { "name": "transformer.h.12.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 20976640 }, { "name": "transformer.h.12.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 20981760 }, { "name": "transformer.h.12.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 20986880 }, { "name": "transformer.h.12.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 21007360 }, { "name": "transformer.h.12.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 21012480 }, { "name": "transformer.h.12.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26321920 }, { "name": "transformer.h.12.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 26327040 }, { "name": "transformer.h.12.mixer.out_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 26915840 }, { "name": "transformer.h.12.mixer.out_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 32225280 }, { "name": "transformer.h.12.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32814080 } ], "md5sum": "0ebe014ca8f2006df10985c8a1984743" }, { "dataPath": "params_shard_20.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.13.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "cb98746a88c05ded63e09a48e7558f99" }, { "dataPath": "params_shard_21.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.13.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "3bd5da4b2931260e8819045539375452" }, { "dataPath": "params_shard_22.bin", "format": "raw-shard", "nbytes": 32814080, "records": [ { "name": "transformer.h.12.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.12.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 13107200 }, { "name": "transformer.h.12.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17239040 }, { "name": "transformer.h.12.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 17244160 }, { "name": "transformer.h.12.mixer.q_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 17699840 }, { "name": "transformer.h.12.mixer.q_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 21831680 }, { "name": "transformer.h.12.mixer.v_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 22287360 }, { "name": "transformer.h.12.mixer.v_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 27008000 }, { "name": "transformer.h.12.mixer.v_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 27013120 }, { "name": "transformer.h.12.mixer.v_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 27535360 }, { "name": "transformer.h.12.mixer.v_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 32256000 }, { "name": "transformer.h.13.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32778240 }, { "name": "transformer.h.13.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32783360 }, { "name": "transformer.h.13.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 32788480 }, { "name": "transformer.h.13.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32808960 } ], "md5sum": "6eada39dbdb4fc7cd202aa8515b9ab45" }, { "dataPath": "params_shard_23.bin", "format": "raw-shard", "nbytes": 28856320, "records": [ { "name": "transformer.h.13.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 0 }, { "name": "transformer.h.13.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 4131840 }, { "name": "transformer.h.13.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 4136960 }, { "name": "transformer.h.13.mixer.out_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 4592640 }, { "name": "transformer.h.13.mixer.out_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 8724480 }, { "name": "transformer.h.13.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 9180160 }, { "name": "transformer.h.13.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 9185280 }, { "name": "transformer.h.13.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 576 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2949120, "byteOffset": 22292480 }, { "name": "transformer.h.13.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 25241600 }, { "name": "transformer.h.13.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 64 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 327680, "byteOffset": 25246720 }, { "name": "transformer.h.13.mixer.q_proj.BLinear_no_train.weight", "shape": [ 576, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2949120, "byteOffset": 25574400 }, { "name": "transformer.h.13.mixer.q_proj.BLinear_train.weight", "shape": [ 64, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 327680, "byteOffset": 28523520 }, { "name": "transformer.h.13.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 28851200 } ], "md5sum": "cf1a937e8490bc39d8600e80a1020313" }, { "dataPath": "params_shard_24.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.14.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "68c10db80a7334706af0e185652f90fc" }, { "dataPath": "params_shard_25.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.14.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "c945c4e8f14d65dfccdcbd1c385ed4c2" }, { "dataPath": "params_shard_26.bin", "format": "raw-shard", "nbytes": 24949760, "records": [ { "name": "transformer.h.13.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.14.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.14.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.14.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.14.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.14.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 13143040 }, { "name": "transformer.h.14.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18452480 }, { "name": "transformer.h.14.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 18457600 }, { "name": "transformer.h.14.mixer.out_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 19046400 }, { "name": "transformer.h.14.mixer.out_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 24355840 }, { "name": "transformer.h.14.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24944640 } ], "md5sum": "7ddfdfab9a4cb62d113943016ec3ffc6" }, { "dataPath": "params_shard_27.bin", "format": "raw-shard", "nbytes": 22292480, "records": [ { "name": "transformer.h.14.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.14.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 13107200 }, { "name": "transformer.h.14.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17239040 }, { "name": "transformer.h.14.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 17244160 }, { "name": "transformer.h.14.mixer.q_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 17699840 }, { "name": "transformer.h.14.mixer.q_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 21831680 }, { "name": "transformer.h.14.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 22287360 } ], "md5sum": "003cb387ac14f1a39a670512c9429232" }, { "dataPath": "params_shard_28.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.15.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "fe98d47961b263e1f0d1c61907060e25" }, { "dataPath": "params_shard_29.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.15.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "7e870d2369a688e73ffd7ed92da846c5" }, { "dataPath": "params_shard_30.bin", "format": "raw-shard", "nbytes": 24949760, "records": [ { "name": "transformer.h.14.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.15.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.15.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.15.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.15.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.15.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 13143040 }, { "name": "transformer.h.15.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18452480 }, { "name": "transformer.h.15.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 18457600 }, { "name": "transformer.h.15.mixer.out_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 19046400 }, { "name": "transformer.h.15.mixer.out_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 24355840 }, { "name": "transformer.h.15.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24944640 } ], "md5sum": "be1d216583dcf0b1b3b4073710c66609" }, { "dataPath": "params_shard_31.bin", "format": "raw-shard", "nbytes": 22292480, "records": [ { "name": "transformer.h.15.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.15.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 13107200 }, { "name": "transformer.h.15.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17239040 }, { "name": "transformer.h.15.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 17244160 }, { "name": "transformer.h.15.mixer.q_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 17699840 }, { "name": "transformer.h.15.mixer.q_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 21831680 }, { "name": "transformer.h.15.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 22287360 } ], "md5sum": "38f89a55538bf9f702a6af884fc87776" }, { "dataPath": "params_shard_32.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.16.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "0433ad57305b86965fd525423cd91ff7" }, { "dataPath": "params_shard_33.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.16.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "e2bbe78b2a7b0430d6cbeca11dc26d00" }, { "dataPath": "params_shard_34.bin", "format": "raw-shard", "nbytes": 28881920, "records": [ { "name": "transformer.h.15.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.16.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.16.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.16.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.16.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.16.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 13143040 }, { "name": "transformer.h.16.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17863680 }, { "name": "transformer.h.16.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 17868800 }, { "name": "transformer.h.16.mixer.out_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 18391040 }, { "name": "transformer.h.16.mixer.out_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 23111680 }, { "name": "transformer.h.16.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 23633920 }, { "name": "transformer.h.16.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 28354560 }, { "name": "transformer.h.16.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 28359680 } ], "md5sum": "d0e5320dea5ba2ceab8aa3f295873989" }, { "dataPath": "params_shard_35.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.17.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "b3ff385e7f3f3b07064a6cfb8d6381cd" }, { "dataPath": "params_shard_36.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.17.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "d88002c1ccff3d742b52c12191448a48" }, { "dataPath": "params_shard_37.bin", "format": "raw-shard", "nbytes": 31503360, "records": [ { "name": "transformer.h.16.mixer.k_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 0 }, { "name": "transformer.h.16.mixer.k_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 4720640 }, { "name": "transformer.h.16.mixer.q_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 5242880 }, { "name": "transformer.h.16.mixer.q_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 5248000 }, { "name": "transformer.h.16.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18355200 }, { "name": "transformer.h.16.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 18360320 }, { "name": "transformer.h.17.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 31467520 }, { "name": "transformer.h.17.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 31472640 }, { "name": "transformer.h.17.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 31477760 }, { "name": "transformer.h.17.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 31498240 } ], "md5sum": "c51f7e29dac58837c21514705b9b026e" }, { "dataPath": "params_shard_38.bin", "format": "raw-shard", "nbytes": 29506560, "records": [ { "name": "transformer.h.17.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 0 }, { "name": "transformer.h.17.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 5309440 }, { "name": "transformer.h.17.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 5314560 }, { "name": "transformer.h.17.mixer.out_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 5903360 }, { "name": "transformer.h.17.mixer.out_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 11212800 }, { "name": "transformer.h.17.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 11801600 }, { "name": "transformer.h.17.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 11806720 }, { "name": "transformer.h.17.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 24913920 }, { "name": "transformer.h.17.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 29045760 }, { "name": "transformer.h.17.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 29050880 } ], "md5sum": "e484f10adbcf00a6fdc0e37b7dc8752a" }, { "dataPath": "params_shard_39.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.18.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "26995d6016d2b1cc36a406ee7eb0abc3" }, { "dataPath": "params_shard_40.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.18.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "397442f2c9365b05454695b885f1708c" }, { "dataPath": "params_shard_41.bin", "format": "raw-shard", "nbytes": 28231680, "records": [ { "name": "transformer.h.17.mixer.q_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 0 }, { "name": "transformer.h.17.mixer.q_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 4131840 }, { "name": "transformer.h.17.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 4587520 }, { "name": "transformer.h.17.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 4592640 }, { "name": "transformer.h.18.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17699840 }, { "name": "transformer.h.18.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17704960 }, { "name": "transformer.h.18.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 17710080 }, { "name": "transformer.h.18.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17730560 }, { "name": "transformer.h.18.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 17735680 }, { "name": "transformer.h.18.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 22456320 }, { "name": "transformer.h.18.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 22461440 }, { "name": "transformer.h.18.mixer.out_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 22983680 }, { "name": "transformer.h.18.mixer.out_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 27704320 }, { "name": "transformer.h.18.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 28226560 } ], "md5sum": "48d84b940d6709b5d555259a62051785" }, { "dataPath": "params_shard_42.bin", "format": "raw-shard", "nbytes": 26224640, "records": [ { "name": "transformer.h.18.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.18.mixer.q_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.18.mixer.q_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 13112320 }, { "name": "transformer.h.18.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26219520 } ], "md5sum": "5ac3d19a6aacace6399c39348d494265" }, { "dataPath": "params_shard_43.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.19.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "23fd732cb5ad43a7a6e3cbf11bf32e52" }, { "dataPath": "params_shard_44.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.19.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "79c76d75b0c6bcea50768615e46dfa45" }, { "dataPath": "params_shard_45.bin", "format": "raw-shard", "nbytes": 31503360, "records": [ { "name": "transformer.h.18.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.19.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.19.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.19.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.19.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.19.mixer.out_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13143040 }, { "name": "transformer.h.19.mixer.out_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 13148160 }, { "name": "transformer.h.19.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 26255360 }, { "name": "transformer.h.19.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 30976000 }, { "name": "transformer.h.19.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 30981120 } ], "md5sum": "95769a635008780c7be9729b0f918565" }, { "dataPath": "params_shard_46.bin", "format": "raw-shard", "nbytes": 30208000, "records": [ { "name": "transformer.h.2.mlp.fc1.ALinear_no_train.weight", "shape": [ 10240, 1475 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 30208000, "byteOffset": 0 } ], "md5sum": "1b85c81756209c05921f4e51ac2a345e" }, { "dataPath": "params_shard_47.bin", "format": "raw-shard", "nbytes": 32215040, "records": [ { "name": "transformer.h.19.mixer.k_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 0 }, { "name": "transformer.h.19.mixer.k_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 4720640 }, { "name": "transformer.h.19.mixer.q_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 5242880 }, { "name": "transformer.h.19.mixer.q_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 5248000 }, { "name": "transformer.h.19.mixer.v_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 18355200 }, { "name": "transformer.h.19.mixer.v_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 23075840 }, { "name": "transformer.h.19.mixer.v_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 23080960 }, { "name": "transformer.h.19.mixer.v_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 23603200 }, { "name": "transformer.h.19.mixer.v_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 28323840 }, { "name": "transformer.h.2.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 28846080 }, { "name": "transformer.h.2.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 28851200 }, { "name": "transformer.h.2.mlp.fc1.ALinear_train.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 28856320 }, { "name": "transformer.h.2.mlp.fc1.ALinear_train.weight", "shape": [ 10240, 163 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3338240, "byteOffset": 28876800 } ], "md5sum": "3a52d8da6a42b9ec460822db957a2add" }, { "dataPath": "params_shard_48.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.2.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "273b6c3aec98e48abf7103ac6b213553" }, { "dataPath": "params_shard_49.bin", "format": "raw-shard", "nbytes": 32000000, "records": [ { "name": "transformer.h.2.mlp.fc1.BLinear_no_train.weight", "shape": [ 1475, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 7552000, "byteOffset": 0 }, { "name": "transformer.h.2.mlp.fc1.BLinear_train.weight", "shape": [ 163, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 834560, "byteOffset": 7552000 }, { "name": "transformer.h.2.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 8386560 }, { "name": "transformer.h.2.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 8391680 }, { "name": "transformer.h.2.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 12523520 }, { "name": "transformer.h.2.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 12528640 }, { "name": "transformer.h.2.mixer.out_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 12984320 }, { "name": "transformer.h.2.mixer.out_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 17116160 }, { "name": "transformer.h.2.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 17571840 }, { "name": "transformer.h.2.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 22881280 }, { "name": "transformer.h.2.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 22886400 }, { "name": "transformer.h.2.mixer.k_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 23475200 }, { "name": "transformer.h.2.mixer.k_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 28784640 }, { "name": "transformer.h.2.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 461 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2360320, "byteOffset": 29373440 }, { "name": "transformer.h.2.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 31733760 }, { "name": "transformer.h.2.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 51 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 261120, "byteOffset": 31738880 } ], "md5sum": "e45779ca4d8b8c6741ac356582a4ea19" }, { "dataPath": "params_shard_50.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.20.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "59da877313e04b7b6d41c4624da17060" }, { "dataPath": "params_shard_51.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.20.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "261b40afb227581a473f03c19a660403" }, { "dataPath": "params_shard_52.bin", "format": "raw-shard", "nbytes": 24954880, "records": [ { "name": "transformer.h.2.mixer.q_proj.BLinear_no_train.weight", "shape": [ 461, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2360320, "byteOffset": 0 }, { "name": "transformer.h.2.mixer.q_proj.BLinear_train.weight", "shape": [ 51, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 261120, "byteOffset": 2360320 }, { "name": "transformer.h.2.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 2621440 }, { "name": "transformer.h.2.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 2626560 }, { "name": "transformer.h.20.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 15733760 }, { "name": "transformer.h.20.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 15738880 }, { "name": "transformer.h.20.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 15744000 }, { "name": "transformer.h.20.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 15764480 }, { "name": "transformer.h.20.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 15769600 }, { "name": "transformer.h.20.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 19901440 }, { "name": "transformer.h.20.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 19906560 }, { "name": "transformer.h.20.mixer.out_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 20362240 }, { "name": "transformer.h.20.mixer.out_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 24494080 }, { "name": "transformer.h.20.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24949760 } ], "md5sum": "018626b784d94a0b2857be155b2e57f2" }, { "dataPath": "params_shard_53.bin", "format": "raw-shard", "nbytes": 24913920, "records": [ { "name": "transformer.h.20.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.20.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 13107200 }, { "name": "transformer.h.20.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18416640 }, { "name": "transformer.h.20.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 18421760 }, { "name": "transformer.h.20.mixer.q_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 19010560 }, { "name": "transformer.h.20.mixer.q_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 24320000 }, { "name": "transformer.h.20.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24908800 } ], "md5sum": "96e955c2519e3b68e1a2600fcce436c3" }, { "dataPath": "params_shard_54.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.21.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "4d536325afe5df811fa1c96a23927e29" }, { "dataPath": "params_shard_55.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.21.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "c20e972caea140ee116d6611a90f4d46" }, { "dataPath": "params_shard_56.bin", "format": "raw-shard", "nbytes": 21017600, "records": [ { "name": "transformer.h.20.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.21.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.21.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.21.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.21.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.21.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 692 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 13143040 }, { "name": "transformer.h.21.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 16686080 }, { "name": "transformer.h.21.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 76 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 16691200 }, { "name": "transformer.h.21.mixer.out_proj.BLinear_no_train.weight", "shape": [ 692, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 17080320 }, { "name": "transformer.h.21.mixer.out_proj.BLinear_train.weight", "shape": [ 76, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 20623360 }, { "name": "transformer.h.21.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 21012480 } ], "md5sum": "0ff148205da47304ec8f8566b4d11132" }, { "dataPath": "params_shard_57.bin", "format": "raw-shard", "nbytes": 24913920, "records": [ { "name": "transformer.h.21.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.21.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 13107200 }, { "name": "transformer.h.21.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18416640 }, { "name": "transformer.h.21.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 18421760 }, { "name": "transformer.h.21.mixer.q_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 19010560 }, { "name": "transformer.h.21.mixer.q_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 24320000 }, { "name": "transformer.h.21.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24908800 } ], "md5sum": "f4549957d1cd35cb8a5736273c78a389" }, { "dataPath": "params_shard_58.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.22.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "cbb16228d1a57bb8a422b82718dbe843" }, { "dataPath": "params_shard_59.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.22.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "fc2e0c772103f1b9b5040c5179c14382" }, { "dataPath": "params_shard_60.bin", "format": "raw-shard", "nbytes": 24949760, "records": [ { "name": "transformer.h.21.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.22.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.22.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.22.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.22.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.22.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 13143040 }, { "name": "transformer.h.22.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18452480 }, { "name": "transformer.h.22.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 18457600 }, { "name": "transformer.h.22.mixer.out_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 19046400 }, { "name": "transformer.h.22.mixer.out_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 24355840 }, { "name": "transformer.h.22.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24944640 } ], "md5sum": "6c039d3ea1e2f127350bfceccf98d03d" }, { "dataPath": "params_shard_61.bin", "format": "raw-shard", "nbytes": 22292480, "records": [ { "name": "transformer.h.22.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.22.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 13107200 }, { "name": "transformer.h.22.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17239040 }, { "name": "transformer.h.22.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 17244160 }, { "name": "transformer.h.22.mixer.q_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 17699840 }, { "name": "transformer.h.22.mixer.q_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 21831680 }, { "name": "transformer.h.22.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 22287360 } ], "md5sum": "b566179742cdcc020e1a03ffb974729e" }, { "dataPath": "params_shard_62.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.23.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "7488fe386f610cf92f6f8994fc73eaca" }, { "dataPath": "params_shard_63.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.23.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "7c7905eca1afb5434e2b2eafbfec2449" }, { "dataPath": "params_shard_64.bin", "format": "raw-shard", "nbytes": 23639040, "records": [ { "name": "transformer.h.22.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.23.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.23.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.23.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.23.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.23.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 13143040 }, { "name": "transformer.h.23.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17863680 }, { "name": "transformer.h.23.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 17868800 }, { "name": "transformer.h.23.mixer.out_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 18391040 }, { "name": "transformer.h.23.mixer.out_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 23111680 }, { "name": "transformer.h.23.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 23633920 } ], "md5sum": "24e28b59208876e57911beaaf1a03d1b" }, { "dataPath": "params_shard_65.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.24.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "3d1d512fcd9f03ffdf30e50c60a68204" }, { "dataPath": "params_shard_66.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.24.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "19e1545689c6a85010f6ea212eef0b02" }, { "dataPath": "params_shard_67.bin", "format": "raw-shard", "nbytes": 31503360, "records": [ { "name": "transformer.h.23.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.23.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 13107200 }, { "name": "transformer.h.23.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17827840 }, { "name": "transformer.h.23.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 17832960 }, { "name": "transformer.h.23.mixer.q_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 18355200 }, { "name": "transformer.h.23.mixer.q_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 23075840 }, { "name": "transformer.h.23.mixer.v_proj.ALinear_no_train.weight", "shape": [ 2560, 692 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 23598080 }, { "name": "transformer.h.23.mixer.v_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 27141120 }, { "name": "transformer.h.23.mixer.v_proj.ALinear_train.weight", "shape": [ 2560, 76 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 27146240 }, { "name": "transformer.h.23.mixer.v_proj.BLinear_no_train.weight", "shape": [ 692, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 27535360 }, { "name": "transformer.h.23.mixer.v_proj.BLinear_train.weight", "shape": [ 76, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 31078400 }, { "name": "transformer.h.24.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 31467520 }, { "name": "transformer.h.24.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 31472640 }, { "name": "transformer.h.24.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 31477760 }, { "name": "transformer.h.24.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 31498240 } ], "md5sum": "83ea146e73153fb9ecc5b1dc5a730f97" }, { "dataPath": "params_shard_68.bin", "format": "raw-shard", "nbytes": 28856320, "records": [ { "name": "transformer.h.24.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 461 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2360320, "byteOffset": 0 }, { "name": "transformer.h.24.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 2360320 }, { "name": "transformer.h.24.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 51 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 261120, "byteOffset": 2365440 }, { "name": "transformer.h.24.mixer.out_proj.BLinear_no_train.weight", "shape": [ 461, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2360320, "byteOffset": 2626560 }, { "name": "transformer.h.24.mixer.out_proj.BLinear_train.weight", "shape": [ 51, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 261120, "byteOffset": 4986880 }, { "name": "transformer.h.24.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 5248000 }, { "name": "transformer.h.24.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 5253120 }, { "name": "transformer.h.24.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 18360320 }, { "name": "transformer.h.24.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 23080960 }, { "name": "transformer.h.24.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 23086080 }, { "name": "transformer.h.24.mixer.q_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 23608320 }, { "name": "transformer.h.24.mixer.q_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 28328960 }, { "name": "transformer.h.24.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 28851200 } ], "md5sum": "61ac7fd74077d33cdae79b6f543ee392" }, { "dataPath": "params_shard_69.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.25.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "77c14e2d1bcf84195dbce8c55c238c1c" }, { "dataPath": "params_shard_70.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.25.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "cecff15a81aebae6fda21529ff207a87" }, { "dataPath": "params_shard_71.bin", "format": "raw-shard", "nbytes": 32819200, "records": [ { "name": "transformer.h.24.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.25.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.25.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.25.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.25.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.25.mixer.out_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13143040 }, { "name": "transformer.h.25.mixer.out_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 13148160 }, { "name": "transformer.h.25.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 576 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2949120, "byteOffset": 26255360 }, { "name": "transformer.h.25.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 29204480 }, { "name": "transformer.h.25.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 64 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 327680, "byteOffset": 29209600 }, { "name": "transformer.h.25.mixer.k_proj.BLinear_no_train.weight", "shape": [ 576, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2949120, "byteOffset": 29537280 }, { "name": "transformer.h.25.mixer.k_proj.BLinear_train.weight", "shape": [ 64, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 327680, "byteOffset": 32486400 }, { "name": "transformer.h.25.mixer.q_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32814080 } ], "md5sum": "e7a995d08f569c539080849e321a702b" }, { "dataPath": "params_shard_72.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.26.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "e37e822d71da0b3c9647f75f45d5378b" }, { "dataPath": "params_shard_73.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.26.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "31428edbd9be52122af809279c99067b" }, { "dataPath": "params_shard_74.bin", "format": "raw-shard", "nbytes": 26260480, "records": [ { "name": "transformer.h.25.mixer.q_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.25.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.25.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 13112320 }, { "name": "transformer.h.26.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26219520 }, { "name": "transformer.h.26.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26224640 }, { "name": "transformer.h.26.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 26229760 }, { "name": "transformer.h.26.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26250240 }, { "name": "transformer.h.26.mixer.out_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26255360 } ], "md5sum": "2acddb9977a7383f9801414fdea976f8" }, { "dataPath": "params_shard_75.bin", "format": "raw-shard", "nbytes": 29501440, "records": [ { "name": "transformer.h.26.mixer.out_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.26.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 13107200 }, { "name": "transformer.h.26.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17827840 }, { "name": "transformer.h.26.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 17832960 }, { "name": "transformer.h.26.mixer.k_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 18355200 }, { "name": "transformer.h.26.mixer.k_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 23075840 }, { "name": "transformer.h.26.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 23598080 }, { "name": "transformer.h.26.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 28907520 }, { "name": "transformer.h.26.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 28912640 } ], "md5sum": "6b8d9ff36734dfbaf098ed04e0a584dd" }, { "dataPath": "params_shard_76.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.27.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "654cec140a09b4d03b4b900683ed79f5" }, { "dataPath": "params_shard_77.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.27.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "92813d9596ae69b4646502a8e8278a63" }, { "dataPath": "params_shard_78.bin", "format": "raw-shard", "nbytes": 31508480, "records": [ { "name": "transformer.h.26.mixer.q_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 0 }, { "name": "transformer.h.26.mixer.q_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 5309440 }, { "name": "transformer.h.26.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 5898240 }, { "name": "transformer.h.26.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 5903360 }, { "name": "transformer.h.27.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 19010560 }, { "name": "transformer.h.27.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 19015680 }, { "name": "transformer.h.27.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 19020800 }, { "name": "transformer.h.27.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 19041280 }, { "name": "transformer.h.27.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 692 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 19046400 }, { "name": "transformer.h.27.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 22589440 }, { "name": "transformer.h.27.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 76 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 22594560 }, { "name": "transformer.h.27.mixer.out_proj.BLinear_no_train.weight", "shape": [ 692, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 22983680 }, { "name": "transformer.h.27.mixer.out_proj.BLinear_train.weight", "shape": [ 76, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 26526720 }, { "name": "transformer.h.27.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 26915840 }, { "name": "transformer.h.27.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 31047680 }, { "name": "transformer.h.27.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 31052800 } ], "md5sum": "21476d133619e626f207b73d5e21f15a" }, { "dataPath": "params_shard_79.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.28.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "e9a791765621207ec101cc3f72313d05" }, { "dataPath": "params_shard_80.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.28.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "3b06206408ab105d2149fa75e5788da6" }, { "dataPath": "params_shard_81.bin", "format": "raw-shard", "nbytes": 30848000, "records": [ { "name": "transformer.h.27.mixer.k_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 0 }, { "name": "transformer.h.27.mixer.k_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 4131840 }, { "name": "transformer.h.27.mixer.q_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 4587520 }, { "name": "transformer.h.27.mixer.q_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 4592640 }, { "name": "transformer.h.27.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17699840 }, { "name": "transformer.h.27.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 17704960 }, { "name": "transformer.h.28.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 30812160 }, { "name": "transformer.h.28.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 30817280 }, { "name": "transformer.h.28.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 30822400 }, { "name": "transformer.h.28.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 30842880 } ], "md5sum": "f61e4cd8e1bad99bde00e5d1c366c2f3" }, { "dataPath": "params_shard_82.bin", "format": "raw-shard", "nbytes": 30167040, "records": [ { "name": "transformer.h.28.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 576 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2949120, "byteOffset": 0 }, { "name": "transformer.h.28.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 2949120 }, { "name": "transformer.h.28.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 64 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 327680, "byteOffset": 2954240 }, { "name": "transformer.h.28.mixer.out_proj.BLinear_no_train.weight", "shape": [ 576, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2949120, "byteOffset": 3281920 }, { "name": "transformer.h.28.mixer.out_proj.BLinear_train.weight", "shape": [ 64, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 327680, "byteOffset": 6231040 }, { "name": "transformer.h.28.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 6558720 }, { "name": "transformer.h.28.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 6563840 }, { "name": "transformer.h.28.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 19671040 }, { "name": "transformer.h.28.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24391680 }, { "name": "transformer.h.28.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 24396800 }, { "name": "transformer.h.28.mixer.q_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 24919040 }, { "name": "transformer.h.28.mixer.q_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 29639680 }, { "name": "transformer.h.28.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 30161920 } ], "md5sum": "de07b27f305ce737f302f49e0b0ca0eb" }, { "dataPath": "params_shard_83.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.29.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "38dac7cd2df307b5d63a3a0ab30fef9d" }, { "dataPath": "params_shard_84.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.29.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "ae0497c5ba02234338e9899265223988" }, { "dataPath": "params_shard_85.bin", "format": "raw-shard", "nbytes": 26260480, "records": [ { "name": "transformer.h.28.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.29.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.29.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.29.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.29.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.29.mixer.out_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13143040 }, { "name": "transformer.h.29.mixer.out_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 13148160 }, { "name": "transformer.h.29.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26255360 } ], "md5sum": "8c18818510fc243e86a6a37e6e1d68f1" }, { "dataPath": "params_shard_86.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.3.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "cac5203ed7b66fb7e79d7618bf161343" }, { "dataPath": "params_shard_87.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.3.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "0b29720a14330b72b89e5ebca57a112e" }, { "dataPath": "params_shard_88.bin", "format": "raw-shard", "nbytes": 32814080, "records": [ { "name": "transformer.h.29.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.29.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 13107200 }, { "name": "transformer.h.29.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17827840 }, { "name": "transformer.h.29.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 17832960 }, { "name": "transformer.h.29.mixer.q_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 18355200 }, { "name": "transformer.h.29.mixer.q_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 23075840 }, { "name": "transformer.h.29.mixer.v_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 23598080 }, { "name": "transformer.h.29.mixer.v_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 27729920 }, { "name": "transformer.h.29.mixer.v_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 27735040 }, { "name": "transformer.h.29.mixer.v_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 28190720 }, { "name": "transformer.h.29.mixer.v_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 32322560 }, { "name": "transformer.h.3.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32778240 }, { "name": "transformer.h.3.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32783360 }, { "name": "transformer.h.3.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 32788480 }, { "name": "transformer.h.3.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32808960 } ], "md5sum": "858f6cdb289476b18109472eeed6fdcd" }, { "dataPath": "params_shard_89.bin", "format": "raw-shard", "nbytes": 22297600, "records": [ { "name": "transformer.h.3.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 0 }, { "name": "transformer.h.3.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 4720640 }, { "name": "transformer.h.3.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 4725760 }, { "name": "transformer.h.3.mixer.out_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 5248000 }, { "name": "transformer.h.3.mixer.out_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 9968640 }, { "name": "transformer.h.3.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 10490880 }, { "name": "transformer.h.3.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 15800320 }, { "name": "transformer.h.3.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 15805440 }, { "name": "transformer.h.3.mixer.k_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 16394240 }, { "name": "transformer.h.3.mixer.k_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 21703680 }, { "name": "transformer.h.3.mixer.q_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 22292480 } ], "md5sum": "6aa435d753ba0eae67cfa748fad95ad5" }, { "dataPath": "params_shard_90.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.30.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "9121523046400b7f749783a40a8d867e" }, { "dataPath": "params_shard_91.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.30.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "bb8374b2be9120a75380d3e17e7578f1" }, { "dataPath": "params_shard_92.bin", "format": "raw-shard", "nbytes": 31503360, "records": [ { "name": "transformer.h.3.mixer.q_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.3.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.3.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 13112320 }, { "name": "transformer.h.30.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26219520 }, { "name": "transformer.h.30.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26224640 }, { "name": "transformer.h.30.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 26229760 }, { "name": "transformer.h.30.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26250240 }, { "name": "transformer.h.30.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 26255360 }, { "name": "transformer.h.30.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 30976000 }, { "name": "transformer.h.30.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 30981120 } ], "md5sum": "73d0c912699142302be99fee569a18d4" }, { "dataPath": "params_shard_93.bin", "format": "raw-shard", "nbytes": 27540480, "records": [ { "name": "transformer.h.30.mixer.out_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 0 }, { "name": "transformer.h.30.mixer.out_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 4720640 }, { "name": "transformer.h.30.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 5242880 }, { "name": "transformer.h.30.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 5248000 }, { "name": "transformer.h.30.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 18355200 }, { "name": "transformer.h.30.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 22487040 }, { "name": "transformer.h.30.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 22492160 }, { "name": "transformer.h.30.mixer.q_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 22947840 }, { "name": "transformer.h.30.mixer.q_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 27079680 }, { "name": "transformer.h.30.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 27535360 } ], "md5sum": "395e4f69cd3d216e3cd6a5a8a0559e18" }, { "dataPath": "params_shard_94.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.31.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "de231243524d89036342e83d0347758c" }, { "dataPath": "params_shard_95.bin", "format": "raw-shard", "nbytes": 24934400, "records": [ { "name": "transformer.h.30.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.31.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13107200 }, { "name": "transformer.h.31.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 13127680 }, { "name": "transformer.h.31.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18437120 }, { "name": "transformer.h.31.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 18442240 }, { "name": "transformer.h.31.mixer.out_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 19031040 }, { "name": "transformer.h.31.mixer.out_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 24340480 }, { "name": "transformer.h.31.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24929280 } ], "md5sum": "ea6384871506f735dcca09d60f8ae0c0" }, { "dataPath": "params_shard_96.bin", "format": "raw-shard", "nbytes": 24913920, "records": [ { "name": "transformer.h.31.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.31.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 13107200 }, { "name": "transformer.h.31.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18416640 }, { "name": "transformer.h.31.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 18421760 }, { "name": "transformer.h.31.mixer.q_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 19010560 }, { "name": "transformer.h.31.mixer.q_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 24320000 }, { "name": "transformer.h.31.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24908800 } ], "md5sum": "5780d81eb7d09b140353e083c5c86325" }, { "dataPath": "params_shard_97.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.4.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "f5cab0880d2097e0b90ed9f53f5aefb9" }, { "dataPath": "params_shard_98.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.4.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "99f670d719b3b0bf8423c4c9d7af412c" }, { "dataPath": "params_shard_99.bin", "format": "raw-shard", "nbytes": 33536000, "records": [ { "name": "transformer.h.31.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.4.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.4.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.4.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.4.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.4.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 13143040 }, { "name": "transformer.h.4.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17274880 }, { "name": "transformer.h.4.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 17280000 }, { "name": "transformer.h.4.mixer.out_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 17735680 }, { "name": "transformer.h.4.mixer.out_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 21867520 }, { "name": "transformer.h.4.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 22323200 }, { "name": "transformer.h.4.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 27632640 }, { "name": "transformer.h.4.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 27637760 }, { "name": "transformer.h.4.mixer.k_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 28226560 } ], "md5sum": "256ed9d9925e529e0076fc8f2ff3822f" }, { "dataPath": "params_shard_100.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.5.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "c8cff3224262fb07aa1fbb2fa931a40b" }, { "dataPath": "params_shard_101.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.5.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "2b7c26aebe892da166219b743612dace" }, { "dataPath": "params_shard_102.bin", "format": "raw-shard", "nbytes": 21611520, "records": [ { "name": "transformer.h.4.mixer.k_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 0 }, { "name": "transformer.h.4.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 692 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 588800 }, { "name": "transformer.h.4.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 4131840 }, { "name": "transformer.h.4.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 76 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 4136960 }, { "name": "transformer.h.4.mixer.q_proj.BLinear_no_train.weight", "shape": [ 692, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 4526080 }, { "name": "transformer.h.4.mixer.q_proj.BLinear_train.weight", "shape": [ 76, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 8069120 }, { "name": "transformer.h.4.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 8458240 }, { "name": "transformer.h.4.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 8463360 }, { "name": "transformer.h.5.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 21570560 }, { "name": "transformer.h.5.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 21575680 }, { "name": "transformer.h.5.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 21580800 }, { "name": "transformer.h.5.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 21601280 }, { "name": "transformer.h.5.mixer.out_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 21606400 } ], "md5sum": "e6df25a60f23c043f9a93be37f0efbb6" }, { "dataPath": "params_shard_103.bin", "format": "raw-shard", "nbytes": 29501440, "records": [ { "name": "transformer.h.5.mixer.out_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.5.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 13107200 }, { "name": "transformer.h.5.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18416640 }, { "name": "transformer.h.5.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 18421760 }, { "name": "transformer.h.5.mixer.k_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 19010560 }, { "name": "transformer.h.5.mixer.k_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 24320000 }, { "name": "transformer.h.5.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 24908800 }, { "name": "transformer.h.5.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 29040640 }, { "name": "transformer.h.5.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 29045760 } ], "md5sum": "bbca7527ec904abf35d72381886e11f5" }, { "dataPath": "params_shard_104.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.6.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "87d663ba9aec39a2d410e4d38c35e80b" }, { "dataPath": "params_shard_105.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.6.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "0982b8b5c06163cc1d33e29eb2c57402" }, { "dataPath": "params_shard_106.bin", "format": "raw-shard", "nbytes": 26920960, "records": [ { "name": "transformer.h.5.mixer.q_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 0 }, { "name": "transformer.h.5.mixer.q_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 4131840 }, { "name": "transformer.h.5.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 4587520 }, { "name": "transformer.h.5.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 4592640 }, { "name": "transformer.h.6.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17699840 }, { "name": "transformer.h.6.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17704960 }, { "name": "transformer.h.6.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 17710080 }, { "name": "transformer.h.6.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17730560 }, { "name": "transformer.h.6.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 807 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 17735680 }, { "name": "transformer.h.6.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 21867520 }, { "name": "transformer.h.6.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 89 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 21872640 }, { "name": "transformer.h.6.mixer.out_proj.BLinear_no_train.weight", "shape": [ 807, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4131840, "byteOffset": 22328320 }, { "name": "transformer.h.6.mixer.out_proj.BLinear_train.weight", "shape": [ 89, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 455680, "byteOffset": 26460160 }, { "name": "transformer.h.6.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26915840 } ], "md5sum": "94114fe87e95cccb70e58c948b850e52" }, { "dataPath": "params_shard_107.bin", "format": "raw-shard", "nbytes": 23603200, "records": [ { "name": "transformer.h.6.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.6.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 922 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 13107200 }, { "name": "transformer.h.6.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 17827840 }, { "name": "transformer.h.6.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 102 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 17832960 }, { "name": "transformer.h.6.mixer.q_proj.BLinear_no_train.weight", "shape": [ 922, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 4720640, "byteOffset": 18355200 }, { "name": "transformer.h.6.mixer.q_proj.BLinear_train.weight", "shape": [ 102, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 522240, "byteOffset": 23075840 }, { "name": "transformer.h.6.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 23598080 } ], "md5sum": "6a3aab23ffc02d0cded6f701bebe75b3" }, { "dataPath": "params_shard_108.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.7.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "750991e45724611bc14c9edb33edd031" }, { "dataPath": "params_shard_109.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.7.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "d8fba4940415edc3eae9610b2ac174f3" }, { "dataPath": "params_shard_110.bin", "format": "raw-shard", "nbytes": 26260480, "records": [ { "name": "transformer.h.6.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.7.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.7.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.7.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.7.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.7.mixer.out_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13143040 }, { "name": "transformer.h.7.mixer.out_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 13148160 }, { "name": "transformer.h.7.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 26255360 } ], "md5sum": "3b3f59efedf3174722284d55cfbe6dfa" }, { "dataPath": "params_shard_111.bin", "format": "raw-shard", "nbytes": 20981760, "records": [ { "name": "transformer.h.7.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.7.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 692 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 13107200 }, { "name": "transformer.h.7.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 16650240 }, { "name": "transformer.h.7.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 76 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 16655360 }, { "name": "transformer.h.7.mixer.q_proj.BLinear_no_train.weight", "shape": [ 692, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 17044480 }, { "name": "transformer.h.7.mixer.q_proj.BLinear_train.weight", "shape": [ 76, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 20587520 }, { "name": "transformer.h.7.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 20976640 } ], "md5sum": "7c86efd6822ce440620d35fac8373dc9" }, { "dataPath": "params_shard_112.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.8.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "a4b90468889cd5ba3155da3ba5ad2737" }, { "dataPath": "params_shard_113.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.8.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "d85bf14e49d31e007d8869a942f6678e" }, { "dataPath": "params_shard_114.bin", "format": "raw-shard", "nbytes": 32814080, "records": [ { "name": "transformer.h.7.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.8.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13107200 }, { "name": "transformer.h.8.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13112320 }, { "name": "transformer.h.8.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 13117440 }, { "name": "transformer.h.8.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 13137920 }, { "name": "transformer.h.8.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 13143040 }, { "name": "transformer.h.8.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 18452480 }, { "name": "transformer.h.8.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 18457600 }, { "name": "transformer.h.8.mixer.out_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 19046400 }, { "name": "transformer.h.8.mixer.out_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 24355840 }, { "name": "transformer.h.8.mixer.k_proj.ALinear_no_train.weight", "shape": [ 2560, 692 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 24944640 }, { "name": "transformer.h.8.mixer.k_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 28487680 }, { "name": "transformer.h.8.mixer.k_proj.ALinear_train.weight", "shape": [ 2560, 76 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 28492800 }, { "name": "transformer.h.8.mixer.k_proj.BLinear_no_train.weight", "shape": [ 692, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 28881920 }, { "name": "transformer.h.8.mixer.k_proj.BLinear_train.weight", "shape": [ 76, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 32424960 } ], "md5sum": "cd783f7f599074c6197dc8298d607fbb" }, { "dataPath": "params_shard_115.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.9.mlp.fc1.weight", "shape": [ 10240, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "06cd12611401d515dbf03552fbf52fa7" }, { "dataPath": "params_shard_116.bin", "format": "raw-shard", "nbytes": 52428800, "records": [ { "name": "transformer.h.9.mlp.fc2.weight", "shape": [ 2560, 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 52428800, "byteOffset": 0 } ], "md5sum": "c3c618745bb9db6d40c2b1700d9f4f5f" }, { "dataPath": "params_shard_117.bin", "format": "raw-shard", "nbytes": 32824320, "records": [ { "name": "transformer.h.8.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 1037 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 0 }, { "name": "transformer.h.8.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 5309440 }, { "name": "transformer.h.8.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 115 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 5314560 }, { "name": "transformer.h.8.mixer.q_proj.BLinear_no_train.weight", "shape": [ 1037, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5309440, "byteOffset": 5903360 }, { "name": "transformer.h.8.mixer.q_proj.BLinear_train.weight", "shape": [ 115, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 588800, "byteOffset": 11212800 }, { "name": "transformer.h.8.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 11801600 }, { "name": "transformer.h.8.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 11806720 }, { "name": "transformer.h.9.ln.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24913920 }, { "name": "transformer.h.9.ln.weight", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24919040 }, { "name": "transformer.h.9.mlp.fc1.bias", "shape": [ 10240 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 20480, "byteOffset": 24924160 }, { "name": "transformer.h.9.mlp.fc2.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 24944640 }, { "name": "transformer.h.9.mixer.out_proj.ALinear_no_train.weight", "shape": [ 2560, 692 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 24949760 }, { "name": "transformer.h.9.mixer.out_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 28492800 }, { "name": "transformer.h.9.mixer.out_proj.ALinear_train.weight", "shape": [ 2560, 76 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 28497920 }, { "name": "transformer.h.9.mixer.out_proj.BLinear_no_train.weight", "shape": [ 692, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 3543040, "byteOffset": 28887040 }, { "name": "transformer.h.9.mixer.out_proj.BLinear_train.weight", "shape": [ 76, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 389120, "byteOffset": 32430080 }, { "name": "transformer.h.9.mixer.k_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 32819200 } ], "md5sum": "1ba45f5b921d686389031928f1fcf2af" }, { "dataPath": "params_shard_118.bin", "format": "raw-shard", "nbytes": 32778240, "records": [ { "name": "transformer.h.9.mixer.k_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 0 }, { "name": "transformer.h.9.mixer.q_proj.ALinear_no_train.weight", "shape": [ 2560, 576 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2949120, "byteOffset": 13107200 }, { "name": "transformer.h.9.mixer.q_proj.ALinear_train.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 16056320 }, { "name": "transformer.h.9.mixer.q_proj.ALinear_train.weight", "shape": [ 2560, 64 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 327680, "byteOffset": 16061440 }, { "name": "transformer.h.9.mixer.q_proj.BLinear_no_train.weight", "shape": [ 576, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 2949120, "byteOffset": 16389120 }, { "name": "transformer.h.9.mixer.q_proj.BLinear_train.weight", "shape": [ 64, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 327680, "byteOffset": 19338240 }, { "name": "transformer.h.9.mixer.v_proj.bias", "shape": [ 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 5120, "byteOffset": 19665920 }, { "name": "transformer.h.9.mixer.v_proj.weight", "shape": [ 2560, 2560 ], "dtype": "float16", "format": "f32-to-bf16", "nbytes": 13107200, "byteOffset": 19671040 } ], "md5sum": "d86a494a4ae8a7f39630433bbc9cce78" } ] }