jan-ai commited on
Commit
3519414
1 Parent(s): 3134912

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- library_name: exllama2
3
- license: llama3.1
4
  ---
5
 
6
  # Model Card for Model ID
 
1
  ---
2
+ library_name: transformers
3
+ tags: []
4
  ---
5
 
6
  # Model Card for Model ID
config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "llama3-s-instruct-v0.3-checkpoint-7000-phase-3/",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 128000,
9
+ "eos_token_id": [
10
+ 128001,
11
+ 128008,
12
+ 128009
13
+ ],
14
+ "hidden_act": "silu",
15
+ "hidden_size": 4096,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 14336,
18
+ "max_position_embeddings": 131072,
19
+ "mlp_bias": false,
20
+ "model_type": "llama",
21
+ "num_attention_heads": 32,
22
+ "num_hidden_layers": 32,
23
+ "num_key_value_heads": 8,
24
+ "pretraining_tp": 1,
25
+ "rms_norm_eps": 1e-05,
26
+ "rope_scaling": {
27
+ "factor": 8.0,
28
+ "high_freq_factor": 4.0,
29
+ "low_freq_factor": 1.0,
30
+ "original_max_position_embeddings": 8192,
31
+ "rope_type": "llama3"
32
+ },
33
+ "rope_theta": 500000.0,
34
+ "tie_word_embeddings": false,
35
+ "torch_dtype": "bfloat16",
36
+ "transformers_version": "4.44.2",
37
+ "use_cache": true,
38
+ "vocab_size": 128771,
39
+ "quantization_config": {
40
+ "quant_method": "exl2",
41
+ "version": "0.2.2",
42
+ "bits": 8.2,
43
+ "head_bits": 6,
44
+ "calibration": {
45
+ "rows": 115,
46
+ "length": 2048,
47
+ "dataset": "(default)"
48
+ }
49
+ }
50
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "eos_token_id": [
5
+ 128001,
6
+ 128008,
7
+ 128009
8
+ ],
9
+ "transformers_version": "4.44.2"
10
+ }
loss_log.txt ADDED
@@ -0,0 +1,644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Step 1 | loss:1.0854802131652832 lr:1.875e-06 tokens_per_second_per_gpu:1168.8574294071448
2
+ Step 2 | loss:1.1429901123046875 lr:3.75e-06 tokens_per_second_per_gpu:3843.7632889854767
3
+ Step 3 | loss:1.2053295373916626 lr:5.625e-06 tokens_per_second_per_gpu:3842.159382826307
4
+ Step 4 | loss:1.079384446144104 lr:7.5e-06 tokens_per_second_per_gpu:3527.3851566722074
5
+ Step 5 | loss:1.2358113527297974 lr:9.375000000000001e-06 tokens_per_second_per_gpu:3297.665429778813
6
+ Step 6 | loss:1.0729756355285645 lr:1.125e-05 tokens_per_second_per_gpu:3513.7372696945217
7
+ Step 7 | loss:0.9907684326171875 lr:1.3125e-05 tokens_per_second_per_gpu:3930.22637976525
8
+ Step 8 | loss:1.143568515777588 lr:1.5e-05 tokens_per_second_per_gpu:2876.219047245818
9
+ Step 9 | loss:1.2526814937591553 lr:1.4999908501094575e-05 tokens_per_second_per_gpu:3772.480582082779
10
+ Step 10 | loss:1.1926498413085938 lr:1.4999634006610844e-05 tokens_per_second_per_gpu:3787.9663127264835
11
+ Step 11 | loss:1.1863847970962524 lr:1.4999176523246392e-05 tokens_per_second_per_gpu:2879.4265754703606
12
+ Step 12 | loss:1.0382694005966187 lr:1.4998536062163683e-05 tokens_per_second_per_gpu:3367.342512712612
13
+ Step 13 | loss:1.0486845970153809 lr:1.4997712638989775e-05 tokens_per_second_per_gpu:3973.1834061223685
14
+ Step 14 | loss:1.0994930267333984 lr:1.499670627381596e-05 tokens_per_second_per_gpu:3882.9148296055278
15
+ Step 15 | loss:1.0168027877807617 lr:1.4995516991197246e-05 tokens_per_second_per_gpu:3710.4998349202683
16
+ Step 16 | loss:1.0419063568115234 lr:1.4994144820151789e-05 tokens_per_second_per_gpu:2869.5305816393893
17
+ Step 17 | loss:1.065361738204956 lr:1.4992589794160159e-05 tokens_per_second_per_gpu:3181.270854761355
18
+ Step 18 | loss:1.1389403343200684 lr:1.4990851951164537e-05 tokens_per_second_per_gpu:2980.579149781292
19
+ Step 19 | loss:1.1638612747192383 lr:1.4988931333567785e-05 tokens_per_second_per_gpu:3509.8801697983517
20
+ Step 20 | loss:1.0619863271713257 lr:1.4986827988232412e-05 tokens_per_second_per_gpu:3703.804663132431
21
+ Step 21 | loss:1.1482210159301758 lr:1.4984541966479427e-05 tokens_per_second_per_gpu:3560.136617406801
22
+ Step 22 | loss:1.0800286531448364 lr:1.4982073324087097e-05 tokens_per_second_per_gpu:3648.6607162752957
23
+ Step 23 | loss:1.0815565586090088 lr:1.4979422121289576e-05 tokens_per_second_per_gpu:3211.609323443709
24
+ Step 24 | loss:1.0016083717346191 lr:1.4976588422775437e-05 tokens_per_second_per_gpu:3403.8935665137487
25
+ Step 25 | loss:1.054980993270874 lr:1.4973572297686097e-05 tokens_per_second_per_gpu:3733.187288468551
26
+ Step 26 | loss:1.0594943761825562 lr:1.4970373819614128e-05 tokens_per_second_per_gpu:3380.227950545309
27
+ Step 27 | loss:1.1039057970046997 lr:1.4966993066601459e-05 tokens_per_second_per_gpu:3416.5838997236365
28
+ Step 28 | loss:1.0442756414413452 lr:1.4963430121137482e-05 tokens_per_second_per_gpu:3232.857948752556
29
+ Step 29 | loss:1.3250808715820312 lr:1.495968507015702e-05 tokens_per_second_per_gpu:2739.8843861816085
30
+ Step 30 | loss:1.0642691850662231 lr:1.495575800503823e-05 tokens_per_second_per_gpu:3270.2234151530183
31
+ Step 31 | loss:1.0556732416152954 lr:1.4951649021600348e-05 tokens_per_second_per_gpu:3280.5378111148802
32
+ Step 32 | loss:1.2651450634002686 lr:1.4947358220101378e-05 tokens_per_second_per_gpu:3675.5567883584245
33
+ Step 33 | loss:0.9953560829162598 lr:1.4942885705235616e-05 tokens_per_second_per_gpu:3957.394946057751
34
+ Step 34 | loss:0.9306876063346863 lr:1.493823158613113e-05 tokens_per_second_per_gpu:2993.4562512796037
35
+ Step 35 | loss:0.9759434461593628 lr:1.4933395976347056e-05 tokens_per_second_per_gpu:3688.3981271180473
36
+ Step 36 | loss:1.042667031288147 lr:1.4928378993870868e-05 tokens_per_second_per_gpu:2739.611697714639
37
+ Step 37 | loss:1.1176235675811768 lr:1.4923180761115471e-05 tokens_per_second_per_gpu:3389.9575685437558
38
+ Step 38 | loss:1.0304946899414062 lr:1.491780140491623e-05 tokens_per_second_per_gpu:2751.426529078761
39
+ Step 39 | loss:1.1554368734359741 lr:1.4912241056527865e-05 tokens_per_second_per_gpu:3678.512086747305
40
+ Step 40 | loss:1.1235543489456177 lr:1.4906499851621251e-05 tokens_per_second_per_gpu:3354.288583814501
41
+ Step 41 | loss:1.1204215288162231 lr:1.4900577930280117e-05 tokens_per_second_per_gpu:2646.7652669210534
42
+ Step 42 | loss:1.159371256828308 lr:1.489447543699761e-05 tokens_per_second_per_gpu:3113.893638504558
43
+ Step 43 | loss:0.9518178701400757 lr:1.488819252067279e-05 tokens_per_second_per_gpu:3624.505087259554
44
+ Step 44 | loss:1.000114917755127 lr:1.4881729334606977e-05 tokens_per_second_per_gpu:3995.207620964973
45
+ Step 45 | loss:1.0402061939239502 lr:1.4875086036500025e-05 tokens_per_second_per_gpu:3694.213698196683
46
+ Step 46 | loss:1.1420111656188965 lr:1.4868262788446472e-05 tokens_per_second_per_gpu:3655.494511670644
47
+ Step 47 | loss:1.0602643489837646 lr:1.4861259756931577e-05 tokens_per_second_per_gpu:3402.054859100221
48
+ Step 48 | loss:0.9862946271896362 lr:1.4854077112827263e-05 tokens_per_second_per_gpu:3261.120378766765
49
+ Step 49 | loss:1.0737448930740356 lr:1.4846715031387952e-05 tokens_per_second_per_gpu:3560.4513581368537
50
+ Step 50 | loss:1.0223171710968018 lr:1.483917369224628e-05 tokens_per_second_per_gpu:4119.596935775298
51
+ Step 51 | loss:0.9688004851341248 lr:1.4831453279408724e-05 tokens_per_second_per_gpu:3727.7426885377727
52
+ Step 52 | loss:1.2084407806396484 lr:1.4823553981251103e-05 tokens_per_second_per_gpu:3475.565097423413
53
+ Step 53 | loss:1.0782098770141602 lr:1.4815475990513983e-05 tokens_per_second_per_gpu:3116.162844911365
54
+ Step 54 | loss:1.076345682144165 lr:1.4807219504297984e-05 tokens_per_second_per_gpu:3636.4074969842172
55
+ Step 55 | loss:0.9842542409896851 lr:1.4798784724058958e-05 tokens_per_second_per_gpu:3890.3173372257284
56
+ Step 56 | loss:1.0145485401153564 lr:1.4790171855603081e-05 tokens_per_second_per_gpu:3849.2883562880224
57
+ Step 57 | loss:1.0625418424606323 lr:1.4781381109081831e-05 tokens_per_second_per_gpu:3571.1217369942487
58
+ Step 58 | loss:1.0252196788787842 lr:1.4772412698986854e-05 tokens_per_second_per_gpu:3456.6789270442778
59
+ Step 59 | loss:0.9392111897468567 lr:1.4763266844144741e-05 tokens_per_second_per_gpu:3657.3789166667807
60
+ Step 60 | loss:1.0814956426620483 lr:1.4753943767711678e-05 tokens_per_second_per_gpu:3315.163159791015
61
+ Step 61 | loss:1.0520405769348145 lr:1.4744443697168013e-05 tokens_per_second_per_gpu:3701.303137175016
62
+ Step 62 | loss:1.0226781368255615 lr:1.473476686431269e-05 tokens_per_second_per_gpu:3537.4955448330916
63
+ Step 63 | loss:1.0486294031143188 lr:1.4724913505257609e-05 tokens_per_second_per_gpu:3095.7445591899086
64
+ Step 64 | loss:1.0411049127578735 lr:1.4714883860421854e-05 tokens_per_second_per_gpu:3149.287379730578
65
+ Step 65 | loss:1.0797317028045654 lr:1.4704678174525831e-05 tokens_per_second_per_gpu:3445.673531691869
66
+ Step 66 | loss:0.9485515356063843 lr:1.4694296696585298e-05 tokens_per_second_per_gpu:3864.0149006705747
67
+ Step 67 | loss:0.9495678544044495 lr:1.4683739679905284e-05 tokens_per_second_per_gpu:3101.8256009234356
68
+ Step 68 | loss:1.0279844999313354 lr:1.4673007382073919e-05 tokens_per_second_per_gpu:3463.4301964845763
69
+ Step 69 | loss:1.0704870223999023 lr:1.4662100064956132e-05 tokens_per_second_per_gpu:3121.3618232019967
70
+ Step 70 | loss:1.0774767398834229 lr:1.4651017994687282e-05 tokens_per_second_per_gpu:2970.1779475300104
71
+ Step 71 | loss:1.040583848953247 lr:1.4639761441666646e-05 tokens_per_second_per_gpu:3368.9417024963573
72
+ Step 72 | loss:1.1608614921569824 lr:1.4628330680550833e-05 tokens_per_second_per_gpu:3591.1238850356212
73
+ Step 73 | loss:1.1760778427124023 lr:1.4616725990247078e-05 tokens_per_second_per_gpu:3191.835895149416
74
+ Step 74 | loss:1.0865557193756104 lr:1.4604947653906435e-05 tokens_per_second_per_gpu:3856.472486717953
75
+ Step 75 | loss:0.997776985168457 lr:1.4592995958916877e-05 tokens_per_second_per_gpu:4288.807801484956
76
+ Step 76 | loss:1.0847547054290771 lr:1.4580871196896266e-05 tokens_per_second_per_gpu:3642.7905562815736
77
+ Step 77 | loss:1.1006416082382202 lr:1.4568573663685267e-05 tokens_per_second_per_gpu:2950.4712628757084
78
+ Step 78 | loss:1.1318891048431396 lr:1.4556103659340091e-05 tokens_per_second_per_gpu:3420.308972950097
79
+ Step 79 | loss:1.073240041732788 lr:1.4543461488125208e-05 tokens_per_second_per_gpu:3292.630999889894
80
+ Step 80 | loss:0.9821822643280029 lr:1.4530647458505908e-05 tokens_per_second_per_gpu:3845.0406649088623
81
+ Step 81 | loss:1.031591534614563 lr:1.4517661883140769e-05 tokens_per_second_per_gpu:3597.2292553821944
82
+ Step 82 | loss:1.0589147806167603 lr:1.4504505078874041e-05 tokens_per_second_per_gpu:3923.875717440058
83
+ Step 83 | loss:1.0883533954620361 lr:1.449117736672791e-05 tokens_per_second_per_gpu:3328.5666750491487
84
+ Step 84 | loss:0.9703829288482666 lr:1.4477679071894659e-05 tokens_per_second_per_gpu:4210.9057239346785
85
+ Step 85 | loss:1.0380264520645142 lr:1.4464010523728745e-05 tokens_per_second_per_gpu:4028.7523308737022
86
+ Step 86 | loss:1.0424227714538574 lr:1.445017205573875e-05 tokens_per_second_per_gpu:3067.892409170815
87
+ Step 87 | loss:0.9402433037757874 lr:1.4436164005579258e-05 tokens_per_second_per_gpu:3727.9697325163597
88
+ Step 88 | loss:0.9366852641105652 lr:1.4421986715042602e-05 tokens_per_second_per_gpu:3359.9819616780637
89
+ Step 89 | loss:0.9120358824729919 lr:1.4407640530050532e-05 tokens_per_second_per_gpu:3560.9878740376766
90
+ Step 90 | loss:1.0565634965896606 lr:1.4393125800645775e-05 tokens_per_second_per_gpu:3406.1127910038686
91
+ Step 91 | loss:1.0152472257614136 lr:1.4378442880983492e-05 tokens_per_second_per_gpu:3213.8003626321415
92
+ Step 92 | loss:1.1250417232513428 lr:1.4363592129322638e-05 tokens_per_second_per_gpu:3550.8080018136066
93
+ Step 93 | loss:1.1519802808761597 lr:1.4348573908017218e-05 tokens_per_second_per_gpu:3889.4264438590662
94
+ Step 94 | loss:1.0634357929229736 lr:1.4333388583507448e-05 tokens_per_second_per_gpu:3799.4362078472236
95
+ Step 95 | loss:0.9236892461776733 lr:1.4318036526310814e-05 tokens_per_second_per_gpu:3107.0781575328906
96
+ Step 96 | loss:1.1531763076782227 lr:1.4302518111013029e-05 tokens_per_second_per_gpu:3633.8470364745854
97
+ Step 97 | loss:1.1617469787597656 lr:1.4286833716258899e-05 tokens_per_second_per_gpu:3394.014913251338
98
+ Step 98 | loss:1.049908995628357 lr:1.4270983724743077e-05 tokens_per_second_per_gpu:3281.7821637586317
99
+ Step 99 | loss:1.168889045715332 lr:1.425496852320073e-05 tokens_per_second_per_gpu:3536.842535550907
100
+ Step 100 | loss:1.029691457748413 lr:1.42387885023981e-05 tokens_per_second_per_gpu:3351.7619303741485
101
+ Step 101 | loss:0.9969059824943542 lr:1.422244405712297e-05 tokens_per_second_per_gpu:3872.7354730770085
102
+ Step 102 | loss:1.0703043937683105 lr:1.420593558617504e-05 tokens_per_second_per_gpu:3133.1573043410367
103
+ Step 103 | loss:0.9668957591056824 lr:1.4189263492356176e-05 tokens_per_second_per_gpu:4133.178776143774
104
+ Step 104 | loss:1.1153146028518677 lr:1.4172428182460605e-05 tokens_per_second_per_gpu:3279.1325303313006
105
+ Step 105 | loss:1.0925307273864746 lr:1.4155430067264974e-05 tokens_per_second_per_gpu:3290.367388662595
106
+ Step 106 | loss:0.9424126148223877 lr:1.4138269561518329e-05 tokens_per_second_per_gpu:3823.0717814425543
107
+ Step 107 | loss:1.008542537689209 lr:1.4120947083932006e-05 tokens_per_second_per_gpu:3747.7812395683623
108
+ Step 108 | loss:1.0429927110671997 lr:1.4103463057169398e-05 tokens_per_second_per_gpu:3590.085848047025
109
+ Step 109 | loss:1.0133060216903687 lr:1.4085817907835657e-05 tokens_per_second_per_gpu:3174.886936324355
110
+ Step 110 | loss:0.9840688109397888 lr:1.4068012066467276e-05 tokens_per_second_per_gpu:3362.005778821448
111
+ Step 111 | loss:0.9953323006629944 lr:1.4050045967521587e-05 tokens_per_second_per_gpu:3757.2369577327404
112
+ Step 112 | loss:1.2065812349319458 lr:1.4031920049366161e-05 tokens_per_second_per_gpu:3325.905289950363
113
+ Step 113 | loss:0.941290020942688 lr:1.4013634754268107e-05 tokens_per_second_per_gpu:3273.1097453960997
114
+ Step 114 | loss:1.006861686706543 lr:1.3995190528383292e-05 tokens_per_second_per_gpu:3408.69735024395
115
+ Step 115 | loss:1.0291639566421509 lr:1.397658782174544e-05 tokens_per_second_per_gpu:3274.3777461502764
116
+ Step 116 | loss:0.9624848961830139 lr:1.3957827088255166e-05 tokens_per_second_per_gpu:3524.4278903848763
117
+ Step 117 | loss:1.0975605249404907 lr:1.3938908785668893e-05 tokens_per_second_per_gpu:3103.05657743153
118
+ Step 118 | loss:1.0946214199066162 lr:1.3919833375587679e-05 tokens_per_second_per_gpu:3842.6176588735884
119
+ Step 119 | loss:1.0759072303771973 lr:1.3900601323445961e-05 tokens_per_second_per_gpu:3808.512164666663
120
+ Step 120 | loss:1.01813805103302 lr:1.3881213098500202e-05 tokens_per_second_per_gpu:3999.2491046466607
121
+ Step 121 | loss:1.0882763862609863 lr:1.3861669173817427e-05 tokens_per_second_per_gpu:3427.5146935052067
122
+ Step 122 | loss:0.9825055599212646 lr:1.3841970026263695e-05 tokens_per_second_per_gpu:3473.233743555214
123
+ Step 123 | loss:1.099984884262085 lr:1.382211613649246e-05 tokens_per_second_per_gpu:3717.821725752323
124
+ Step 124 | loss:1.0696111917495728 lr:1.3802107988932832e-05 tokens_per_second_per_gpu:3082.1936590885393
125
+ Step 125 | loss:1.1309176683425903 lr:1.3781946071777777e-05 tokens_per_second_per_gpu:3077.2164906951607
126
+ Step 126 | loss:1.0581340789794922 lr:1.3761630876972183e-05 tokens_per_second_per_gpu:3387.403226559853
127
+ Step 127 | loss:1.0505608320236206 lr:1.3741162900200874e-05 tokens_per_second_per_gpu:3411.845981322547
128
+ Step 128 | loss:0.9377124309539795 lr:1.3720542640876514e-05 tokens_per_second_per_gpu:3386.120190777436
129
+ Step 129 | loss:1.0050909519195557 lr:1.3699770602127406e-05 tokens_per_second_per_gpu:3239.424588950932
130
+ Step 130 | loss:1.0426461696624756 lr:1.3678847290785237e-05 tokens_per_second_per_gpu:3827.433691106946
131
+ Step 131 | loss:1.0819664001464844 lr:1.3657773217372694e-05 tokens_per_second_per_gpu:3468.9528124343155
132
+ Step 132 | loss:0.9391593933105469 lr:1.3636548896091019e-05 tokens_per_second_per_gpu:4109.727828655714
133
+ Step 133 | loss:1.130295753479004 lr:1.3615174844807451e-05 tokens_per_second_per_gpu:3672.415503816543
134
+ Step 134 | loss:0.9968859553337097 lr:1.359365158504261e-05 tokens_per_second_per_gpu:4390.05247525661
135
+ Step 135 | loss:1.1019611358642578 lr:1.3571979641957745e-05 tokens_per_second_per_gpu:3287.8714995416
136
+ Step 136 | loss:1.047814965248108 lr:1.3550159544341948e-05 tokens_per_second_per_gpu:3286.5090224768937
137
+ Step 137 | loss:1.041077733039856 lr:1.3528191824599228e-05 tokens_per_second_per_gpu:3717.2716801705315
138
+ Step 138 | loss:1.0515552759170532 lr:1.3506077018735533e-05 tokens_per_second_per_gpu:3202.3379318776147
139
+ Step 139 | loss:1.1296720504760742 lr:1.3483815666345674e-05 tokens_per_second_per_gpu:3258.4210318353244
140
+ Step 140 | loss:1.0627872943878174 lr:1.3461408310600151e-05 tokens_per_second_per_gpu:3194.7948425297013
141
+ Step 141 | loss:1.0591386556625366 lr:1.3438855498231901e-05 tokens_per_second_per_gpu:3233.8322219652996
142
+ Step 142 | loss:1.2165062427520752 lr:1.3416157779522969e-05 tokens_per_second_per_gpu:2700.8847148372674
143
+ Step 143 | loss:1.1000216007232666 lr:1.339331570829106e-05 tokens_per_second_per_gpu:3090.8040651670713
144
+ Step 144 | loss:1.0945175886154175 lr:1.3370329841876049e-05 tokens_per_second_per_gpu:3016.1521681626327
145
+ Step 145 | loss:1.0223222970962524 lr:1.3347200741126368e-05 tokens_per_second_per_gpu:3864.9123751265906
146
+ Step 146 | loss:1.0384180545806885 lr:1.3323928970385318e-05 tokens_per_second_per_gpu:3093.045925373374
147
+ Step 147 | loss:1.0387237071990967 lr:1.3300515097477319e-05 tokens_per_second_per_gpu:3583.5866748553367
148
+ Step 148 | loss:1.0544804334640503 lr:1.3276959693694032e-05 tokens_per_second_per_gpu:4013.569528765113
149
+ Step 149 | loss:1.0835425853729248 lr:1.3253263333780434e-05 tokens_per_second_per_gpu:3747.288931376117
150
+ Step 150 | loss:1.0735474824905396 lr:1.3229426595920794e-05 tokens_per_second_per_gpu:3016.964949179244
151
+ Step 151 | loss:1.1732394695281982 lr:1.3205450061724554e-05 tokens_per_second_per_gpu:3552.8302097626038
152
+ Step 152 | loss:1.0120192766189575 lr:1.3181334316212151e-05 tokens_per_second_per_gpu:3993.2864465929683
153
+ Step 153 | loss:1.098035216331482 lr:1.3157079947800736e-05 tokens_per_second_per_gpu:3630.096833881631
154
+ Step 154 | loss:1.0417951345443726 lr:1.313268754828982e-05 tokens_per_second_per_gpu:4128.380187352026
155
+ Step 155 | loss:1.0801373720169067 lr:1.3108157712846833e-05 tokens_per_second_per_gpu:3170.1141756442244
156
+ Step 156 | loss:0.9469473361968994 lr:1.3083491039992596e-05 tokens_per_second_per_gpu:3746.1269785447903
157
+ Step 157 | loss:1.1583002805709839 lr:1.3058688131586727e-05 tokens_per_second_per_gpu:2958.969451708439
158
+ Step 158 | loss:1.0937081575393677 lr:1.3033749592812955e-05 tokens_per_second_per_gpu:3625.0738232335902
159
+ Step 159 | loss:1.041576623916626 lr:1.3008676032164346e-05 tokens_per_second_per_gpu:3174.5789362480195
160
+ Step 160 | loss:1.1431220769882202 lr:1.2983468061428455e-05 tokens_per_second_per_gpu:3186.8973435756893
161
+ Step 161 | loss:1.060421347618103 lr:1.2958126295672419e-05 tokens_per_second_per_gpu:3658.4590960017413
162
+ Step 162 | loss:0.9672384262084961 lr:1.293265135322792e-05 tokens_per_second_per_gpu:3234.76727249106
163
+ Step 163 | loss:1.1202409267425537 lr:1.290704385567612e-05 tokens_per_second_per_gpu:3683.5608176507158
164
+ Step 164 | loss:1.1316494941711426 lr:1.2881304427832483e-05 tokens_per_second_per_gpu:2893.5620231739244
165
+ Step 165 | loss:1.0530860424041748 lr:1.2855433697731538e-05 tokens_per_second_per_gpu:3596.0516529135816
166
+ Step 166 | loss:0.9973039627075195 lr:1.2829432296611547e-05 tokens_per_second_per_gpu:3239.2785653439887
167
+ Step 167 | loss:1.0723451375961304 lr:1.2803300858899106e-05 tokens_per_second_per_gpu:3442.490376433312
168
+ Step 168 | loss:1.0164995193481445 lr:1.2777040022193672e-05 tokens_per_second_per_gpu:3164.9946873077406
169
+ Step 169 | loss:0.9852588176727295 lr:1.275065042725199e-05 tokens_per_second_per_gpu:3323.0232306554753
170
+ Step 170 | loss:1.0778350830078125 lr:1.2724132717972478e-05 tokens_per_second_per_gpu:2830.1534574906404
171
+ Step 171 | loss:1.1002942323684692 lr:1.26974875413795e-05 tokens_per_second_per_gpu:3511.467336305257
172
+ Step 172 | loss:1.0973403453826904 lr:1.267071554760759e-05 tokens_per_second_per_gpu:4210.974619365593
173
+ Step 173 | loss:1.0474810600280762 lr:1.264381738988558e-05 tokens_per_second_per_gpu:3633.5914296190986
174
+ Step 174 | loss:1.107753038406372 lr:1.2616793724520665e-05 tokens_per_second_per_gpu:3379.4868101415323
175
+ Step 175 | loss:1.0912103652954102 lr:1.2589645210882397e-05 tokens_per_second_per_gpu:3268.3413161504523
176
+ Step 176 | loss:0.9168727397918701 lr:1.256237251138658e-05 tokens_per_second_per_gpu:3584.472835944426
177
+ Step 177 | loss:1.0452046394348145 lr:1.2534976291479122e-05 tokens_per_second_per_gpu:3418.838663433972
178
+ Step 178 | loss:1.0404298305511475 lr:1.2507457219619796e-05 tokens_per_second_per_gpu:2913.0762752618957
179
+ Step 179 | loss:0.9873407483100891 lr:1.247981596726592e-05 tokens_per_second_per_gpu:3476.05126598452
180
+ Step 180 | loss:0.9315714836120605 lr:1.245205320885598e-05 tokens_per_second_per_gpu:4233.830633953143
181
+ Step 181 | loss:1.0789451599121094 lr:1.2424169621793182e-05 tokens_per_second_per_gpu:3855.9909233022713
182
+ Step 182 | loss:1.0372527837753296 lr:1.2396165886428913e-05 tokens_per_second_per_gpu:3188.532121002085
183
+ Step 183 | loss:0.9855374097824097 lr:1.2368042686046139e-05 tokens_per_second_per_gpu:3360.102286992968
184
+ Step 184 | loss:0.9893398880958557 lr:1.233980070684274e-05 tokens_per_second_per_gpu:3529.360812306045
185
+ Step 185 | loss:1.0867488384246826 lr:1.2311440637914766e-05 tokens_per_second_per_gpu:3179.6720985050933
186
+ Step 186 | loss:1.136749267578125 lr:1.228296317123962e-05 tokens_per_second_per_gpu:3040.5648519621095
187
+ Step 187 | loss:1.0160714387893677 lr:1.2254369001659178e-05 tokens_per_second_per_gpu:2636.2919865625445
188
+ Step 188 | loss:0.9412505030632019 lr:1.2225658826862835e-05 tokens_per_second_per_gpu:3556.44720893037
189
+ Step 189 | loss:1.0352046489715576 lr:1.219683334737047e-05 tokens_per_second_per_gpu:3189.685627683808
190
+ Step 190 | loss:1.0164215564727783 lr:1.2167893266515368e-05 tokens_per_second_per_gpu:3481.096336571031
191
+ Step 191 | loss:0.9528836607933044 lr:1.2138839290427062e-05 tokens_per_second_per_gpu:3763.6803007713847
192
+ Step 192 | loss:0.874556303024292 lr:1.210967212801408e-05 tokens_per_second_per_gpu:3543.1880047946393
193
+ Step 193 | loss:0.9521118402481079 lr:1.208039249094668e-05 tokens_per_second_per_gpu:3085.3159173416116
194
+ Step 194 | loss:0.9902434349060059 lr:1.2051001093639451e-05 tokens_per_second_per_gpu:3872.794807892901
195
+ Step 195 | loss:1.0282405614852905 lr:1.2021498653233912e-05 tokens_per_second_per_gpu:3357.2798676727166
196
+ Step 196 | loss:1.0458389520645142 lr:1.1991885889581001e-05 tokens_per_second_per_gpu:3481.726431927051
197
+ Step 197 | loss:1.06132972240448 lr:1.1962163525223505e-05 tokens_per_second_per_gpu:3642.1604094854006
198
+ Step 198 | loss:1.0513228178024292 lr:1.1932332285378438e-05 tokens_per_second_per_gpu:3643.609473789504
199
+ Step 199 | loss:1.0699129104614258 lr:1.1902392897919344e-05 tokens_per_second_per_gpu:4025.895506710408
200
+ Step 200 | loss:1.0457760095596313 lr:1.187234609335854e-05 tokens_per_second_per_gpu:3671.934888630731
201
+ Step 201 | loss:1.0659315586090088 lr:1.1842192604829286e-05 tokens_per_second_per_gpu:3569.504587981394
202
+ Step 202 | loss:0.980949878692627 lr:1.1811933168067903e-05 tokens_per_second_per_gpu:3335.3855968235766
203
+ Step 203 | loss:1.0401633977890015 lr:1.1781568521395815e-05 tokens_per_second_per_gpu:3991.0346971342574
204
+ Step 204 | loss:1.008753776550293 lr:1.1751099405701535e-05 tokens_per_second_per_gpu:3817.479518764033
205
+ Step 205 | loss:0.9701691269874573 lr:1.1720526564422593e-05 tokens_per_second_per_gpu:3387.821746371587
206
+ Step 206 | loss:1.122227430343628 lr:1.1689850743527394e-05 tokens_per_second_per_gpu:3950.7440411904113
207
+ Step 207 | loss:1.0229696035385132 lr:1.1659072691497014e-05 tokens_per_second_per_gpu:4086.9911967317607
208
+ Step 208 | loss:1.0376312732696533 lr:1.1628193159306939e-05 tokens_per_second_per_gpu:3460.2927592053616
209
+ Step 209 | loss:1.0910463333129883 lr:1.1597212900408738e-05 tokens_per_second_per_gpu:3441.665664549203
210
+ Step 210 | loss:0.951766848564148 lr:1.1566132670711691e-05 tokens_per_second_per_gpu:3765.2848616356405
211
+ Step 211 | loss:1.1121230125427246 lr:1.1534953228564325e-05 tokens_per_second_per_gpu:3348.8906808140177
212
+ Step 212 | loss:0.9478145241737366 lr:1.1503675334735933e-05 tokens_per_second_per_gpu:3729.1377760854084
213
+ Step 213 | loss:1.1243412494659424 lr:1.1472299752397989e-05 tokens_per_second_per_gpu:3466.9044365152718
214
+ Step 214 | loss:0.9882078170776367 lr:1.1440827247105546e-05 tokens_per_second_per_gpu:3301.1196001394637
215
+ Step 215 | loss:1.0635075569152832 lr:1.140925858677855e-05 tokens_per_second_per_gpu:3329.520308962072
216
+ Step 216 | loss:0.9767946004867554 lr:1.1377594541683095e-05 tokens_per_second_per_gpu:3282.602115501923
217
+ Step 217 | loss:0.9662872552871704 lr:1.134583588441264e-05 tokens_per_second_per_gpu:3304.2082107578244
218
+ Step 218 | loss:0.9389134049415588 lr:1.1313983389869154e-05 tokens_per_second_per_gpu:3737.964426771077
219
+ Step 219 | loss:1.0151091814041138 lr:1.1282037835244205e-05 tokens_per_second_per_gpu:3664.737039346888
220
+ Step 220 | loss:1.0414142608642578 lr:1.125e-05 tokens_per_second_per_gpu:3906.458243931466
221
+ Step 221 | loss:1.0948292016983032 lr:1.121787066585037e-05 tokens_per_second_per_gpu:3126.7367416868815
222
+ Step 222 | loss:1.1479017734527588 lr:1.118565061674169e-05 tokens_per_second_per_gpu:3310.893821872108
223
+ Step 223 | loss:1.1810510158538818 lr:1.1153340638833753e-05 tokens_per_second_per_gpu:3048.5584944239317
224
+ Step 224 | loss:0.9962694048881531 lr:1.1120941520480588e-05 tokens_per_second_per_gpu:3589.2994193950644
225
+ Step 225 | loss:1.0925935506820679 lr:1.1088454052211226e-05 tokens_per_second_per_gpu:3637.8874197495197
226
+ Step 226 | loss:1.0008320808410645 lr:1.1055879026710413e-05 tokens_per_second_per_gpu:3710.3314714150306
227
+ Step 227 | loss:1.0037486553192139 lr:1.102321723879926e-05 tokens_per_second_per_gpu:3530.8579202116766
228
+ Step 228 | loss:1.0278898477554321 lr:1.0990469485415859e-05 tokens_per_second_per_gpu:3603.3149217944747
229
+ Step 229 | loss:0.9470862150192261 lr:1.0957636565595835e-05 tokens_per_second_per_gpu:3314.542239706414
230
+ Step 230 | loss:0.974503755569458 lr:1.0924719280452849e-05 tokens_per_second_per_gpu:3547.2378512191153
231
+ Step 231 | loss:0.9583215713500977 lr:1.0891718433159048e-05 tokens_per_second_per_gpu:3523.465479574953
232
+ Step 232 | loss:1.0647832155227661 lr:1.0858634828925474e-05 tokens_per_second_per_gpu:4079.209517175555
233
+ Step 233 | loss:1.0859055519104004 lr:1.0825469274982416e-05 tokens_per_second_per_gpu:3748.6882115494077
234
+ Step 234 | loss:1.0758495330810547 lr:1.0792222580559706e-05 tokens_per_second_per_gpu:3851.188199977994
235
+ Step 235 | loss:0.9934150576591492 lr:1.0758895556866984e-05 tokens_per_second_per_gpu:3027.8201410044285
236
+ Step 236 | loss:0.9749125838279724 lr:1.0725489017073905e-05 tokens_per_second_per_gpu:3824.3620745614776
237
+ Step 237 | loss:1.1250077486038208 lr:1.0692003776290284e-05 tokens_per_second_per_gpu:3812.241805278313
238
+ Step 238 | loss:0.9836297631263733 lr:1.0658440651546224e-05 tokens_per_second_per_gpu:3754.6913377732058
239
+ Step 239 | loss:1.0390492677688599 lr:1.0624800461772173e-05 tokens_per_second_per_gpu:3801.826857915905
240
+ Step 240 | loss:1.04658842086792 lr:1.059108402777894e-05 tokens_per_second_per_gpu:3247.5606533242094
241
+ Step 241 | loss:1.1221239566802979 lr:1.0557292172237676e-05 tokens_per_second_per_gpu:3813.4732938265543
242
+ Step 242 | loss:1.150991678237915 lr:1.0523425719659793e-05 tokens_per_second_per_gpu:3428.596138959447
243
+ Step 243 | loss:1.139402151107788 lr:1.0489485496376844e-05 tokens_per_second_per_gpu:3350.111202236054
244
+ Step 244 | loss:0.990974485874176 lr:1.0455472330520378e-05 tokens_per_second_per_gpu:3301.0046192724767
245
+ Step 245 | loss:1.023050308227539 lr:1.0421387052001705e-05 tokens_per_second_per_gpu:2901.330223329429
246
+ Step 246 | loss:1.0226176977157593 lr:1.0387230492491678e-05 tokens_per_second_per_gpu:3203.0339991749943
247
+ Step 247 | loss:1.0416032075881958 lr:1.0353003485400378e-05 tokens_per_second_per_gpu:3180.0409616941442
248
+ Step 248 | loss:0.9852002859115601 lr:1.0318706865856785e-05 tokens_per_second_per_gpu:3998.775527074616
249
+ Step 249 | loss:1.1657763719558716 lr:1.028434147068841e-05 tokens_per_second_per_gpu:3891.387438322461
250
+ Step 250 | loss:1.0519105195999146 lr:1.0249908138400862e-05 tokens_per_second_per_gpu:3888.449070526479
251
+ Step 251 | loss:1.0405876636505127 lr:1.0215407709157396e-05 tokens_per_second_per_gpu:3619.335466976545
252
+ Step 252 | loss:1.0283851623535156 lr:1.0180841024758419e-05 tokens_per_second_per_gpu:2752.070363393849
253
+ Step 253 | loss:1.0298212766647339 lr:1.0146208928620938e-05 tokens_per_second_per_gpu:3307.377280772541
254
+ Step 254 | loss:0.964581310749054 lr:1.0111512265757992e-05 tokens_per_second_per_gpu:3752.103655989273
255
+ Step 255 | loss:0.9683664441108704 lr:1.0076751882758025e-05 tokens_per_second_per_gpu:4138.001777589129
256
+ Step 256 | loss:1.0615788698196411 lr:1.0041928627764238e-05 tokens_per_second_per_gpu:3341.61440477967
257
+ Step 257 | loss:1.0685681104660034 lr:1.0007043350453889e-05 tokens_per_second_per_gpu:3688.378050058521
258
+ Step 258 | loss:1.0611388683319092 lr:9.972096902017559e-06 tokens_per_second_per_gpu:3797.8084242404275
259
+ Step 259 | loss:0.961796760559082 lr:9.937090135138392e-06 tokens_per_second_per_gpu:3767.499020527814
260
+ Step 260 | loss:0.9383830428123474 lr:9.902023903971282e-06 tokens_per_second_per_gpu:4237.725928428407
261
+ Step 261 | loss:1.0182300806045532 lr:9.866899064122033e-06 tokens_per_second_per_gpu:3204.7581266878838
262
+ Step 262 | loss:1.0680441856384277 lr:9.831716472626485e-06 tokens_per_second_per_gpu:3623.368447703484
263
+ Step 263 | loss:1.0305362939834595 lr:9.796476987929601e-06 tokens_per_second_per_gpu:3667.2284929147554
264
+ Step 264 | loss:1.0003256797790527 lr:9.761181469864523e-06 tokens_per_second_per_gpu:3335.105931326227
265
+ Step 265 | loss:1.0257227420806885 lr:9.725830779631588e-06 tokens_per_second_per_gpu:4030.3851390190357
266
+ Step 266 | loss:1.0340684652328491 lr:9.69042577977732e-06 tokens_per_second_per_gpu:3411.0997028354655
267
+ Step 267 | loss:1.1450355052947998 lr:9.65496733417338e-06 tokens_per_second_per_gpu:3052.7514833967907
268
+ Step 268 | loss:1.0675625801086426 lr:9.619456307995492e-06 tokens_per_second_per_gpu:3714.1461111976373
269
+ Step 269 | loss:1.0681331157684326 lr:9.583893567702329e-06 tokens_per_second_per_gpu:3103.583450091087
270
+ Step 270 | loss:1.0137324333190918 lr:9.548279981014373e-06 tokens_per_second_per_gpu:3423.384868860896
271
+ Step 271 | loss:1.0491313934326172 lr:9.512616416892749e-06 tokens_per_second_per_gpu:3583.567062020731
272
+ Step 272 | loss:0.9289360046386719 lr:9.476903745518007e-06 tokens_per_second_per_gpu:3940.2272562878234
273
+ Step 273 | loss:0.9665488600730896 lr:9.441142838268906e-06 tokens_per_second_per_gpu:2920.460239292336
274
+ Step 274 | loss:1.0877517461776733 lr:9.405334567701143e-06 tokens_per_second_per_gpu:2961.311684974825
275
+ Step 275 | loss:1.0493131875991821 lr:9.369479807526072e-06 tokens_per_second_per_gpu:3966.098017081578
276
+ Step 276 | loss:1.0874686241149902 lr:9.333579432589371e-06 tokens_per_second_per_gpu:3387.0541722877924
277
+ Step 277 | loss:0.9608097672462463 lr:9.297634318849712e-06 tokens_per_second_per_gpu:4291.32165742434
278
+ Step 278 | loss:1.095219612121582 lr:9.26164534335738e-06 tokens_per_second_per_gpu:3076.5752421480315
279
+ Step 279 | loss:1.0561093091964722 lr:9.225613384232867e-06 tokens_per_second_per_gpu:3212.6037289159076
280
+ Step 280 | loss:1.0817214250564575 lr:9.189539320645461e-06 tokens_per_second_per_gpu:3659.336877166898
281
+ Step 281 | loss:1.0177308320999146 lr:9.15342403279179e-06 tokens_per_second_per_gpu:3738.1776707794156
282
+ Step 282 | loss:0.9710911512374878 lr:9.117268401874329e-06 tokens_per_second_per_gpu:3382.0765709925126
283
+ Step 283 | loss:1.1531765460968018 lr:9.081073310079919e-06 tokens_per_second_per_gpu:3141.8432625191035
284
+ Step 284 | loss:1.0935015678405762 lr:9.044839640558238e-06 tokens_per_second_per_gpu:3429.2429675320373
285
+ Step 285 | loss:0.9385885000228882 lr:9.008568277400246e-06 tokens_per_second_per_gpu:3346.592745218261
286
+ Step 286 | loss:0.9143052101135254 lr:8.972260105616615e-06 tokens_per_second_per_gpu:3597.7170701513446
287
+ Step 287 | loss:1.1004893779754639 lr:8.935916011116141e-06 tokens_per_second_per_gpu:3088.5027102851095
288
+ Step 288 | loss:1.0081392526626587 lr:8.899536880684118e-06 tokens_per_second_per_gpu:3565.6117227664477
289
+ Step 289 | loss:1.0537937879562378 lr:8.863123601960713e-06 tokens_per_second_per_gpu:3896.8268798437653
290
+ Step 290 | loss:1.0108102560043335 lr:8.826677063419297e-06 tokens_per_second_per_gpu:2986.3032089900526
291
+ Step 291 | loss:0.9827962517738342 lr:8.790198154344774e-06 tokens_per_second_per_gpu:3411.9397148980493
292
+ Step 292 | loss:0.9616813659667969 lr:8.753687764811874e-06 tokens_per_second_per_gpu:4128.635119314705
293
+ Step 293 | loss:1.0986084938049316 lr:8.717146785663451e-06 tokens_per_second_per_gpu:3044.1432990396206
294
+ Step 294 | loss:1.010633111000061 lr:8.680576108488722e-06 tokens_per_second_per_gpu:4016.5995836897896
295
+ Step 295 | loss:1.1678359508514404 lr:8.643976625601543e-06 tokens_per_second_per_gpu:3484.066437413288
296
+ Step 296 | loss:0.9672271609306335 lr:8.60734923001861e-06 tokens_per_second_per_gpu:3722.9078388086114
297
+ Step 297 | loss:1.0644941329956055 lr:8.570694815437684e-06 tokens_per_second_per_gpu:3674.078084922676
298
+ Step 298 | loss:1.0218087434768677 lr:8.534014276215784e-06 tokens_per_second_per_gpu:3719.1251320960077
299
+ Step 299 | loss:0.8616893291473389 lr:8.497308507347358e-06 tokens_per_second_per_gpu:3980.0883290351844
300
+ Step 300 | loss:1.200434923171997 lr:8.460578404442452e-06 tokens_per_second_per_gpu:3537.9024408752107
301
+ Step 301 | loss:1.2012302875518799 lr:8.423824863704859e-06 tokens_per_second_per_gpu:3341.0804733107207
302
+ Step 302 | loss:1.0611824989318848 lr:8.387048781910243e-06 tokens_per_second_per_gpu:3399.3160464558623
303
+ Step 303 | loss:1.0133317708969116 lr:8.350251056384267e-06 tokens_per_second_per_gpu:3396.784053005488
304
+ Step 304 | loss:1.0075584650039673 lr:8.313432584980693e-06 tokens_per_second_per_gpu:3170.834006045349
305
+ Step 305 | loss:1.0057969093322754 lr:8.276594266059473e-06 tokens_per_second_per_gpu:3349.7058914501686
306
+ Step 306 | loss:0.9875553846359253 lr:8.239736998464839e-06 tokens_per_second_per_gpu:3283.555251517354
307
+ Step 307 | loss:1.0624175071716309 lr:8.202861681503362e-06 tokens_per_second_per_gpu:3288.3057635972655
308
+ Step 308 | loss:1.0439351797103882 lr:8.165969214922011e-06 tokens_per_second_per_gpu:3257.713880411403
309
+ Step 309 | loss:1.021206021308899 lr:8.129060498886204e-06 tokens_per_second_per_gpu:3443.1099672423125
310
+ Step 310 | loss:1.0511139631271362 lr:8.09213643395784e-06 tokens_per_second_per_gpu:3937.8803984424717
311
+ Step 311 | loss:0.9559040069580078 lr:8.05519792107332e-06 tokens_per_second_per_gpu:3379.1804990828364
312
+ Step 312 | loss:1.048850655555725 lr:8.018245861521585e-06 tokens_per_second_per_gpu:3090.2532971700753
313
+ Step 313 | loss:1.1831482648849487 lr:7.981281156922097e-06 tokens_per_second_per_gpu:3205.4884774853103
314
+ Step 314 | loss:1.065642237663269 lr:7.944304709202857e-06 tokens_per_second_per_gpu:3615.7381651115775
315
+ Step 315 | loss:0.9639590978622437 lr:7.9073174205784e-06 tokens_per_second_per_gpu:3895.6258291409545
316
+ Step 316 | loss:1.0430667400360107 lr:7.870320193527773e-06 tokens_per_second_per_gpu:3228.924479386842
317
+ Step 317 | loss:1.104460597038269 lr:7.833313930772514e-06 tokens_per_second_per_gpu:3807.3283800568493
318
+ Step 318 | loss:1.0125938653945923 lr:7.796299535254633e-06 tokens_per_second_per_gpu:3032.071204987172
319
+ Step 319 | loss:1.064937949180603 lr:7.759277910114582e-06 tokens_per_second_per_gpu:3479.6920455124077
320
+ Step 320 | loss:1.0094972848892212 lr:7.722249958669199e-06 tokens_per_second_per_gpu:3269.3602673037053
321
+ Step 321 | loss:1.0703675746917725 lr:7.685216584389697e-06 tokens_per_second_per_gpu:3704.300080154252
322
+ Step 322 | loss:0.9877763390541077 lr:7.648178690879598e-06 tokens_per_second_per_gpu:3353.1459094211864
323
+ Step 323 | loss:1.12434983253479 lr:7.611137181852695e-06 tokens_per_second_per_gpu:3122.7128974218263
324
+ Step 324 | loss:1.0240429639816284 lr:7.574092961110993e-06 tokens_per_second_per_gpu:3098.34410006354
325
+ Step 325 | loss:0.9952061176300049 lr:7.537046932522668e-06 tokens_per_second_per_gpu:3604.4779903256135
326
+ Step 326 | loss:1.011582612991333 lr:7.5e-06 tokens_per_second_per_gpu:3283.509048393859
327
+ Step 327 | loss:0.9831445217132568 lr:7.462953067477332e-06 tokens_per_second_per_gpu:3775.59490266086
328
+ Step 328 | loss:1.1675689220428467 lr:7.425907038889008e-06 tokens_per_second_per_gpu:3642.4454328719376
329
+ Step 329 | loss:1.0117154121398926 lr:7.388862818147305e-06 tokens_per_second_per_gpu:3161.511403244572
330
+ Step 330 | loss:0.9967679381370544 lr:7.351821309120403e-06 tokens_per_second_per_gpu:4076.6176819879342
331
+ Step 331 | loss:1.0238803625106812 lr:7.314783415610303e-06 tokens_per_second_per_gpu:3682.85016942687
332
+ Step 332 | loss:1.1032280921936035 lr:7.2777500413308015e-06 tokens_per_second_per_gpu:3018.3267646978616
333
+ Step 333 | loss:1.0388715267181396 lr:7.240722089885421e-06 tokens_per_second_per_gpu:3664.4967840336526
334
+ Step 334 | loss:1.0032538175582886 lr:7.203700464745366e-06 tokens_per_second_per_gpu:3878.0272534442006
335
+ Step 335 | loss:0.9861161112785339 lr:7.166686069227486e-06 tokens_per_second_per_gpu:3704.921578499297
336
+ Step 336 | loss:1.0723155736923218 lr:7.129679806472228e-06 tokens_per_second_per_gpu:3379.4966782090632
337
+ Step 337 | loss:1.0907748937606812 lr:7.092682579421598e-06 tokens_per_second_per_gpu:3056.886775193357
338
+ Step 338 | loss:1.0879602432250977 lr:7.055695290797143e-06 tokens_per_second_per_gpu:3510.6596962005956
339
+ Step 339 | loss:1.2004570960998535 lr:7.018718843077904e-06 tokens_per_second_per_gpu:2698.696907414987
340
+ Step 340 | loss:1.1619454622268677 lr:6.981754138478416e-06 tokens_per_second_per_gpu:3246.2572334355586
341
+ Step 341 | loss:1.1515110731124878 lr:6.944802078926679e-06 tokens_per_second_per_gpu:3833.456255283371
342
+ Step 342 | loss:0.9102157950401306 lr:6.907863566042161e-06 tokens_per_second_per_gpu:3861.3976159065483
343
+ Step 343 | loss:1.0276517868041992 lr:6.870939501113796e-06 tokens_per_second_per_gpu:3515.3211579047493
344
+ Step 344 | loss:1.0000083446502686 lr:6.834030785077989e-06 tokens_per_second_per_gpu:3557.346531908244
345
+ Step 345 | loss:1.0519853830337524 lr:6.797138318496637e-06 tokens_per_second_per_gpu:3382.344983091482
346
+ Step 346 | loss:1.0421191453933716 lr:6.7602630015351624e-06 tokens_per_second_per_gpu:3723.6117480414146
347
+ Step 347 | loss:0.9147570133209229 lr:6.723405733940528e-06 tokens_per_second_per_gpu:4252.460083108924
348
+ Step 348 | loss:0.9446225166320801 lr:6.68656741501931e-06 tokens_per_second_per_gpu:3618.6484563348768
349
+ Step 349 | loss:0.9708753228187561 lr:6.649748943615732e-06 tokens_per_second_per_gpu:3283.5603120185374
350
+ Step 350 | loss:0.989623486995697 lr:6.6129512180897574e-06 tokens_per_second_per_gpu:3714.9557633983477
351
+ Step 351 | loss:1.0894560813903809 lr:6.57617513629514e-06 tokens_per_second_per_gpu:3633.0728284201673
352
+ Step 352 | loss:1.115202784538269 lr:6.539421595557549e-06 tokens_per_second_per_gpu:3465.6183875539423
353
+ Step 353 | loss:0.8921554088592529 lr:6.502691492652643e-06 tokens_per_second_per_gpu:3898.581389798211
354
+ Step 354 | loss:1.0538413524627686 lr:6.465985723784218e-06 tokens_per_second_per_gpu:3875.7674647016806
355
+ Step 355 | loss:1.1229345798492432 lr:6.429305184562315e-06 tokens_per_second_per_gpu:3747.439269311531
356
+ Step 356 | loss:1.0564650297164917 lr:6.392650769981392e-06 tokens_per_second_per_gpu:3671.9056611937544
357
+ Step 357 | loss:1.0135570764541626 lr:6.356023374398456e-06 tokens_per_second_per_gpu:2899.364909402783
358
+ Step 358 | loss:1.0615811347961426 lr:6.319423891511278e-06 tokens_per_second_per_gpu:3801.336525876768
359
+ Step 359 | loss:0.9940143823623657 lr:6.28285321433655e-06 tokens_per_second_per_gpu:3759.510249531142
360
+ Step 360 | loss:1.051573395729065 lr:6.246312235188126e-06 tokens_per_second_per_gpu:3262.2914203901505
361
+ Step 361 | loss:1.0494478940963745 lr:6.209801845655227e-06 tokens_per_second_per_gpu:3231.54016416701
362
+ Step 362 | loss:1.134334921836853 lr:6.173322936580705e-06 tokens_per_second_per_gpu:3571.6658588531845
363
+ Step 363 | loss:1.132298231124878 lr:6.136876398039287e-06 tokens_per_second_per_gpu:3057.396332653723
364
+ Step 364 | loss:1.0008654594421387 lr:6.100463119315882e-06 tokens_per_second_per_gpu:3834.101013732286
365
+ Step 365 | loss:1.1123298406600952 lr:6.0640839888838594e-06 tokens_per_second_per_gpu:3247.3705233572437
366
+ Step 366 | loss:1.1211216449737549 lr:6.027739894383387e-06 tokens_per_second_per_gpu:2845.3333047002448
367
+ Step 367 | loss:1.0459622144699097 lr:5.991431722599755e-06 tokens_per_second_per_gpu:3457.97226490014
368
+ Step 368 | loss:1.0909711122512817 lr:5.955160359441763e-06 tokens_per_second_per_gpu:3221.2013455847036
369
+ Step 369 | loss:1.1640664339065552 lr:5.918926689920081e-06 tokens_per_second_per_gpu:3451.3559735588115
370
+ Step 370 | loss:1.2360296249389648 lr:5.882731598125674e-06 tokens_per_second_per_gpu:3289.3973935577333
371
+ Step 371 | loss:1.0348141193389893 lr:5.846575967208211e-06 tokens_per_second_per_gpu:3435.4094495338736
372
+ Step 372 | loss:0.9567263722419739 lr:5.810460679354538e-06 tokens_per_second_per_gpu:3576.81900822866
373
+ Step 373 | loss:0.9700967073440552 lr:5.774386615767134e-06 tokens_per_second_per_gpu:4121.726341513599
374
+ Step 374 | loss:1.0645740032196045 lr:5.738354656642623e-06 tokens_per_second_per_gpu:3347.2283933756694
375
+ Step 375 | loss:1.0295037031173706 lr:5.702365681150289e-06 tokens_per_second_per_gpu:3868.2590545517114
376
+ Step 376 | loss:1.0450406074523926 lr:5.66642056741063e-06 tokens_per_second_per_gpu:3477.8887834676907
377
+ Step 377 | loss:1.0922298431396484 lr:5.630520192473929e-06 tokens_per_second_per_gpu:3405.0405996273425
378
+ Step 378 | loss:0.9819583296775818 lr:5.594665432298858e-06 tokens_per_second_per_gpu:3530.057502597269
379
+ Step 379 | loss:1.0701051950454712 lr:5.558857161731094e-06 tokens_per_second_per_gpu:3565.327286781416
380
+ Step 380 | loss:0.9559437036514282 lr:5.523096254481995e-06 tokens_per_second_per_gpu:3365.6710550463654
381
+ Step 381 | loss:1.0547300577163696 lr:5.48738358310725e-06 tokens_per_second_per_gpu:3346.3530578743994
382
+ Step 382 | loss:0.9696499705314636 lr:5.451720018985626e-06 tokens_per_second_per_gpu:3329.928998826801
383
+ Step 383 | loss:1.0064926147460938 lr:5.416106432297671e-06 tokens_per_second_per_gpu:4094.6995868513072
384
+ Step 384 | loss:1.0114684104919434 lr:5.380543692004509e-06 tokens_per_second_per_gpu:3475.432549582163
385
+ Step 385 | loss:0.9692176580429077 lr:5.345032665826621e-06 tokens_per_second_per_gpu:3244.483451213904
386
+ Step 386 | loss:1.0306236743927002 lr:5.3095742202226825e-06 tokens_per_second_per_gpu:3269.062813082469
387
+ Step 387 | loss:0.9608441591262817 lr:5.274169220368412e-06 tokens_per_second_per_gpu:3293.0322861856876
388
+ Step 388 | loss:1.0451374053955078 lr:5.238818530135479e-06 tokens_per_second_per_gpu:2938.7446792343676
389
+ Step 389 | loss:1.1184594631195068 lr:5.203523012070398e-06 tokens_per_second_per_gpu:4074.2637958572395
390
+ Step 390 | loss:1.0143616199493408 lr:5.168283527373516e-06 tokens_per_second_per_gpu:3757.202601042462
391
+ Step 391 | loss:1.0019251108169556 lr:5.133100935877967e-06 tokens_per_second_per_gpu:3643.925463722737
392
+ Step 392 | loss:0.9685758352279663 lr:5.097976096028719e-06 tokens_per_second_per_gpu:3100.3578523101673
393
+ Step 393 | loss:1.0199055671691895 lr:5.0629098648616075e-06 tokens_per_second_per_gpu:3384.275878283242
394
+ Step 394 | loss:1.178783893585205 lr:5.027903097982441e-06 tokens_per_second_per_gpu:3391.4302131315617
395
+ Step 395 | loss:1.0473383665084839 lr:4.992956649546113e-06 tokens_per_second_per_gpu:3063.8065410727527
396
+ Step 396 | loss:1.0996811389923096 lr:4.958071372235763e-06 tokens_per_second_per_gpu:3498.2229002983877
397
+ Step 397 | loss:1.0462857484817505 lr:4.923248117241975e-06 tokens_per_second_per_gpu:4333.282035445803
398
+ Step 398 | loss:0.9347184300422668 lr:4.8884877342420105e-06 tokens_per_second_per_gpu:3622.574379253927
399
+ Step 399 | loss:0.9804700613021851 lr:4.853791071379062e-06 tokens_per_second_per_gpu:3226.8869428343696
400
+ Step 400 | loss:1.1005946397781372 lr:4.8191589752415834e-06 tokens_per_second_per_gpu:3372.8523923167445
401
+ Step 401 | loss:0.8959193229675293 lr:4.7845922908426035e-06 tokens_per_second_per_gpu:3320.152228865833
402
+ Step 402 | loss:1.021907925605774 lr:4.75009186159914e-06 tokens_per_second_per_gpu:3777.003290844435
403
+ Step 403 | loss:1.0068416595458984 lr:4.71565852931159e-06 tokens_per_second_per_gpu:3986.641799795482
404
+ Step 404 | loss:1.000541090965271 lr:4.681293134143216e-06 tokens_per_second_per_gpu:3747.1434322920663
405
+ Step 405 | loss:1.2205549478530884 lr:4.646996514599623e-06 tokens_per_second_per_gpu:3820.4594798567286
406
+ Step 406 | loss:0.9515297412872314 lr:4.612769507508324e-06 tokens_per_second_per_gpu:3795.377292339278
407
+ Step 407 | loss:1.0238933563232422 lr:4.5786129479982945e-06 tokens_per_second_per_gpu:3229.7823570944547
408
+ Step 408 | loss:1.0509495735168457 lr:4.544527669479625e-06 tokens_per_second_per_gpu:3349.333952642161
409
+ Step 409 | loss:1.0613925457000732 lr:4.510514503623155e-06 tokens_per_second_per_gpu:3156.1717446768876
410
+ Step 410 | loss:1.033313512802124 lr:4.4765742803402094e-06 tokens_per_second_per_gpu:3336.8989972246827
411
+ Step 411 | loss:1.0496494770050049 lr:4.442707827762323e-06 tokens_per_second_per_gpu:3590.429787817092
412
+ Step 412 | loss:1.0385499000549316 lr:4.4089159722210605e-06 tokens_per_second_per_gpu:3170.2776264525723
413
+ Step 413 | loss:0.9418953061103821 lr:4.375199538227827e-06 tokens_per_second_per_gpu:3534.558502200195
414
+ Step 414 | loss:1.047280192375183 lr:4.341559348453779e-06 tokens_per_second_per_gpu:2733.5207695529916
415
+ Step 415 | loss:1.022471308708191 lr:4.307996223709717e-06 tokens_per_second_per_gpu:3890.7734867828663
416
+ Step 416 | loss:1.0086355209350586 lr:4.274510982926099e-06 tokens_per_second_per_gpu:3987.506565566676
417
+ Step 417 | loss:1.0402534008026123 lr:4.2411044431330156e-06 tokens_per_second_per_gpu:3236.307481873333
418
+ Step 418 | loss:0.8711655139923096 lr:4.207777419440298e-06 tokens_per_second_per_gpu:3975.2863975021874
419
+ Step 419 | loss:1.023214340209961 lr:4.174530725017586e-06 tokens_per_second_per_gpu:3554.0476686396737
420
+ Step 420 | loss:0.9571332335472107 lr:4.141365171074529e-06 tokens_per_second_per_gpu:3525.9512956270514
421
+ Step 421 | loss:1.0896693468093872 lr:4.108281566840953e-06 tokens_per_second_per_gpu:3153.098645463316
422
+ Step 422 | loss:0.9747520685195923 lr:4.075280719547152e-06 tokens_per_second_per_gpu:3966.0455004943665
423
+ Step 423 | loss:1.1110624074935913 lr:4.042363434404165e-06 tokens_per_second_per_gpu:3779.0025960246794
424
+ Step 424 | loss:1.0604115724563599 lr:4.009530514584142e-06 tokens_per_second_per_gpu:3830.9188953316843
425
+ Step 425 | loss:1.0563576221466064 lr:3.976782761200741e-06 tokens_per_second_per_gpu:3582.450000860484
426
+ Step 426 | loss:1.0671446323394775 lr:3.944120973289589e-06 tokens_per_second_per_gpu:2908.494054782393
427
+ Step 427 | loss:0.9762901067733765 lr:3.911545947788775e-06 tokens_per_second_per_gpu:3053.4349628908967
428
+ Step 428 | loss:0.9776210188865662 lr:3.879058479519415e-06 tokens_per_second_per_gpu:3544.2355797599025
429
+ Step 429 | loss:1.0141849517822266 lr:3.846659361166249e-06 tokens_per_second_per_gpu:3297.786365076082
430
+ Step 430 | loss:1.1040488481521606 lr:3.8143493832583126e-06 tokens_per_second_per_gpu:3338.843244763157
431
+ Step 431 | loss:1.0626472234725952 lr:3.7821293341496314e-06 tokens_per_second_per_gpu:4418.329163218037
432
+ Step 432 | loss:1.0949605703353882 lr:3.750000000000002e-06 tokens_per_second_per_gpu:3047.106222452544
433
+ Step 433 | loss:1.0159730911254883 lr:3.717962164755795e-06 tokens_per_second_per_gpu:3602.6745614483334
434
+ Step 434 | loss:1.0500991344451904 lr:3.686016610130848e-06 tokens_per_second_per_gpu:3986.7451576661006
435
+ Step 435 | loss:1.0814403295516968 lr:3.654164115587359e-06 tokens_per_second_per_gpu:3571.832029398707
436
+ Step 436 | loss:1.0364539623260498 lr:3.622405458316908e-06 tokens_per_second_per_gpu:3642.0438527830474
437
+ Step 437 | loss:1.1901767253875732 lr:3.5907414132214504e-06 tokens_per_second_per_gpu:3176.521239313863
438
+ Step 438 | loss:0.9588651657104492 lr:3.5591727528944566e-06 tokens_per_second_per_gpu:3918.860748590204
439
+ Step 439 | loss:1.0434871912002563 lr:3.5277002476020124e-06 tokens_per_second_per_gpu:3299.17808549516
440
+ Step 440 | loss:1.0512974262237549 lr:3.496324665264073e-06 tokens_per_second_per_gpu:3635.173332567157
441
+ Step 441 | loss:1.231705665588379 lr:3.465046771435676e-06 tokens_per_second_per_gpu:3074.205941042435
442
+ Step 442 | loss:1.0789237022399902 lr:3.4338673292883108e-06 tokens_per_second_per_gpu:3693.2828587088543
443
+ Step 443 | loss:1.2193183898925781 lr:3.4027870995912626e-06 tokens_per_second_per_gpu:3431.2840987432733
444
+ Step 444 | loss:1.0715184211730957 lr:3.3718068406930634e-06 tokens_per_second_per_gpu:3682.480389381471
445
+ Step 445 | loss:1.0878655910491943 lr:3.3409273085029877e-06 tokens_per_second_per_gpu:3247.4307989923645
446
+ Step 446 | loss:1.1202605962753296 lr:3.3101492564726074e-06 tokens_per_second_per_gpu:2868.354735858521
447
+ Step 447 | loss:0.9288511276245117 lr:3.279473435577409e-06 tokens_per_second_per_gpu:3563.062402935358
448
+ Step 448 | loss:1.0990906953811646 lr:3.2489005942984675e-06 tokens_per_second_per_gpu:2858.893387808962
449
+ Step 449 | loss:0.9979101419448853 lr:3.218431478604187e-06 tokens_per_second_per_gpu:3678.9613236315463
450
+ Step 450 | loss:1.1519663333892822 lr:3.188066831932098e-06 tokens_per_second_per_gpu:3727.64506347197
451
+ Step 451 | loss:1.0830491781234741 lr:3.157807395170714e-06 tokens_per_second_per_gpu:3000.9712833795475
452
+ Step 452 | loss:0.953801691532135 lr:3.127653906641461e-06 tokens_per_second_per_gpu:3259.1973993559814
453
+ Step 453 | loss:1.0369517803192139 lr:3.097607102080657e-06 tokens_per_second_per_gpu:3511.980574010068
454
+ Step 454 | loss:1.092458724975586 lr:3.067667714621564e-06 tokens_per_second_per_gpu:3294.265502138293
455
+ Step 455 | loss:0.936871349811554 lr:3.037836474776495e-06 tokens_per_second_per_gpu:3481.5948657632803
456
+ Step 456 | loss:1.1124027967453003 lr:3.008114110419e-06 tokens_per_second_per_gpu:3812.8193336820773
457
+ Step 457 | loss:1.0468907356262207 lr:2.978501346766086e-06 tokens_per_second_per_gpu:3151.5910084710267
458
+ Step 458 | loss:1.0531938076019287 lr:2.948998906360552e-06 tokens_per_second_per_gpu:4215.472517871868
459
+ Step 459 | loss:1.041715145111084 lr:2.9196075090533224e-06 tokens_per_second_per_gpu:3710.2677325476725
460
+ Step 460 | loss:1.0383410453796387 lr:2.890327871985922e-06 tokens_per_second_per_gpu:3652.23350104369
461
+ Step 461 | loss:1.0480257272720337 lr:2.8611607095729393e-06 tokens_per_second_per_gpu:3064.9383393059015
462
+ Step 462 | loss:0.9538127779960632 lr:2.8321067334846317e-06 tokens_per_second_per_gpu:3769.898140676965
463
+ Step 463 | loss:0.9850819110870361 lr:2.8031666526295325e-06 tokens_per_second_per_gpu:3406.614961871896
464
+ Step 464 | loss:1.0228904485702515 lr:2.7743411731371677e-06 tokens_per_second_per_gpu:3569.9039916901243
465
+ Step 465 | loss:0.9739654064178467 lr:2.7456309983408215e-06 tokens_per_second_per_gpu:3855.6404996182887
466
+ Step 466 | loss:1.0107979774475098 lr:2.7170368287603812e-06 tokens_per_second_per_gpu:2971.2864568059426
467
+ Step 467 | loss:1.092708945274353 lr:2.6885593620852362e-06 tokens_per_second_per_gpu:2782.0899935873463
468
+ Step 468 | loss:1.1168324947357178 lr:2.660199293157263e-06 tokens_per_second_per_gpu:3767.413743825975
469
+ Step 469 | loss:1.0790526866912842 lr:2.6319573139538637e-06 tokens_per_second_per_gpu:3453.5127524794507
470
+ Step 470 | loss:1.0022902488708496 lr:2.603834113571089e-06 tokens_per_second_per_gpu:3047.035744052629
471
+ Step 471 | loss:0.9685212969779968 lr:2.575830378206819e-06 tokens_per_second_per_gpu:3106.6813389813046
472
+ Step 472 | loss:0.9829853773117065 lr:2.547946791144022e-06 tokens_per_second_per_gpu:3517.443534427285
473
+ Step 473 | loss:1.0006290674209595 lr:2.520184032734084e-06 tokens_per_second_per_gpu:3299.938446528448
474
+ Step 474 | loss:1.06125009059906 lr:2.492542780380207e-06 tokens_per_second_per_gpu:3319.6900284288845
475
+ Step 475 | loss:1.1993439197540283 lr:2.4650237085208767e-06 tokens_per_second_per_gpu:3576.5268885175046
476
+ Step 476 | loss:0.9784356951713562 lr:2.4376274886134225e-06 tokens_per_second_per_gpu:3467.613615187959
477
+ Step 477 | loss:0.9801285862922668 lr:2.4103547891176042e-06 tokens_per_second_per_gpu:4156.110968467437
478
+ Step 478 | loss:1.2105839252471924 lr:2.3832062754793375e-06 tokens_per_second_per_gpu:3410.3132050226804
479
+ Step 479 | loss:1.0551785230636597 lr:2.356182610114421e-06 tokens_per_second_per_gpu:3536.043611031053
480
+ Step 480 | loss:1.0802967548370361 lr:2.3292844523924132e-06 tokens_per_second_per_gpu:3609.396572236827
481
+ Step 481 | loss:1.0849721431732178 lr:2.3025124586205e-06 tokens_per_second_per_gpu:3036.075258849583
482
+ Step 482 | loss:1.0422958135604858 lr:2.2758672820275245e-06 tokens_per_second_per_gpu:3702.107907772152
483
+ Step 483 | loss:1.0579990148544312 lr:2.2493495727480106e-06 tokens_per_second_per_gpu:3823.945807707913
484
+ Step 484 | loss:1.0904960632324219 lr:2.22295997780633e-06 tokens_per_second_per_gpu:3312.442269740297
485
+ Step 485 | loss:1.0701911449432373 lr:2.196699141100894e-06 tokens_per_second_per_gpu:3242.470040399053
486
+ Step 486 | loss:0.9568579196929932 lr:2.170567703388454e-06 tokens_per_second_per_gpu:3632.613290114698
487
+ Step 487 | loss:1.0242254734039307 lr:2.1445663022684626e-06 tokens_per_second_per_gpu:3655.3842931955455
488
+ Step 488 | loss:1.0032799243927002 lr:2.1186955721675145e-06 tokens_per_second_per_gpu:3707.926602365104
489
+ Step 489 | loss:1.0453135967254639 lr:2.092956144323881e-06 tokens_per_second_per_gpu:2869.137361469829
490
+ Step 490 | loss:1.2181352376937866 lr:2.067348646772079e-06 tokens_per_second_per_gpu:3293.3080266736984
491
+ Step 491 | loss:1.000616192817688 lr:2.041873704327583e-06 tokens_per_second_per_gpu:3715.517462012418
492
+ Step 492 | loss:1.0542888641357422 lr:2.0165319385715436e-06 tokens_per_second_per_gpu:3830.4390289474027
493
+ Step 493 | loss:1.0258901119232178 lr:1.991323967835658e-06 tokens_per_second_per_gpu:3028.369221973012
494
+ Step 494 | loss:0.9300064444541931 lr:1.966250407187045e-06 tokens_per_second_per_gpu:3930.731519264302
495
+ Step 495 | loss:1.0918774604797363 lr:1.9413118684132744e-06 tokens_per_second_per_gpu:3462.5265384350296
496
+ Step 496 | loss:1.0639344453811646 lr:1.916508960007404e-06 tokens_per_second_per_gpu:3684.844309448488
497
+ Step 497 | loss:1.0016651153564453 lr:1.8918422871531677e-06 tokens_per_second_per_gpu:3102.599497609656
498
+ Step 498 | loss:1.0390342473983765 lr:1.8673124517101783e-06 tokens_per_second_per_gpu:3670.164801395993
499
+ Step 499 | loss:1.210891842842102 lr:1.842920052199263e-06 tokens_per_second_per_gpu:2854.611146514477
500
+ Step 500 | loss:1.1124374866485596 lr:1.818665683787849e-06 tokens_per_second_per_gpu:3270.6689217420526
501
+ Step 501 | loss:0.9718752503395081 lr:1.794549938275447e-06 tokens_per_second_per_gpu:4170.2565142940675
502
+ Step 502 | loss:1.1475387811660767 lr:1.7705734040792066e-06 tokens_per_second_per_gpu:3874.4289722672997
503
+ Step 503 | loss:0.9939885139465332 lr:1.7467366662195653e-06 tokens_per_second_per_gpu:4018.9157193209103
504
+ Step 504 | loss:1.060147762298584 lr:1.7230403063059688e-06 tokens_per_second_per_gpu:3884.5121540563578
505
+ Step 505 | loss:1.153907060623169 lr:1.6994849025226819e-06 tokens_per_second_per_gpu:3476.133514653969
506
+ Step 506 | loss:0.9286587238311768 lr:1.6760710296146827e-06 tokens_per_second_per_gpu:3320.296836386092
507
+ Step 507 | loss:1.1152493953704834 lr:1.6527992588736346e-06 tokens_per_second_per_gpu:2898.2952105804293
508
+ Step 508 | loss:1.1163067817687988 lr:1.6296701581239514e-06 tokens_per_second_per_gpu:3230.075370963527
509
+ Step 509 | loss:1.0790047645568848 lr:1.6066842917089406e-06 tokens_per_second_per_gpu:3259.4500583104896
510
+ Step 510 | loss:0.9519661068916321 lr:1.5838422204770304e-06 tokens_per_second_per_gpu:3670.231481091447
511
+ Step 511 | loss:0.992994487285614 lr:1.5611445017680991e-06 tokens_per_second_per_gpu:3799.1403263844904
512
+ Step 512 | loss:1.111997127532959 lr:1.5385916893998497e-06 tokens_per_second_per_gpu:3600.044046651026
513
+ Step 513 | loss:0.9813446998596191 lr:1.5161843336543285e-06 tokens_per_second_per_gpu:3486.542256282655
514
+ Step 514 | loss:1.058476209640503 lr:1.4939229812644679e-06 tokens_per_second_per_gpu:3923.009117188916
515
+ Step 515 | loss:0.9895375967025757 lr:1.4718081754007753e-06 tokens_per_second_per_gpu:3754.8223465225924
516
+ Step 516 | loss:1.079698085784912 lr:1.4498404556580525e-06 tokens_per_second_per_gpu:3386.562332279495
517
+ Step 517 | loss:1.0072911977767944 lr:1.4280203580422537e-06 tokens_per_second_per_gpu:3733.3749612885863
518
+ Step 518 | loss:1.0473989248275757 lr:1.4063484149573902e-06 tokens_per_second_per_gpu:3459.389528812242
519
+ Step 519 | loss:0.9953117370605469 lr:1.3848251551925475e-06 tokens_per_second_per_gpu:3253.9885230172995
520
+ Step 520 | loss:1.0296379327774048 lr:1.3634511039089819e-06 tokens_per_second_per_gpu:3249.8868818068645
521
+ Step 521 | loss:1.0001991987228394 lr:1.3422267826273052e-06 tokens_per_second_per_gpu:3295.052521167191
522
+ Step 522 | loss:1.0914515256881714 lr:1.321152709214763e-06 tokens_per_second_per_gpu:3418.445922794606
523
+ Step 523 | loss:1.0070934295654297 lr:1.3002293978725935e-06 tokens_per_second_per_gpu:3468.4665497148526
524
+ Step 524 | loss:0.991290807723999 lr:1.2794573591234868e-06 tokens_per_second_per_gpu:3655.7877168779905
525
+ Step 525 | loss:0.9848833680152893 lr:1.2588370997991262e-06 tokens_per_second_per_gpu:3221.000627253079
526
+ Step 526 | loss:1.0280349254608154 lr:1.2383691230278197e-06 tokens_per_second_per_gpu:3589.7060395621884
527
+ Step 527 | loss:1.0546156167984009 lr:1.2180539282222252e-06 tokens_per_second_per_gpu:3325.8819649807083
528
+ Step 528 | loss:0.9586650729179382 lr:1.1978920110671688e-06 tokens_per_second_per_gpu:3187.053627556894
529
+ Step 529 | loss:1.0085161924362183 lr:1.1778838635075415e-06 tokens_per_second_per_gpu:3512.8972208771875
530
+ Step 530 | loss:0.9839969873428345 lr:1.1580299737363037e-06 tokens_per_second_per_gpu:3345.8963367573137
531
+ Step 531 | loss:1.0338804721832275 lr:1.1383308261825748e-06 tokens_per_second_per_gpu:3539.808114559959
532
+ Step 532 | loss:1.1831824779510498 lr:1.1187869014997992e-06 tokens_per_second_per_gpu:3428.417581753952
533
+ Step 533 | loss:1.0245225429534912 lr:1.0993986765540403e-06 tokens_per_second_per_gpu:3259.282719856141
534
+ Step 534 | loss:1.022165060043335 lr:1.080166624412322e-06 tokens_per_second_per_gpu:3233.591194250682
535
+ Step 535 | loss:1.1499828100204468 lr:1.0610912143311096e-06 tokens_per_second_per_gpu:3336.2483024753146
536
+ Step 536 | loss:1.1073501110076904 lr:1.0421729117448334e-06 tokens_per_second_per_gpu:3675.8114899776433
537
+ Step 537 | loss:1.002111554145813 lr:1.0234121782545621e-06 tokens_per_second_per_gpu:3456.2249516746747
538
+ Step 538 | loss:0.988724946975708 lr:1.0048094716167097e-06 tokens_per_second_per_gpu:3571.358149202487
539
+ Step 539 | loss:1.0480667352676392 lr:9.863652457318939e-07 tokens_per_second_per_gpu:3861.236977577604
540
+ Step 540 | loss:1.0037941932678223 lr:9.68079950633841e-07 tokens_per_second_per_gpu:3795.781462671837
541
+ Step 541 | loss:1.046254277229309 lr:9.499540324784137e-07 tokens_per_second_per_gpu:3805.412799974102
542
+ Step 542 | loss:1.0471158027648926 lr:9.319879335327244e-07 tokens_per_second_per_gpu:3803.6177569269944
543
+ Step 543 | loss:1.0406914949417114 lr:9.141820921643429e-07 tokens_per_second_per_gpu:3697.1148852757697
544
+ Step 544 | loss:1.0133854150772095 lr:8.965369428306025e-07 tokens_per_second_per_gpu:2798.1724809223538
545
+ Step 545 | loss:0.9686719179153442 lr:8.790529160679947e-07 tokens_per_second_per_gpu:3461.1321675655404
546
+ Step 546 | loss:0.9938296675682068 lr:8.617304384816716e-07 tokens_per_second_per_gpu:3745.8216708473556
547
+ Step 547 | loss:1.035244345664978 lr:8.445699327350281e-07 tokens_per_second_per_gpu:3470.7464874423918
548
+ Step 548 | loss:1.0502572059631348 lr:8.275718175393959e-07 tokens_per_second_per_gpu:3148.8280613056
549
+ Step 549 | loss:0.9388608336448669 lr:8.10736507643825e-07 tokens_per_second_per_gpu:3426.3978598698536
550
+ Step 550 | loss:1.0639617443084717 lr:7.940644138249606e-07 tokens_per_second_per_gpu:3617.338930211458
551
+ Step 551 | loss:1.0359141826629639 lr:7.77555942877031e-07 tokens_per_second_per_gpu:3569.7245310019453
552
+ Step 552 | loss:1.0284898281097412 lr:7.612114976019013e-07 tokens_per_second_per_gpu:4029.1180368589817
553
+ Step 553 | loss:1.0912672281265259 lr:7.450314767992725e-07 tokens_per_second_per_gpu:3319.4160159719727
554
+ Step 554 | loss:1.0639758110046387 lr:7.290162752569235e-07 tokens_per_second_per_gpu:3216.1711606947674
555
+ Step 555 | loss:0.9887262582778931 lr:7.131662837411021e-07 tokens_per_second_per_gpu:3598.567838296343
556
+ Step 556 | loss:1.119757890701294 lr:6.974818889869706e-07 tokens_per_second_per_gpu:3828.479312234801
557
+ Step 557 | loss:0.9670307636260986 lr:6.819634736891881e-07 tokens_per_second_per_gpu:3205.8416970935805
558
+ Step 558 | loss:0.8852719664573669 lr:6.666114164925519e-07 tokens_per_second_per_gpu:3146.806660385405
559
+ Step 559 | loss:0.9563117623329163 lr:6.51426091982782e-07 tokens_per_second_per_gpu:3621.68107570963
560
+ Step 560 | loss:1.1427892446517944 lr:6.364078706773616e-07 tokens_per_second_per_gpu:2813.4747436797943
561
+ Step 561 | loss:1.0382424592971802 lr:6.215571190165073e-07 tokens_per_second_per_gpu:3323.0282890047843
562
+ Step 562 | loss:1.0987064838409424 lr:6.068741993542251e-07 tokens_per_second_per_gpu:3476.205922040534
563
+ Step 563 | loss:1.0931929349899292 lr:5.923594699494683e-07 tokens_per_second_per_gpu:3470.4852308972336
564
+ Step 564 | loss:0.9598541259765625 lr:5.780132849573988e-07 tokens_per_second_per_gpu:3137.686342427438
565
+ Step 565 | loss:0.9403815865516663 lr:5.638359944207421e-07 tokens_per_second_per_gpu:3236.405721258059
566
+ Step 566 | loss:0.9363535046577454 lr:5.498279442612497e-07 tokens_per_second_per_gpu:3329.627606082326
567
+ Step 567 | loss:1.1313518285751343 lr:5.359894762712558e-07 tokens_per_second_per_gpu:3055.2431705401013
568
+ Step 568 | loss:1.015571117401123 lr:5.223209281053415e-07 tokens_per_second_per_gpu:3085.9648675924795
569
+ Step 569 | loss:1.0932440757751465 lr:5.088226332720916e-07 tokens_per_second_per_gpu:3736.1741995184375
570
+ Step 570 | loss:1.132112979888916 lr:4.954949211259599e-07 tokens_per_second_per_gpu:3456.21209578924
571
+ Step 571 | loss:1.0685855150222778 lr:4.823381168592328e-07 tokens_per_second_per_gpu:3930.7959672235484
572
+ Step 572 | loss:0.9238698482513428 lr:4.693525414940933e-07 tokens_per_second_per_gpu:3142.089445727256
573
+ Step 573 | loss:0.9899613857269287 lr:4.565385118747922e-07 tokens_per_second_per_gpu:3651.4091222282746
574
+ Step 574 | loss:1.131011962890625 lr:4.4389634065990866e-07 tokens_per_second_per_gpu:3602.03410991543
575
+ Step 575 | loss:1.0533106327056885 lr:4.31426336314735e-07 tokens_per_second_per_gpu:3126.586366047976
576
+ Step 576 | loss:0.990755558013916 lr:4.191288031037316e-07 tokens_per_second_per_gpu:2938.069229206343
577
+ Step 577 | loss:1.085005521774292 lr:4.0700404108312493e-07 tokens_per_second_per_gpu:3634.120014496565
578
+ Step 578 | loss:1.0190842151641846 lr:3.9505234609356455e-07 tokens_per_second_per_gpu:3508.693290240972
579
+ Step 579 | loss:1.1383253335952759 lr:3.832740097529236e-07 tokens_per_second_per_gpu:3784.3207549193335
580
+ Step 580 | loss:1.0955651998519897 lr:3.7166931944916713e-07 tokens_per_second_per_gpu:3443.295664825699
581
+ Step 581 | loss:1.020769715309143 lr:3.602385583333537e-07 tokens_per_second_per_gpu:3670.778095752044
582
+ Step 582 | loss:0.9577903747558594 lr:3.4898200531271796e-07 tokens_per_second_per_gpu:3453.753723845044
583
+ Step 583 | loss:1.0107300281524658 lr:3.3789993504386747e-07 tokens_per_second_per_gpu:3322.5018350298633
584
+ Step 584 | loss:1.1570929288864136 lr:3.2699261792608183e-07 tokens_per_second_per_gpu:3546.864382690297
585
+ Step 585 | loss:0.9978330135345459 lr:3.162603200947156e-07 tokens_per_second_per_gpu:3258.2812026952033
586
+ Step 586 | loss:1.0092127323150635 lr:3.057033034147028e-07 tokens_per_second_per_gpu:3262.423307313303
587
+ Step 587 | loss:0.9077081680297852 lr:2.953218254741699e-07 tokens_per_second_per_gpu:3697.3960382773657
588
+ Step 588 | loss:1.018717885017395 lr:2.8511613957814655e-07 tokens_per_second_per_gpu:3288.5442807398276
589
+ Step 589 | loss:1.0892322063446045 lr:2.7508649474239145e-07 tokens_per_second_per_gpu:3393.0489554763767
590
+ Step 590 | loss:1.0539475679397583 lr:2.6523313568731026e-07 tokens_per_second_per_gpu:3643.0417749516805
591
+ Step 591 | loss:1.033966064453125 lr:2.555563028319885e-07 tokens_per_second_per_gpu:3534.4036121090435
592
+ Step 592 | loss:1.0088396072387695 lr:2.46056232288322e-07 tokens_per_second_per_gpu:3681.445022735837
593
+ Step 593 | loss:1.0584028959274292 lr:2.3673315585526072e-07 tokens_per_second_per_gpu:3240.2921371564175
594
+ Step 594 | loss:1.0147311687469482 lr:2.2758730101314684e-07 tokens_per_second_per_gpu:3359.430877297497
595
+ Step 595 | loss:1.1457427740097046 lr:2.1861889091817133e-07 tokens_per_second_per_gpu:3525.4677911202753
596
+ Step 596 | loss:1.1605221033096313 lr:2.0982814439691939e-07 tokens_per_second_per_gpu:3453.302876809475
597
+ Step 597 | loss:1.0375539064407349 lr:2.0121527594104295e-07 tokens_per_second_per_gpu:3577.2309087051176
598
+ Step 598 | loss:1.0658079385757446 lr:1.9278049570201654e-07 tokens_per_second_per_gpu:3381.451046136403
599
+ Step 599 | loss:1.0220245122909546 lr:1.8452400948601816e-07 tokens_per_second_per_gpu:3823.4144363605037
600
+ Step 600 | loss:0.9951660633087158 lr:1.7644601874889894e-07 tokens_per_second_per_gpu:3713.609461806826
601
+ Step 601 | loss:1.119286060333252 lr:1.6854672059127635e-07 tokens_per_second_per_gpu:3864.777481244428
602
+ Step 602 | loss:0.9337589740753174 lr:1.6082630775371976e-07 tokens_per_second_per_gpu:3744.03231292111
603
+ Step 603 | loss:1.072335958480835 lr:1.532849686120491e-07 tokens_per_second_per_gpu:3751.680025273069
604
+ Step 604 | loss:1.048313021659851 lr:1.459228871727386e-07 tokens_per_second_per_gpu:3755.7832090988786
605
+ Step 605 | loss:1.1349972486495972 lr:1.3874024306842453e-07 tokens_per_second_per_gpu:3677.1697050059897
606
+ Step 606 | loss:1.0434086322784424 lr:1.3173721155352868e-07 tokens_per_second_per_gpu:3393.0209085636616
607
+ Step 607 | loss:1.1098034381866455 lr:1.249139634999752e-07 tokens_per_second_per_gpu:3051.599689491925
608
+ Step 608 | loss:1.0198705196380615 lr:1.1827066539302378e-07 tokens_per_second_per_gpu:3692.11290536148
609
+ Step 609 | loss:1.0850074291229248 lr:1.1180747932721142e-07 tokens_per_second_per_gpu:3417.4471054705496
610
+ Step 610 | loss:1.187023639678955 lr:1.055245630023896e-07 tokens_per_second_per_gpu:3640.1749871015163
611
+ Step 611 | loss:1.0894136428833008 lr:9.942206971988416e-08 tokens_per_second_per_gpu:3595.7152958928796
612
+ Step 612 | loss:1.0794929265975952 lr:9.350014837874899e-08 tokens_per_second_per_gpu:3323.2475499204224
613
+ Step 613 | loss:0.9975882768630981 lr:8.775894347213654e-08 tokens_per_second_per_gpu:3418.980031398386
614
+ Step 614 | loss:1.220999836921692 lr:8.219859508376975e-08 tokens_per_second_per_gpu:3446.3651662681623
615
+ Step 615 | loss:1.0748283863067627 lr:7.681923888452902e-08 tokens_per_second_per_gpu:3373.2525132787046
616
+ Step 616 | loss:1.1435097455978394 lr:7.162100612913308e-08 tokens_per_second_per_gpu:3363.609081700875
617
+ Step 617 | loss:1.0882154703140259 lr:6.660402365294499e-08 tokens_per_second_per_gpu:3023.7098014074804
618
+ Step 618 | loss:0.9880639314651489 lr:6.176841386887205e-08 tokens_per_second_per_gpu:3988.401290174963
619
+ Step 619 | loss:1.1591405868530273 lr:5.71142947643824e-08 tokens_per_second_per_gpu:3171.9719503194547
620
+ Step 620 | loss:1.0383414030075073 lr:5.264177989862312e-08 tokens_per_second_per_gpu:3086.0139727006417
621
+ Step 621 | loss:1.1537160873413086 lr:4.8350978399650804e-08 tokens_per_second_per_gpu:3369.979075784975
622
+ Step 622 | loss:1.0627126693725586 lr:4.424199496177117e-08 tokens_per_second_per_gpu:3436.2757606077344
623
+ Step 623 | loss:1.0991474390029907 lr:4.0314929842979466e-08 tokens_per_second_per_gpu:3929.0458952761765
624
+ Step 624 | loss:1.0756220817565918 lr:3.6569878862519055e-08 tokens_per_second_per_gpu:3385.603493456278
625
+ Step 625 | loss:1.0126550197601318 lr:3.300693339854083e-08 tokens_per_second_per_gpu:3796.7319365547914
626
+ Step 626 | loss:1.0999494791030884 lr:2.9626180385873282e-08 tokens_per_second_per_gpu:3491.292618600503
627
+ Step 627 | loss:1.0032886266708374 lr:2.64277023139034e-08 tokens_per_second_per_gpu:3434.2746828936133
628
+ Step 628 | loss:0.9731086492538452 lr:2.3411577224563273e-08 tokens_per_second_per_gpu:3992.9430945768536
629
+ Step 629 | loss:1.029701590538025 lr:2.0577878710424104e-08 tokens_per_second_per_gpu:3954.800918281961
630
+ Step 630 | loss:0.9682028889656067 lr:1.7926675912902644e-08 tokens_per_second_per_gpu:3079.0321478215965
631
+ Step 631 | loss:1.2128889560699463 lr:1.54580335205734e-08 tokens_per_second_per_gpu:2766.3162516442967
632
+ Step 632 | loss:1.0580332279205322 lr:1.3172011767589865e-08 tokens_per_second_per_gpu:3414.5038307314135
633
+ Step 633 | loss:0.9746403098106384 lr:1.1068666432215713e-08 tokens_per_second_per_gpu:3261.521524950618
634
+ Step 634 | loss:0.9512935280799866 lr:9.148048835462552e-09 tokens_per_second_per_gpu:3716.130203693814
635
+ Step 635 | loss:0.9694274663925171 lr:7.410205839840922e-09 tokens_per_second_per_gpu:3638.283430143105
636
+ Step 636 | loss:0.9943816065788269 lr:5.8551798482112116e-09 tokens_per_second_per_gpu:3984.6815410831828
637
+ Step 637 | loss:1.019755244255066 lr:4.483008802754485e-09 tokens_per_second_per_gpu:3426.6431524886534
638
+ Step 638 | loss:0.9445756673812866 lr:3.2937261840423894e-09 tokens_per_second_per_gpu:3651.1875501336294
639
+ Step 639 | loss:1.0361696481704712 lr:2.2873610102253083e-09 tokens_per_second_per_gpu:2899.609547551191
640
+ Step 640 | loss:1.0265421867370605 lr:1.4639378363187629e-09 tokens_per_second_per_gpu:3146.508069944556
641
+ Step 641 | loss:0.9879869222640991 lr:8.234767536080545e-10 tokens_per_second_per_gpu:3718.1335290345437
642
+ Step 642 | loss:1.0564134120941162 lr:3.659933891569933e-10 tokens_per_second_per_gpu:3539.4915099356376
643
+ Step 643 | loss:1.1678091287612915 lr:9.149890542570205e-11 tokens_per_second_per_gpu:2749.888827785128
644
+ Step 644 | loss:0.9817880392074585 lr:0.0 tokens_per_second_per_gpu:3425.3032445871813
model.safetensors.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 16068960256
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
296
+ "model.norm.weight": "model-00004-of-00004.safetensors"
297
+ }
298
+ }
output.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd5b29e86ced8a007dbaa7008b11de967a8bf12ce60522973f4fe7e291bde71
3
+ size 8429513848
special_tokens_map.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|eot_id|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ }
16
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
training_config_phase3.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Config for multi-device full finetuning in full_finetune_distributed.py
2
+ # using a Llama3 8B Instruct model
3
+ #
4
+ # This config assumes that you've run the following command before launching
5
+ # this run:
6
+ # tune download meta-llama/Meta-Llama-3-8B-Instruct --output-dir /tmp/Meta-Llama-3-8B-Instruct --hf-token <HF_TOKEN>
7
+ #
8
+ # To launch on 4 devices, run the following command from root:
9
+ # tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full
10
+ #
11
+ # You can add specific overrides through the command line. For example
12
+ # to override the checkpointer directory while launching training
13
+ # you can run:
14
+ # tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
15
+ #
16
+ # This config works best when the model is being fine-tuned on 2+ GPUs.
17
+ # Single device full finetuning requires more memory optimizations. It's
18
+ # best to use 8B_full_single_device.yaml for those cases
19
+ # Tokenizer
20
+ tokenizer:
21
+ _component_: torchtune.models.llama3.llama3_s_tokenizer
22
+ path: ../model_zoo/tokenizer.model
23
+ max_seq_len: 4096
24
+
25
+ # Dataset
26
+ dataset:
27
+ _component_: torchtune.datasets.chat_dataset
28
+ source: jan-hq/mixed-instruction-speech-multiturn-noise-clean
29
+ conversation_style: openai
30
+ max_seq_len: 4096
31
+ split: train
32
+ train_on_input: True
33
+
34
+ seed: 42
35
+ shuffle: False
36
+ # Model Arguments
37
+ model:
38
+ _component_: torchtune.models.llama3_1.llama3_1_s_8b
39
+ # path: model_zoo/Llama3.1_s_8b_init
40
+ checkpointer:
41
+ _component_: torchtune.utils.FullModelHFCheckpointerSaveSteps
42
+ checkpoint_dir: ../model_zoo/llama3.1-s-cp-7000
43
+ checkpoint_files: [
44
+ model-00001-of-00004.safetensors,
45
+ model-00002-of-00004.safetensors,
46
+ model-00003-of-00004.safetensors,
47
+ model-00004-of-00004.safetensors,
48
+ ]
49
+ recipe_checkpoint: null
50
+ output_dir: ../model_zoo/llama3-s-instruct-lr-3e-5
51
+ model_type: LLAMA3
52
+ resume_from_checkpoint: False
53
+ save_every_n_steps: 200
54
+ max_checkpoints: 3
55
+ # Fine-tuning arguments
56
+ batch_size: 4
57
+ epochs: 1
58
+ max_steps_per_epoch: null
59
+ gradient_accumulation_steps: 8
60
+ compile: False
61
+ # Optimizer and Scheduler
62
+ optimizer:
63
+ _component_: torch.optim.AdamW #change this to use adam_mini: torchtune.modules.optimizer.Adam_mini
64
+ weight_decay: 0.005
65
+ lr: 1.5e-5
66
+ fused: True
67
+ lr_scheduler:
68
+ _component_: torchtune.modules.get_cosine_schedule_with_warmup
69
+ num_warmup_steps: 8
70
+
71
+ loss:
72
+ _component_: torch.nn.CrossEntropyLoss
73
+
74
+ fsdp:
75
+ cpu_offload: False
76
+
77
+ # Training env
78
+ device: cuda
79
+ dtype: bf16
80
+
81
+ # Memory management
82
+ enable_activation_checkpointing: True
83
+ memory_efficient_fsdp_wrap: True
84
+ ac_mode: 'selective'
85
+
86
+
87
+ # Logging
88
+ metric_logger:
89
+ _component_: torchtune.utils.metric_logging.DiskLogger
90
+ log_dir: ${output_dir}
91
+ output_dir: ../model_zoo/Llama3-instruct-log-lr-3e-5/
92
+ log_every_n_steps: 1
93
+ log_peak_memory_stats: False