Upload 11 files

Browse files

Files changed (10) hide show

model.safetensors +1 -1
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +15 -0
spiece.model +3 -0
tokenizer.json +0 -0
tokenizer_config.json +57 -0
trainer_state.json +1459 -0
training_args.bin +3 -0

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49f4c45b436751ee83d95fc9972e79397b1f6b7b54985723a6d1e342ef4fc161
 size 46743912

 version https://git-lfs.github.com/spec/v1
+oid sha256:5e96e57fb4c2ffe175277ebb65b5cd16a44a7114097c0e2e688b55fef924b13a
 size 46743912

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a504f4b6efd623f8cfa8d97bf7ceef00d87744ec09b0ee121c51cbf548033cd
+size 93502808

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5109e08295ec457bd444c73df2b91ea091459c68e440d157572b04e7050331a
+size 13553

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7477571713a4a349a0ecb64fd21f32fe71fd5559a2cbd3c32e4c358dca8bc7ce
+size 627

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "[SEP]",
+  "unk_token": "<unk>"
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fefb02b667a6c5c2fe27602d28e5fb3428f66ab89c7d6f388e7c8d44a02d0336
+size 760289

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "eos_token": "[SEP]",
+  "keep_accents": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "remove_space": true,
+  "sep_token": "[SEP]",
+  "tokenizer_class": "AlbertTokenizer",
+  "unk_token": "<unk>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1459 @@

+{
+  "best_metric": 0.29854172468185425,
+  "best_model_checkpoint": "./results/checkpoint-2026",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 2026,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 15.60984992980957,
+      "learning_rate": 4.990128331688055e-05,
+      "loss": 0.5627,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.2870399951934814,
+      "learning_rate": 4.9802566633761114e-05,
+      "loss": 0.2714,
+      "step": 20
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 123.16035461425781,
+      "learning_rate": 4.970384995064166e-05,
+      "loss": 0.5078,
+      "step": 30
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 11.994287490844727,
+      "learning_rate": 4.960513326752221e-05,
+      "loss": 0.4197,
+      "step": 40
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 3.41953182220459,
+      "learning_rate": 4.950641658440277e-05,
+      "loss": 0.441,
+      "step": 50
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 26.95296287536621,
+      "learning_rate": 4.940769990128332e-05,
+      "loss": 0.6958,
+      "step": 60
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 28.05646324157715,
+      "learning_rate": 4.930898321816387e-05,
+      "loss": 0.3109,
+      "step": 70
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 28.755550384521484,
+      "learning_rate": 4.921026653504443e-05,
+      "loss": 0.5321,
+      "step": 80
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 46.09171676635742,
+      "learning_rate": 4.9111549851924976e-05,
+      "loss": 0.4216,
+      "step": 90
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 3.234527587890625,
+      "learning_rate": 4.901283316880553e-05,
+      "loss": 0.3427,
+      "step": 100
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 36.78240966796875,
+      "learning_rate": 4.891411648568609e-05,
+      "loss": 0.5259,
+      "step": 110
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.4952964782714844,
+      "learning_rate": 4.8815399802566636e-05,
+      "loss": 0.525,
+      "step": 120
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 6.107447147369385,
+      "learning_rate": 4.8716683119447184e-05,
+      "loss": 0.5349,
+      "step": 130
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 17.599472045898438,
+      "learning_rate": 4.861796643632775e-05,
+      "loss": 0.3194,
+      "step": 140
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 67.53023529052734,
+      "learning_rate": 4.8519249753208296e-05,
+      "loss": 0.4738,
+      "step": 150
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 61.95085525512695,
+      "learning_rate": 4.8420533070088844e-05,
+      "loss": 0.4151,
+      "step": 160
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 377.9793701171875,
+      "learning_rate": 4.83218163869694e-05,
+      "loss": 0.3219,
+      "step": 170
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 9.71474838256836,
+      "learning_rate": 4.8223099703849955e-05,
+      "loss": 0.2931,
+      "step": 180
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 54.442691802978516,
+      "learning_rate": 4.8124383020730504e-05,
+      "loss": 0.3802,
+      "step": 190
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 111.41837310791016,
+      "learning_rate": 4.802566633761106e-05,
+      "loss": 0.4909,
+      "step": 200
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 28.207542419433594,
+      "learning_rate": 4.792694965449161e-05,
+      "loss": 0.392,
+      "step": 210
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 65.766357421875,
+      "learning_rate": 4.7828232971372164e-05,
+      "loss": 0.3002,
+      "step": 220
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 127.14469909667969,
+      "learning_rate": 4.772951628825272e-05,
+      "loss": 0.3654,
+      "step": 230
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.19254250824451447,
+      "learning_rate": 4.763079960513327e-05,
+      "loss": 0.2953,
+      "step": 240
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 30.106840133666992,
+      "learning_rate": 4.753208292201382e-05,
+      "loss": 0.4019,
+      "step": 250
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 4.884279727935791,
+      "learning_rate": 4.743336623889438e-05,
+      "loss": 0.4152,
+      "step": 260
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 71.0513916015625,
+      "learning_rate": 4.733464955577493e-05,
+      "loss": 0.6281,
+      "step": 270
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.381753921508789,
+      "learning_rate": 4.723593287265548e-05,
+      "loss": 0.3224,
+      "step": 280
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.0361205339431763,
+      "learning_rate": 4.713721618953603e-05,
+      "loss": 0.6375,
+      "step": 290
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.2360197305679321,
+      "learning_rate": 4.703849950641659e-05,
+      "loss": 0.419,
+      "step": 300
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 188.26495361328125,
+      "learning_rate": 4.693978282329714e-05,
+      "loss": 0.5303,
+      "step": 310
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.14256739616394043,
+      "learning_rate": 4.684106614017769e-05,
+      "loss": 0.2615,
+      "step": 320
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 63.93450927734375,
+      "learning_rate": 4.674234945705824e-05,
+      "loss": 0.4678,
+      "step": 330
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 31.07522201538086,
+      "learning_rate": 4.66436327739388e-05,
+      "loss": 0.607,
+      "step": 340
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 12.982345581054688,
+      "learning_rate": 4.654491609081935e-05,
+      "loss": 0.189,
+      "step": 350
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 14.37088394165039,
+      "learning_rate": 4.64461994076999e-05,
+      "loss": 0.4341,
+      "step": 360
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 8.185881614685059,
+      "learning_rate": 4.634748272458046e-05,
+      "loss": 0.5132,
+      "step": 370
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.082980751991272,
+      "learning_rate": 4.624876604146101e-05,
+      "loss": 0.3491,
+      "step": 380
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 12.5576753616333,
+      "learning_rate": 4.615004935834156e-05,
+      "loss": 0.7587,
+      "step": 390
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 5.933102130889893,
+      "learning_rate": 4.605133267522212e-05,
+      "loss": 0.579,
+      "step": 400
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.3454967737197876,
+      "learning_rate": 4.5952615992102666e-05,
+      "loss": 0.1932,
+      "step": 410
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 17.171228408813477,
+      "learning_rate": 4.585389930898322e-05,
+      "loss": 0.3217,
+      "step": 420
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 3.5590412616729736,
+      "learning_rate": 4.575518262586377e-05,
+      "loss": 0.2279,
+      "step": 430
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 17.661069869995117,
+      "learning_rate": 4.5656465942744326e-05,
+      "loss": 0.2676,
+      "step": 440
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 48.93571853637695,
+      "learning_rate": 4.5557749259624875e-05,
+      "loss": 0.5449,
+      "step": 450
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 12.7286376953125,
+      "learning_rate": 4.545903257650543e-05,
+      "loss": 0.5127,
+      "step": 460
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 51.88860321044922,
+      "learning_rate": 4.5360315893385986e-05,
+      "loss": 0.4794,
+      "step": 470
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 18.063552856445312,
+      "learning_rate": 4.5261599210266535e-05,
+      "loss": 0.3728,
+      "step": 480
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.861877918243408,
+      "learning_rate": 4.516288252714709e-05,
+      "loss": 0.3038,
+      "step": 490
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 6.476074695587158,
+      "learning_rate": 4.5064165844027646e-05,
+      "loss": 0.2592,
+      "step": 500
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 62.48997497558594,
+      "learning_rate": 4.4965449160908195e-05,
+      "loss": 0.4779,
+      "step": 510
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.5959272384643555,
+      "learning_rate": 4.486673247778875e-05,
+      "loss": 0.3865,
+      "step": 520
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 32.949684143066406,
+      "learning_rate": 4.47680157946693e-05,
+      "loss": 0.5077,
+      "step": 530
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 8.09738826751709,
+      "learning_rate": 4.4669299111549855e-05,
+      "loss": 0.3352,
+      "step": 540
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 23.277297973632812,
+      "learning_rate": 4.457058242843041e-05,
+      "loss": 0.5204,
+      "step": 550
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 89.32869720458984,
+      "learning_rate": 4.447186574531096e-05,
+      "loss": 0.3888,
+      "step": 560
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.6795363426208496,
+      "learning_rate": 4.437314906219151e-05,
+      "loss": 0.5252,
+      "step": 570
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 37.583744049072266,
+      "learning_rate": 4.427443237907207e-05,
+      "loss": 0.3881,
+      "step": 580
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.256844162940979,
+      "learning_rate": 4.417571569595262e-05,
+      "loss": 0.1872,
+      "step": 590
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 4.5737786293029785,
+      "learning_rate": 4.407699901283317e-05,
+      "loss": 0.2536,
+      "step": 600
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 45.64347839355469,
+      "learning_rate": 4.3978282329713724e-05,
+      "loss": 0.3777,
+      "step": 610
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.4227633774280548,
+      "learning_rate": 4.387956564659428e-05,
+      "loss": 0.2028,
+      "step": 620
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 4.602664947509766,
+      "learning_rate": 4.378084896347483e-05,
+      "loss": 0.5563,
+      "step": 630
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.7803702354431152,
+      "learning_rate": 4.3682132280355384e-05,
+      "loss": 0.3636,
+      "step": 640
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 70.02734375,
+      "learning_rate": 4.358341559723593e-05,
+      "loss": 0.4558,
+      "step": 650
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 39.45964050292969,
+      "learning_rate": 4.348469891411649e-05,
+      "loss": 0.4592,
+      "step": 660
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 22.5675106048584,
+      "learning_rate": 4.3385982230997044e-05,
+      "loss": 0.3082,
+      "step": 670
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 4.789850234985352,
+      "learning_rate": 4.328726554787759e-05,
+      "loss": 0.2404,
+      "step": 680
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 4.671356678009033,
+      "learning_rate": 4.318854886475814e-05,
+      "loss": 0.2864,
+      "step": 690
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.803113341331482,
+      "learning_rate": 4.3089832181638704e-05,
+      "loss": 0.2627,
+      "step": 700
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.38143932819366455,
+      "learning_rate": 4.299111549851925e-05,
+      "loss": 0.1678,
+      "step": 710
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.396694540977478,
+      "learning_rate": 4.28923988153998e-05,
+      "loss": 0.181,
+      "step": 720
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 26.724634170532227,
+      "learning_rate": 4.279368213228036e-05,
+      "loss": 0.5595,
+      "step": 730
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 179.3428497314453,
+      "learning_rate": 4.269496544916091e-05,
+      "loss": 0.3613,
+      "step": 740
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 4.721936225891113,
+      "learning_rate": 4.259624876604146e-05,
+      "loss": 0.4182,
+      "step": 750
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.8950241804122925,
+      "learning_rate": 4.249753208292202e-05,
+      "loss": 0.3623,
+      "step": 760
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 5.388864994049072,
+      "learning_rate": 4.2398815399802566e-05,
+      "loss": 0.4246,
+      "step": 770
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.41123124957084656,
+      "learning_rate": 4.230009871668312e-05,
+      "loss": 0.2425,
+      "step": 780
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.3556106388568878,
+      "learning_rate": 4.220138203356368e-05,
+      "loss": 0.3751,
+      "step": 790
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.899945080280304,
+      "learning_rate": 4.2102665350444226e-05,
+      "loss": 0.3994,
+      "step": 800
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 4.583869934082031,
+      "learning_rate": 4.2003948667324774e-05,
+      "loss": 0.3681,
+      "step": 810
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.3905455768108368,
+      "learning_rate": 4.190523198420534e-05,
+      "loss": 0.1491,
+      "step": 820
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 36.8359260559082,
+      "learning_rate": 4.1806515301085886e-05,
+      "loss": 0.2609,
+      "step": 830
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 34.53616714477539,
+      "learning_rate": 4.1707798617966434e-05,
+      "loss": 0.5495,
+      "step": 840
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 14.104715347290039,
+      "learning_rate": 4.160908193484699e-05,
+      "loss": 0.33,
+      "step": 850
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 30.295068740844727,
+      "learning_rate": 4.1510365251727546e-05,
+      "loss": 1.0008,
+      "step": 860
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 93.3653793334961,
+      "learning_rate": 4.1411648568608094e-05,
+      "loss": 1.0401,
+      "step": 870
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 114.31365966796875,
+      "learning_rate": 4.131293188548865e-05,
+      "loss": 0.4156,
+      "step": 880
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 134.54774475097656,
+      "learning_rate": 4.12142152023692e-05,
+      "loss": 0.5463,
+      "step": 890
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 3.021076202392578,
+      "learning_rate": 4.1115498519249754e-05,
+      "loss": 0.2947,
+      "step": 900
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 9.884215354919434,
+      "learning_rate": 4.101678183613031e-05,
+      "loss": 0.3674,
+      "step": 910
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 167.9898223876953,
+      "learning_rate": 4.091806515301086e-05,
+      "loss": 0.4516,
+      "step": 920
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 34.41691207885742,
+      "learning_rate": 4.0819348469891414e-05,
+      "loss": 0.504,
+      "step": 930
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 10.135024070739746,
+      "learning_rate": 4.072063178677197e-05,
+      "loss": 0.2834,
+      "step": 940
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.0688509941101074,
+      "learning_rate": 4.062191510365252e-05,
+      "loss": 0.3188,
+      "step": 950
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 5.052711009979248,
+      "learning_rate": 4.052319842053307e-05,
+      "loss": 0.3693,
+      "step": 960
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.37648436427116394,
+      "learning_rate": 4.042448173741363e-05,
+      "loss": 0.1054,
+      "step": 970
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 18.3348445892334,
+      "learning_rate": 4.032576505429418e-05,
+      "loss": 0.3397,
+      "step": 980
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 10.808074951171875,
+      "learning_rate": 4.022704837117473e-05,
+      "loss": 0.3628,
+      "step": 990
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 141.88064575195312,
+      "learning_rate": 4.012833168805528e-05,
+      "loss": 0.9269,
+      "step": 1000
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.555182695388794,
+      "learning_rate": 4.002961500493584e-05,
+      "loss": 0.1197,
+      "step": 1010
+    },
+    {
+      "epoch": 1.0,
+      "eval_balanced accuracy": 0.917760474601409,
+      "eval_f1": 0.9176981176842771,
+      "eval_loss": 0.40740078687667847,
+      "eval_precision": 0.9176448492816227,
+      "eval_recall": 0.917760474601409,
+      "eval_runtime": 5.5647,
+      "eval_samples_per_second": 161.733,
+      "eval_steps_per_second": 10.243,
+      "step": 1013
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 4.549361228942871,
+      "learning_rate": 3.993089832181639e-05,
+      "loss": 0.5231,
+      "step": 1020
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 5.699501991271973,
+      "learning_rate": 3.983218163869694e-05,
+      "loss": 0.4139,
+      "step": 1030
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 2.1153147220611572,
+      "learning_rate": 3.973346495557749e-05,
+      "loss": 0.2718,
+      "step": 1040
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 5.258866310119629,
+      "learning_rate": 3.963474827245805e-05,
+      "loss": 0.3115,
+      "step": 1050
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 13.351494789123535,
+      "learning_rate": 3.95360315893386e-05,
+      "loss": 0.3992,
+      "step": 1060
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.7189359664917,
+      "learning_rate": 3.943731490621915e-05,
+      "loss": 0.1346,
+      "step": 1070
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 5.006288051605225,
+      "learning_rate": 3.933859822309971e-05,
+      "loss": 0.3118,
+      "step": 1080
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 7.094489574432373,
+      "learning_rate": 3.923988153998026e-05,
+      "loss": 0.1807,
+      "step": 1090
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 4.784492492675781,
+      "learning_rate": 3.914116485686081e-05,
+      "loss": 0.3839,
+      "step": 1100
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 1.5643423795700073,
+      "learning_rate": 3.904244817374136e-05,
+      "loss": 0.1729,
+      "step": 1110
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 33.595703125,
+      "learning_rate": 3.8943731490621916e-05,
+      "loss": 0.1749,
+      "step": 1120
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.5887395143508911,
+      "learning_rate": 3.884501480750247e-05,
+      "loss": 0.2513,
+      "step": 1130
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 22.53057289123535,
+      "learning_rate": 3.874629812438302e-05,
+      "loss": 0.2858,
+      "step": 1140
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 52.66212463378906,
+      "learning_rate": 3.8647581441263576e-05,
+      "loss": 0.1328,
+      "step": 1150
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 5.8826117515563965,
+      "learning_rate": 3.8548864758144125e-05,
+      "loss": 0.3296,
+      "step": 1160
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 10.208854675292969,
+      "learning_rate": 3.845014807502468e-05,
+      "loss": 0.1743,
+      "step": 1170
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 5.222922325134277,
+      "learning_rate": 3.8351431391905236e-05,
+      "loss": 0.2482,
+      "step": 1180
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.3885471224784851,
+      "learning_rate": 3.8252714708785785e-05,
+      "loss": 0.3651,
+      "step": 1190
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 68.36416625976562,
+      "learning_rate": 3.815399802566634e-05,
+      "loss": 0.5256,
+      "step": 1200
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 103.91950988769531,
+      "learning_rate": 3.8055281342546896e-05,
+      "loss": 0.2199,
+      "step": 1210
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.17333897948265076,
+      "learning_rate": 3.7956564659427445e-05,
+      "loss": 0.126,
+      "step": 1220
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 41.487117767333984,
+      "learning_rate": 3.7857847976308e-05,
+      "loss": 0.2293,
+      "step": 1230
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.1527445763349533,
+      "learning_rate": 3.775913129318855e-05,
+      "loss": 0.2754,
+      "step": 1240
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.3720811605453491,
+      "learning_rate": 3.7660414610069105e-05,
+      "loss": 0.1904,
+      "step": 1250
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.2801426947116852,
+      "learning_rate": 3.756169792694966e-05,
+      "loss": 0.2894,
+      "step": 1260
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.912218451499939,
+      "learning_rate": 3.746298124383021e-05,
+      "loss": 0.4345,
+      "step": 1270
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.25501587986946106,
+      "learning_rate": 3.736426456071076e-05,
+      "loss": 0.2249,
+      "step": 1280
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 19.25888442993164,
+      "learning_rate": 3.7265547877591314e-05,
+      "loss": 0.4532,
+      "step": 1290
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 7.447415351867676,
+      "learning_rate": 3.716683119447187e-05,
+      "loss": 0.419,
+      "step": 1300
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.2623952627182007,
+      "learning_rate": 3.706811451135242e-05,
+      "loss": 0.3596,
+      "step": 1310
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 49.27845001220703,
+      "learning_rate": 3.6969397828232974e-05,
+      "loss": 0.1807,
+      "step": 1320
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 8.055280685424805,
+      "learning_rate": 3.687068114511353e-05,
+      "loss": 0.1877,
+      "step": 1330
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.24801558256149292,
+      "learning_rate": 3.677196446199408e-05,
+      "loss": 0.1906,
+      "step": 1340
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.37148603796958923,
+      "learning_rate": 3.6673247778874634e-05,
+      "loss": 0.6613,
+      "step": 1350
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 2.0603933334350586,
+      "learning_rate": 3.657453109575518e-05,
+      "loss": 0.1717,
+      "step": 1360
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.4730746746063232,
+      "learning_rate": 3.647581441263574e-05,
+      "loss": 0.3606,
+      "step": 1370
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 11.129170417785645,
+      "learning_rate": 3.6377097729516294e-05,
+      "loss": 0.4668,
+      "step": 1380
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 107.76866912841797,
+      "learning_rate": 3.627838104639684e-05,
+      "loss": 0.4248,
+      "step": 1390
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.4574478566646576,
+      "learning_rate": 3.617966436327739e-05,
+      "loss": 0.2463,
+      "step": 1400
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 9.523133277893066,
+      "learning_rate": 3.6080947680157954e-05,
+      "loss": 0.2986,
+      "step": 1410
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 724.2791137695312,
+      "learning_rate": 3.59822309970385e-05,
+      "loss": 0.1994,
+      "step": 1420
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.495822012424469,
+      "learning_rate": 3.588351431391905e-05,
+      "loss": 0.405,
+      "step": 1430
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.7077971696853638,
+      "learning_rate": 3.578479763079961e-05,
+      "loss": 0.3258,
+      "step": 1440
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.471545934677124,
+      "learning_rate": 3.568608094768016e-05,
+      "loss": 0.3381,
+      "step": 1450
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 160.64279174804688,
+      "learning_rate": 3.558736426456071e-05,
+      "loss": 0.4319,
+      "step": 1460
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 213.93475341796875,
+      "learning_rate": 3.548864758144127e-05,
+      "loss": 0.3506,
+      "step": 1470
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.5124903917312622,
+      "learning_rate": 3.5389930898321816e-05,
+      "loss": 0.261,
+      "step": 1480
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.2033979296684265,
+      "learning_rate": 3.529121421520237e-05,
+      "loss": 0.3329,
+      "step": 1490
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.14042626321315765,
+      "learning_rate": 3.519249753208293e-05,
+      "loss": 0.198,
+      "step": 1500
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.052474942058324814,
+      "learning_rate": 3.5093780848963476e-05,
+      "loss": 0.3291,
+      "step": 1510
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.7498096823692322,
+      "learning_rate": 3.4995064165844024e-05,
+      "loss": 0.5893,
+      "step": 1520
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 56.467071533203125,
+      "learning_rate": 3.489634748272459e-05,
+      "loss": 0.22,
+      "step": 1530
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 5.047154903411865,
+      "learning_rate": 3.4797630799605136e-05,
+      "loss": 0.3128,
+      "step": 1540
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.24173791706562042,
+      "learning_rate": 3.4698914116485684e-05,
+      "loss": 0.2632,
+      "step": 1550
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.23745213449001312,
+      "learning_rate": 3.460019743336624e-05,
+      "loss": 0.1316,
+      "step": 1560
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 0.3697431683540344,
+      "learning_rate": 3.4501480750246796e-05,
+      "loss": 0.3162,
+      "step": 1570
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 125.36990356445312,
+      "learning_rate": 3.4402764067127344e-05,
+      "loss": 0.7252,
+      "step": 1580
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 30.01531410217285,
+      "learning_rate": 3.43040473840079e-05,
+      "loss": 0.567,
+      "step": 1590
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 44.524818420410156,
+      "learning_rate": 3.420533070088845e-05,
+      "loss": 0.4531,
+      "step": 1600
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 133.4363555908203,
+      "learning_rate": 3.4106614017769004e-05,
+      "loss": 0.4438,
+      "step": 1610
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1119.47509765625,
+      "learning_rate": 3.400789733464956e-05,
+      "loss": 0.3973,
+      "step": 1620
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 4.369329929351807,
+      "learning_rate": 3.390918065153011e-05,
+      "loss": 0.482,
+      "step": 1630
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 28.413909912109375,
+      "learning_rate": 3.381046396841066e-05,
+      "loss": 0.3454,
+      "step": 1640
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 76.58002471923828,
+      "learning_rate": 3.371174728529122e-05,
+      "loss": 0.2663,
+      "step": 1650
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 597.3102416992188,
+      "learning_rate": 3.361303060217177e-05,
+      "loss": 0.155,
+      "step": 1660
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 24.984447479248047,
+      "learning_rate": 3.351431391905232e-05,
+      "loss": 0.2535,
+      "step": 1670
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 30.53813934326172,
+      "learning_rate": 3.341559723593287e-05,
+      "loss": 0.315,
+      "step": 1680
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.5513701438903809,
+      "learning_rate": 3.331688055281343e-05,
+      "loss": 0.3617,
+      "step": 1690
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 3.676360845565796,
+      "learning_rate": 3.321816386969398e-05,
+      "loss": 0.6472,
+      "step": 1700
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 23.96689796447754,
+      "learning_rate": 3.311944718657453e-05,
+      "loss": 0.5382,
+      "step": 1710
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 18.116992950439453,
+      "learning_rate": 3.302073050345508e-05,
+      "loss": 0.345,
+      "step": 1720
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 4.786412239074707,
+      "learning_rate": 3.292201382033564e-05,
+      "loss": 0.3599,
+      "step": 1730
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 2.5227644443511963,
+      "learning_rate": 3.282329713721619e-05,
+      "loss": 0.4313,
+      "step": 1740
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 4.462274074554443,
+      "learning_rate": 3.272458045409674e-05,
+      "loss": 0.5479,
+      "step": 1750
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 49.19129180908203,
+      "learning_rate": 3.26258637709773e-05,
+      "loss": 0.5215,
+      "step": 1760
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 89.65460968017578,
+      "learning_rate": 3.252714708785785e-05,
+      "loss": 0.7555,
+      "step": 1770
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.9293081760406494,
+      "learning_rate": 3.24284304047384e-05,
+      "loss": 0.3071,
+      "step": 1780
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 11.949310302734375,
+      "learning_rate": 3.232971372161895e-05,
+      "loss": 0.2182,
+      "step": 1790
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 15.446320533752441,
+      "learning_rate": 3.2230997038499506e-05,
+      "loss": 0.2696,
+      "step": 1800
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 5.7437567710876465,
+      "learning_rate": 3.213228035538006e-05,
+      "loss": 0.3771,
+      "step": 1810
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 209.53298950195312,
+      "learning_rate": 3.203356367226061e-05,
+      "loss": 0.3023,
+      "step": 1820
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.2472151517868042,
+      "learning_rate": 3.1934846989141166e-05,
+      "loss": 0.29,
+      "step": 1830
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 370.38800048828125,
+      "learning_rate": 3.1836130306021715e-05,
+      "loss": 0.3409,
+      "step": 1840
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 145.07717895507812,
+      "learning_rate": 3.173741362290227e-05,
+      "loss": 0.3839,
+      "step": 1850
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 48.441585540771484,
+      "learning_rate": 3.1638696939782826e-05,
+      "loss": 0.2765,
+      "step": 1860
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 8.114079475402832,
+      "learning_rate": 3.1539980256663375e-05,
+      "loss": 0.4797,
+      "step": 1870
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 2.0335161685943604,
+      "learning_rate": 3.144126357354393e-05,
+      "loss": 0.3283,
+      "step": 1880
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 571.5001831054688,
+      "learning_rate": 3.1342546890424486e-05,
+      "loss": 0.3749,
+      "step": 1890
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 26.4891414642334,
+      "learning_rate": 3.1243830207305035e-05,
+      "loss": 0.5855,
+      "step": 1900
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 373.5781555175781,
+      "learning_rate": 3.114511352418559e-05,
+      "loss": 0.5779,
+      "step": 1910
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 12.153056144714355,
+      "learning_rate": 3.1046396841066146e-05,
+      "loss": 0.5083,
+      "step": 1920
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 246.8365936279297,
+      "learning_rate": 3.0947680157946695e-05,
+      "loss": 0.2794,
+      "step": 1930
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 16.38204002380371,
+      "learning_rate": 3.084896347482725e-05,
+      "loss": 0.1689,
+      "step": 1940
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 86.90618896484375,
+      "learning_rate": 3.07502467917078e-05,
+      "loss": 0.124,
+      "step": 1950
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 13.767565727233887,
+      "learning_rate": 3.0651530108588355e-05,
+      "loss": 0.4286,
+      "step": 1960
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 25.554912567138672,
+      "learning_rate": 3.0552813425468904e-05,
+      "loss": 0.342,
+      "step": 1970
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.8774542212486267,
+      "learning_rate": 3.045409674234946e-05,
+      "loss": 0.2024,
+      "step": 1980
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 53.33576583862305,
+      "learning_rate": 3.0355380059230008e-05,
+      "loss": 0.3032,
+      "step": 1990
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 6.252757549285889,
+      "learning_rate": 3.0256663376110567e-05,
+      "loss": 0.3633,
+      "step": 2000
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.8560687899589539,
+      "learning_rate": 3.015794669299112e-05,
+      "loss": 0.2662,
+      "step": 2010
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 5.560061454772949,
+      "learning_rate": 3.0059230009871668e-05,
+      "loss": 0.3057,
+      "step": 2020
+    },
+    {
+      "epoch": 2.0,
+      "eval_balanced accuracy": 0.9113459399332592,
+      "eval_f1": 0.9119394500117044,
+      "eval_loss": 0.29854172468185425,
+      "eval_precision": 0.913535516192521,
+      "eval_recall": 0.9113459399332592,
+      "eval_runtime": 5.6113,
+      "eval_samples_per_second": 160.392,
+      "eval_steps_per_second": 10.158,
+      "step": 2026
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 5065,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "total_flos": 96787312128000.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce4159defbc8b7143d7a714e34b4537555801075b0d7f6b279e94c452fb5607c
+size 4411