End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +948 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3
 base_model: meta-llama/Meta-Llama-3-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: OH_DCFT_V3_wo_caseus_custom
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # OH_DCFT_V3_wo_caseus_custom
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6464

 base_model: meta-llama/Meta-Llama-3-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: OH_DCFT_V3_wo_caseus_custom
 # OH_DCFT_V3_wo_caseus_custom
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_caseus_custom dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.6464

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9982238010657194,
+    "eval_loss": 0.6463779211044312,
+    "eval_runtime": 225.177,
+    "eval_samples_per_second": 50.525,
+    "eval_steps_per_second": 0.395,
+    "total_flos": 2120178393415680.0,
+    "train_loss": 0.6161920559161459,
+    "train_runtime": 37913.0448,
+    "train_samples_per_second": 17.103,
+    "train_steps_per_second": 0.033
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9982238010657194,
+    "eval_loss": 0.6463779211044312,
+    "eval_runtime": 225.177,
+    "eval_samples_per_second": 50.525,
+    "eval_steps_per_second": 0.395
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9982238010657194,
+    "total_flos": 2120178393415680.0,
+    "train_loss": 0.6161920559161459,
+    "train_runtime": 37913.0448,
+    "train_samples_per_second": 17.103,
+    "train_steps_per_second": 0.033
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,948 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9982238010657194,
+  "eval_steps": 500,
+  "global_step": 1266,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.023682652457075192,
+      "grad_norm": 13.189470730439847,
+      "learning_rate": 5e-06,
+      "loss": 0.9153,
+      "step": 10
+    },
+    {
+      "epoch": 0.047365304914150384,
+      "grad_norm": 16.705378973618984,
+      "learning_rate": 5e-06,
+      "loss": 0.8006,
+      "step": 20
+    },
+    {
+      "epoch": 0.07104795737122557,
+      "grad_norm": 1.5387333872556015,
+      "learning_rate": 5e-06,
+      "loss": 0.767,
+      "step": 30
+    },
+    {
+      "epoch": 0.09473060982830077,
+      "grad_norm": 1.5480037290759114,
+      "learning_rate": 5e-06,
+      "loss": 0.7381,
+      "step": 40
+    },
+    {
+      "epoch": 0.11841326228537596,
+      "grad_norm": 0.7592297796517318,
+      "learning_rate": 5e-06,
+      "loss": 0.7299,
+      "step": 50
+    },
+    {
+      "epoch": 0.14209591474245115,
+      "grad_norm": 0.7439300906768337,
+      "learning_rate": 5e-06,
+      "loss": 0.7102,
+      "step": 60
+    },
+    {
+      "epoch": 0.16577856719952636,
+      "grad_norm": 0.9644129830031992,
+      "learning_rate": 5e-06,
+      "loss": 0.7226,
+      "step": 70
+    },
+    {
+      "epoch": 0.18946121965660154,
+      "grad_norm": 1.083419801070231,
+      "learning_rate": 5e-06,
+      "loss": 0.7016,
+      "step": 80
+    },
+    {
+      "epoch": 0.21314387211367672,
+      "grad_norm": 0.6505643570843905,
+      "learning_rate": 5e-06,
+      "loss": 0.7007,
+      "step": 90
+    },
+    {
+      "epoch": 0.23682652457075193,
+      "grad_norm": 0.5502857011017064,
+      "learning_rate": 5e-06,
+      "loss": 0.6834,
+      "step": 100
+    },
+    {
+      "epoch": 0.2605091770278271,
+      "grad_norm": 0.5293957463932839,
+      "learning_rate": 5e-06,
+      "loss": 0.6843,
+      "step": 110
+    },
+    {
+      "epoch": 0.2841918294849023,
+      "grad_norm": 0.6629192311799427,
+      "learning_rate": 5e-06,
+      "loss": 0.6876,
+      "step": 120
+    },
+    {
+      "epoch": 0.30787448194197753,
+      "grad_norm": 0.7092941162121122,
+      "learning_rate": 5e-06,
+      "loss": 0.6789,
+      "step": 130
+    },
+    {
+      "epoch": 0.3315571343990527,
+      "grad_norm": 0.6884868718102479,
+      "learning_rate": 5e-06,
+      "loss": 0.6765,
+      "step": 140
+    },
+    {
+      "epoch": 0.3552397868561279,
+      "grad_norm": 0.5675134967361182,
+      "learning_rate": 5e-06,
+      "loss": 0.6781,
+      "step": 150
+    },
+    {
+      "epoch": 0.3789224393132031,
+      "grad_norm": 0.4872689604652464,
+      "learning_rate": 5e-06,
+      "loss": 0.6778,
+      "step": 160
+    },
+    {
+      "epoch": 0.40260509177027826,
+      "grad_norm": 0.5788634488292149,
+      "learning_rate": 5e-06,
+      "loss": 0.6723,
+      "step": 170
+    },
+    {
+      "epoch": 0.42628774422735344,
+      "grad_norm": 0.5446139045326359,
+      "learning_rate": 5e-06,
+      "loss": 0.6689,
+      "step": 180
+    },
+    {
+      "epoch": 0.4499703966844287,
+      "grad_norm": 0.7925779847081673,
+      "learning_rate": 5e-06,
+      "loss": 0.6687,
+      "step": 190
+    },
+    {
+      "epoch": 0.47365304914150386,
+      "grad_norm": 0.7557946920173058,
+      "learning_rate": 5e-06,
+      "loss": 0.6703,
+      "step": 200
+    },
+    {
+      "epoch": 0.49733570159857904,
+      "grad_norm": 0.6224256728374598,
+      "learning_rate": 5e-06,
+      "loss": 0.6674,
+      "step": 210
+    },
+    {
+      "epoch": 0.5210183540556542,
+      "grad_norm": 0.7659997074796134,
+      "learning_rate": 5e-06,
+      "loss": 0.6775,
+      "step": 220
+    },
+    {
+      "epoch": 0.5447010065127295,
+      "grad_norm": 0.6379584879302997,
+      "learning_rate": 5e-06,
+      "loss": 0.6606,
+      "step": 230
+    },
+    {
+      "epoch": 0.5683836589698046,
+      "grad_norm": 0.6095478409444377,
+      "learning_rate": 5e-06,
+      "loss": 0.6688,
+      "step": 240
+    },
+    {
+      "epoch": 0.5920663114268798,
+      "grad_norm": 0.5271893204607883,
+      "learning_rate": 5e-06,
+      "loss": 0.6674,
+      "step": 250
+    },
+    {
+      "epoch": 0.6157489638839551,
+      "grad_norm": 0.614025958346404,
+      "learning_rate": 5e-06,
+      "loss": 0.6582,
+      "step": 260
+    },
+    {
+      "epoch": 0.6394316163410302,
+      "grad_norm": 0.7285219514043463,
+      "learning_rate": 5e-06,
+      "loss": 0.6581,
+      "step": 270
+    },
+    {
+      "epoch": 0.6631142687981054,
+      "grad_norm": 0.7190322452172576,
+      "learning_rate": 5e-06,
+      "loss": 0.659,
+      "step": 280
+    },
+    {
+      "epoch": 0.6867969212551805,
+      "grad_norm": 0.6071224570407465,
+      "learning_rate": 5e-06,
+      "loss": 0.6649,
+      "step": 290
+    },
+    {
+      "epoch": 0.7104795737122558,
+      "grad_norm": 0.49216175819370883,
+      "learning_rate": 5e-06,
+      "loss": 0.6611,
+      "step": 300
+    },
+    {
+      "epoch": 0.7341622261693309,
+      "grad_norm": 0.5178492184765248,
+      "learning_rate": 5e-06,
+      "loss": 0.6608,
+      "step": 310
+    },
+    {
+      "epoch": 0.7578448786264061,
+      "grad_norm": 0.5029896497256847,
+      "learning_rate": 5e-06,
+      "loss": 0.6568,
+      "step": 320
+    },
+    {
+      "epoch": 0.7815275310834814,
+      "grad_norm": 0.7171925939489323,
+      "learning_rate": 5e-06,
+      "loss": 0.6559,
+      "step": 330
+    },
+    {
+      "epoch": 0.8052101835405565,
+      "grad_norm": 0.6508752197201968,
+      "learning_rate": 5e-06,
+      "loss": 0.6549,
+      "step": 340
+    },
+    {
+      "epoch": 0.8288928359976317,
+      "grad_norm": 0.5255741435374958,
+      "learning_rate": 5e-06,
+      "loss": 0.6608,
+      "step": 350
+    },
+    {
+      "epoch": 0.8525754884547069,
+      "grad_norm": 0.547087018654004,
+      "learning_rate": 5e-06,
+      "loss": 0.6557,
+      "step": 360
+    },
+    {
+      "epoch": 0.8762581409117821,
+      "grad_norm": 0.48618214031429924,
+      "learning_rate": 5e-06,
+      "loss": 0.6535,
+      "step": 370
+    },
+    {
+      "epoch": 0.8999407933688574,
+      "grad_norm": 0.5442894464037913,
+      "learning_rate": 5e-06,
+      "loss": 0.6556,
+      "step": 380
+    },
+    {
+      "epoch": 0.9236234458259325,
+      "grad_norm": 0.6652203100668711,
+      "learning_rate": 5e-06,
+      "loss": 0.6585,
+      "step": 390
+    },
+    {
+      "epoch": 0.9473060982830077,
+      "grad_norm": 0.5348960345212145,
+      "learning_rate": 5e-06,
+      "loss": 0.6429,
+      "step": 400
+    },
+    {
+      "epoch": 0.9709887507400828,
+      "grad_norm": 0.5696256474721889,
+      "learning_rate": 5e-06,
+      "loss": 0.6552,
+      "step": 410
+    },
+    {
+      "epoch": 0.9946714031971581,
+      "grad_norm": 0.6600709101064473,
+      "learning_rate": 5e-06,
+      "loss": 0.6424,
+      "step": 420
+    },
+    {
+      "epoch": 0.9994079336885732,
+      "eval_loss": 0.6530644297599792,
+      "eval_runtime": 226.0947,
+      "eval_samples_per_second": 50.32,
+      "eval_steps_per_second": 0.394,
+      "step": 422
+    },
+    {
+      "epoch": 1.0183540556542332,
+      "grad_norm": 0.7026612619443326,
+      "learning_rate": 5e-06,
+      "loss": 0.6135,
+      "step": 430
+    },
+    {
+      "epoch": 1.0420367081113084,
+      "grad_norm": 0.6043811485369807,
+      "learning_rate": 5e-06,
+      "loss": 0.609,
+      "step": 440
+    },
+    {
+      "epoch": 1.0657193605683837,
+      "grad_norm": 0.5930425944334362,
+      "learning_rate": 5e-06,
+      "loss": 0.6002,
+      "step": 450
+    },
+    {
+      "epoch": 1.089402013025459,
+      "grad_norm": 0.5902568338086035,
+      "learning_rate": 5e-06,
+      "loss": 0.6078,
+      "step": 460
+    },
+    {
+      "epoch": 1.1130846654825342,
+      "grad_norm": 0.49068395029218714,
+      "learning_rate": 5e-06,
+      "loss": 0.6075,
+      "step": 470
+    },
+    {
+      "epoch": 1.1367673179396092,
+      "grad_norm": 0.6080540264952039,
+      "learning_rate": 5e-06,
+      "loss": 0.5968,
+      "step": 480
+    },
+    {
+      "epoch": 1.1604499703966844,
+      "grad_norm": 0.631498780679762,
+      "learning_rate": 5e-06,
+      "loss": 0.6033,
+      "step": 490
+    },
+    {
+      "epoch": 1.1841326228537596,
+      "grad_norm": 0.6454741753351722,
+      "learning_rate": 5e-06,
+      "loss": 0.6113,
+      "step": 500
+    },
+    {
+      "epoch": 1.2078152753108349,
+      "grad_norm": 0.4832541377171757,
+      "learning_rate": 5e-06,
+      "loss": 0.608,
+      "step": 510
+    },
+    {
+      "epoch": 1.2314979277679101,
+      "grad_norm": 0.576018646795068,
+      "learning_rate": 5e-06,
+      "loss": 0.6153,
+      "step": 520
+    },
+    {
+      "epoch": 1.2551805802249851,
+      "grad_norm": 0.584703843866035,
+      "learning_rate": 5e-06,
+      "loss": 0.604,
+      "step": 530
+    },
+    {
+      "epoch": 1.2788632326820604,
+      "grad_norm": 0.5216537105860701,
+      "learning_rate": 5e-06,
+      "loss": 0.6091,
+      "step": 540
+    },
+    {
+      "epoch": 1.3025458851391356,
+      "grad_norm": 0.5533381633091111,
+      "learning_rate": 5e-06,
+      "loss": 0.6082,
+      "step": 550
+    },
+    {
+      "epoch": 1.3262285375962108,
+      "grad_norm": 0.6053695231773084,
+      "learning_rate": 5e-06,
+      "loss": 0.6085,
+      "step": 560
+    },
+    {
+      "epoch": 1.349911190053286,
+      "grad_norm": 0.49887104712071284,
+      "learning_rate": 5e-06,
+      "loss": 0.6052,
+      "step": 570
+    },
+    {
+      "epoch": 1.373593842510361,
+      "grad_norm": 0.7914662780125341,
+      "learning_rate": 5e-06,
+      "loss": 0.6051,
+      "step": 580
+    },
+    {
+      "epoch": 1.3972764949674363,
+      "grad_norm": 0.6735579998559209,
+      "learning_rate": 5e-06,
+      "loss": 0.6003,
+      "step": 590
+    },
+    {
+      "epoch": 1.4209591474245116,
+      "grad_norm": 0.6489668939560808,
+      "learning_rate": 5e-06,
+      "loss": 0.6023,
+      "step": 600
+    },
+    {
+      "epoch": 1.4446417998815868,
+      "grad_norm": 0.5114004583431452,
+      "learning_rate": 5e-06,
+      "loss": 0.6055,
+      "step": 610
+    },
+    {
+      "epoch": 1.468324452338662,
+      "grad_norm": 0.5435875766325163,
+      "learning_rate": 5e-06,
+      "loss": 0.6016,
+      "step": 620
+    },
+    {
+      "epoch": 1.492007104795737,
+      "grad_norm": 0.5201706548652517,
+      "learning_rate": 5e-06,
+      "loss": 0.5993,
+      "step": 630
+    },
+    {
+      "epoch": 1.5156897572528123,
+      "grad_norm": 0.5829006586321327,
+      "learning_rate": 5e-06,
+      "loss": 0.6064,
+      "step": 640
+    },
+    {
+      "epoch": 1.5393724097098875,
+      "grad_norm": 0.4715645741265029,
+      "learning_rate": 5e-06,
+      "loss": 0.6015,
+      "step": 650
+    },
+    {
+      "epoch": 1.5630550621669625,
+      "grad_norm": 0.5490786443724993,
+      "learning_rate": 5e-06,
+      "loss": 0.6065,
+      "step": 660
+    },
+    {
+      "epoch": 1.586737714624038,
+      "grad_norm": 0.5628955248722177,
+      "learning_rate": 5e-06,
+      "loss": 0.6028,
+      "step": 670
+    },
+    {
+      "epoch": 1.610420367081113,
+      "grad_norm": 0.5612429021808025,
+      "learning_rate": 5e-06,
+      "loss": 0.6054,
+      "step": 680
+    },
+    {
+      "epoch": 1.6341030195381883,
+      "grad_norm": 0.5381611425419351,
+      "learning_rate": 5e-06,
+      "loss": 0.6081,
+      "step": 690
+    },
+    {
+      "epoch": 1.6577856719952635,
+      "grad_norm": 0.6114484557774539,
+      "learning_rate": 5e-06,
+      "loss": 0.6013,
+      "step": 700
+    },
+    {
+      "epoch": 1.6814683244523385,
+      "grad_norm": 0.7308277575487941,
+      "learning_rate": 5e-06,
+      "loss": 0.6013,
+      "step": 710
+    },
+    {
+      "epoch": 1.705150976909414,
+      "grad_norm": 0.5543487895907269,
+      "learning_rate": 5e-06,
+      "loss": 0.6034,
+      "step": 720
+    },
+    {
+      "epoch": 1.728833629366489,
+      "grad_norm": 0.7004280943438956,
+      "learning_rate": 5e-06,
+      "loss": 0.6041,
+      "step": 730
+    },
+    {
+      "epoch": 1.7525162818235642,
+      "grad_norm": 0.5549471387533644,
+      "learning_rate": 5e-06,
+      "loss": 0.6036,
+      "step": 740
+    },
+    {
+      "epoch": 1.7761989342806395,
+      "grad_norm": 0.4806128760976835,
+      "learning_rate": 5e-06,
+      "loss": 0.6064,
+      "step": 750
+    },
+    {
+      "epoch": 1.7998815867377145,
+      "grad_norm": 0.5198713849166644,
+      "learning_rate": 5e-06,
+      "loss": 0.6013,
+      "step": 760
+    },
+    {
+      "epoch": 1.82356423919479,
+      "grad_norm": 0.4992510602270783,
+      "learning_rate": 5e-06,
+      "loss": 0.6019,
+      "step": 770
+    },
+    {
+      "epoch": 1.847246891651865,
+      "grad_norm": 0.5736035447222264,
+      "learning_rate": 5e-06,
+      "loss": 0.6027,
+      "step": 780
+    },
+    {
+      "epoch": 1.8709295441089402,
+      "grad_norm": 0.4838895289542149,
+      "learning_rate": 5e-06,
+      "loss": 0.6105,
+      "step": 790
+    },
+    {
+      "epoch": 1.8946121965660154,
+      "grad_norm": 0.5587547644562073,
+      "learning_rate": 5e-06,
+      "loss": 0.6027,
+      "step": 800
+    },
+    {
+      "epoch": 1.9182948490230904,
+      "grad_norm": 0.595674379478466,
+      "learning_rate": 5e-06,
+      "loss": 0.6015,
+      "step": 810
+    },
+    {
+      "epoch": 1.941977501480166,
+      "grad_norm": 0.578820036510254,
+      "learning_rate": 5e-06,
+      "loss": 0.6093,
+      "step": 820
+    },
+    {
+      "epoch": 1.965660153937241,
+      "grad_norm": 0.6023612558579958,
+      "learning_rate": 5e-06,
+      "loss": 0.6061,
+      "step": 830
+    },
+    {
+      "epoch": 1.9893428063943162,
+      "grad_norm": 0.5233076861556437,
+      "learning_rate": 5e-06,
+      "loss": 0.6031,
+      "step": 840
+    },
+    {
+      "epoch": 1.9988158673771461,
+      "eval_loss": 0.6433995366096497,
+      "eval_runtime": 226.6273,
+      "eval_samples_per_second": 50.201,
+      "eval_steps_per_second": 0.393,
+      "step": 844
+    },
+    {
+      "epoch": 2.0130254588513914,
+      "grad_norm": 0.6787143504485439,
+      "learning_rate": 5e-06,
+      "loss": 0.5759,
+      "step": 850
+    },
+    {
+      "epoch": 2.0367081113084664,
+      "grad_norm": 0.5958693467971309,
+      "learning_rate": 5e-06,
+      "loss": 0.5524,
+      "step": 860
+    },
+    {
+      "epoch": 2.060390763765542,
+      "grad_norm": 0.6686487959774522,
+      "learning_rate": 5e-06,
+      "loss": 0.5564,
+      "step": 870
+    },
+    {
+      "epoch": 2.084073416222617,
+      "grad_norm": 0.6334086790954905,
+      "learning_rate": 5e-06,
+      "loss": 0.558,
+      "step": 880
+    },
+    {
+      "epoch": 2.1077560686796923,
+      "grad_norm": 0.5726693373620011,
+      "learning_rate": 5e-06,
+      "loss": 0.5549,
+      "step": 890
+    },
+    {
+      "epoch": 2.1314387211367674,
+      "grad_norm": 0.7389222692453125,
+      "learning_rate": 5e-06,
+      "loss": 0.5601,
+      "step": 900
+    },
+    {
+      "epoch": 2.1551213735938424,
+      "grad_norm": 0.5596250429874627,
+      "learning_rate": 5e-06,
+      "loss": 0.5558,
+      "step": 910
+    },
+    {
+      "epoch": 2.178804026050918,
+      "grad_norm": 0.4980926648118017,
+      "learning_rate": 5e-06,
+      "loss": 0.554,
+      "step": 920
+    },
+    {
+      "epoch": 2.202486678507993,
+      "grad_norm": 0.5341661818817609,
+      "learning_rate": 5e-06,
+      "loss": 0.5575,
+      "step": 930
+    },
+    {
+      "epoch": 2.2261693309650683,
+      "grad_norm": 0.5872312499818088,
+      "learning_rate": 5e-06,
+      "loss": 0.5573,
+      "step": 940
+    },
+    {
+      "epoch": 2.2498519834221433,
+      "grad_norm": 0.5524808246263375,
+      "learning_rate": 5e-06,
+      "loss": 0.5597,
+      "step": 950
+    },
+    {
+      "epoch": 2.2735346358792183,
+      "grad_norm": 0.6665756455358031,
+      "learning_rate": 5e-06,
+      "loss": 0.5582,
+      "step": 960
+    },
+    {
+      "epoch": 2.297217288336294,
+      "grad_norm": 0.6380394178638625,
+      "learning_rate": 5e-06,
+      "loss": 0.5575,
+      "step": 970
+    },
+    {
+      "epoch": 2.320899940793369,
+      "grad_norm": 0.6401376648665413,
+      "learning_rate": 5e-06,
+      "loss": 0.5637,
+      "step": 980
+    },
+    {
+      "epoch": 2.3445825932504443,
+      "grad_norm": 0.5798343260055008,
+      "learning_rate": 5e-06,
+      "loss": 0.5679,
+      "step": 990
+    },
+    {
+      "epoch": 2.3682652457075193,
+      "grad_norm": 0.5491726722233458,
+      "learning_rate": 5e-06,
+      "loss": 0.5644,
+      "step": 1000
+    },
+    {
+      "epoch": 2.3919478981645943,
+      "grad_norm": 0.5417761950290281,
+      "learning_rate": 5e-06,
+      "loss": 0.5616,
+      "step": 1010
+    },
+    {
+      "epoch": 2.4156305506216698,
+      "grad_norm": 0.5292463549461751,
+      "learning_rate": 5e-06,
+      "loss": 0.5573,
+      "step": 1020
+    },
+    {
+      "epoch": 2.4393132030787448,
+      "grad_norm": 0.5252209565871296,
+      "learning_rate": 5e-06,
+      "loss": 0.5568,
+      "step": 1030
+    },
+    {
+      "epoch": 2.4629958555358202,
+      "grad_norm": 0.5545040747666857,
+      "learning_rate": 5e-06,
+      "loss": 0.5516,
+      "step": 1040
+    },
+    {
+      "epoch": 2.4866785079928952,
+      "grad_norm": 0.584792726087141,
+      "learning_rate": 5e-06,
+      "loss": 0.5558,
+      "step": 1050
+    },
+    {
+      "epoch": 2.5103611604499703,
+      "grad_norm": 0.5853523679293414,
+      "learning_rate": 5e-06,
+      "loss": 0.5623,
+      "step": 1060
+    },
+    {
+      "epoch": 2.5340438129070457,
+      "grad_norm": 0.6018679654899803,
+      "learning_rate": 5e-06,
+      "loss": 0.5621,
+      "step": 1070
+    },
+    {
+      "epoch": 2.5577264653641207,
+      "grad_norm": 0.5843281213248563,
+      "learning_rate": 5e-06,
+      "loss": 0.5678,
+      "step": 1080
+    },
+    {
+      "epoch": 2.581409117821196,
+      "grad_norm": 0.695068864198405,
+      "learning_rate": 5e-06,
+      "loss": 0.5677,
+      "step": 1090
+    },
+    {
+      "epoch": 2.605091770278271,
+      "grad_norm": 0.5553119611017783,
+      "learning_rate": 5e-06,
+      "loss": 0.5578,
+      "step": 1100
+    },
+    {
+      "epoch": 2.6287744227353462,
+      "grad_norm": 0.5986682054349503,
+      "learning_rate": 5e-06,
+      "loss": 0.5583,
+      "step": 1110
+    },
+    {
+      "epoch": 2.6524570751924217,
+      "grad_norm": 0.5547254396195579,
+      "learning_rate": 5e-06,
+      "loss": 0.56,
+      "step": 1120
+    },
+    {
+      "epoch": 2.6761397276494967,
+      "grad_norm": 0.6846167862474701,
+      "learning_rate": 5e-06,
+      "loss": 0.5706,
+      "step": 1130
+    },
+    {
+      "epoch": 2.699822380106572,
+      "grad_norm": 0.5104743423646483,
+      "learning_rate": 5e-06,
+      "loss": 0.5663,
+      "step": 1140
+    },
+    {
+      "epoch": 2.723505032563647,
+      "grad_norm": 0.5307641334546837,
+      "learning_rate": 5e-06,
+      "loss": 0.564,
+      "step": 1150
+    },
+    {
+      "epoch": 2.747187685020722,
+      "grad_norm": 0.5149548951573419,
+      "learning_rate": 5e-06,
+      "loss": 0.562,
+      "step": 1160
+    },
+    {
+      "epoch": 2.7708703374777977,
+      "grad_norm": 0.6003485422420077,
+      "learning_rate": 5e-06,
+      "loss": 0.5583,
+      "step": 1170
+    },
+    {
+      "epoch": 2.7945529899348727,
+      "grad_norm": 0.6041639812457358,
+      "learning_rate": 5e-06,
+      "loss": 0.5662,
+      "step": 1180
+    },
+    {
+      "epoch": 2.818235642391948,
+      "grad_norm": 0.681355395170715,
+      "learning_rate": 5e-06,
+      "loss": 0.5619,
+      "step": 1190
+    },
+    {
+      "epoch": 2.841918294849023,
+      "grad_norm": 0.5003052671879107,
+      "learning_rate": 5e-06,
+      "loss": 0.5591,
+      "step": 1200
+    },
+    {
+      "epoch": 2.865600947306098,
+      "grad_norm": 0.5184483144480979,
+      "learning_rate": 5e-06,
+      "loss": 0.5595,
+      "step": 1210
+    },
+    {
+      "epoch": 2.8892835997631736,
+      "grad_norm": 0.5244993691156036,
+      "learning_rate": 5e-06,
+      "loss": 0.5583,
+      "step": 1220
+    },
+    {
+      "epoch": 2.9129662522202486,
+      "grad_norm": 0.5251082495659288,
+      "learning_rate": 5e-06,
+      "loss": 0.5606,
+      "step": 1230
+    },
+    {
+      "epoch": 2.936648904677324,
+      "grad_norm": 0.5488470152501833,
+      "learning_rate": 5e-06,
+      "loss": 0.556,
+      "step": 1240
+    },
+    {
+      "epoch": 2.960331557134399,
+      "grad_norm": 0.5568845430294795,
+      "learning_rate": 5e-06,
+      "loss": 0.5596,
+      "step": 1250
+    },
+    {
+      "epoch": 2.984014209591474,
+      "grad_norm": 0.5225267576791486,
+      "learning_rate": 5e-06,
+      "loss": 0.562,
+      "step": 1260
+    },
+    {
+      "epoch": 2.9982238010657194,
+      "eval_loss": 0.6463779211044312,
+      "eval_runtime": 227.372,
+      "eval_samples_per_second": 50.037,
+      "eval_steps_per_second": 0.391,
+      "step": 1266
+    },
+    {
+      "epoch": 2.9982238010657194,
+      "step": 1266,
+      "total_flos": 2120178393415680.0,
+      "train_loss": 0.6161920559161459,
+      "train_runtime": 37913.0448,
+      "train_samples_per_second": 17.103,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1266,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2120178393415680.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed