diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100755--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7279 @@ +{ + "best_metric": 0.7214700193423598, + "best_model_checkpoint": "./experiment/train_0407_3b_db_use_init_10770/checkpoint-3392", + "epoch": 1130.581818181818, + "global_step": 3392, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.29, + "learning_rate": 0.0001, + "loss": 0.0524, + "step": 1 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001, + "loss": 0.0135, + "step": 3 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001, + "loss": 0.0271, + "step": 6 + }, + { + "epoch": 2.87, + "learning_rate": 0.0001, + "loss": 0.0032, + "step": 9 + }, + { + "epoch": 3.87, + "learning_rate": 0.0001, + "loss": 0.0041, + "step": 12 + }, + { + "epoch": 4.87, + "learning_rate": 0.0001, + "loss": 0.0117, + "step": 15 + }, + { + "epoch": 5.87, + "learning_rate": 0.0001, + "loss": 0.0021, + "step": 18 + }, + { + "epoch": 6.87, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 21 + }, + { + "epoch": 7.87, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 24 + }, + { + "epoch": 8.87, + "learning_rate": 0.0001, + "loss": 0.0014, + "step": 27 + }, + { + "epoch": 9.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 30 + }, + { + "epoch": 10.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 33 + }, + { + "epoch": 11.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 36 + }, + { + "epoch": 12.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 39 + }, + { + "epoch": 13.87, + "learning_rate": 0.0001, + "loss": 0.0027, + "step": 42 + }, + { + "epoch": 14.87, + "learning_rate": 0.0001, + "loss": 0.0085, + "step": 45 + }, + { + "epoch": 15.87, + "learning_rate": 0.0001, + "loss": 0.0017, + "step": 48 + }, + { + "epoch": 16.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 51 + }, + { + "epoch": 17.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 54 + }, + { + "epoch": 18.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 57 + }, + { + "epoch": 19.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 60 + }, + { + "epoch": 20.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 63 + }, + { + "epoch": 21.29, + "eval_exact_match": 0.6711798839458414, + "eval_exec": 0.7263056092843327, + "eval_loss": 0.31857267022132874, + "eval_runtime": 1456.357, + "eval_samples_per_second": 0.71, + "step": 64 + }, + { + "epoch": 21.87, + "learning_rate": 0.0001, + "loss": 0.0063, + "step": 66 + }, + { + "epoch": 22.87, + "learning_rate": 0.0001, + "loss": 0.0066, + "step": 69 + }, + { + "epoch": 23.87, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 72 + }, + { + "epoch": 24.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 75 + }, + { + "epoch": 25.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 78 + }, + { + "epoch": 26.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 81 + }, + { + "epoch": 27.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 84 + }, + { + "epoch": 28.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 87 + }, + { + "epoch": 29.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 90 + }, + { + "epoch": 30.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 93 + }, + { + "epoch": 31.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 96 + }, + { + "epoch": 32.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 99 + }, + { + "epoch": 33.87, + "learning_rate": 0.0001, + "loss": 0.003, + "step": 102 + }, + { + "epoch": 34.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 105 + }, + { + "epoch": 35.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 108 + }, + { + "epoch": 36.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 111 + }, + { + "epoch": 37.87, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 114 + }, + { + "epoch": 38.87, + "learning_rate": 0.0001, + "loss": 0.0062, + "step": 117 + }, + { + "epoch": 39.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 120 + }, + { + "epoch": 40.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 123 + }, + { + "epoch": 41.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 126 + }, + { + "epoch": 42.58, + "eval_exact_match": 0.660541586073501, + "eval_exec": 0.7030947775628626, + "eval_loss": 0.3405461311340332, + "eval_runtime": 549.64, + "eval_samples_per_second": 1.881, + "step": 128 + }, + { + "epoch": 42.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 129 + }, + { + "epoch": 43.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 132 + }, + { + "epoch": 44.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 135 + }, + { + "epoch": 45.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 138 + }, + { + "epoch": 46.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 141 + }, + { + "epoch": 47.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 144 + }, + { + "epoch": 48.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 147 + }, + { + "epoch": 49.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 150 + }, + { + "epoch": 50.87, + "learning_rate": 0.0001, + "loss": 0.0098, + "step": 153 + }, + { + "epoch": 51.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 156 + }, + { + "epoch": 52.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 159 + }, + { + "epoch": 53.87, + "learning_rate": 0.0001, + "loss": 0.0043, + "step": 162 + }, + { + "epoch": 54.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 165 + }, + { + "epoch": 55.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 168 + }, + { + "epoch": 56.87, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 171 + }, + { + "epoch": 57.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 174 + }, + { + "epoch": 58.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 177 + }, + { + "epoch": 59.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 180 + }, + { + "epoch": 60.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 183 + }, + { + "epoch": 61.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 186 + }, + { + "epoch": 62.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 189 + }, + { + "epoch": 63.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 192 + }, + { + "epoch": 63.87, + "eval_exact_match": 0.683752417794971, + "eval_exec": 0.7137330754352031, + "eval_loss": 0.36411476135253906, + "eval_runtime": 516.7586, + "eval_samples_per_second": 2.001, + "step": 192 + }, + { + "epoch": 64.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 195 + }, + { + "epoch": 65.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 198 + }, + { + "epoch": 66.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 201 + }, + { + "epoch": 67.87, + "learning_rate": 0.0001, + "loss": 0.0062, + "step": 204 + }, + { + "epoch": 68.87, + "learning_rate": 0.0001, + "loss": 0.0061, + "step": 207 + }, + { + "epoch": 69.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 210 + }, + { + "epoch": 70.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 213 + }, + { + "epoch": 71.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 216 + }, + { + "epoch": 72.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 219 + }, + { + "epoch": 73.87, + "learning_rate": 0.0001, + "loss": 0.0033, + "step": 222 + }, + { + "epoch": 74.87, + "learning_rate": 0.0001, + "loss": 0.0051, + "step": 225 + }, + { + "epoch": 75.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 228 + }, + { + "epoch": 76.87, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 231 + }, + { + "epoch": 77.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 234 + }, + { + "epoch": 78.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 237 + }, + { + "epoch": 79.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 240 + }, + { + "epoch": 80.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 243 + }, + { + "epoch": 81.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 246 + }, + { + "epoch": 82.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 249 + }, + { + "epoch": 83.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 252 + }, + { + "epoch": 84.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 255 + }, + { + "epoch": 85.29, + "eval_exact_match": 0.6876208897485493, + "eval_exec": 0.7156673114119922, + "eval_loss": 0.3546766936779022, + "eval_runtime": 519.3296, + "eval_samples_per_second": 1.991, + "step": 256 + }, + { + "epoch": 85.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 258 + }, + { + "epoch": 86.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 261 + }, + { + "epoch": 87.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 264 + }, + { + "epoch": 88.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 267 + }, + { + "epoch": 89.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 270 + }, + { + "epoch": 90.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 273 + }, + { + "epoch": 91.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 276 + }, + { + "epoch": 92.87, + "learning_rate": 0.0001, + "loss": 0.0126, + "step": 279 + }, + { + "epoch": 93.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 282 + }, + { + "epoch": 94.87, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 285 + }, + { + "epoch": 95.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 288 + }, + { + "epoch": 96.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 291 + }, + { + "epoch": 97.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 294 + }, + { + "epoch": 98.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 297 + }, + { + "epoch": 99.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 300 + }, + { + "epoch": 100.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 303 + }, + { + "epoch": 101.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 306 + }, + { + "epoch": 102.87, + "learning_rate": 0.0001, + "loss": 0.0119, + "step": 309 + }, + { + "epoch": 103.87, + "learning_rate": 0.0001, + "loss": 0.0156, + "step": 312 + }, + { + "epoch": 104.87, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 315 + }, + { + "epoch": 105.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 318 + }, + { + "epoch": 106.58, + "eval_exact_match": 0.6769825918762089, + "eval_exec": 0.718568665377176, + "eval_loss": 0.30316162109375, + "eval_runtime": 480.1138, + "eval_samples_per_second": 2.154, + "step": 320 + }, + { + "epoch": 106.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 321 + }, + { + "epoch": 107.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 324 + }, + { + "epoch": 108.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 327 + }, + { + "epoch": 109.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 330 + }, + { + "epoch": 110.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 333 + }, + { + "epoch": 111.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 336 + }, + { + "epoch": 112.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 339 + }, + { + "epoch": 113.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 342 + }, + { + "epoch": 114.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 345 + }, + { + "epoch": 115.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 348 + }, + { + "epoch": 116.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 351 + }, + { + "epoch": 117.87, + "learning_rate": 0.0001, + "loss": 0.002, + "step": 354 + }, + { + "epoch": 118.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 357 + }, + { + "epoch": 119.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 360 + }, + { + "epoch": 120.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 363 + }, + { + "epoch": 121.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 366 + }, + { + "epoch": 122.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 369 + }, + { + "epoch": 123.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 372 + }, + { + "epoch": 124.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 375 + }, + { + "epoch": 125.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 378 + }, + { + "epoch": 126.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 381 + }, + { + "epoch": 127.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 384 + }, + { + "epoch": 127.87, + "eval_exact_match": 0.6866537717601547, + "eval_exec": 0.7272727272727273, + "eval_loss": 0.3592199981212616, + "eval_runtime": 499.5116, + "eval_samples_per_second": 2.07, + "step": 384 + }, + { + "epoch": 128.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 387 + }, + { + "epoch": 129.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 390 + }, + { + "epoch": 130.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 393 + }, + { + "epoch": 131.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 396 + }, + { + "epoch": 132.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 399 + }, + { + "epoch": 133.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 402 + }, + { + "epoch": 134.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 405 + }, + { + "epoch": 135.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 408 + }, + { + "epoch": 136.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 411 + }, + { + "epoch": 137.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 414 + }, + { + "epoch": 138.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 417 + }, + { + "epoch": 139.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 420 + }, + { + "epoch": 140.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 423 + }, + { + "epoch": 141.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 426 + }, + { + "epoch": 142.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 429 + }, + { + "epoch": 143.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 432 + }, + { + "epoch": 144.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 435 + }, + { + "epoch": 145.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 438 + }, + { + "epoch": 146.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 441 + }, + { + "epoch": 147.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 444 + }, + { + "epoch": 148.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 447 + }, + { + "epoch": 149.29, + "eval_exact_match": 0.683752417794971, + "eval_exec": 0.7263056092843327, + "eval_loss": 0.37971413135528564, + "eval_runtime": 539.1731, + "eval_samples_per_second": 1.918, + "step": 448 + }, + { + "epoch": 149.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 450 + }, + { + "epoch": 150.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 453 + }, + { + "epoch": 151.87, + "learning_rate": 0.0001, + "loss": 0.0079, + "step": 456 + }, + { + "epoch": 152.87, + "learning_rate": 0.0001, + "loss": 0.0039, + "step": 459 + }, + { + "epoch": 153.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 462 + }, + { + "epoch": 154.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 465 + }, + { + "epoch": 155.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 468 + }, + { + "epoch": 156.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 471 + }, + { + "epoch": 157.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 474 + }, + { + "epoch": 158.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 477 + }, + { + "epoch": 159.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 480 + }, + { + "epoch": 160.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 483 + }, + { + "epoch": 161.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 486 + }, + { + "epoch": 162.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 489 + }, + { + "epoch": 163.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 492 + }, + { + "epoch": 164.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 495 + }, + { + "epoch": 165.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 498 + }, + { + "epoch": 166.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 501 + }, + { + "epoch": 167.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 504 + }, + { + "epoch": 168.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 507 + }, + { + "epoch": 169.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 510 + }, + { + "epoch": 170.58, + "eval_exact_match": 0.690522243713733, + "eval_exec": 0.7263056092843327, + "eval_loss": 0.3713047504425049, + "eval_runtime": 487.523, + "eval_samples_per_second": 2.121, + "step": 512 + }, + { + "epoch": 170.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 513 + }, + { + "epoch": 171.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 516 + }, + { + "epoch": 172.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 519 + }, + { + "epoch": 173.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 522 + }, + { + "epoch": 174.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 525 + }, + { + "epoch": 175.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 528 + }, + { + "epoch": 176.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 531 + }, + { + "epoch": 177.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 534 + }, + { + "epoch": 178.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 537 + }, + { + "epoch": 179.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 540 + }, + { + "epoch": 180.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 543 + }, + { + "epoch": 181.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 546 + }, + { + "epoch": 182.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 549 + }, + { + "epoch": 183.87, + "learning_rate": 0.0001, + "loss": 0.0021, + "step": 552 + }, + { + "epoch": 184.87, + "learning_rate": 0.0001, + "loss": 0.0104, + "step": 555 + }, + { + "epoch": 185.87, + "learning_rate": 0.0001, + "loss": 0.0022, + "step": 558 + }, + { + "epoch": 186.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 561 + }, + { + "epoch": 187.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 564 + }, + { + "epoch": 188.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 567 + }, + { + "epoch": 189.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 570 + }, + { + "epoch": 190.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 573 + }, + { + "epoch": 191.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 576 + }, + { + "epoch": 191.87, + "eval_exact_match": 0.6982591876208898, + "eval_exec": 0.7398452611218569, + "eval_loss": 0.35731402039527893, + "eval_runtime": 515.4072, + "eval_samples_per_second": 2.006, + "step": 576 + }, + { + "epoch": 192.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 579 + }, + { + "epoch": 193.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 582 + }, + { + "epoch": 194.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 585 + }, + { + "epoch": 195.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 588 + }, + { + "epoch": 196.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 591 + }, + { + "epoch": 197.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 594 + }, + { + "epoch": 198.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 597 + }, + { + "epoch": 199.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 600 + }, + { + "epoch": 200.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 603 + }, + { + "epoch": 201.87, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 606 + }, + { + "epoch": 202.87, + "learning_rate": 0.0001, + "loss": 0.0022, + "step": 609 + }, + { + "epoch": 203.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 612 + }, + { + "epoch": 204.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 615 + }, + { + "epoch": 205.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 618 + }, + { + "epoch": 206.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 621 + }, + { + "epoch": 207.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 624 + }, + { + "epoch": 208.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 627 + }, + { + "epoch": 209.87, + "learning_rate": 0.0001, + "loss": 0.0061, + "step": 630 + }, + { + "epoch": 210.87, + "learning_rate": 0.0001, + "loss": 0.0214, + "step": 633 + }, + { + "epoch": 211.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 636 + }, + { + "epoch": 212.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 639 + }, + { + "epoch": 213.29, + "eval_exact_match": 0.7059961315280464, + "eval_exec": 0.7350096711798839, + "eval_loss": 0.3407991826534271, + "eval_runtime": 507.7063, + "eval_samples_per_second": 2.037, + "step": 640 + }, + { + "epoch": 213.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 642 + }, + { + "epoch": 214.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 645 + }, + { + "epoch": 215.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 648 + }, + { + "epoch": 216.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 651 + }, + { + "epoch": 217.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 654 + }, + { + "epoch": 218.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 657 + }, + { + "epoch": 219.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 660 + }, + { + "epoch": 220.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 663 + }, + { + "epoch": 221.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 666 + }, + { + "epoch": 222.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 669 + }, + { + "epoch": 223.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 672 + }, + { + "epoch": 224.87, + "learning_rate": 0.0001, + "loss": 0.0018, + "step": 675 + }, + { + "epoch": 225.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 678 + }, + { + "epoch": 226.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 681 + }, + { + "epoch": 227.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 684 + }, + { + "epoch": 228.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 687 + }, + { + "epoch": 229.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 690 + }, + { + "epoch": 230.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 693 + }, + { + "epoch": 231.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 696 + }, + { + "epoch": 232.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 699 + }, + { + "epoch": 233.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 702 + }, + { + "epoch": 234.58, + "eval_exact_match": 0.6992263056092843, + "eval_exec": 0.7272727272727273, + "eval_loss": 0.3767277002334595, + "eval_runtime": 545.434, + "eval_samples_per_second": 1.896, + "step": 704 + }, + { + "epoch": 234.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 705 + }, + { + "epoch": 235.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 708 + }, + { + "epoch": 236.87, + "learning_rate": 0.0001, + "loss": 0.0116, + "step": 711 + }, + { + "epoch": 237.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 714 + }, + { + "epoch": 238.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 717 + }, + { + "epoch": 239.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 720 + }, + { + "epoch": 240.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 723 + }, + { + "epoch": 241.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 726 + }, + { + "epoch": 242.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 729 + }, + { + "epoch": 243.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 732 + }, + { + "epoch": 244.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 735 + }, + { + "epoch": 245.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 738 + }, + { + "epoch": 246.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 741 + }, + { + "epoch": 247.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 744 + }, + { + "epoch": 248.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 747 + }, + { + "epoch": 249.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 750 + }, + { + "epoch": 250.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 753 + }, + { + "epoch": 251.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 756 + }, + { + "epoch": 252.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 759 + }, + { + "epoch": 253.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 762 + }, + { + "epoch": 254.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 765 + }, + { + "epoch": 255.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 768 + }, + { + "epoch": 255.87, + "eval_exact_match": 0.6943907156673114, + "eval_exec": 0.7282398452611218, + "eval_loss": 0.3885338008403778, + "eval_runtime": 546.3768, + "eval_samples_per_second": 1.892, + "step": 768 + }, + { + "epoch": 256.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 771 + }, + { + "epoch": 257.87, + "learning_rate": 0.0001, + "loss": 0.0059, + "step": 774 + }, + { + "epoch": 258.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 777 + }, + { + "epoch": 259.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 780 + }, + { + "epoch": 260.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 783 + }, + { + "epoch": 261.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 786 + }, + { + "epoch": 262.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 789 + }, + { + "epoch": 263.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 792 + }, + { + "epoch": 264.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 795 + }, + { + "epoch": 265.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 798 + }, + { + "epoch": 266.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 801 + }, + { + "epoch": 267.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 804 + }, + { + "epoch": 268.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 807 + }, + { + "epoch": 269.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 810 + }, + { + "epoch": 270.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 813 + }, + { + "epoch": 271.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 816 + }, + { + "epoch": 272.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 819 + }, + { + "epoch": 273.87, + "learning_rate": 0.0001, + "loss": 0.0013, + "step": 822 + }, + { + "epoch": 274.87, + "learning_rate": 0.0001, + "loss": 0.0059, + "step": 825 + }, + { + "epoch": 275.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 828 + }, + { + "epoch": 276.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 831 + }, + { + "epoch": 277.29, + "eval_exact_match": 0.695357833655706, + "eval_exec": 0.7321083172147002, + "eval_loss": 0.35210856795310974, + "eval_runtime": 608.3162, + "eval_samples_per_second": 1.7, + "step": 832 + }, + { + "epoch": 277.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 834 + }, + { + "epoch": 278.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 837 + }, + { + "epoch": 279.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 840 + }, + { + "epoch": 280.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 843 + }, + { + "epoch": 281.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 846 + }, + { + "epoch": 282.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 849 + }, + { + "epoch": 283.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 852 + }, + { + "epoch": 284.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 855 + }, + { + "epoch": 285.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 858 + }, + { + "epoch": 286.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 861 + }, + { + "epoch": 287.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 864 + }, + { + "epoch": 288.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 867 + }, + { + "epoch": 289.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 870 + }, + { + "epoch": 290.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 873 + }, + { + "epoch": 291.87, + "learning_rate": 0.0001, + "loss": 0.0019, + "step": 876 + }, + { + "epoch": 292.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 879 + }, + { + "epoch": 293.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 882 + }, + { + "epoch": 294.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 885 + }, + { + "epoch": 295.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 888 + }, + { + "epoch": 296.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 891 + }, + { + "epoch": 297.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 894 + }, + { + "epoch": 298.58, + "eval_exact_match": 0.688588007736944, + "eval_exec": 0.7214700193423598, + "eval_loss": 0.3566940426826477, + "eval_runtime": 584.9721, + "eval_samples_per_second": 1.768, + "step": 896 + }, + { + "epoch": 298.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 897 + }, + { + "epoch": 299.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 900 + }, + { + "epoch": 300.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 903 + }, + { + "epoch": 301.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 906 + }, + { + "epoch": 302.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 909 + }, + { + "epoch": 303.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 912 + }, + { + "epoch": 304.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 915 + }, + { + "epoch": 305.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 918 + }, + { + "epoch": 306.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 921 + }, + { + "epoch": 307.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 924 + }, + { + "epoch": 308.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 927 + }, + { + "epoch": 309.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 930 + }, + { + "epoch": 310.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 933 + }, + { + "epoch": 311.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 936 + }, + { + "epoch": 312.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 939 + }, + { + "epoch": 313.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 942 + }, + { + "epoch": 314.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 945 + }, + { + "epoch": 315.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 948 + }, + { + "epoch": 316.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 951 + }, + { + "epoch": 317.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 954 + }, + { + "epoch": 318.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 957 + }, + { + "epoch": 319.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 960 + }, + { + "epoch": 319.87, + "eval_exact_match": 0.6963249516441006, + "eval_exec": 0.730174081237911, + "eval_loss": 0.37420669198036194, + "eval_runtime": 518.4361, + "eval_samples_per_second": 1.994, + "step": 960 + }, + { + "epoch": 320.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 963 + }, + { + "epoch": 321.87, + "learning_rate": 0.0001, + "loss": 0.0024, + "step": 966 + }, + { + "epoch": 322.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 969 + }, + { + "epoch": 323.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 972 + }, + { + "epoch": 324.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 975 + }, + { + "epoch": 325.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 978 + }, + { + "epoch": 326.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 981 + }, + { + "epoch": 327.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 984 + }, + { + "epoch": 328.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 987 + }, + { + "epoch": 329.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 990 + }, + { + "epoch": 330.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 993 + }, + { + "epoch": 331.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 996 + }, + { + "epoch": 332.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 999 + }, + { + "epoch": 333.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1002 + }, + { + "epoch": 334.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1005 + }, + { + "epoch": 335.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1008 + }, + { + "epoch": 336.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1011 + }, + { + "epoch": 337.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1014 + }, + { + "epoch": 338.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1017 + }, + { + "epoch": 339.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1020 + }, + { + "epoch": 340.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1023 + }, + { + "epoch": 341.29, + "eval_exact_match": 0.6924564796905223, + "eval_exec": 0.7359767891682786, + "eval_loss": 0.39109551906585693, + "eval_runtime": 1517.7959, + "eval_samples_per_second": 0.681, + "step": 1024 + }, + { + "epoch": 341.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1026 + }, + { + "epoch": 342.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1029 + }, + { + "epoch": 343.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1032 + }, + { + "epoch": 344.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1035 + }, + { + "epoch": 345.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1038 + }, + { + "epoch": 346.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1041 + }, + { + "epoch": 347.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1044 + }, + { + "epoch": 348.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1047 + }, + { + "epoch": 349.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1050 + }, + { + "epoch": 350.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1053 + }, + { + "epoch": 351.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1056 + }, + { + "epoch": 352.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1059 + }, + { + "epoch": 353.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1062 + }, + { + "epoch": 354.87, + "learning_rate": 0.0001, + "loss": 0.0034, + "step": 1065 + }, + { + "epoch": 355.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1068 + }, + { + "epoch": 356.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1071 + }, + { + "epoch": 357.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1074 + }, + { + "epoch": 358.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1077 + }, + { + "epoch": 359.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1080 + }, + { + "epoch": 360.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1083 + }, + { + "epoch": 361.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1086 + }, + { + "epoch": 362.58, + "eval_exact_match": 0.6760154738878144, + "eval_exec": 0.7156673114119922, + "eval_loss": 0.37218964099884033, + "eval_runtime": 526.428, + "eval_samples_per_second": 1.964, + "step": 1088 + }, + { + "epoch": 362.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1089 + }, + { + "epoch": 363.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1092 + }, + { + "epoch": 364.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1095 + }, + { + "epoch": 365.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1098 + }, + { + "epoch": 366.87, + "learning_rate": 0.0001, + "loss": 0.0019, + "step": 1101 + }, + { + "epoch": 367.87, + "learning_rate": 0.0001, + "loss": 0.01, + "step": 1104 + }, + { + "epoch": 368.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1107 + }, + { + "epoch": 369.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1110 + }, + { + "epoch": 370.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1113 + }, + { + "epoch": 371.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1116 + }, + { + "epoch": 372.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1119 + }, + { + "epoch": 373.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1122 + }, + { + "epoch": 374.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1125 + }, + { + "epoch": 375.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1128 + }, + { + "epoch": 376.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1131 + }, + { + "epoch": 377.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1134 + }, + { + "epoch": 378.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1137 + }, + { + "epoch": 379.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1140 + }, + { + "epoch": 380.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1143 + }, + { + "epoch": 381.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1146 + }, + { + "epoch": 382.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1149 + }, + { + "epoch": 383.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1152 + }, + { + "epoch": 383.87, + "eval_exact_match": 0.6934235976789168, + "eval_exec": 0.7408123791102514, + "eval_loss": 0.3976551294326782, + "eval_runtime": 484.9481, + "eval_samples_per_second": 2.132, + "step": 1152 + }, + { + "epoch": 384.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1155 + }, + { + "epoch": 385.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1158 + }, + { + "epoch": 386.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1161 + }, + { + "epoch": 387.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1164 + }, + { + "epoch": 388.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1167 + }, + { + "epoch": 389.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1170 + }, + { + "epoch": 390.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1173 + }, + { + "epoch": 391.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1176 + }, + { + "epoch": 392.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1179 + }, + { + "epoch": 393.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1182 + }, + { + "epoch": 394.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1185 + }, + { + "epoch": 395.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1188 + }, + { + "epoch": 396.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1191 + }, + { + "epoch": 397.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1194 + }, + { + "epoch": 398.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1197 + }, + { + "epoch": 399.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1200 + }, + { + "epoch": 400.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1203 + }, + { + "epoch": 401.87, + "learning_rate": 0.0001, + "loss": 0.0043, + "step": 1206 + }, + { + "epoch": 402.87, + "learning_rate": 0.0001, + "loss": 0.0078, + "step": 1209 + }, + { + "epoch": 403.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1212 + }, + { + "epoch": 404.87, + "learning_rate": 0.0001, + "loss": 0.0034, + "step": 1215 + }, + { + "epoch": 405.29, + "eval_exact_match": 0.6779497098646035, + "eval_exec": 0.7263056092843327, + "eval_loss": 0.32186877727508545, + "eval_runtime": 460.174, + "eval_samples_per_second": 2.247, + "step": 1216 + }, + { + "epoch": 405.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 1218 + }, + { + "epoch": 406.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1221 + }, + { + "epoch": 407.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1224 + }, + { + "epoch": 408.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1227 + }, + { + "epoch": 409.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1230 + }, + { + "epoch": 410.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1233 + }, + { + "epoch": 411.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1236 + }, + { + "epoch": 412.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1239 + }, + { + "epoch": 413.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1242 + }, + { + "epoch": 414.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1245 + }, + { + "epoch": 415.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1248 + }, + { + "epoch": 416.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1251 + }, + { + "epoch": 417.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1254 + }, + { + "epoch": 418.87, + "learning_rate": 0.0001, + "loss": 0.0015, + "step": 1257 + }, + { + "epoch": 419.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1260 + }, + { + "epoch": 420.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1263 + }, + { + "epoch": 421.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1266 + }, + { + "epoch": 422.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1269 + }, + { + "epoch": 423.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1272 + }, + { + "epoch": 424.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1275 + }, + { + "epoch": 425.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1278 + }, + { + "epoch": 426.58, + "eval_exact_match": 0.6895551257253385, + "eval_exec": 0.730174081237911, + "eval_loss": 0.37865036725997925, + "eval_runtime": 491.4423, + "eval_samples_per_second": 2.104, + "step": 1280 + }, + { + "epoch": 426.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1281 + }, + { + "epoch": 427.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1284 + }, + { + "epoch": 428.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1287 + }, + { + "epoch": 429.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1290 + }, + { + "epoch": 430.87, + "learning_rate": 0.0001, + "loss": 0.0025, + "step": 1293 + }, + { + "epoch": 431.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1296 + }, + { + "epoch": 432.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1299 + }, + { + "epoch": 433.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1302 + }, + { + "epoch": 434.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1305 + }, + { + "epoch": 435.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1308 + }, + { + "epoch": 436.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1311 + }, + { + "epoch": 437.87, + "learning_rate": 0.0001, + "loss": 0.005, + "step": 1314 + }, + { + "epoch": 438.87, + "learning_rate": 0.0001, + "loss": 0.0035, + "step": 1317 + }, + { + "epoch": 439.87, + "learning_rate": 0.0001, + "loss": 0.0016, + "step": 1320 + }, + { + "epoch": 440.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1323 + }, + { + "epoch": 441.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1326 + }, + { + "epoch": 442.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1329 + }, + { + "epoch": 443.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1332 + }, + { + "epoch": 444.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1335 + }, + { + "epoch": 445.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1338 + }, + { + "epoch": 446.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1341 + }, + { + "epoch": 447.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1344 + }, + { + "epoch": 447.87, + "eval_exact_match": 0.6934235976789168, + "eval_exec": 0.7282398452611218, + "eval_loss": 0.35363292694091797, + "eval_runtime": 532.2521, + "eval_samples_per_second": 1.943, + "step": 1344 + }, + { + "epoch": 448.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1347 + }, + { + "epoch": 449.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1350 + }, + { + "epoch": 450.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1353 + }, + { + "epoch": 451.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1356 + }, + { + "epoch": 452.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1359 + }, + { + "epoch": 453.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1362 + }, + { + "epoch": 454.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1365 + }, + { + "epoch": 455.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1368 + }, + { + "epoch": 456.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1371 + }, + { + "epoch": 457.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1374 + }, + { + "epoch": 458.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1377 + }, + { + "epoch": 459.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1380 + }, + { + "epoch": 460.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1383 + }, + { + "epoch": 461.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1386 + }, + { + "epoch": 462.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1389 + }, + { + "epoch": 463.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1392 + }, + { + "epoch": 464.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1395 + }, + { + "epoch": 465.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1398 + }, + { + "epoch": 466.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1401 + }, + { + "epoch": 467.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1404 + }, + { + "epoch": 468.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1407 + }, + { + "epoch": 469.29, + "eval_exact_match": 0.6895551257253385, + "eval_exec": 0.7263056092843327, + "eval_loss": 0.3940747082233429, + "eval_runtime": 480.6861, + "eval_samples_per_second": 2.151, + "step": 1408 + }, + { + "epoch": 469.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1410 + }, + { + "epoch": 470.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1413 + }, + { + "epoch": 471.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1416 + }, + { + "epoch": 472.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1419 + }, + { + "epoch": 473.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1422 + }, + { + "epoch": 474.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1425 + }, + { + "epoch": 475.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1428 + }, + { + "epoch": 476.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1431 + }, + { + "epoch": 477.87, + "learning_rate": 0.0001, + "loss": 0.0031, + "step": 1434 + }, + { + "epoch": 478.87, + "learning_rate": 0.0001, + "loss": 0.0014, + "step": 1437 + }, + { + "epoch": 479.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1440 + }, + { + "epoch": 480.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1443 + }, + { + "epoch": 481.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1446 + }, + { + "epoch": 482.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1449 + }, + { + "epoch": 483.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1452 + }, + { + "epoch": 484.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1455 + }, + { + "epoch": 485.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1458 + }, + { + "epoch": 486.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1461 + }, + { + "epoch": 487.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1464 + }, + { + "epoch": 488.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1467 + }, + { + "epoch": 489.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1470 + }, + { + "epoch": 490.58, + "eval_exact_match": 0.6895551257253385, + "eval_exec": 0.723404255319149, + "eval_loss": 0.39211201667785645, + "eval_runtime": 483.703, + "eval_samples_per_second": 2.138, + "step": 1472 + }, + { + "epoch": 490.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1473 + }, + { + "epoch": 491.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1476 + }, + { + "epoch": 492.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1479 + }, + { + "epoch": 493.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1482 + }, + { + "epoch": 494.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1485 + }, + { + "epoch": 495.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1488 + }, + { + "epoch": 496.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1491 + }, + { + "epoch": 497.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1494 + }, + { + "epoch": 498.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1497 + }, + { + "epoch": 499.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1500 + }, + { + "epoch": 500.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1503 + }, + { + "epoch": 501.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1506 + }, + { + "epoch": 502.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1509 + }, + { + "epoch": 503.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1512 + }, + { + "epoch": 504.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1515 + }, + { + "epoch": 505.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1518 + }, + { + "epoch": 506.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1521 + }, + { + "epoch": 507.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1524 + }, + { + "epoch": 508.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1527 + }, + { + "epoch": 509.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1530 + }, + { + "epoch": 510.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1533 + }, + { + "epoch": 511.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1536 + }, + { + "epoch": 511.87, + "eval_exact_match": 0.6972920696324951, + "eval_exec": 0.7330754352030948, + "eval_loss": 0.38117140531539917, + "eval_runtime": 463.195, + "eval_samples_per_second": 2.232, + "step": 1536 + }, + { + "epoch": 512.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 1539 + }, + { + "epoch": 513.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1542 + }, + { + "epoch": 514.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1545 + }, + { + "epoch": 515.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1548 + }, + { + "epoch": 516.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1551 + }, + { + "epoch": 517.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1554 + }, + { + "epoch": 518.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1557 + }, + { + "epoch": 519.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1560 + }, + { + "epoch": 520.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1563 + }, + { + "epoch": 521.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1566 + }, + { + "epoch": 522.87, + "learning_rate": 0.0001, + "loss": 0.0023, + "step": 1569 + }, + { + "epoch": 523.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1572 + }, + { + "epoch": 524.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1575 + }, + { + "epoch": 525.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1578 + }, + { + "epoch": 526.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1581 + }, + { + "epoch": 527.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1584 + }, + { + "epoch": 528.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1587 + }, + { + "epoch": 529.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1590 + }, + { + "epoch": 530.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1593 + }, + { + "epoch": 531.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1596 + }, + { + "epoch": 532.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1599 + }, + { + "epoch": 533.29, + "eval_exact_match": 0.7021276595744681, + "eval_exec": 0.7369439071566731, + "eval_loss": 0.3867366313934326, + "eval_runtime": 480.8888, + "eval_samples_per_second": 2.15, + "step": 1600 + }, + { + "epoch": 533.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1602 + }, + { + "epoch": 534.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1605 + }, + { + "epoch": 535.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1608 + }, + { + "epoch": 536.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1611 + }, + { + "epoch": 537.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1614 + }, + { + "epoch": 538.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1617 + }, + { + "epoch": 539.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1620 + }, + { + "epoch": 540.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1623 + }, + { + "epoch": 541.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1626 + }, + { + "epoch": 542.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1629 + }, + { + "epoch": 543.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1632 + }, + { + "epoch": 544.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1635 + }, + { + "epoch": 545.87, + "learning_rate": 0.0001, + "loss": 0.0118, + "step": 1638 + }, + { + "epoch": 546.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1641 + }, + { + "epoch": 547.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1644 + }, + { + "epoch": 548.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1647 + }, + { + "epoch": 549.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1650 + }, + { + "epoch": 550.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1653 + }, + { + "epoch": 551.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1656 + }, + { + "epoch": 552.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1659 + }, + { + "epoch": 553.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1662 + }, + { + "epoch": 554.58, + "eval_exact_match": 0.695357833655706, + "eval_exec": 0.7388781431334622, + "eval_loss": 0.36511388421058655, + "eval_runtime": 507.0372, + "eval_samples_per_second": 2.039, + "step": 1664 + }, + { + "epoch": 554.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1665 + }, + { + "epoch": 555.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1668 + }, + { + "epoch": 556.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1671 + }, + { + "epoch": 557.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1674 + }, + { + "epoch": 558.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1677 + }, + { + "epoch": 559.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1680 + }, + { + "epoch": 560.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1683 + }, + { + "epoch": 561.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1686 + }, + { + "epoch": 562.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1689 + }, + { + "epoch": 563.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1692 + }, + { + "epoch": 564.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1695 + }, + { + "epoch": 565.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1698 + }, + { + "epoch": 566.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1701 + }, + { + "epoch": 567.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1704 + }, + { + "epoch": 568.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1707 + }, + { + "epoch": 569.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1710 + }, + { + "epoch": 570.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1713 + }, + { + "epoch": 571.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1716 + }, + { + "epoch": 572.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1719 + }, + { + "epoch": 573.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1722 + }, + { + "epoch": 574.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1725 + }, + { + "epoch": 575.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1728 + }, + { + "epoch": 575.87, + "eval_exact_match": 0.6943907156673114, + "eval_exec": 0.7330754352030948, + "eval_loss": 0.38700243830680847, + "eval_runtime": 510.5436, + "eval_samples_per_second": 2.025, + "step": 1728 + }, + { + "epoch": 576.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 1731 + }, + { + "epoch": 577.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 1734 + }, + { + "epoch": 578.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1737 + }, + { + "epoch": 579.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1740 + }, + { + "epoch": 580.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1743 + }, + { + "epoch": 581.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1746 + }, + { + "epoch": 582.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1749 + }, + { + "epoch": 583.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1752 + }, + { + "epoch": 584.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1755 + }, + { + "epoch": 585.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1758 + }, + { + "epoch": 586.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 1761 + }, + { + "epoch": 587.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1764 + }, + { + "epoch": 588.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1767 + }, + { + "epoch": 589.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1770 + }, + { + "epoch": 590.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1773 + }, + { + "epoch": 591.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1776 + }, + { + "epoch": 592.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1779 + }, + { + "epoch": 593.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1782 + }, + { + "epoch": 594.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1785 + }, + { + "epoch": 595.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1788 + }, + { + "epoch": 596.87, + "learning_rate": 0.0001, + "loss": 0.0021, + "step": 1791 + }, + { + "epoch": 597.29, + "eval_exact_match": 0.6682785299806576, + "eval_exec": 0.7040618955512572, + "eval_loss": 0.37455007433891296, + "eval_runtime": 480.6892, + "eval_samples_per_second": 2.151, + "step": 1792 + }, + { + "epoch": 597.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1794 + }, + { + "epoch": 598.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1797 + }, + { + "epoch": 599.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1800 + }, + { + "epoch": 600.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1803 + }, + { + "epoch": 601.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1806 + }, + { + "epoch": 602.87, + "learning_rate": 0.0001, + "loss": 0.0026, + "step": 1809 + }, + { + "epoch": 603.87, + "learning_rate": 0.0001, + "loss": 0.0035, + "step": 1812 + }, + { + "epoch": 604.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1815 + }, + { + "epoch": 605.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1818 + }, + { + "epoch": 606.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1821 + }, + { + "epoch": 607.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1824 + }, + { + "epoch": 608.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1827 + }, + { + "epoch": 609.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1830 + }, + { + "epoch": 610.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1833 + }, + { + "epoch": 611.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1836 + }, + { + "epoch": 612.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1839 + }, + { + "epoch": 613.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1842 + }, + { + "epoch": 614.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1845 + }, + { + "epoch": 615.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1848 + }, + { + "epoch": 616.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1851 + }, + { + "epoch": 617.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1854 + }, + { + "epoch": 618.58, + "eval_exact_match": 0.688588007736944, + "eval_exec": 0.7195357833655706, + "eval_loss": 0.37960970401763916, + "eval_runtime": 478.0693, + "eval_samples_per_second": 2.163, + "step": 1856 + }, + { + "epoch": 618.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1857 + }, + { + "epoch": 619.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1860 + }, + { + "epoch": 620.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1863 + }, + { + "epoch": 621.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1866 + }, + { + "epoch": 622.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1869 + }, + { + "epoch": 623.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1872 + }, + { + "epoch": 624.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1875 + }, + { + "epoch": 625.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1878 + }, + { + "epoch": 626.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1881 + }, + { + "epoch": 627.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 1884 + }, + { + "epoch": 628.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1887 + }, + { + "epoch": 629.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1890 + }, + { + "epoch": 630.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1893 + }, + { + "epoch": 631.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1896 + }, + { + "epoch": 632.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1899 + }, + { + "epoch": 633.87, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 1902 + }, + { + "epoch": 634.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1905 + }, + { + "epoch": 635.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1908 + }, + { + "epoch": 636.87, + "learning_rate": 0.0001, + "loss": 0.0026, + "step": 1911 + }, + { + "epoch": 637.87, + "learning_rate": 0.0001, + "loss": 0.0046, + "step": 1914 + }, + { + "epoch": 638.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 1917 + }, + { + "epoch": 639.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1920 + }, + { + "epoch": 639.87, + "eval_exact_match": 0.6847195357833655, + "eval_exec": 0.723404255319149, + "eval_loss": 0.3337988555431366, + "eval_runtime": 469.6103, + "eval_samples_per_second": 2.202, + "step": 1920 + }, + { + "epoch": 640.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1923 + }, + { + "epoch": 641.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1926 + }, + { + "epoch": 642.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1929 + }, + { + "epoch": 643.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1932 + }, + { + "epoch": 644.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1935 + }, + { + "epoch": 645.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1938 + }, + { + "epoch": 646.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1941 + }, + { + "epoch": 647.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1944 + }, + { + "epoch": 648.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1947 + }, + { + "epoch": 649.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1950 + }, + { + "epoch": 650.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1953 + }, + { + "epoch": 651.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1956 + }, + { + "epoch": 652.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1959 + }, + { + "epoch": 653.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1962 + }, + { + "epoch": 654.87, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 1965 + }, + { + "epoch": 655.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1968 + }, + { + "epoch": 656.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1971 + }, + { + "epoch": 657.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1974 + }, + { + "epoch": 658.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 1977 + }, + { + "epoch": 659.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1980 + }, + { + "epoch": 660.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1983 + }, + { + "epoch": 661.29, + "eval_exact_match": 0.6731141199226306, + "eval_exec": 0.7127659574468085, + "eval_loss": 0.384302020072937, + "eval_runtime": 462.6516, + "eval_samples_per_second": 2.235, + "step": 1984 + }, + { + "epoch": 661.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1986 + }, + { + "epoch": 662.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1989 + }, + { + "epoch": 663.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 1992 + }, + { + "epoch": 664.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1995 + }, + { + "epoch": 665.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 1998 + }, + { + "epoch": 666.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2001 + }, + { + "epoch": 667.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2004 + }, + { + "epoch": 668.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2007 + }, + { + "epoch": 669.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2010 + }, + { + "epoch": 670.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2013 + }, + { + "epoch": 671.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2016 + }, + { + "epoch": 672.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2019 + }, + { + "epoch": 673.87, + "learning_rate": 0.0001, + "loss": 0.003, + "step": 2022 + }, + { + "epoch": 674.87, + "learning_rate": 0.0001, + "loss": 0.0014, + "step": 2025 + }, + { + "epoch": 675.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2028 + }, + { + "epoch": 676.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2031 + }, + { + "epoch": 677.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2034 + }, + { + "epoch": 678.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2037 + }, + { + "epoch": 679.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2040 + }, + { + "epoch": 680.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2043 + }, + { + "epoch": 681.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2046 + }, + { + "epoch": 682.58, + "eval_exact_match": 0.6924564796905223, + "eval_exec": 0.7311411992263056, + "eval_loss": 0.3901544213294983, + "eval_runtime": 510.2096, + "eval_samples_per_second": 2.027, + "step": 2048 + }, + { + "epoch": 682.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2049 + }, + { + "epoch": 683.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2052 + }, + { + "epoch": 684.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2055 + }, + { + "epoch": 685.87, + "learning_rate": 0.0001, + "loss": 0.005, + "step": 2058 + }, + { + "epoch": 686.87, + "learning_rate": 0.0001, + "loss": 0.0075, + "step": 2061 + }, + { + "epoch": 687.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2064 + }, + { + "epoch": 688.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2067 + }, + { + "epoch": 689.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2070 + }, + { + "epoch": 690.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2073 + }, + { + "epoch": 691.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2076 + }, + { + "epoch": 692.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2079 + }, + { + "epoch": 693.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2082 + }, + { + "epoch": 694.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2085 + }, + { + "epoch": 695.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2088 + }, + { + "epoch": 696.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2091 + }, + { + "epoch": 697.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2094 + }, + { + "epoch": 698.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2097 + }, + { + "epoch": 699.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2100 + }, + { + "epoch": 700.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2103 + }, + { + "epoch": 701.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2106 + }, + { + "epoch": 702.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2109 + }, + { + "epoch": 703.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2112 + }, + { + "epoch": 703.87, + "eval_exact_match": 0.690522243713733, + "eval_exec": 0.7272727272727273, + "eval_loss": 0.3928453326225281, + "eval_runtime": 466.0396, + "eval_samples_per_second": 2.219, + "step": 2112 + }, + { + "epoch": 704.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 2115 + }, + { + "epoch": 705.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2118 + }, + { + "epoch": 706.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2121 + }, + { + "epoch": 707.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2124 + }, + { + "epoch": 708.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2127 + }, + { + "epoch": 709.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2130 + }, + { + "epoch": 710.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2133 + }, + { + "epoch": 711.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2136 + }, + { + "epoch": 712.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2139 + }, + { + "epoch": 713.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2142 + }, + { + "epoch": 714.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2145 + }, + { + "epoch": 715.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2148 + }, + { + "epoch": 716.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2151 + }, + { + "epoch": 717.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2154 + }, + { + "epoch": 718.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2157 + }, + { + "epoch": 719.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2160 + }, + { + "epoch": 720.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2163 + }, + { + "epoch": 721.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2166 + }, + { + "epoch": 722.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2169 + }, + { + "epoch": 723.87, + "learning_rate": 0.0001, + "loss": 0.0017, + "step": 2172 + }, + { + "epoch": 724.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2175 + }, + { + "epoch": 725.29, + "eval_exact_match": 0.7011605415860735, + "eval_exec": 0.7408123791102514, + "eval_loss": 0.3595825433731079, + "eval_runtime": 483.3909, + "eval_samples_per_second": 2.139, + "step": 2176 + }, + { + "epoch": 725.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2178 + }, + { + "epoch": 726.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2181 + }, + { + "epoch": 727.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2184 + }, + { + "epoch": 728.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 2187 + }, + { + "epoch": 729.87, + "learning_rate": 0.0001, + "loss": 0.0033, + "step": 2190 + }, + { + "epoch": 730.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 2193 + }, + { + "epoch": 731.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2196 + }, + { + "epoch": 732.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2199 + }, + { + "epoch": 733.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2202 + }, + { + "epoch": 734.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2205 + }, + { + "epoch": 735.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2208 + }, + { + "epoch": 736.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2211 + }, + { + "epoch": 737.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2214 + }, + { + "epoch": 738.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2217 + }, + { + "epoch": 739.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2220 + }, + { + "epoch": 740.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2223 + }, + { + "epoch": 741.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2226 + }, + { + "epoch": 742.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2229 + }, + { + "epoch": 743.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2232 + }, + { + "epoch": 744.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2235 + }, + { + "epoch": 745.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2238 + }, + { + "epoch": 746.58, + "eval_exact_match": 0.690522243713733, + "eval_exec": 0.7350096711798839, + "eval_loss": 0.3938235640525818, + "eval_runtime": 476.497, + "eval_samples_per_second": 2.17, + "step": 2240 + }, + { + "epoch": 746.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2241 + }, + { + "epoch": 747.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2244 + }, + { + "epoch": 748.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2247 + }, + { + "epoch": 749.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2250 + }, + { + "epoch": 750.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2253 + }, + { + "epoch": 751.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2256 + }, + { + "epoch": 752.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2259 + }, + { + "epoch": 753.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2262 + }, + { + "epoch": 754.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2265 + }, + { + "epoch": 755.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2268 + }, + { + "epoch": 756.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2271 + }, + { + "epoch": 757.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2274 + }, + { + "epoch": 758.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2277 + }, + { + "epoch": 759.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2280 + }, + { + "epoch": 760.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2283 + }, + { + "epoch": 761.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2286 + }, + { + "epoch": 762.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2289 + }, + { + "epoch": 763.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2292 + }, + { + "epoch": 764.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2295 + }, + { + "epoch": 765.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2298 + }, + { + "epoch": 766.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2301 + }, + { + "epoch": 767.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2304 + }, + { + "epoch": 767.87, + "eval_exact_match": 0.7030947775628626, + "eval_exec": 0.7485493230174082, + "eval_loss": 0.41635680198669434, + "eval_runtime": 468.9737, + "eval_samples_per_second": 2.205, + "step": 2304 + }, + { + "epoch": 768.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2307 + }, + { + "epoch": 769.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2310 + }, + { + "epoch": 770.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2313 + }, + { + "epoch": 771.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2316 + }, + { + "epoch": 772.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2319 + }, + { + "epoch": 773.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2322 + }, + { + "epoch": 774.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2325 + }, + { + "epoch": 775.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2328 + }, + { + "epoch": 776.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2331 + }, + { + "epoch": 777.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2334 + }, + { + "epoch": 778.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2337 + }, + { + "epoch": 779.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2340 + }, + { + "epoch": 780.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2343 + }, + { + "epoch": 781.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2346 + }, + { + "epoch": 782.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2349 + }, + { + "epoch": 783.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2352 + }, + { + "epoch": 784.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2355 + }, + { + "epoch": 785.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2358 + }, + { + "epoch": 786.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2361 + }, + { + "epoch": 787.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2364 + }, + { + "epoch": 788.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2367 + }, + { + "epoch": 789.29, + "eval_exact_match": 0.7030947775628626, + "eval_exec": 0.7456479690522244, + "eval_loss": 0.4018816649913788, + "eval_runtime": 554.0201, + "eval_samples_per_second": 1.866, + "step": 2368 + }, + { + "epoch": 789.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2370 + }, + { + "epoch": 790.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2373 + }, + { + "epoch": 791.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2376 + }, + { + "epoch": 792.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2379 + }, + { + "epoch": 793.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2382 + }, + { + "epoch": 794.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2385 + }, + { + "epoch": 795.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2388 + }, + { + "epoch": 796.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2391 + }, + { + "epoch": 797.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2394 + }, + { + "epoch": 798.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2397 + }, + { + "epoch": 799.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2400 + }, + { + "epoch": 800.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2403 + }, + { + "epoch": 801.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2406 + }, + { + "epoch": 802.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2409 + }, + { + "epoch": 803.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2412 + }, + { + "epoch": 804.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2415 + }, + { + "epoch": 805.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2418 + }, + { + "epoch": 806.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2421 + }, + { + "epoch": 807.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2424 + }, + { + "epoch": 808.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2427 + }, + { + "epoch": 809.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2430 + }, + { + "epoch": 810.58, + "eval_exact_match": 0.7040618955512572, + "eval_exec": 0.7495164410058027, + "eval_loss": 0.37046629190444946, + "eval_runtime": 545.4486, + "eval_samples_per_second": 1.896, + "step": 2432 + }, + { + "epoch": 810.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2433 + }, + { + "epoch": 811.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2436 + }, + { + "epoch": 812.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2439 + }, + { + "epoch": 813.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2442 + }, + { + "epoch": 814.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2445 + }, + { + "epoch": 815.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2448 + }, + { + "epoch": 816.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2451 + }, + { + "epoch": 817.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2454 + }, + { + "epoch": 818.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2457 + }, + { + "epoch": 819.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2460 + }, + { + "epoch": 820.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2463 + }, + { + "epoch": 821.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2466 + }, + { + "epoch": 822.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2469 + }, + { + "epoch": 823.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2472 + }, + { + "epoch": 824.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2475 + }, + { + "epoch": 825.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2478 + }, + { + "epoch": 826.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2481 + }, + { + "epoch": 827.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2484 + }, + { + "epoch": 828.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2487 + }, + { + "epoch": 829.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2490 + }, + { + "epoch": 830.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2493 + }, + { + "epoch": 831.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2496 + }, + { + "epoch": 831.87, + "eval_exact_match": 0.7011605415860735, + "eval_exec": 0.7408123791102514, + "eval_loss": 0.4019996225833893, + "eval_runtime": 517.5919, + "eval_samples_per_second": 1.998, + "step": 2496 + }, + { + "epoch": 832.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2499 + }, + { + "epoch": 833.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2502 + }, + { + "epoch": 834.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2505 + }, + { + "epoch": 835.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2508 + }, + { + "epoch": 836.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2511 + }, + { + "epoch": 837.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 2514 + }, + { + "epoch": 838.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2517 + }, + { + "epoch": 839.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2520 + }, + { + "epoch": 840.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2523 + }, + { + "epoch": 841.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2526 + }, + { + "epoch": 842.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2529 + }, + { + "epoch": 843.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2532 + }, + { + "epoch": 844.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2535 + }, + { + "epoch": 845.87, + "learning_rate": 0.0001, + "loss": 0.0013, + "step": 2538 + }, + { + "epoch": 846.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 2541 + }, + { + "epoch": 847.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2544 + }, + { + "epoch": 848.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2547 + }, + { + "epoch": 849.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2550 + }, + { + "epoch": 850.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2553 + }, + { + "epoch": 851.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2556 + }, + { + "epoch": 852.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2559 + }, + { + "epoch": 853.29, + "eval_exact_match": 0.7030947775628626, + "eval_exec": 0.7475822050290135, + "eval_loss": 0.36097148060798645, + "eval_runtime": 463.1991, + "eval_samples_per_second": 2.232, + "step": 2560 + }, + { + "epoch": 853.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2562 + }, + { + "epoch": 854.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2565 + }, + { + "epoch": 855.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2568 + }, + { + "epoch": 856.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2571 + }, + { + "epoch": 857.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2574 + }, + { + "epoch": 858.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2577 + }, + { + "epoch": 859.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2580 + }, + { + "epoch": 860.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2583 + }, + { + "epoch": 861.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2586 + }, + { + "epoch": 862.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2589 + }, + { + "epoch": 863.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2592 + }, + { + "epoch": 864.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2595 + }, + { + "epoch": 865.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2598 + }, + { + "epoch": 866.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 2601 + }, + { + "epoch": 867.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2604 + }, + { + "epoch": 868.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2607 + }, + { + "epoch": 869.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2610 + }, + { + "epoch": 870.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2613 + }, + { + "epoch": 871.87, + "learning_rate": 0.0001, + "loss": 0.0027, + "step": 2616 + }, + { + "epoch": 872.87, + "learning_rate": 0.0001, + "loss": 0.0008, + "step": 2619 + }, + { + "epoch": 873.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2622 + }, + { + "epoch": 874.58, + "eval_exact_match": 0.7195357833655706, + "eval_exec": 0.7659574468085106, + "eval_loss": 0.33834004402160645, + "eval_runtime": 494.2046, + "eval_samples_per_second": 2.092, + "step": 2624 + }, + { + "epoch": 874.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2625 + }, + { + "epoch": 875.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2628 + }, + { + "epoch": 876.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2631 + }, + { + "epoch": 877.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2634 + }, + { + "epoch": 878.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2637 + }, + { + "epoch": 879.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2640 + }, + { + "epoch": 880.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2643 + }, + { + "epoch": 881.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2646 + }, + { + "epoch": 882.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2649 + }, + { + "epoch": 883.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2652 + }, + { + "epoch": 884.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2655 + }, + { + "epoch": 885.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2658 + }, + { + "epoch": 886.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 2661 + }, + { + "epoch": 887.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2664 + }, + { + "epoch": 888.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2667 + }, + { + "epoch": 889.87, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 2670 + }, + { + "epoch": 890.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2673 + }, + { + "epoch": 891.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2676 + }, + { + "epoch": 892.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2679 + }, + { + "epoch": 893.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2682 + }, + { + "epoch": 894.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2685 + }, + { + "epoch": 895.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2688 + }, + { + "epoch": 895.87, + "eval_exact_match": 0.7079303675048356, + "eval_exec": 0.753384912959381, + "eval_loss": 0.36499086022377014, + "eval_runtime": 549.4813, + "eval_samples_per_second": 1.882, + "step": 2688 + }, + { + "epoch": 896.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2691 + }, + { + "epoch": 897.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2694 + }, + { + "epoch": 898.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2697 + }, + { + "epoch": 899.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2700 + }, + { + "epoch": 900.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2703 + }, + { + "epoch": 901.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2706 + }, + { + "epoch": 902.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2709 + }, + { + "epoch": 903.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2712 + }, + { + "epoch": 904.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2715 + }, + { + "epoch": 905.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2718 + }, + { + "epoch": 906.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2721 + }, + { + "epoch": 907.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2724 + }, + { + "epoch": 908.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2727 + }, + { + "epoch": 909.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2730 + }, + { + "epoch": 910.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2733 + }, + { + "epoch": 911.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2736 + }, + { + "epoch": 912.87, + "learning_rate": 0.0001, + "loss": 0.0052, + "step": 2739 + }, + { + "epoch": 913.87, + "learning_rate": 0.0001, + "loss": 0.0085, + "step": 2742 + }, + { + "epoch": 914.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2745 + }, + { + "epoch": 915.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2748 + }, + { + "epoch": 916.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2751 + }, + { + "epoch": 917.29, + "eval_exact_match": 0.7176015473887815, + "eval_exec": 0.758220502901354, + "eval_loss": 0.35287681221961975, + "eval_runtime": 527.4463, + "eval_samples_per_second": 1.96, + "step": 2752 + }, + { + "epoch": 917.87, + "learning_rate": 0.0001, + "loss": 0.001, + "step": 2754 + }, + { + "epoch": 918.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 2757 + }, + { + "epoch": 919.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2760 + }, + { + "epoch": 920.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2763 + }, + { + "epoch": 921.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2766 + }, + { + "epoch": 922.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2769 + }, + { + "epoch": 923.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2772 + }, + { + "epoch": 924.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2775 + }, + { + "epoch": 925.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2778 + }, + { + "epoch": 926.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2781 + }, + { + "epoch": 927.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2784 + }, + { + "epoch": 928.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2787 + }, + { + "epoch": 929.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2790 + }, + { + "epoch": 930.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2793 + }, + { + "epoch": 931.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2796 + }, + { + "epoch": 932.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2799 + }, + { + "epoch": 933.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2802 + }, + { + "epoch": 934.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2805 + }, + { + "epoch": 935.87, + "learning_rate": 0.0001, + "loss": 0.0031, + "step": 2808 + }, + { + "epoch": 936.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 2811 + }, + { + "epoch": 937.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2814 + }, + { + "epoch": 938.58, + "eval_exact_match": 0.7040618955512572, + "eval_exec": 0.7543520309477756, + "eval_loss": 0.3663797676563263, + "eval_runtime": 481.0364, + "eval_samples_per_second": 2.15, + "step": 2816 + }, + { + "epoch": 938.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2817 + }, + { + "epoch": 939.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2820 + }, + { + "epoch": 940.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2823 + }, + { + "epoch": 941.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2826 + }, + { + "epoch": 942.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2829 + }, + { + "epoch": 943.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2832 + }, + { + "epoch": 944.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2835 + }, + { + "epoch": 945.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2838 + }, + { + "epoch": 946.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2841 + }, + { + "epoch": 947.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2844 + }, + { + "epoch": 948.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2847 + }, + { + "epoch": 949.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2850 + }, + { + "epoch": 950.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2853 + }, + { + "epoch": 951.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2856 + }, + { + "epoch": 952.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2859 + }, + { + "epoch": 953.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2862 + }, + { + "epoch": 954.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2865 + }, + { + "epoch": 955.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2868 + }, + { + "epoch": 956.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2871 + }, + { + "epoch": 957.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2874 + }, + { + "epoch": 958.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2877 + }, + { + "epoch": 959.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2880 + }, + { + "epoch": 959.87, + "eval_exact_match": 0.7040618955512572, + "eval_exec": 0.7456479690522244, + "eval_loss": 0.3911956250667572, + "eval_runtime": 473.5193, + "eval_samples_per_second": 2.184, + "step": 2880 + }, + { + "epoch": 960.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2883 + }, + { + "epoch": 961.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2886 + }, + { + "epoch": 962.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2889 + }, + { + "epoch": 963.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2892 + }, + { + "epoch": 964.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2895 + }, + { + "epoch": 965.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2898 + }, + { + "epoch": 966.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2901 + }, + { + "epoch": 967.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2904 + }, + { + "epoch": 968.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2907 + }, + { + "epoch": 969.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2910 + }, + { + "epoch": 970.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2913 + }, + { + "epoch": 971.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2916 + }, + { + "epoch": 972.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2919 + }, + { + "epoch": 973.87, + "learning_rate": 0.0001, + "loss": 0.0037, + "step": 2922 + }, + { + "epoch": 974.87, + "learning_rate": 0.0001, + "loss": 0.0012, + "step": 2925 + }, + { + "epoch": 975.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2928 + }, + { + "epoch": 976.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2931 + }, + { + "epoch": 977.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2934 + }, + { + "epoch": 978.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2937 + }, + { + "epoch": 979.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2940 + }, + { + "epoch": 980.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2943 + }, + { + "epoch": 981.29, + "eval_exact_match": 0.7195357833655706, + "eval_exec": 0.7485493230174082, + "eval_loss": 0.3832685053348541, + "eval_runtime": 501.8416, + "eval_samples_per_second": 2.06, + "step": 2944 + }, + { + "epoch": 981.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2946 + }, + { + "epoch": 982.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2949 + }, + { + "epoch": 983.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2952 + }, + { + "epoch": 984.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2955 + }, + { + "epoch": 985.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2958 + }, + { + "epoch": 986.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2961 + }, + { + "epoch": 987.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2964 + }, + { + "epoch": 988.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2967 + }, + { + "epoch": 989.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2970 + }, + { + "epoch": 990.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2973 + }, + { + "epoch": 991.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2976 + }, + { + "epoch": 992.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 2979 + }, + { + "epoch": 993.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2982 + }, + { + "epoch": 994.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 2985 + }, + { + "epoch": 995.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 2988 + }, + { + "epoch": 996.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2991 + }, + { + "epoch": 997.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 2994 + }, + { + "epoch": 998.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 2997 + }, + { + "epoch": 999.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3000 + }, + { + "epoch": 1000.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3003 + }, + { + "epoch": 1001.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3006 + }, + { + "epoch": 1002.58, + "eval_exact_match": 0.7156673114119922, + "eval_exec": 0.7427466150870407, + "eval_loss": 0.3957798480987549, + "eval_runtime": 491.7076, + "eval_samples_per_second": 2.103, + "step": 3008 + }, + { + "epoch": 1002.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3009 + }, + { + "epoch": 1003.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3012 + }, + { + "epoch": 1004.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3015 + }, + { + "epoch": 1005.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3018 + }, + { + "epoch": 1006.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3021 + }, + { + "epoch": 1007.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3024 + }, + { + "epoch": 1008.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3027 + }, + { + "epoch": 1009.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3030 + }, + { + "epoch": 1010.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3033 + }, + { + "epoch": 1011.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3036 + }, + { + "epoch": 1012.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3039 + }, + { + "epoch": 1013.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3042 + }, + { + "epoch": 1014.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3045 + }, + { + "epoch": 1015.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3048 + }, + { + "epoch": 1016.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3051 + }, + { + "epoch": 1017.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3054 + }, + { + "epoch": 1018.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3057 + }, + { + "epoch": 1019.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3060 + }, + { + "epoch": 1020.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3063 + }, + { + "epoch": 1021.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3066 + }, + { + "epoch": 1022.87, + "learning_rate": 0.0001, + "loss": 0.0006, + "step": 3069 + }, + { + "epoch": 1023.87, + "learning_rate": 0.0001, + "loss": 0.0018, + "step": 3072 + }, + { + "epoch": 1023.87, + "eval_exact_match": 0.690522243713733, + "eval_exec": 0.7253384912959381, + "eval_loss": 0.3460274934768677, + "eval_runtime": 479.4016, + "eval_samples_per_second": 2.157, + "step": 3072 + }, + { + "epoch": 1024.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3075 + }, + { + "epoch": 1025.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3078 + }, + { + "epoch": 1026.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3081 + }, + { + "epoch": 1027.87, + "learning_rate": 0.0001, + "loss": 0.0011, + "step": 3084 + }, + { + "epoch": 1028.87, + "learning_rate": 0.0001, + "loss": 0.0023, + "step": 3087 + }, + { + "epoch": 1029.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3090 + }, + { + "epoch": 1030.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3093 + }, + { + "epoch": 1031.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3096 + }, + { + "epoch": 1032.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3099 + }, + { + "epoch": 1033.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3102 + }, + { + "epoch": 1034.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3105 + }, + { + "epoch": 1035.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3108 + }, + { + "epoch": 1036.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3111 + }, + { + "epoch": 1037.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3114 + }, + { + "epoch": 1038.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 3117 + }, + { + "epoch": 1039.87, + "learning_rate": 0.0001, + "loss": 0.0049, + "step": 3120 + }, + { + "epoch": 1040.87, + "learning_rate": 0.0001, + "loss": 0.0003, + "step": 3123 + }, + { + "epoch": 1041.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3126 + }, + { + "epoch": 1042.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 3129 + }, + { + "epoch": 1043.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 3132 + }, + { + "epoch": 1044.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3135 + }, + { + "epoch": 1045.29, + "eval_exact_match": 0.7001934235976789, + "eval_exec": 0.7427466150870407, + "eval_loss": 0.355153888463974, + "eval_runtime": 451.6545, + "eval_samples_per_second": 2.289, + "step": 3136 + }, + { + "epoch": 1045.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3138 + }, + { + "epoch": 1046.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3141 + }, + { + "epoch": 1047.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3144 + }, + { + "epoch": 1048.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3147 + }, + { + "epoch": 1049.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3150 + }, + { + "epoch": 1050.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3153 + }, + { + "epoch": 1051.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3156 + }, + { + "epoch": 1052.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3159 + }, + { + "epoch": 1053.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3162 + }, + { + "epoch": 1054.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3165 + }, + { + "epoch": 1055.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3168 + }, + { + "epoch": 1056.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3171 + }, + { + "epoch": 1057.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3174 + }, + { + "epoch": 1058.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3177 + }, + { + "epoch": 1059.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3180 + }, + { + "epoch": 1060.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3183 + }, + { + "epoch": 1061.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3186 + }, + { + "epoch": 1062.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3189 + }, + { + "epoch": 1063.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3192 + }, + { + "epoch": 1064.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3195 + }, + { + "epoch": 1065.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 3198 + }, + { + "epoch": 1066.58, + "eval_exact_match": 0.7127659574468085, + "eval_exec": 0.7543520309477756, + "eval_loss": 0.4037820100784302, + "eval_runtime": 464.6921, + "eval_samples_per_second": 2.225, + "step": 3200 + }, + { + "epoch": 1066.87, + "learning_rate": 0.0001, + "loss": 0.0004, + "step": 3201 + }, + { + "epoch": 1067.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3204 + }, + { + "epoch": 1068.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3207 + }, + { + "epoch": 1069.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3210 + }, + { + "epoch": 1070.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3213 + }, + { + "epoch": 1071.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3216 + }, + { + "epoch": 1072.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3219 + }, + { + "epoch": 1073.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3222 + }, + { + "epoch": 1074.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3225 + }, + { + "epoch": 1075.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3228 + }, + { + "epoch": 1076.87, + "learning_rate": 0.0001, + "loss": 0.0002, + "step": 3231 + }, + { + "epoch": 1077.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 3234 + }, + { + "epoch": 1078.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3237 + }, + { + "epoch": 1079.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3240 + }, + { + "epoch": 1080.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3243 + }, + { + "epoch": 1081.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3246 + }, + { + "epoch": 1082.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3249 + }, + { + "epoch": 1083.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3252 + }, + { + "epoch": 1084.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3255 + }, + { + "epoch": 1085.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3258 + }, + { + "epoch": 1086.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3261 + }, + { + "epoch": 1087.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3264 + }, + { + "epoch": 1087.87, + "eval_exact_match": 0.7117988394584139, + "eval_exec": 0.7485493230174082, + "eval_loss": 0.4036550521850586, + "eval_runtime": 486.4323, + "eval_samples_per_second": 2.126, + "step": 3264 + }, + { + "epoch": 1088.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3267 + }, + { + "epoch": 1089.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3270 + }, + { + "epoch": 1090.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3273 + }, + { + "epoch": 1091.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3276 + }, + { + "epoch": 1092.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3279 + }, + { + "epoch": 1093.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3282 + }, + { + "epoch": 1094.87, + "learning_rate": 0.0001, + "loss": 0.0007, + "step": 3285 + }, + { + "epoch": 1095.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3288 + }, + { + "epoch": 1096.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3291 + }, + { + "epoch": 1097.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3294 + }, + { + "epoch": 1098.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3297 + }, + { + "epoch": 1099.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3300 + }, + { + "epoch": 1100.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3303 + }, + { + "epoch": 1101.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3306 + }, + { + "epoch": 1102.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3309 + }, + { + "epoch": 1103.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3312 + }, + { + "epoch": 1104.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3315 + }, + { + "epoch": 1105.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3318 + }, + { + "epoch": 1106.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3321 + }, + { + "epoch": 1107.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3324 + }, + { + "epoch": 1108.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3327 + }, + { + "epoch": 1109.29, + "eval_exact_match": 0.7021276595744681, + "eval_exec": 0.7408123791102514, + "eval_loss": 0.4005126953125, + "eval_runtime": 524.1199, + "eval_samples_per_second": 1.973, + "step": 3328 + }, + { + "epoch": 1109.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3330 + }, + { + "epoch": 1110.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3333 + }, + { + "epoch": 1111.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3336 + }, + { + "epoch": 1112.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3339 + }, + { + "epoch": 1113.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3342 + }, + { + "epoch": 1114.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3345 + }, + { + "epoch": 1115.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3348 + }, + { + "epoch": 1116.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3351 + }, + { + "epoch": 1117.87, + "learning_rate": 0.0001, + "loss": 0.0001, + "step": 3354 + }, + { + "epoch": 1118.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3357 + }, + { + "epoch": 1119.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3360 + }, + { + "epoch": 1120.87, + "learning_rate": 0.0001, + "loss": 0.0005, + "step": 3363 + }, + { + "epoch": 1121.87, + "learning_rate": 0.0001, + "loss": 0.0009, + "step": 3366 + }, + { + "epoch": 1122.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3369 + }, + { + "epoch": 1123.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3372 + }, + { + "epoch": 1124.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3375 + }, + { + "epoch": 1125.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3378 + }, + { + "epoch": 1126.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3381 + }, + { + "epoch": 1127.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3384 + }, + { + "epoch": 1128.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3387 + }, + { + "epoch": 1129.87, + "learning_rate": 0.0001, + "loss": 0.0, + "step": 3390 + }, + { + "epoch": 1130.58, + "eval_exact_match": 0.7214700193423598, + "eval_exec": 0.7562862669245648, + "eval_loss": 0.41660553216934204, + "eval_runtime": 517.542, + "eval_samples_per_second": 1.998, + "step": 3392 + } + ], + "max_steps": 9216, + "num_train_epochs": 3072, + "total_flos": 2.960191905890671e+19, + "trial_name": null, + "trial_params": null +}