akahana commited on
Commit
9d6ea3c
·
verified ·
1 Parent(s): afeb34c

End of training

Browse files
README.md CHANGED
@@ -1,9 +1,24 @@
1
  ---
2
  tags:
3
  - generated_from_trainer
 
 
 
 
4
  model-index:
5
  - name: tinygpt2-javanese
6
- results: []
 
 
 
 
 
 
 
 
 
 
 
7
  ---
8
 
9
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -11,7 +26,10 @@ should probably proofread and complete it, then remove this comment. -->
11
 
12
  # tinygpt2-javanese
13
 
14
- This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
 
 
 
15
 
16
  ## Model description
17
 
 
1
  ---
2
  tags:
3
  - generated_from_trainer
4
+ datasets:
5
+ - akahana/GlotCC-V1-jav-Latn
6
+ metrics:
7
+ - accuracy
8
  model-index:
9
  - name: tinygpt2-javanese
10
+ results:
11
+ - task:
12
+ name: Causal Language Modeling
13
+ type: text-generation
14
+ dataset:
15
+ name: akahana/GlotCC-V1-jav-Latn default
16
+ type: akahana/GlotCC-V1-jav-Latn
17
+ args: default
18
+ metrics:
19
+ - name: Accuracy
20
+ type: accuracy
21
+ value: 0.2786154321383402
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
26
 
27
  # tinygpt2-javanese
28
 
29
+ This model is a fine-tuned version of [](https://huggingface.co/) on the akahana/GlotCC-V1-jav-Latn default dataset.
30
+ It achieves the following results on the evaluation set:
31
+ - Loss: 4.7648
32
+ - Accuracy: 0.2786
33
 
34
  ## Model description
35
 
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_accuracy": 0.25270675362470885,
4
- "eval_loss": 5.005028247833252,
5
- "eval_runtime": 10.9985,
6
  "eval_samples": 4053,
7
- "eval_samples_per_second": 368.506,
8
- "eval_steps_per_second": 92.195,
9
- "perplexity": 149.16129658231105,
10
- "total_flos": 488922611712000.0,
11
- "train_loss": 5.48259629872295,
12
- "train_runtime": 4495.3437,
13
  "train_samples": 80219,
14
- "train_samples_per_second": 356.898,
15
- "train_steps_per_second": 22.308
16
  }
 
1
  {
2
+ "epoch": 30.0,
3
+ "eval_accuracy": 0.2786154321383402,
4
+ "eval_loss": 4.764777183532715,
5
+ "eval_runtime": 11.4146,
6
  "eval_samples": 4053,
7
+ "eval_samples_per_second": 355.07,
8
+ "eval_steps_per_second": 88.833,
9
+ "perplexity": 117.30497689511513,
10
+ "total_flos": 733383917568000.0,
11
+ "train_loss": 1.6002090492649228,
12
+ "train_runtime": 2253.0761,
13
  "train_samples": 80219,
14
+ "train_samples_per_second": 1068.126,
15
+ "train_steps_per_second": 66.762
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_accuracy": 0.25270675362470885,
4
- "eval_loss": 5.005028247833252,
5
- "eval_runtime": 10.9985,
6
  "eval_samples": 4053,
7
- "eval_samples_per_second": 368.506,
8
- "eval_steps_per_second": 92.195,
9
- "perplexity": 149.16129658231105
10
  }
 
1
  {
2
+ "epoch": 30.0,
3
+ "eval_accuracy": 0.2786154321383402,
4
+ "eval_loss": 4.764777183532715,
5
+ "eval_runtime": 11.4146,
6
  "eval_samples": 4053,
7
+ "eval_samples_per_second": 355.07,
8
+ "eval_steps_per_second": 88.833,
9
+ "perplexity": 117.30497689511513
10
  }
runs/Jul21_23-32-54_78d944cbbe34/events.out.tfevents.1721607100.78d944cbbe34.21392.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a2c4f7fe7d0d902e53fe05a8feb3b0176198da56613547c2c261aa31ff89215
3
+ size 417
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 488922611712000.0,
4
- "train_loss": 5.48259629872295,
5
- "train_runtime": 4495.3437,
6
  "train_samples": 80219,
7
- "train_samples_per_second": 356.898,
8
- "train_steps_per_second": 22.308
9
  }
 
1
  {
2
+ "epoch": 30.0,
3
+ "total_flos": 733383917568000.0,
4
+ "train_loss": 1.6002090492649228,
5
+ "train_runtime": 2253.0761,
6
  "train_samples": 80219,
7
+ "train_samples_per_second": 1068.126,
8
+ "train_steps_per_second": 66.762
9
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 100280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1416,12 +1416,721 @@
1416
  "train_runtime": 4495.3437,
1417
  "train_samples_per_second": 356.898,
1418
  "train_steps_per_second": 22.308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1419
  }
1420
  ],
1421
  "logging_steps": 500,
1422
- "max_steps": 100280,
1423
  "num_input_tokens_seen": 0,
1424
- "num_train_epochs": 20,
1425
  "save_steps": 500,
1426
  "stateful_callbacks": {
1427
  "TrainerControl": {
@@ -1435,7 +2144,7 @@
1435
  "attributes": {}
1436
  }
1437
  },
1438
- "total_flos": 488922611712000.0,
1439
  "train_batch_size": 16,
1440
  "trial_name": null,
1441
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 30.0,
5
  "eval_steps": 500,
6
+ "global_step": 150420,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1416
  "train_runtime": 4495.3437,
1417
  "train_samples_per_second": 356.898,
1418
  "train_steps_per_second": 22.308
1419
+ },
1420
+ {
1421
+ "epoch": 20.04387714399681,
1422
+ "grad_norm": 2.977564573287964,
1423
+ "learning_rate": 4.992687142667199e-05,
1424
+ "loss": 5.0172,
1425
+ "step": 100500
1426
+ },
1427
+ {
1428
+ "epoch": 20.14359792580774,
1429
+ "grad_norm": 2.9825730323791504,
1430
+ "learning_rate": 4.976067012365378e-05,
1431
+ "loss": 5.0479,
1432
+ "step": 101000
1433
+ },
1434
+ {
1435
+ "epoch": 20.243318707618666,
1436
+ "grad_norm": 3.019160747528076,
1437
+ "learning_rate": 4.959446882063555e-05,
1438
+ "loss": 5.0469,
1439
+ "step": 101500
1440
+ },
1441
+ {
1442
+ "epoch": 20.343039489429597,
1443
+ "grad_norm": 3.1838183403015137,
1444
+ "learning_rate": 4.942826751761734e-05,
1445
+ "loss": 5.0334,
1446
+ "step": 102000
1447
+ },
1448
+ {
1449
+ "epoch": 20.442760271240527,
1450
+ "grad_norm": 3.0762367248535156,
1451
+ "learning_rate": 4.926206621459912e-05,
1452
+ "loss": 5.0445,
1453
+ "step": 102500
1454
+ },
1455
+ {
1456
+ "epoch": 20.542481053051457,
1457
+ "grad_norm": 3.038895606994629,
1458
+ "learning_rate": 4.909586491158091e-05,
1459
+ "loss": 4.9971,
1460
+ "step": 103000
1461
+ },
1462
+ {
1463
+ "epoch": 20.642201834862384,
1464
+ "grad_norm": 3.1863033771514893,
1465
+ "learning_rate": 4.892966360856269e-05,
1466
+ "loss": 5.0347,
1467
+ "step": 103500
1468
+ },
1469
+ {
1470
+ "epoch": 20.741922616673314,
1471
+ "grad_norm": 2.8989017009735107,
1472
+ "learning_rate": 4.876346230554448e-05,
1473
+ "loss": 5.0353,
1474
+ "step": 104000
1475
+ },
1476
+ {
1477
+ "epoch": 20.841643398484244,
1478
+ "grad_norm": 2.9284589290618896,
1479
+ "learning_rate": 4.85975934051323e-05,
1480
+ "loss": 5.025,
1481
+ "step": 104500
1482
+ },
1483
+ {
1484
+ "epoch": 20.941364180295174,
1485
+ "grad_norm": 2.8213396072387695,
1486
+ "learning_rate": 4.843139210211408e-05,
1487
+ "loss": 5.0041,
1488
+ "step": 105000
1489
+ },
1490
+ {
1491
+ "epoch": 21.041084962106105,
1492
+ "grad_norm": 3.0717594623565674,
1493
+ "learning_rate": 4.826519079909587e-05,
1494
+ "loss": 4.9886,
1495
+ "step": 105500
1496
+ },
1497
+ {
1498
+ "epoch": 21.14080574391703,
1499
+ "grad_norm": 2.702904224395752,
1500
+ "learning_rate": 4.809898949607765e-05,
1501
+ "loss": 4.956,
1502
+ "step": 106000
1503
+ },
1504
+ {
1505
+ "epoch": 21.24052652572796,
1506
+ "grad_norm": 2.5885391235351562,
1507
+ "learning_rate": 4.793312059566547e-05,
1508
+ "loss": 4.9718,
1509
+ "step": 106500
1510
+ },
1511
+ {
1512
+ "epoch": 21.340247307538892,
1513
+ "grad_norm": 2.9367082118988037,
1514
+ "learning_rate": 4.776691929264726e-05,
1515
+ "loss": 4.9589,
1516
+ "step": 107000
1517
+ },
1518
+ {
1519
+ "epoch": 21.439968089349822,
1520
+ "grad_norm": 2.8302552700042725,
1521
+ "learning_rate": 4.760071798962904e-05,
1522
+ "loss": 4.9731,
1523
+ "step": 107500
1524
+ },
1525
+ {
1526
+ "epoch": 21.53968887116075,
1527
+ "grad_norm": 3.23287296295166,
1528
+ "learning_rate": 4.743451668661083e-05,
1529
+ "loss": 4.9309,
1530
+ "step": 108000
1531
+ },
1532
+ {
1533
+ "epoch": 21.63940965297168,
1534
+ "grad_norm": 2.868462562561035,
1535
+ "learning_rate": 4.7268647786198647e-05,
1536
+ "loss": 4.9436,
1537
+ "step": 108500
1538
+ },
1539
+ {
1540
+ "epoch": 21.73913043478261,
1541
+ "grad_norm": 2.8602261543273926,
1542
+ "learning_rate": 4.710244648318043e-05,
1543
+ "loss": 4.9688,
1544
+ "step": 109000
1545
+ },
1546
+ {
1547
+ "epoch": 21.83885121659354,
1548
+ "grad_norm": 3.121124505996704,
1549
+ "learning_rate": 4.6936245180162217e-05,
1550
+ "loss": 4.9444,
1551
+ "step": 109500
1552
+ },
1553
+ {
1554
+ "epoch": 21.938571998404466,
1555
+ "grad_norm": 2.777409553527832,
1556
+ "learning_rate": 4.6770043877144e-05,
1557
+ "loss": 4.9165,
1558
+ "step": 110000
1559
+ },
1560
+ {
1561
+ "epoch": 22.038292780215397,
1562
+ "grad_norm": 2.9804909229278564,
1563
+ "learning_rate": 4.660417497673182e-05,
1564
+ "loss": 4.9534,
1565
+ "step": 110500
1566
+ },
1567
+ {
1568
+ "epoch": 22.138013562026327,
1569
+ "grad_norm": 3.034639596939087,
1570
+ "learning_rate": 4.6437973673713605e-05,
1571
+ "loss": 4.9069,
1572
+ "step": 111000
1573
+ },
1574
+ {
1575
+ "epoch": 22.237734343837257,
1576
+ "grad_norm": 2.7168800830841064,
1577
+ "learning_rate": 4.627177237069539e-05,
1578
+ "loss": 4.8861,
1579
+ "step": 111500
1580
+ },
1581
+ {
1582
+ "epoch": 22.337455125648184,
1583
+ "grad_norm": 2.8833560943603516,
1584
+ "learning_rate": 4.6105571067677175e-05,
1585
+ "loss": 4.884,
1586
+ "step": 112000
1587
+ },
1588
+ {
1589
+ "epoch": 22.437175907459114,
1590
+ "grad_norm": 2.8463797569274902,
1591
+ "learning_rate": 4.5939702167264994e-05,
1592
+ "loss": 4.9167,
1593
+ "step": 112500
1594
+ },
1595
+ {
1596
+ "epoch": 22.536896689270044,
1597
+ "grad_norm": 2.765068531036377,
1598
+ "learning_rate": 4.5773500864246776e-05,
1599
+ "loss": 4.8929,
1600
+ "step": 113000
1601
+ },
1602
+ {
1603
+ "epoch": 22.636617471080974,
1604
+ "grad_norm": 2.7801401615142822,
1605
+ "learning_rate": 4.5607299561228564e-05,
1606
+ "loss": 4.8934,
1607
+ "step": 113500
1608
+ },
1609
+ {
1610
+ "epoch": 22.7363382528919,
1611
+ "grad_norm": 2.883640766143799,
1612
+ "learning_rate": 4.5441098258210346e-05,
1613
+ "loss": 4.8918,
1614
+ "step": 114000
1615
+ },
1616
+ {
1617
+ "epoch": 22.83605903470283,
1618
+ "grad_norm": 3.0070436000823975,
1619
+ "learning_rate": 4.5275229357798165e-05,
1620
+ "loss": 4.894,
1621
+ "step": 114500
1622
+ },
1623
+ {
1624
+ "epoch": 22.93577981651376,
1625
+ "grad_norm": 3.1484322547912598,
1626
+ "learning_rate": 4.510902805477995e-05,
1627
+ "loss": 4.8752,
1628
+ "step": 115000
1629
+ },
1630
+ {
1631
+ "epoch": 23.035500598324692,
1632
+ "grad_norm": 3.016380786895752,
1633
+ "learning_rate": 4.4942826751761735e-05,
1634
+ "loss": 4.8612,
1635
+ "step": 115500
1636
+ },
1637
+ {
1638
+ "epoch": 23.13522138013562,
1639
+ "grad_norm": 3.0375137329101562,
1640
+ "learning_rate": 4.477662544874352e-05,
1641
+ "loss": 4.8517,
1642
+ "step": 116000
1643
+ },
1644
+ {
1645
+ "epoch": 23.23494216194655,
1646
+ "grad_norm": 2.926248073577881,
1647
+ "learning_rate": 4.461075654833134e-05,
1648
+ "loss": 4.834,
1649
+ "step": 116500
1650
+ },
1651
+ {
1652
+ "epoch": 23.33466294375748,
1653
+ "grad_norm": 2.898101806640625,
1654
+ "learning_rate": 4.444455524531312e-05,
1655
+ "loss": 4.8456,
1656
+ "step": 117000
1657
+ },
1658
+ {
1659
+ "epoch": 23.43438372556841,
1660
+ "grad_norm": 2.9906890392303467,
1661
+ "learning_rate": 4.427835394229491e-05,
1662
+ "loss": 4.8431,
1663
+ "step": 117500
1664
+ },
1665
+ {
1666
+ "epoch": 23.53410450737934,
1667
+ "grad_norm": 2.9021828174591064,
1668
+ "learning_rate": 4.4112152639276693e-05,
1669
+ "loss": 4.8362,
1670
+ "step": 118000
1671
+ },
1672
+ {
1673
+ "epoch": 23.633825289190266,
1674
+ "grad_norm": 2.9854063987731934,
1675
+ "learning_rate": 4.394628373886451e-05,
1676
+ "loss": 4.8508,
1677
+ "step": 118500
1678
+ },
1679
+ {
1680
+ "epoch": 23.733546071001197,
1681
+ "grad_norm": 2.959423780441284,
1682
+ "learning_rate": 4.37800824358463e-05,
1683
+ "loss": 4.8395,
1684
+ "step": 119000
1685
+ },
1686
+ {
1687
+ "epoch": 23.833266852812127,
1688
+ "grad_norm": 3.267308235168457,
1689
+ "learning_rate": 4.361388113282808e-05,
1690
+ "loss": 4.8467,
1691
+ "step": 119500
1692
+ },
1693
+ {
1694
+ "epoch": 23.932987634623057,
1695
+ "grad_norm": 2.9600274562835693,
1696
+ "learning_rate": 4.344767982980987e-05,
1697
+ "loss": 4.8405,
1698
+ "step": 120000
1699
+ },
1700
+ {
1701
+ "epoch": 24.032708416433984,
1702
+ "grad_norm": 3.0417428016662598,
1703
+ "learning_rate": 4.328181092939769e-05,
1704
+ "loss": 4.8078,
1705
+ "step": 120500
1706
+ },
1707
+ {
1708
+ "epoch": 24.132429198244914,
1709
+ "grad_norm": 3.029172897338867,
1710
+ "learning_rate": 4.311560962637947e-05,
1711
+ "loss": 4.8216,
1712
+ "step": 121000
1713
+ },
1714
+ {
1715
+ "epoch": 24.232149980055844,
1716
+ "grad_norm": 2.846696376800537,
1717
+ "learning_rate": 4.294940832336126e-05,
1718
+ "loss": 4.814,
1719
+ "step": 121500
1720
+ },
1721
+ {
1722
+ "epoch": 24.331870761866774,
1723
+ "grad_norm": 3.2993550300598145,
1724
+ "learning_rate": 4.278320702034304e-05,
1725
+ "loss": 4.7863,
1726
+ "step": 122000
1727
+ },
1728
+ {
1729
+ "epoch": 24.4315915436777,
1730
+ "grad_norm": 3.039426803588867,
1731
+ "learning_rate": 4.261733811993086e-05,
1732
+ "loss": 4.7851,
1733
+ "step": 122500
1734
+ },
1735
+ {
1736
+ "epoch": 24.53131232548863,
1737
+ "grad_norm": 2.8034543991088867,
1738
+ "learning_rate": 4.245113681691265e-05,
1739
+ "loss": 4.7997,
1740
+ "step": 123000
1741
+ },
1742
+ {
1743
+ "epoch": 24.63103310729956,
1744
+ "grad_norm": 3.0070390701293945,
1745
+ "learning_rate": 4.228493551389443e-05,
1746
+ "loss": 4.8009,
1747
+ "step": 123500
1748
+ },
1749
+ {
1750
+ "epoch": 24.730753889110492,
1751
+ "grad_norm": 2.9534358978271484,
1752
+ "learning_rate": 4.211873421087622e-05,
1753
+ "loss": 4.8079,
1754
+ "step": 124000
1755
+ },
1756
+ {
1757
+ "epoch": 24.83047467092142,
1758
+ "grad_norm": 3.184213638305664,
1759
+ "learning_rate": 4.195286531046404e-05,
1760
+ "loss": 4.7826,
1761
+ "step": 124500
1762
+ },
1763
+ {
1764
+ "epoch": 24.93019545273235,
1765
+ "grad_norm": 2.946760416030884,
1766
+ "learning_rate": 4.178666400744582e-05,
1767
+ "loss": 4.7941,
1768
+ "step": 125000
1769
+ },
1770
+ {
1771
+ "epoch": 25.02991623454328,
1772
+ "grad_norm": 2.929389238357544,
1773
+ "learning_rate": 4.162046270442761e-05,
1774
+ "loss": 4.7783,
1775
+ "step": 125500
1776
+ },
1777
+ {
1778
+ "epoch": 25.12963701635421,
1779
+ "grad_norm": 2.9876906871795654,
1780
+ "learning_rate": 4.145426140140939e-05,
1781
+ "loss": 4.7433,
1782
+ "step": 126000
1783
+ },
1784
+ {
1785
+ "epoch": 25.229357798165136,
1786
+ "grad_norm": 2.9121735095977783,
1787
+ "learning_rate": 4.128839250099721e-05,
1788
+ "loss": 4.7545,
1789
+ "step": 126500
1790
+ },
1791
+ {
1792
+ "epoch": 25.329078579976066,
1793
+ "grad_norm": 2.848165273666382,
1794
+ "learning_rate": 4.1122191197978996e-05,
1795
+ "loss": 4.7756,
1796
+ "step": 127000
1797
+ },
1798
+ {
1799
+ "epoch": 25.428799361786997,
1800
+ "grad_norm": 2.955857515335083,
1801
+ "learning_rate": 4.095598989496078e-05,
1802
+ "loss": 4.7686,
1803
+ "step": 127500
1804
+ },
1805
+ {
1806
+ "epoch": 25.528520143597927,
1807
+ "grad_norm": 3.084696054458618,
1808
+ "learning_rate": 4.0789788591942566e-05,
1809
+ "loss": 4.756,
1810
+ "step": 128000
1811
+ },
1812
+ {
1813
+ "epoch": 25.628240925408853,
1814
+ "grad_norm": 2.993539571762085,
1815
+ "learning_rate": 4.0623919691530384e-05,
1816
+ "loss": 4.7699,
1817
+ "step": 128500
1818
+ },
1819
+ {
1820
+ "epoch": 25.727961707219784,
1821
+ "grad_norm": 3.0663325786590576,
1822
+ "learning_rate": 4.0457718388512166e-05,
1823
+ "loss": 4.7594,
1824
+ "step": 129000
1825
+ },
1826
+ {
1827
+ "epoch": 25.827682489030714,
1828
+ "grad_norm": 3.0915310382843018,
1829
+ "learning_rate": 4.0291517085493954e-05,
1830
+ "loss": 4.768,
1831
+ "step": 129500
1832
+ },
1833
+ {
1834
+ "epoch": 25.927403270841644,
1835
+ "grad_norm": 2.790329933166504,
1836
+ "learning_rate": 4.0125315782475736e-05,
1837
+ "loss": 4.7519,
1838
+ "step": 130000
1839
+ },
1840
+ {
1841
+ "epoch": 26.027124052652574,
1842
+ "grad_norm": 3.1589112281799316,
1843
+ "learning_rate": 3.9959446882063555e-05,
1844
+ "loss": 4.7383,
1845
+ "step": 130500
1846
+ },
1847
+ {
1848
+ "epoch": 26.1268448344635,
1849
+ "grad_norm": 2.9991183280944824,
1850
+ "learning_rate": 3.979324557904534e-05,
1851
+ "loss": 4.7297,
1852
+ "step": 131000
1853
+ },
1854
+ {
1855
+ "epoch": 26.22656561627443,
1856
+ "grad_norm": 2.959322452545166,
1857
+ "learning_rate": 3.9627044276027125e-05,
1858
+ "loss": 4.723,
1859
+ "step": 131500
1860
+ },
1861
+ {
1862
+ "epoch": 26.32628639808536,
1863
+ "grad_norm": 2.9168314933776855,
1864
+ "learning_rate": 3.946084297300891e-05,
1865
+ "loss": 4.7143,
1866
+ "step": 132000
1867
+ },
1868
+ {
1869
+ "epoch": 26.426007179896292,
1870
+ "grad_norm": 2.9729034900665283,
1871
+ "learning_rate": 3.929497407259673e-05,
1872
+ "loss": 4.7176,
1873
+ "step": 132500
1874
+ },
1875
+ {
1876
+ "epoch": 26.52572796170722,
1877
+ "grad_norm": 2.762373685836792,
1878
+ "learning_rate": 3.9128772769578514e-05,
1879
+ "loss": 4.7353,
1880
+ "step": 133000
1881
+ },
1882
+ {
1883
+ "epoch": 26.62544874351815,
1884
+ "grad_norm": 3.2931153774261475,
1885
+ "learning_rate": 3.89625714665603e-05,
1886
+ "loss": 4.7069,
1887
+ "step": 133500
1888
+ },
1889
+ {
1890
+ "epoch": 26.72516952532908,
1891
+ "grad_norm": 3.129920482635498,
1892
+ "learning_rate": 3.8796370163542084e-05,
1893
+ "loss": 4.7253,
1894
+ "step": 134000
1895
+ },
1896
+ {
1897
+ "epoch": 26.82489030714001,
1898
+ "grad_norm": 3.0690855979919434,
1899
+ "learning_rate": 3.86305012631299e-05,
1900
+ "loss": 4.744,
1901
+ "step": 134500
1902
+ },
1903
+ {
1904
+ "epoch": 26.924611088950936,
1905
+ "grad_norm": 2.957228183746338,
1906
+ "learning_rate": 3.846429996011169e-05,
1907
+ "loss": 4.7287,
1908
+ "step": 135000
1909
+ },
1910
+ {
1911
+ "epoch": 27.024331870761866,
1912
+ "grad_norm": 2.922133445739746,
1913
+ "learning_rate": 3.829809865709347e-05,
1914
+ "loss": 4.7231,
1915
+ "step": 135500
1916
+ },
1917
+ {
1918
+ "epoch": 27.124052652572797,
1919
+ "grad_norm": 3.0305354595184326,
1920
+ "learning_rate": 3.813189735407526e-05,
1921
+ "loss": 4.6755,
1922
+ "step": 136000
1923
+ },
1924
+ {
1925
+ "epoch": 27.223773434383727,
1926
+ "grad_norm": 2.9898860454559326,
1927
+ "learning_rate": 3.796602845366308e-05,
1928
+ "loss": 4.6737,
1929
+ "step": 136500
1930
+ },
1931
+ {
1932
+ "epoch": 27.323494216194653,
1933
+ "grad_norm": 3.0518152713775635,
1934
+ "learning_rate": 3.779982715064486e-05,
1935
+ "loss": 4.7161,
1936
+ "step": 137000
1937
+ },
1938
+ {
1939
+ "epoch": 27.423214998005584,
1940
+ "grad_norm": 3.14530086517334,
1941
+ "learning_rate": 3.763362584762665e-05,
1942
+ "loss": 4.6827,
1943
+ "step": 137500
1944
+ },
1945
+ {
1946
+ "epoch": 27.522935779816514,
1947
+ "grad_norm": 2.9844906330108643,
1948
+ "learning_rate": 3.746742454460843e-05,
1949
+ "loss": 4.7015,
1950
+ "step": 138000
1951
+ },
1952
+ {
1953
+ "epoch": 27.622656561627444,
1954
+ "grad_norm": 3.1187822818756104,
1955
+ "learning_rate": 3.730155564419625e-05,
1956
+ "loss": 4.6907,
1957
+ "step": 138500
1958
+ },
1959
+ {
1960
+ "epoch": 27.72237734343837,
1961
+ "grad_norm": 3.1447060108184814,
1962
+ "learning_rate": 3.713535434117804e-05,
1963
+ "loss": 4.6979,
1964
+ "step": 139000
1965
+ },
1966
+ {
1967
+ "epoch": 27.8220981252493,
1968
+ "grad_norm": 3.2830941677093506,
1969
+ "learning_rate": 3.696915303815982e-05,
1970
+ "loss": 4.7023,
1971
+ "step": 139500
1972
+ },
1973
+ {
1974
+ "epoch": 27.92181890706023,
1975
+ "grad_norm": 2.969634532928467,
1976
+ "learning_rate": 3.680295173514161e-05,
1977
+ "loss": 4.7053,
1978
+ "step": 140000
1979
+ },
1980
+ {
1981
+ "epoch": 28.02153968887116,
1982
+ "grad_norm": 3.082902431488037,
1983
+ "learning_rate": 3.663708283472943e-05,
1984
+ "loss": 4.6914,
1985
+ "step": 140500
1986
+ },
1987
+ {
1988
+ "epoch": 28.121260470682092,
1989
+ "grad_norm": 3.165813446044922,
1990
+ "learning_rate": 3.647088153171121e-05,
1991
+ "loss": 4.6726,
1992
+ "step": 141000
1993
+ },
1994
+ {
1995
+ "epoch": 28.22098125249302,
1996
+ "grad_norm": 3.1427435874938965,
1997
+ "learning_rate": 3.6304680228693e-05,
1998
+ "loss": 4.6586,
1999
+ "step": 141500
2000
+ },
2001
+ {
2002
+ "epoch": 28.32070203430395,
2003
+ "grad_norm": 3.179264545440674,
2004
+ "learning_rate": 3.613847892567478e-05,
2005
+ "loss": 4.6576,
2006
+ "step": 142000
2007
+ },
2008
+ {
2009
+ "epoch": 28.42042281611488,
2010
+ "grad_norm": 3.1044764518737793,
2011
+ "learning_rate": 3.59726100252626e-05,
2012
+ "loss": 4.6713,
2013
+ "step": 142500
2014
+ },
2015
+ {
2016
+ "epoch": 28.52014359792581,
2017
+ "grad_norm": 3.049412488937378,
2018
+ "learning_rate": 3.5806408722244386e-05,
2019
+ "loss": 4.6702,
2020
+ "step": 143000
2021
+ },
2022
+ {
2023
+ "epoch": 28.619864379736736,
2024
+ "grad_norm": 3.128653049468994,
2025
+ "learning_rate": 3.564020741922617e-05,
2026
+ "loss": 4.6872,
2027
+ "step": 143500
2028
+ },
2029
+ {
2030
+ "epoch": 28.719585161547666,
2031
+ "grad_norm": 3.13429856300354,
2032
+ "learning_rate": 3.5474006116207956e-05,
2033
+ "loss": 4.6487,
2034
+ "step": 144000
2035
+ },
2036
+ {
2037
+ "epoch": 28.819305943358597,
2038
+ "grad_norm": 3.0185248851776123,
2039
+ "learning_rate": 3.530780481318974e-05,
2040
+ "loss": 4.68,
2041
+ "step": 144500
2042
+ },
2043
+ {
2044
+ "epoch": 28.919026725169527,
2045
+ "grad_norm": 2.990931749343872,
2046
+ "learning_rate": 3.5141935912777556e-05,
2047
+ "loss": 4.669,
2048
+ "step": 145000
2049
+ },
2050
+ {
2051
+ "epoch": 29.018747506980453,
2052
+ "grad_norm": 2.9707412719726562,
2053
+ "learning_rate": 3.4975734609759345e-05,
2054
+ "loss": 4.663,
2055
+ "step": 145500
2056
+ },
2057
+ {
2058
+ "epoch": 29.118468288791384,
2059
+ "grad_norm": 3.247962713241577,
2060
+ "learning_rate": 3.4809533306741126e-05,
2061
+ "loss": 4.6391,
2062
+ "step": 146000
2063
+ },
2064
+ {
2065
+ "epoch": 29.218189070602314,
2066
+ "grad_norm": 3.135483503341675,
2067
+ "learning_rate": 3.4643332003722915e-05,
2068
+ "loss": 4.6368,
2069
+ "step": 146500
2070
+ },
2071
+ {
2072
+ "epoch": 29.317909852413244,
2073
+ "grad_norm": 3.4479868412017822,
2074
+ "learning_rate": 3.4477463103310734e-05,
2075
+ "loss": 4.6437,
2076
+ "step": 147000
2077
+ },
2078
+ {
2079
+ "epoch": 29.41763063422417,
2080
+ "grad_norm": 3.3987677097320557,
2081
+ "learning_rate": 3.4311261800292515e-05,
2082
+ "loss": 4.6635,
2083
+ "step": 147500
2084
+ },
2085
+ {
2086
+ "epoch": 29.5173514160351,
2087
+ "grad_norm": 3.153754234313965,
2088
+ "learning_rate": 3.4145060497274304e-05,
2089
+ "loss": 4.6128,
2090
+ "step": 148000
2091
+ },
2092
+ {
2093
+ "epoch": 29.61707219784603,
2094
+ "grad_norm": 3.29654860496521,
2095
+ "learning_rate": 3.3978859194256085e-05,
2096
+ "loss": 4.664,
2097
+ "step": 148500
2098
+ },
2099
+ {
2100
+ "epoch": 29.71679297965696,
2101
+ "grad_norm": 3.0110297203063965,
2102
+ "learning_rate": 3.3812990293843904e-05,
2103
+ "loss": 4.6438,
2104
+ "step": 149000
2105
+ },
2106
+ {
2107
+ "epoch": 29.81651376146789,
2108
+ "grad_norm": 3.0456008911132812,
2109
+ "learning_rate": 3.364678899082569e-05,
2110
+ "loss": 4.6476,
2111
+ "step": 149500
2112
+ },
2113
+ {
2114
+ "epoch": 29.91623454327882,
2115
+ "grad_norm": 3.3188984394073486,
2116
+ "learning_rate": 3.3480587687807474e-05,
2117
+ "loss": 4.6508,
2118
+ "step": 150000
2119
+ },
2120
+ {
2121
+ "epoch": 30.0,
2122
+ "step": 150420,
2123
+ "total_flos": 733383917568000.0,
2124
+ "train_loss": 1.6002090492649228,
2125
+ "train_runtime": 2253.0761,
2126
+ "train_samples_per_second": 1068.126,
2127
+ "train_steps_per_second": 66.762
2128
  }
2129
  ],
2130
  "logging_steps": 500,
2131
+ "max_steps": 150420,
2132
  "num_input_tokens_seen": 0,
2133
+ "num_train_epochs": 30,
2134
  "save_steps": 500,
2135
  "stateful_callbacks": {
2136
  "TrainerControl": {
 
2144
  "attributes": {}
2145
  }
2146
  },
2147
+ "total_flos": 733383917568000.0,
2148
  "train_batch_size": 16,
2149
  "trial_name": null,
2150
  "trial_params": null