Upload 11 files
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +279 -3
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 133466376
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65e67105c48220d2074098ec1e37f0d61b97375cc55304f903567d3695295370
|
3 |
size 133466376
|
optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 267054330
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0714840fa9604d2fc8f7ea5d59a366c8a93781aa1d7c495a9de1474c626848b4
|
3 |
size 267054330
|
rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38f537364220577dd4e5269dcefc13c34f9b3778f6daf6768a70fafb5a921478
|
3 |
size 14244
|
scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5da051eb48121dd050f992d01ba9f253f3d6fbc88a7c642a234fae20f61e7eb3
|
3 |
size 1064
|
trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch":
|
5 |
"eval_steps": 10000,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -637,6 +637,282 @@
|
|
637 |
"learning_rate": 6.131178557351203e-06,
|
638 |
"loss": 0.0135,
|
639 |
"step": 88000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
640 |
}
|
641 |
],
|
642 |
"logging_steps": 1000,
|
@@ -644,7 +920,7 @@
|
|
644 |
"num_input_tokens_seen": 0,
|
645 |
"num_train_epochs": 50,
|
646 |
"save_steps": 500,
|
647 |
-
"total_flos":
|
648 |
"train_batch_size": 8,
|
649 |
"trial_name": null,
|
650 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 50.0,
|
5 |
"eval_steps": 10000,
|
6 |
+
"global_step": 126850,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
637 |
"learning_rate": 6.131178557351203e-06,
|
638 |
"loss": 0.0135,
|
639 |
"step": 88000
|
640 |
+
},
|
641 |
+
{
|
642 |
+
"epoch": 35.08,
|
643 |
+
"learning_rate": 5.973512022073315e-06,
|
644 |
+
"loss": 0.016,
|
645 |
+
"step": 89000
|
646 |
+
},
|
647 |
+
{
|
648 |
+
"epoch": 35.47,
|
649 |
+
"learning_rate": 5.815845486795429e-06,
|
650 |
+
"loss": 0.0144,
|
651 |
+
"step": 90000
|
652 |
+
},
|
653 |
+
{
|
654 |
+
"epoch": 35.47,
|
655 |
+
"eval_accuracy": 0.996594778660613,
|
656 |
+
"eval_f1": 0.9872340425531915,
|
657 |
+
"eval_loss": 0.009375466965138912,
|
658 |
+
"eval_precision": 0.9914529914529915,
|
659 |
+
"eval_recall": 0.9830508474576272,
|
660 |
+
"eval_runtime": 6.361,
|
661 |
+
"eval_samples_per_second": 276.999,
|
662 |
+
"eval_steps_per_second": 34.743,
|
663 |
+
"step": 90000
|
664 |
+
},
|
665 |
+
{
|
666 |
+
"epoch": 35.87,
|
667 |
+
"learning_rate": 5.658336618052819e-06,
|
668 |
+
"loss": 0.0155,
|
669 |
+
"step": 91000
|
670 |
+
},
|
671 |
+
{
|
672 |
+
"epoch": 36.26,
|
673 |
+
"learning_rate": 5.500670082774931e-06,
|
674 |
+
"loss": 0.0126,
|
675 |
+
"step": 92000
|
676 |
+
},
|
677 |
+
{
|
678 |
+
"epoch": 36.66,
|
679 |
+
"learning_rate": 5.343161214032322e-06,
|
680 |
+
"loss": 0.0094,
|
681 |
+
"step": 93000
|
682 |
+
},
|
683 |
+
{
|
684 |
+
"epoch": 37.05,
|
685 |
+
"learning_rate": 5.185494678754435e-06,
|
686 |
+
"loss": 0.0106,
|
687 |
+
"step": 94000
|
688 |
+
},
|
689 |
+
{
|
690 |
+
"epoch": 37.45,
|
691 |
+
"learning_rate": 5.027828143476547e-06,
|
692 |
+
"loss": 0.0115,
|
693 |
+
"step": 95000
|
694 |
+
},
|
695 |
+
{
|
696 |
+
"epoch": 37.84,
|
697 |
+
"learning_rate": 4.870161608198661e-06,
|
698 |
+
"loss": 0.0087,
|
699 |
+
"step": 96000
|
700 |
+
},
|
701 |
+
{
|
702 |
+
"epoch": 38.23,
|
703 |
+
"learning_rate": 4.712495072920773e-06,
|
704 |
+
"loss": 0.0126,
|
705 |
+
"step": 97000
|
706 |
+
},
|
707 |
+
{
|
708 |
+
"epoch": 38.63,
|
709 |
+
"learning_rate": 4.554986204178163e-06,
|
710 |
+
"loss": 0.009,
|
711 |
+
"step": 98000
|
712 |
+
},
|
713 |
+
{
|
714 |
+
"epoch": 39.02,
|
715 |
+
"learning_rate": 4.397319668900277e-06,
|
716 |
+
"loss": 0.0129,
|
717 |
+
"step": 99000
|
718 |
+
},
|
719 |
+
{
|
720 |
+
"epoch": 39.42,
|
721 |
+
"learning_rate": 4.239653133622389e-06,
|
722 |
+
"loss": 0.009,
|
723 |
+
"step": 100000
|
724 |
+
},
|
725 |
+
{
|
726 |
+
"epoch": 39.42,
|
727 |
+
"eval_accuracy": 0.996594778660613,
|
728 |
+
"eval_f1": 0.9872881355932204,
|
729 |
+
"eval_loss": 0.008661070838570595,
|
730 |
+
"eval_precision": 0.9872881355932204,
|
731 |
+
"eval_recall": 0.9872881355932204,
|
732 |
+
"eval_runtime": 6.0434,
|
733 |
+
"eval_samples_per_second": 291.559,
|
734 |
+
"eval_steps_per_second": 36.569,
|
735 |
+
"step": 100000
|
736 |
+
},
|
737 |
+
{
|
738 |
+
"epoch": 39.81,
|
739 |
+
"learning_rate": 4.081986598344502e-06,
|
740 |
+
"loss": 0.0066,
|
741 |
+
"step": 101000
|
742 |
+
},
|
743 |
+
{
|
744 |
+
"epoch": 40.2,
|
745 |
+
"learning_rate": 3.924477729601892e-06,
|
746 |
+
"loss": 0.0116,
|
747 |
+
"step": 102000
|
748 |
+
},
|
749 |
+
{
|
750 |
+
"epoch": 40.6,
|
751 |
+
"learning_rate": 3.7668111943240047e-06,
|
752 |
+
"loss": 0.0099,
|
753 |
+
"step": 103000
|
754 |
+
},
|
755 |
+
{
|
756 |
+
"epoch": 40.99,
|
757 |
+
"learning_rate": 3.6091446590461178e-06,
|
758 |
+
"loss": 0.0087,
|
759 |
+
"step": 104000
|
760 |
+
},
|
761 |
+
{
|
762 |
+
"epoch": 41.39,
|
763 |
+
"learning_rate": 3.4516357903035086e-06,
|
764 |
+
"loss": 0.0092,
|
765 |
+
"step": 105000
|
766 |
+
},
|
767 |
+
{
|
768 |
+
"epoch": 41.78,
|
769 |
+
"learning_rate": 3.2939692550256207e-06,
|
770 |
+
"loss": 0.0098,
|
771 |
+
"step": 106000
|
772 |
+
},
|
773 |
+
{
|
774 |
+
"epoch": 42.18,
|
775 |
+
"learning_rate": 3.1363027197477337e-06,
|
776 |
+
"loss": 0.0092,
|
777 |
+
"step": 107000
|
778 |
+
},
|
779 |
+
{
|
780 |
+
"epoch": 42.57,
|
781 |
+
"learning_rate": 2.9787938510051245e-06,
|
782 |
+
"loss": 0.0111,
|
783 |
+
"step": 108000
|
784 |
+
},
|
785 |
+
{
|
786 |
+
"epoch": 42.96,
|
787 |
+
"learning_rate": 2.821127315727237e-06,
|
788 |
+
"loss": 0.0077,
|
789 |
+
"step": 109000
|
790 |
+
},
|
791 |
+
{
|
792 |
+
"epoch": 43.36,
|
793 |
+
"learning_rate": 2.66346078044935e-06,
|
794 |
+
"loss": 0.0084,
|
795 |
+
"step": 110000
|
796 |
+
},
|
797 |
+
{
|
798 |
+
"epoch": 43.36,
|
799 |
+
"eval_accuracy": 0.9954597048808173,
|
800 |
+
"eval_f1": 0.9828326180257511,
|
801 |
+
"eval_loss": 0.018768297508358955,
|
802 |
+
"eval_precision": 0.9956521739130435,
|
803 |
+
"eval_recall": 0.9703389830508474,
|
804 |
+
"eval_runtime": 5.9996,
|
805 |
+
"eval_samples_per_second": 293.685,
|
806 |
+
"eval_steps_per_second": 36.836,
|
807 |
+
"step": 110000
|
808 |
+
},
|
809 |
+
{
|
810 |
+
"epoch": 43.75,
|
811 |
+
"learning_rate": 2.5059519117067405e-06,
|
812 |
+
"loss": 0.0086,
|
813 |
+
"step": 111000
|
814 |
+
},
|
815 |
+
{
|
816 |
+
"epoch": 44.15,
|
817 |
+
"learning_rate": 2.348285376428853e-06,
|
818 |
+
"loss": 0.0092,
|
819 |
+
"step": 112000
|
820 |
+
},
|
821 |
+
{
|
822 |
+
"epoch": 44.54,
|
823 |
+
"learning_rate": 2.190618841150966e-06,
|
824 |
+
"loss": 0.0099,
|
825 |
+
"step": 113000
|
826 |
+
},
|
827 |
+
{
|
828 |
+
"epoch": 44.93,
|
829 |
+
"learning_rate": 2.0331099724083565e-06,
|
830 |
+
"loss": 0.0101,
|
831 |
+
"step": 114000
|
832 |
+
},
|
833 |
+
{
|
834 |
+
"epoch": 45.33,
|
835 |
+
"learning_rate": 1.8754434371304691e-06,
|
836 |
+
"loss": 0.0079,
|
837 |
+
"step": 115000
|
838 |
+
},
|
839 |
+
{
|
840 |
+
"epoch": 45.72,
|
841 |
+
"learning_rate": 1.71793456838786e-06,
|
842 |
+
"loss": 0.0102,
|
843 |
+
"step": 116000
|
844 |
+
},
|
845 |
+
{
|
846 |
+
"epoch": 46.12,
|
847 |
+
"learning_rate": 1.5602680331099725e-06,
|
848 |
+
"loss": 0.0086,
|
849 |
+
"step": 117000
|
850 |
+
},
|
851 |
+
{
|
852 |
+
"epoch": 46.51,
|
853 |
+
"learning_rate": 1.4026014978320853e-06,
|
854 |
+
"loss": 0.0097,
|
855 |
+
"step": 118000
|
856 |
+
},
|
857 |
+
{
|
858 |
+
"epoch": 46.91,
|
859 |
+
"learning_rate": 1.2449349625541981e-06,
|
860 |
+
"loss": 0.0048,
|
861 |
+
"step": 119000
|
862 |
+
},
|
863 |
+
{
|
864 |
+
"epoch": 47.3,
|
865 |
+
"learning_rate": 1.0872684272763107e-06,
|
866 |
+
"loss": 0.0112,
|
867 |
+
"step": 120000
|
868 |
+
},
|
869 |
+
{
|
870 |
+
"epoch": 47.3,
|
871 |
+
"eval_accuracy": 0.996594778660613,
|
872 |
+
"eval_f1": 0.9872881355932204,
|
873 |
+
"eval_loss": 0.008344221860170364,
|
874 |
+
"eval_precision": 0.9872881355932204,
|
875 |
+
"eval_recall": 0.9872881355932204,
|
876 |
+
"eval_runtime": 6.0461,
|
877 |
+
"eval_samples_per_second": 291.429,
|
878 |
+
"eval_steps_per_second": 36.553,
|
879 |
+
"step": 120000
|
880 |
+
},
|
881 |
+
{
|
882 |
+
"epoch": 47.69,
|
883 |
+
"learning_rate": 9.297595585337013e-07,
|
884 |
+
"loss": 0.0069,
|
885 |
+
"step": 121000
|
886 |
+
},
|
887 |
+
{
|
888 |
+
"epoch": 48.09,
|
889 |
+
"learning_rate": 7.72250689791092e-07,
|
890 |
+
"loss": 0.0103,
|
891 |
+
"step": 122000
|
892 |
+
},
|
893 |
+
{
|
894 |
+
"epoch": 48.48,
|
895 |
+
"learning_rate": 6.145841545132046e-07,
|
896 |
+
"loss": 0.0079,
|
897 |
+
"step": 123000
|
898 |
+
},
|
899 |
+
{
|
900 |
+
"epoch": 48.88,
|
901 |
+
"learning_rate": 4.5691761923531733e-07,
|
902 |
+
"loss": 0.0061,
|
903 |
+
"step": 124000
|
904 |
+
},
|
905 |
+
{
|
906 |
+
"epoch": 49.27,
|
907 |
+
"learning_rate": 2.994087504927079e-07,
|
908 |
+
"loss": 0.0081,
|
909 |
+
"step": 125000
|
910 |
+
},
|
911 |
+
{
|
912 |
+
"epoch": 49.66,
|
913 |
+
"learning_rate": 1.4174221521482067e-07,
|
914 |
+
"loss": 0.0101,
|
915 |
+
"step": 126000
|
916 |
}
|
917 |
],
|
918 |
"logging_steps": 1000,
|
|
|
920 |
"num_input_tokens_seen": 0,
|
921 |
"num_train_epochs": 50,
|
922 |
"save_steps": 500,
|
923 |
+
"total_flos": 1.0572380196462e+16,
|
924 |
"train_batch_size": 8,
|
925 |
"trial_name": null,
|
926 |
"trial_params": null
|