ssunggun2 commited on
Commit
2837dd7
1 Parent(s): 0a30b8a

Initial commit of fine-tuned model

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:473f0d4a80ac0df61e01088932e49cb6824225d5d4bf3f360908abdfb9a580f3
3
  size 54543184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4906ed3de48fb0de630d14e495577b07cec0878941b9c89aa8081b1d906b340
3
  size 54543184
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bfe44e4195868e44c40aacc562681b1950bdc51cc1f36a646ad67202229b307f
3
  size 109130618
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2e45fb0c3cffb55cff93b8eab278a57f60af5f661a94d98aca9a63030091bee
3
  size 109130618
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc2acf47f68caa7d18b212606808dac263cf20e72e1bf6e3dd9688c46d8616b4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c74c344198d9960751843e4c11fd2221f86d11dba060b029f4f7201c81ce036e
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d03f8df28661f098b1ac9b3cb4165605c8f0f87b62ef0eff1506af174fbbc387
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe9f6904a7003fa25bc1c72394e6d3f620e7e1016a868c0c50ad1d7ebd7f9390
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 2.096661329269409,
3
- "best_model_checkpoint": "/data/sunggeunan/ICL/src/outputs/Meta-Llama-3-8B-Instruct_qa_ft_QA_mrqa_nq_SQuAD_3shot_1docs/checkpoint-385",
4
- "epoch": 4.0,
5
  "eval_steps": 100,
6
- "global_step": 385,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2734,6 +2734,686 @@
2734
  "eval_samples_per_second": 2.703,
2735
  "eval_steps_per_second": 0.432,
2736
  "step": 385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2737
  }
2738
  ],
2739
  "logging_steps": 1,
@@ -2753,7 +3433,7 @@
2753
  "attributes": {}
2754
  }
2755
  },
2756
- "total_flos": 4.55214242753151e+18,
2757
  "train_batch_size": 8,
2758
  "trial_name": null,
2759
  "trial_params": null
 
1
  {
2
+ "best_metric": 2.0890417098999023,
3
+ "best_model_checkpoint": "/data/sunggeunan/ICL/src/outputs/Meta-Llama-3-8B-Instruct_qa_ft_QA_mrqa_nq_SQuAD_3shot_1docs/checkpoint-481",
4
+ "epoch": 4.997402597402598,
5
  "eval_steps": 100,
6
+ "global_step": 481,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2734
  "eval_samples_per_second": 2.703,
2735
  "eval_steps_per_second": 0.432,
2736
  "step": 385
2737
+ },
2738
+ {
2739
+ "epoch": 4.01038961038961,
2740
+ "grad_norm": 0.3841288685798645,
2741
+ "learning_rate": 3.9087719298245615e-07,
2742
+ "loss": 2.1132,
2743
+ "step": 386
2744
+ },
2745
+ {
2746
+ "epoch": 4.020779220779221,
2747
+ "grad_norm": 0.3716009855270386,
2748
+ "learning_rate": 3.9052631578947363e-07,
2749
+ "loss": 2.1147,
2750
+ "step": 387
2751
+ },
2752
+ {
2753
+ "epoch": 4.031168831168831,
2754
+ "grad_norm": 0.37986499071121216,
2755
+ "learning_rate": 3.9017543859649123e-07,
2756
+ "loss": 2.085,
2757
+ "step": 388
2758
+ },
2759
+ {
2760
+ "epoch": 4.041558441558442,
2761
+ "grad_norm": 0.37856024503707886,
2762
+ "learning_rate": 3.8982456140350877e-07,
2763
+ "loss": 2.0922,
2764
+ "step": 389
2765
+ },
2766
+ {
2767
+ "epoch": 4.0519480519480515,
2768
+ "grad_norm": 0.3767644464969635,
2769
+ "learning_rate": 3.894736842105263e-07,
2770
+ "loss": 2.0762,
2771
+ "step": 390
2772
+ },
2773
+ {
2774
+ "epoch": 4.062337662337662,
2775
+ "grad_norm": 0.3725319802761078,
2776
+ "learning_rate": 3.8912280701754385e-07,
2777
+ "loss": 2.0878,
2778
+ "step": 391
2779
+ },
2780
+ {
2781
+ "epoch": 4.072727272727272,
2782
+ "grad_norm": 0.35704779624938965,
2783
+ "learning_rate": 3.8877192982456134e-07,
2784
+ "loss": 2.0943,
2785
+ "step": 392
2786
+ },
2787
+ {
2788
+ "epoch": 4.083116883116883,
2789
+ "grad_norm": 0.3813813626766205,
2790
+ "learning_rate": 3.8842105263157893e-07,
2791
+ "loss": 2.0769,
2792
+ "step": 393
2793
+ },
2794
+ {
2795
+ "epoch": 4.093506493506493,
2796
+ "grad_norm": 0.3729550540447235,
2797
+ "learning_rate": 3.8807017543859647e-07,
2798
+ "loss": 2.0747,
2799
+ "step": 394
2800
+ },
2801
+ {
2802
+ "epoch": 4.103896103896104,
2803
+ "grad_norm": 0.37814322113990784,
2804
+ "learning_rate": 3.87719298245614e-07,
2805
+ "loss": 2.0834,
2806
+ "step": 395
2807
+ },
2808
+ {
2809
+ "epoch": 4.114285714285714,
2810
+ "grad_norm": 0.3847734034061432,
2811
+ "learning_rate": 3.8736842105263155e-07,
2812
+ "loss": 2.0945,
2813
+ "step": 396
2814
+ },
2815
+ {
2816
+ "epoch": 4.124675324675325,
2817
+ "grad_norm": 0.3798567056655884,
2818
+ "learning_rate": 3.8701754385964915e-07,
2819
+ "loss": 2.0756,
2820
+ "step": 397
2821
+ },
2822
+ {
2823
+ "epoch": 4.135064935064935,
2824
+ "grad_norm": 0.3700884282588959,
2825
+ "learning_rate": 3.8666666666666664e-07,
2826
+ "loss": 2.0621,
2827
+ "step": 398
2828
+ },
2829
+ {
2830
+ "epoch": 4.1454545454545455,
2831
+ "grad_norm": 0.36993175745010376,
2832
+ "learning_rate": 3.8631578947368423e-07,
2833
+ "loss": 2.1099,
2834
+ "step": 399
2835
+ },
2836
+ {
2837
+ "epoch": 4.1558441558441555,
2838
+ "grad_norm": 0.380310595035553,
2839
+ "learning_rate": 3.859649122807017e-07,
2840
+ "loss": 2.0981,
2841
+ "step": 400
2842
+ },
2843
+ {
2844
+ "epoch": 4.166233766233766,
2845
+ "grad_norm": 0.38853439688682556,
2846
+ "learning_rate": 3.8561403508771926e-07,
2847
+ "loss": 2.1049,
2848
+ "step": 401
2849
+ },
2850
+ {
2851
+ "epoch": 4.176623376623376,
2852
+ "grad_norm": 0.3776048421859741,
2853
+ "learning_rate": 3.8526315789473685e-07,
2854
+ "loss": 2.0446,
2855
+ "step": 402
2856
+ },
2857
+ {
2858
+ "epoch": 4.187012987012987,
2859
+ "grad_norm": 0.38195568323135376,
2860
+ "learning_rate": 3.8491228070175434e-07,
2861
+ "loss": 2.0813,
2862
+ "step": 403
2863
+ },
2864
+ {
2865
+ "epoch": 4.197402597402597,
2866
+ "grad_norm": 0.373125284910202,
2867
+ "learning_rate": 3.8456140350877193e-07,
2868
+ "loss": 2.0903,
2869
+ "step": 404
2870
+ },
2871
+ {
2872
+ "epoch": 4.207792207792208,
2873
+ "grad_norm": 0.3781803548336029,
2874
+ "learning_rate": 3.842105263157894e-07,
2875
+ "loss": 2.093,
2876
+ "step": 405
2877
+ },
2878
+ {
2879
+ "epoch": 4.218181818181818,
2880
+ "grad_norm": 0.38378608226776123,
2881
+ "learning_rate": 3.83859649122807e-07,
2882
+ "loss": 2.0772,
2883
+ "step": 406
2884
+ },
2885
+ {
2886
+ "epoch": 4.228571428571429,
2887
+ "grad_norm": 0.3815755248069763,
2888
+ "learning_rate": 3.8350877192982455e-07,
2889
+ "loss": 2.0876,
2890
+ "step": 407
2891
+ },
2892
+ {
2893
+ "epoch": 4.238961038961039,
2894
+ "grad_norm": 0.3809583783149719,
2895
+ "learning_rate": 3.831578947368421e-07,
2896
+ "loss": 2.0631,
2897
+ "step": 408
2898
+ },
2899
+ {
2900
+ "epoch": 4.249350649350649,
2901
+ "grad_norm": 0.3809110224246979,
2902
+ "learning_rate": 3.8280701754385964e-07,
2903
+ "loss": 2.1069,
2904
+ "step": 409
2905
+ },
2906
+ {
2907
+ "epoch": 4.259740259740259,
2908
+ "grad_norm": 0.37152138352394104,
2909
+ "learning_rate": 3.824561403508772e-07,
2910
+ "loss": 2.0738,
2911
+ "step": 410
2912
+ },
2913
+ {
2914
+ "epoch": 4.27012987012987,
2915
+ "grad_norm": 0.3761196434497833,
2916
+ "learning_rate": 3.821052631578947e-07,
2917
+ "loss": 2.0994,
2918
+ "step": 411
2919
+ },
2920
+ {
2921
+ "epoch": 4.28051948051948,
2922
+ "grad_norm": 0.39031481742858887,
2923
+ "learning_rate": 3.8175438596491226e-07,
2924
+ "loss": 2.0857,
2925
+ "step": 412
2926
+ },
2927
+ {
2928
+ "epoch": 4.290909090909091,
2929
+ "grad_norm": 0.37237513065338135,
2930
+ "learning_rate": 3.814035087719298e-07,
2931
+ "loss": 2.0844,
2932
+ "step": 413
2933
+ },
2934
+ {
2935
+ "epoch": 4.301298701298701,
2936
+ "grad_norm": 0.38423943519592285,
2937
+ "learning_rate": 3.8105263157894734e-07,
2938
+ "loss": 2.126,
2939
+ "step": 414
2940
+ },
2941
+ {
2942
+ "epoch": 4.311688311688312,
2943
+ "grad_norm": 0.36542361974716187,
2944
+ "learning_rate": 3.8070175438596493e-07,
2945
+ "loss": 2.0705,
2946
+ "step": 415
2947
+ },
2948
+ {
2949
+ "epoch": 4.322077922077922,
2950
+ "grad_norm": 0.36861154437065125,
2951
+ "learning_rate": 3.803508771929824e-07,
2952
+ "loss": 2.0681,
2953
+ "step": 416
2954
+ },
2955
+ {
2956
+ "epoch": 4.332467532467533,
2957
+ "grad_norm": 0.3783316910266876,
2958
+ "learning_rate": 3.7999999999999996e-07,
2959
+ "loss": 2.0777,
2960
+ "step": 417
2961
+ },
2962
+ {
2963
+ "epoch": 4.3428571428571425,
2964
+ "grad_norm": 0.38323143124580383,
2965
+ "learning_rate": 3.7964912280701756e-07,
2966
+ "loss": 2.0952,
2967
+ "step": 418
2968
+ },
2969
+ {
2970
+ "epoch": 4.353246753246753,
2971
+ "grad_norm": 0.3862488269805908,
2972
+ "learning_rate": 3.7929824561403504e-07,
2973
+ "loss": 2.0927,
2974
+ "step": 419
2975
+ },
2976
+ {
2977
+ "epoch": 4.363636363636363,
2978
+ "grad_norm": 0.38100945949554443,
2979
+ "learning_rate": 3.7894736842105264e-07,
2980
+ "loss": 2.1043,
2981
+ "step": 420
2982
+ },
2983
+ {
2984
+ "epoch": 4.374025974025974,
2985
+ "grad_norm": 0.38466402888298035,
2986
+ "learning_rate": 3.785964912280701e-07,
2987
+ "loss": 2.0906,
2988
+ "step": 421
2989
+ },
2990
+ {
2991
+ "epoch": 4.384415584415584,
2992
+ "grad_norm": 0.37953078746795654,
2993
+ "learning_rate": 3.782456140350877e-07,
2994
+ "loss": 2.0706,
2995
+ "step": 422
2996
+ },
2997
+ {
2998
+ "epoch": 4.394805194805195,
2999
+ "grad_norm": 0.3823856711387634,
3000
+ "learning_rate": 3.7789473684210526e-07,
3001
+ "loss": 2.0951,
3002
+ "step": 423
3003
+ },
3004
+ {
3005
+ "epoch": 4.405194805194805,
3006
+ "grad_norm": 0.37538549304008484,
3007
+ "learning_rate": 3.775438596491228e-07,
3008
+ "loss": 2.0771,
3009
+ "step": 424
3010
+ },
3011
+ {
3012
+ "epoch": 4.415584415584416,
3013
+ "grad_norm": 0.3802937865257263,
3014
+ "learning_rate": 3.7719298245614034e-07,
3015
+ "loss": 2.078,
3016
+ "step": 425
3017
+ },
3018
+ {
3019
+ "epoch": 4.425974025974026,
3020
+ "grad_norm": 0.3733079433441162,
3021
+ "learning_rate": 3.7684210526315783e-07,
3022
+ "loss": 2.0799,
3023
+ "step": 426
3024
+ },
3025
+ {
3026
+ "epoch": 4.4363636363636365,
3027
+ "grad_norm": 0.37729039788246155,
3028
+ "learning_rate": 3.764912280701754e-07,
3029
+ "loss": 2.1051,
3030
+ "step": 427
3031
+ },
3032
+ {
3033
+ "epoch": 4.4467532467532465,
3034
+ "grad_norm": 0.3915861248970032,
3035
+ "learning_rate": 3.7614035087719296e-07,
3036
+ "loss": 2.0927,
3037
+ "step": 428
3038
+ },
3039
+ {
3040
+ "epoch": 4.457142857142857,
3041
+ "grad_norm": 0.38771378993988037,
3042
+ "learning_rate": 3.757894736842105e-07,
3043
+ "loss": 2.0989,
3044
+ "step": 429
3045
+ },
3046
+ {
3047
+ "epoch": 4.467532467532467,
3048
+ "grad_norm": 0.3854687213897705,
3049
+ "learning_rate": 3.7543859649122804e-07,
3050
+ "loss": 2.0984,
3051
+ "step": 430
3052
+ },
3053
+ {
3054
+ "epoch": 4.477922077922078,
3055
+ "grad_norm": 0.3793568015098572,
3056
+ "learning_rate": 3.7508771929824564e-07,
3057
+ "loss": 2.0804,
3058
+ "step": 431
3059
+ },
3060
+ {
3061
+ "epoch": 4.488311688311688,
3062
+ "grad_norm": 0.39430853724479675,
3063
+ "learning_rate": 3.747368421052631e-07,
3064
+ "loss": 2.0985,
3065
+ "step": 432
3066
+ },
3067
+ {
3068
+ "epoch": 4.498701298701299,
3069
+ "grad_norm": 0.3847366273403168,
3070
+ "learning_rate": 3.743859649122807e-07,
3071
+ "loss": 2.0849,
3072
+ "step": 433
3073
+ },
3074
+ {
3075
+ "epoch": 4.509090909090909,
3076
+ "grad_norm": 0.374398797750473,
3077
+ "learning_rate": 3.740350877192982e-07,
3078
+ "loss": 2.0847,
3079
+ "step": 434
3080
+ },
3081
+ {
3082
+ "epoch": 4.51948051948052,
3083
+ "grad_norm": 0.4258849620819092,
3084
+ "learning_rate": 3.7368421052631575e-07,
3085
+ "loss": 2.082,
3086
+ "step": 435
3087
+ },
3088
+ {
3089
+ "epoch": 4.52987012987013,
3090
+ "grad_norm": 0.3853350579738617,
3091
+ "learning_rate": 3.7333333333333334e-07,
3092
+ "loss": 2.0832,
3093
+ "step": 436
3094
+ },
3095
+ {
3096
+ "epoch": 4.54025974025974,
3097
+ "grad_norm": 0.38020631670951843,
3098
+ "learning_rate": 3.7298245614035083e-07,
3099
+ "loss": 2.0799,
3100
+ "step": 437
3101
+ },
3102
+ {
3103
+ "epoch": 4.55064935064935,
3104
+ "grad_norm": 0.4022679030895233,
3105
+ "learning_rate": 3.726315789473684e-07,
3106
+ "loss": 2.1038,
3107
+ "step": 438
3108
+ },
3109
+ {
3110
+ "epoch": 4.561038961038961,
3111
+ "grad_norm": 0.37137728929519653,
3112
+ "learning_rate": 3.7228070175438596e-07,
3113
+ "loss": 2.0921,
3114
+ "step": 439
3115
+ },
3116
+ {
3117
+ "epoch": 4.571428571428571,
3118
+ "grad_norm": 0.38251206278800964,
3119
+ "learning_rate": 3.719298245614035e-07,
3120
+ "loss": 2.0878,
3121
+ "step": 440
3122
+ },
3123
+ {
3124
+ "epoch": 4.581818181818182,
3125
+ "grad_norm": 0.39200717210769653,
3126
+ "learning_rate": 3.7157894736842104e-07,
3127
+ "loss": 2.1202,
3128
+ "step": 441
3129
+ },
3130
+ {
3131
+ "epoch": 4.592207792207792,
3132
+ "grad_norm": 0.3731335699558258,
3133
+ "learning_rate": 3.7122807017543853e-07,
3134
+ "loss": 2.0737,
3135
+ "step": 442
3136
+ },
3137
+ {
3138
+ "epoch": 4.602597402597403,
3139
+ "grad_norm": 0.38276833295822144,
3140
+ "learning_rate": 3.708771929824561e-07,
3141
+ "loss": 2.0521,
3142
+ "step": 443
3143
+ },
3144
+ {
3145
+ "epoch": 4.612987012987013,
3146
+ "grad_norm": 0.38775137066841125,
3147
+ "learning_rate": 3.7052631578947367e-07,
3148
+ "loss": 2.0772,
3149
+ "step": 444
3150
+ },
3151
+ {
3152
+ "epoch": 4.623376623376624,
3153
+ "grad_norm": 0.3836955428123474,
3154
+ "learning_rate": 3.701754385964912e-07,
3155
+ "loss": 2.0992,
3156
+ "step": 445
3157
+ },
3158
+ {
3159
+ "epoch": 4.6337662337662335,
3160
+ "grad_norm": 0.37715139985084534,
3161
+ "learning_rate": 3.6982456140350875e-07,
3162
+ "loss": 2.0499,
3163
+ "step": 446
3164
+ },
3165
+ {
3166
+ "epoch": 4.644155844155844,
3167
+ "grad_norm": 0.3789008557796478,
3168
+ "learning_rate": 3.6947368421052634e-07,
3169
+ "loss": 2.0531,
3170
+ "step": 447
3171
+ },
3172
+ {
3173
+ "epoch": 4.654545454545454,
3174
+ "grad_norm": 0.3865036964416504,
3175
+ "learning_rate": 3.6912280701754383e-07,
3176
+ "loss": 2.0949,
3177
+ "step": 448
3178
+ },
3179
+ {
3180
+ "epoch": 4.664935064935065,
3181
+ "grad_norm": 0.3880210816860199,
3182
+ "learning_rate": 3.687719298245614e-07,
3183
+ "loss": 2.0871,
3184
+ "step": 449
3185
+ },
3186
+ {
3187
+ "epoch": 4.675324675324675,
3188
+ "grad_norm": 0.3839876353740692,
3189
+ "learning_rate": 3.684210526315789e-07,
3190
+ "loss": 2.0586,
3191
+ "step": 450
3192
+ },
3193
+ {
3194
+ "epoch": 4.685714285714286,
3195
+ "grad_norm": 0.39316463470458984,
3196
+ "learning_rate": 3.6807017543859645e-07,
3197
+ "loss": 2.0736,
3198
+ "step": 451
3199
+ },
3200
+ {
3201
+ "epoch": 4.696103896103896,
3202
+ "grad_norm": 0.37328803539276123,
3203
+ "learning_rate": 3.6771929824561405e-07,
3204
+ "loss": 2.084,
3205
+ "step": 452
3206
+ },
3207
+ {
3208
+ "epoch": 4.706493506493507,
3209
+ "grad_norm": 0.3884430527687073,
3210
+ "learning_rate": 3.6736842105263153e-07,
3211
+ "loss": 2.0788,
3212
+ "step": 453
3213
+ },
3214
+ {
3215
+ "epoch": 4.716883116883117,
3216
+ "grad_norm": 0.385623574256897,
3217
+ "learning_rate": 3.6701754385964913e-07,
3218
+ "loss": 2.0705,
3219
+ "step": 454
3220
+ },
3221
+ {
3222
+ "epoch": 4.7272727272727275,
3223
+ "grad_norm": 0.38950812816619873,
3224
+ "learning_rate": 3.666666666666666e-07,
3225
+ "loss": 2.0785,
3226
+ "step": 455
3227
+ },
3228
+ {
3229
+ "epoch": 4.7376623376623375,
3230
+ "grad_norm": 0.38535040616989136,
3231
+ "learning_rate": 3.663157894736842e-07,
3232
+ "loss": 2.0909,
3233
+ "step": 456
3234
+ },
3235
+ {
3236
+ "epoch": 4.748051948051948,
3237
+ "grad_norm": 0.3869593143463135,
3238
+ "learning_rate": 3.6596491228070175e-07,
3239
+ "loss": 2.0801,
3240
+ "step": 457
3241
+ },
3242
+ {
3243
+ "epoch": 4.758441558441558,
3244
+ "grad_norm": 0.39084428548812866,
3245
+ "learning_rate": 3.656140350877193e-07,
3246
+ "loss": 2.096,
3247
+ "step": 458
3248
+ },
3249
+ {
3250
+ "epoch": 4.768831168831169,
3251
+ "grad_norm": 0.3794546127319336,
3252
+ "learning_rate": 3.6526315789473683e-07,
3253
+ "loss": 2.0527,
3254
+ "step": 459
3255
+ },
3256
+ {
3257
+ "epoch": 4.779220779220779,
3258
+ "grad_norm": 0.3870809078216553,
3259
+ "learning_rate": 3.6491228070175437e-07,
3260
+ "loss": 2.0853,
3261
+ "step": 460
3262
+ },
3263
+ {
3264
+ "epoch": 4.78961038961039,
3265
+ "grad_norm": 0.38205036520957947,
3266
+ "learning_rate": 3.645614035087719e-07,
3267
+ "loss": 2.0643,
3268
+ "step": 461
3269
+ },
3270
+ {
3271
+ "epoch": 4.8,
3272
+ "grad_norm": 0.3907061815261841,
3273
+ "learning_rate": 3.6421052631578945e-07,
3274
+ "loss": 2.0786,
3275
+ "step": 462
3276
+ },
3277
+ {
3278
+ "epoch": 4.810389610389611,
3279
+ "grad_norm": 0.39493080973625183,
3280
+ "learning_rate": 3.63859649122807e-07,
3281
+ "loss": 2.0944,
3282
+ "step": 463
3283
+ },
3284
+ {
3285
+ "epoch": 4.820779220779221,
3286
+ "grad_norm": 0.3930380046367645,
3287
+ "learning_rate": 3.6350877192982453e-07,
3288
+ "loss": 2.1138,
3289
+ "step": 464
3290
+ },
3291
+ {
3292
+ "epoch": 4.8311688311688314,
3293
+ "grad_norm": 0.3952060639858246,
3294
+ "learning_rate": 3.6315789473684213e-07,
3295
+ "loss": 2.0802,
3296
+ "step": 465
3297
+ },
3298
+ {
3299
+ "epoch": 4.841558441558441,
3300
+ "grad_norm": 0.3815995752811432,
3301
+ "learning_rate": 3.628070175438596e-07,
3302
+ "loss": 2.0838,
3303
+ "step": 466
3304
+ },
3305
+ {
3306
+ "epoch": 4.851948051948052,
3307
+ "grad_norm": 0.38858020305633545,
3308
+ "learning_rate": 3.6245614035087716e-07,
3309
+ "loss": 2.0804,
3310
+ "step": 467
3311
+ },
3312
+ {
3313
+ "epoch": 4.862337662337662,
3314
+ "grad_norm": 0.385565847158432,
3315
+ "learning_rate": 3.6210526315789475e-07,
3316
+ "loss": 2.0974,
3317
+ "step": 468
3318
+ },
3319
+ {
3320
+ "epoch": 4.872727272727273,
3321
+ "grad_norm": 0.3909178078174591,
3322
+ "learning_rate": 3.6175438596491224e-07,
3323
+ "loss": 2.0887,
3324
+ "step": 469
3325
+ },
3326
+ {
3327
+ "epoch": 4.883116883116883,
3328
+ "grad_norm": 0.3982325792312622,
3329
+ "learning_rate": 3.6140350877192983e-07,
3330
+ "loss": 2.1054,
3331
+ "step": 470
3332
+ },
3333
+ {
3334
+ "epoch": 4.893506493506494,
3335
+ "grad_norm": 0.3876339793205261,
3336
+ "learning_rate": 3.610526315789473e-07,
3337
+ "loss": 2.1054,
3338
+ "step": 471
3339
+ },
3340
+ {
3341
+ "epoch": 4.903896103896104,
3342
+ "grad_norm": 0.3819069266319275,
3343
+ "learning_rate": 3.607017543859649e-07,
3344
+ "loss": 2.0821,
3345
+ "step": 472
3346
+ },
3347
+ {
3348
+ "epoch": 4.914285714285715,
3349
+ "grad_norm": 0.3924694359302521,
3350
+ "learning_rate": 3.6035087719298245e-07,
3351
+ "loss": 2.0712,
3352
+ "step": 473
3353
+ },
3354
+ {
3355
+ "epoch": 4.9246753246753245,
3356
+ "grad_norm": 0.3937675654888153,
3357
+ "learning_rate": 3.6e-07,
3358
+ "loss": 2.1057,
3359
+ "step": 474
3360
+ },
3361
+ {
3362
+ "epoch": 4.935064935064935,
3363
+ "grad_norm": 0.38620275259017944,
3364
+ "learning_rate": 3.5964912280701754e-07,
3365
+ "loss": 2.0845,
3366
+ "step": 475
3367
+ },
3368
+ {
3369
+ "epoch": 4.945454545454545,
3370
+ "grad_norm": 0.40442150831222534,
3371
+ "learning_rate": 3.59298245614035e-07,
3372
+ "loss": 2.0936,
3373
+ "step": 476
3374
+ },
3375
+ {
3376
+ "epoch": 4.955844155844156,
3377
+ "grad_norm": 0.3815317153930664,
3378
+ "learning_rate": 3.589473684210526e-07,
3379
+ "loss": 2.0845,
3380
+ "step": 477
3381
+ },
3382
+ {
3383
+ "epoch": 4.966233766233766,
3384
+ "grad_norm": 0.38584476709365845,
3385
+ "learning_rate": 3.5859649122807016e-07,
3386
+ "loss": 2.071,
3387
+ "step": 478
3388
+ },
3389
+ {
3390
+ "epoch": 4.976623376623377,
3391
+ "grad_norm": 0.3887505829334259,
3392
+ "learning_rate": 3.582456140350877e-07,
3393
+ "loss": 2.0924,
3394
+ "step": 479
3395
+ },
3396
+ {
3397
+ "epoch": 4.987012987012987,
3398
+ "grad_norm": 0.3836219012737274,
3399
+ "learning_rate": 3.5789473684210524e-07,
3400
+ "loss": 2.0986,
3401
+ "step": 480
3402
+ },
3403
+ {
3404
+ "epoch": 4.997402597402598,
3405
+ "grad_norm": 0.38430362939834595,
3406
+ "learning_rate": 3.5754385964912283e-07,
3407
+ "loss": 2.1006,
3408
+ "step": 481
3409
+ },
3410
+ {
3411
+ "epoch": 4.997402597402598,
3412
+ "eval_loss": 2.0890417098999023,
3413
+ "eval_runtime": 9.2277,
3414
+ "eval_samples_per_second": 2.709,
3415
+ "eval_steps_per_second": 0.433,
3416
+ "step": 481
3417
  }
3418
  ],
3419
  "logging_steps": 1,
 
3433
  "attributes": {}
3434
  }
3435
  },
3436
+ "total_flos": 5.690178034414387e+18,
3437
  "train_batch_size": 8,
3438
  "trial_name": null,
3439
  "trial_params": null