ccore commited on
Commit
1e37efb
·
verified ·
1 Parent(s): 695236f

Training in progress, epoch 10, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3c365c66cf2a3d941ce1ea5ad003e4a05b3c41bd56f4e0493053ac8b6dd9350
3
  size 1324830880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:062975bff2d0f7f7c4d6f5b956cc23d7a72551cdc1c569801b7bfec117ef85c4
3
  size 1324830880
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17e462259220bd7bfe4d650b3ccd1980f9d0cf22b5c8f4def5e9b2a8c5e8719a
3
  size 2649896094
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:449d21a27ff64bfb59a33d659105ad18c8f4a4106d6637584febc647d8988ef6
3
  size 2649896094
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8622f03e7cb24e2f286863cd91515fad339da4d289ad4c2fbea14ca1b50ee88
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80b5e2320c011d2ef85af0e21d5865f84ed70109b55a905c3969adb000834408
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1d655edf5c0d59719b2e09e5b118a19566dd1c026c019c1e64e07966eca110e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52cccb9a6c56995c65476c1bb9d353486d68d2924d3217599f44452b5f6974ca
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.30803602933883667,
3
  "best_model_checkpoint": "./opt_trained/checkpoint-258",
4
- "epoch": 9.0,
5
  "eval_steps": 500,
6
- "global_step": 387,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2788,6 +2788,315 @@
2788
  "eval_samples_per_second": 62.191,
2789
  "eval_steps_per_second": 15.548,
2790
  "step": 387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2791
  }
2792
  ],
2793
  "logging_steps": 1,
@@ -2807,7 +3116,7 @@
2807
  "attributes": {}
2808
  }
2809
  },
2810
- "total_flos": 1.9361945056444416e+16,
2811
  "train_batch_size": 12,
2812
  "trial_name": null,
2813
  "trial_params": null
 
1
  {
2
  "best_metric": 0.30803602933883667,
3
  "best_model_checkpoint": "./opt_trained/checkpoint-258",
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 430,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2788
  "eval_samples_per_second": 62.191,
2789
  "eval_steps_per_second": 15.548,
2790
  "step": 387
2791
+ },
2792
+ {
2793
+ "epoch": 9.023391812865498,
2794
+ "grad_norm": 4.209856033325195,
2795
+ "learning_rate": 8.152380952380953e-05,
2796
+ "loss": 2.2796,
2797
+ "step": 388
2798
+ },
2799
+ {
2800
+ "epoch": 9.046783625730994,
2801
+ "grad_norm": 4.553707122802734,
2802
+ "learning_rate": 8.147619047619049e-05,
2803
+ "loss": 2.1917,
2804
+ "step": 389
2805
+ },
2806
+ {
2807
+ "epoch": 9.070175438596491,
2808
+ "grad_norm": 2.235180377960205,
2809
+ "learning_rate": 8.142857142857143e-05,
2810
+ "loss": 2.2586,
2811
+ "step": 390
2812
+ },
2813
+ {
2814
+ "epoch": 9.093567251461987,
2815
+ "grad_norm": 3.6155264377593994,
2816
+ "learning_rate": 8.138095238095238e-05,
2817
+ "loss": 2.2361,
2818
+ "step": 391
2819
+ },
2820
+ {
2821
+ "epoch": 9.116959064327485,
2822
+ "grad_norm": 3.4730417728424072,
2823
+ "learning_rate": 8.133333333333334e-05,
2824
+ "loss": 2.2257,
2825
+ "step": 392
2826
+ },
2827
+ {
2828
+ "epoch": 9.140350877192983,
2829
+ "grad_norm": 3.765535831451416,
2830
+ "learning_rate": 8.128571428571428e-05,
2831
+ "loss": 2.1642,
2832
+ "step": 393
2833
+ },
2834
+ {
2835
+ "epoch": 9.163742690058479,
2836
+ "grad_norm": 3.1897642612457275,
2837
+ "learning_rate": 8.123809523809525e-05,
2838
+ "loss": 2.2229,
2839
+ "step": 394
2840
+ },
2841
+ {
2842
+ "epoch": 9.187134502923977,
2843
+ "grad_norm": 3.8044841289520264,
2844
+ "learning_rate": 8.11904761904762e-05,
2845
+ "loss": 2.1882,
2846
+ "step": 395
2847
+ },
2848
+ {
2849
+ "epoch": 9.210526315789474,
2850
+ "grad_norm": 3.561450719833374,
2851
+ "learning_rate": 8.114285714285714e-05,
2852
+ "loss": 2.1809,
2853
+ "step": 396
2854
+ },
2855
+ {
2856
+ "epoch": 9.23391812865497,
2857
+ "grad_norm": 3.2488150596618652,
2858
+ "learning_rate": 8.10952380952381e-05,
2859
+ "loss": 2.2111,
2860
+ "step": 397
2861
+ },
2862
+ {
2863
+ "epoch": 9.257309941520468,
2864
+ "grad_norm": 3.5442261695861816,
2865
+ "learning_rate": 8.104761904761905e-05,
2866
+ "loss": 2.1749,
2867
+ "step": 398
2868
+ },
2869
+ {
2870
+ "epoch": 9.280701754385966,
2871
+ "grad_norm": 2.66875958442688,
2872
+ "learning_rate": 8.1e-05,
2873
+ "loss": 2.1776,
2874
+ "step": 399
2875
+ },
2876
+ {
2877
+ "epoch": 9.304093567251462,
2878
+ "grad_norm": 2.672934055328369,
2879
+ "learning_rate": 8.095238095238096e-05,
2880
+ "loss": 2.1938,
2881
+ "step": 400
2882
+ },
2883
+ {
2884
+ "epoch": 9.32748538011696,
2885
+ "grad_norm": 2.8488197326660156,
2886
+ "learning_rate": 8.090476190476191e-05,
2887
+ "loss": 2.1875,
2888
+ "step": 401
2889
+ },
2890
+ {
2891
+ "epoch": 9.350877192982455,
2892
+ "grad_norm": 3.4410758018493652,
2893
+ "learning_rate": 8.085714285714287e-05,
2894
+ "loss": 2.2253,
2895
+ "step": 402
2896
+ },
2897
+ {
2898
+ "epoch": 9.374269005847953,
2899
+ "grad_norm": 3.20196270942688,
2900
+ "learning_rate": 8.080952380952381e-05,
2901
+ "loss": 2.2043,
2902
+ "step": 403
2903
+ },
2904
+ {
2905
+ "epoch": 9.397660818713451,
2906
+ "grad_norm": 3.485410213470459,
2907
+ "learning_rate": 8.076190476190475e-05,
2908
+ "loss": 2.2155,
2909
+ "step": 404
2910
+ },
2911
+ {
2912
+ "epoch": 9.421052631578947,
2913
+ "grad_norm": 3.2198355197906494,
2914
+ "learning_rate": 8.071428571428573e-05,
2915
+ "loss": 2.2339,
2916
+ "step": 405
2917
+ },
2918
+ {
2919
+ "epoch": 9.444444444444445,
2920
+ "grad_norm": 4.449166774749756,
2921
+ "learning_rate": 8.066666666666667e-05,
2922
+ "loss": 2.2157,
2923
+ "step": 406
2924
+ },
2925
+ {
2926
+ "epoch": 9.46783625730994,
2927
+ "grad_norm": 3.3024957180023193,
2928
+ "learning_rate": 8.061904761904763e-05,
2929
+ "loss": 2.2373,
2930
+ "step": 407
2931
+ },
2932
+ {
2933
+ "epoch": 9.491228070175438,
2934
+ "grad_norm": 4.262597560882568,
2935
+ "learning_rate": 8.057142857142857e-05,
2936
+ "loss": 2.2118,
2937
+ "step": 408
2938
+ },
2939
+ {
2940
+ "epoch": 9.514619883040936,
2941
+ "grad_norm": 3.014378070831299,
2942
+ "learning_rate": 8.052380952380952e-05,
2943
+ "loss": 2.1513,
2944
+ "step": 409
2945
+ },
2946
+ {
2947
+ "epoch": 9.538011695906432,
2948
+ "grad_norm": 2.3644843101501465,
2949
+ "learning_rate": 8.047619047619048e-05,
2950
+ "loss": 2.2105,
2951
+ "step": 410
2952
+ },
2953
+ {
2954
+ "epoch": 9.56140350877193,
2955
+ "grad_norm": 3.573030471801758,
2956
+ "learning_rate": 8.042857142857144e-05,
2957
+ "loss": 2.2014,
2958
+ "step": 411
2959
+ },
2960
+ {
2961
+ "epoch": 9.584795321637428,
2962
+ "grad_norm": 3.49285626411438,
2963
+ "learning_rate": 8.03809523809524e-05,
2964
+ "loss": 2.2258,
2965
+ "step": 412
2966
+ },
2967
+ {
2968
+ "epoch": 9.608187134502923,
2969
+ "grad_norm": 2.701261281967163,
2970
+ "learning_rate": 8.033333333333334e-05,
2971
+ "loss": 2.1926,
2972
+ "step": 413
2973
+ },
2974
+ {
2975
+ "epoch": 9.631578947368421,
2976
+ "grad_norm": 3.1829402446746826,
2977
+ "learning_rate": 8.028571428571428e-05,
2978
+ "loss": 2.1833,
2979
+ "step": 414
2980
+ },
2981
+ {
2982
+ "epoch": 9.654970760233919,
2983
+ "grad_norm": 3.5617990493774414,
2984
+ "learning_rate": 8.023809523809524e-05,
2985
+ "loss": 2.2629,
2986
+ "step": 415
2987
+ },
2988
+ {
2989
+ "epoch": 9.678362573099415,
2990
+ "grad_norm": 3.1133735179901123,
2991
+ "learning_rate": 8.01904761904762e-05,
2992
+ "loss": 2.2344,
2993
+ "step": 416
2994
+ },
2995
+ {
2996
+ "epoch": 9.701754385964913,
2997
+ "grad_norm": 3.0228543281555176,
2998
+ "learning_rate": 8.014285714285715e-05,
2999
+ "loss": 2.2076,
3000
+ "step": 417
3001
+ },
3002
+ {
3003
+ "epoch": 9.725146198830409,
3004
+ "grad_norm": 4.193742752075195,
3005
+ "learning_rate": 8.00952380952381e-05,
3006
+ "loss": 2.2325,
3007
+ "step": 418
3008
+ },
3009
+ {
3010
+ "epoch": 9.748538011695906,
3011
+ "grad_norm": 4.473887920379639,
3012
+ "learning_rate": 8.004761904761905e-05,
3013
+ "loss": 2.2241,
3014
+ "step": 419
3015
+ },
3016
+ {
3017
+ "epoch": 9.771929824561404,
3018
+ "grad_norm": 3.292799234390259,
3019
+ "learning_rate": 8e-05,
3020
+ "loss": 2.2584,
3021
+ "step": 420
3022
+ },
3023
+ {
3024
+ "epoch": 9.7953216374269,
3025
+ "grad_norm": 6.8649516105651855,
3026
+ "learning_rate": 7.995238095238095e-05,
3027
+ "loss": 2.2649,
3028
+ "step": 421
3029
+ },
3030
+ {
3031
+ "epoch": 9.818713450292398,
3032
+ "grad_norm": 3.9372193813323975,
3033
+ "learning_rate": 7.990476190476191e-05,
3034
+ "loss": 2.3003,
3035
+ "step": 422
3036
+ },
3037
+ {
3038
+ "epoch": 9.842105263157894,
3039
+ "grad_norm": 3.112377405166626,
3040
+ "learning_rate": 7.985714285714287e-05,
3041
+ "loss": 2.2106,
3042
+ "step": 423
3043
+ },
3044
+ {
3045
+ "epoch": 9.865497076023392,
3046
+ "grad_norm": 2.902355909347534,
3047
+ "learning_rate": 7.980952380952381e-05,
3048
+ "loss": 2.2946,
3049
+ "step": 424
3050
+ },
3051
+ {
3052
+ "epoch": 9.88888888888889,
3053
+ "grad_norm": 2.473977565765381,
3054
+ "learning_rate": 7.976190476190477e-05,
3055
+ "loss": 2.2175,
3056
+ "step": 425
3057
+ },
3058
+ {
3059
+ "epoch": 9.912280701754385,
3060
+ "grad_norm": 4.093216419219971,
3061
+ "learning_rate": 7.971428571428572e-05,
3062
+ "loss": 2.2092,
3063
+ "step": 426
3064
+ },
3065
+ {
3066
+ "epoch": 9.935672514619883,
3067
+ "grad_norm": 2.5776782035827637,
3068
+ "learning_rate": 7.966666666666666e-05,
3069
+ "loss": 2.219,
3070
+ "step": 427
3071
+ },
3072
+ {
3073
+ "epoch": 9.95906432748538,
3074
+ "grad_norm": 3.246060371398926,
3075
+ "learning_rate": 7.961904761904763e-05,
3076
+ "loss": 2.239,
3077
+ "step": 428
3078
+ },
3079
+ {
3080
+ "epoch": 9.982456140350877,
3081
+ "grad_norm": 2.8515846729278564,
3082
+ "learning_rate": 7.957142857142858e-05,
3083
+ "loss": 2.2154,
3084
+ "step": 429
3085
+ },
3086
+ {
3087
+ "epoch": 10.0,
3088
+ "grad_norm": 2.685945510864258,
3089
+ "learning_rate": 7.952380952380952e-05,
3090
+ "loss": 1.6512,
3091
+ "step": 430
3092
+ },
3093
+ {
3094
+ "epoch": 10.0,
3095
+ "eval_loss": 0.3121136426925659,
3096
+ "eval_runtime": 3.5135,
3097
+ "eval_samples_per_second": 61.478,
3098
+ "eval_steps_per_second": 15.369,
3099
+ "step": 430
3100
  }
3101
  ],
3102
  "logging_steps": 1,
 
3116
  "attributes": {}
3117
  }
3118
  },
3119
+ "total_flos": 2.151327228493824e+16,
3120
  "train_batch_size": 12,
3121
  "trial_name": null,
3122
  "trial_params": null