MohamedAhmedAE
commited on
Training in progress, step 610200, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1715561468
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:feb488d07180dda4d36dc8c04e6962a0296f67d8778b893ecac1f7e5d993b765
|
3 |
size 1715561468
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3431474364
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8797c75efc47e59ca589d0274b2c8ecd06e6d51e1b0e7370194d01a342ade252
|
3 |
size 3431474364
|
last-checkpoint/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55bc85299f5f6627f236f7c8b72ae391f14d02a771d86cdf81791100be66164c
|
3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:657663cc54c163af99554667642ae2a96b3249ce9d1e18733019516ba49032ee
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 45.
|
5 |
"eval_steps": 1000,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -25957,6 +25957,293 @@
|
|
25957 |
"learning_rate": 2.8758473550836634e-05,
|
25958 |
"loss": 0.4025,
|
25959 |
"step": 603600
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25960 |
}
|
25961 |
],
|
25962 |
"logging_steps": 200,
|
@@ -25976,7 +26263,7 @@
|
|
25976 |
"attributes": {}
|
25977 |
}
|
25978 |
},
|
25979 |
-
"total_flos": 9.
|
25980 |
"train_batch_size": 10,
|
25981 |
"trial_name": null,
|
25982 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 45.690752527143395,
|
5 |
"eval_steps": 1000,
|
6 |
+
"global_step": 610200,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
25957 |
"learning_rate": 2.8758473550836634e-05,
|
25958 |
"loss": 0.4025,
|
25959 |
"step": 603600
|
25960 |
+
},
|
25961 |
+
{
|
25962 |
+
"epoch": 45.21153126169974,
|
25963 |
+
"grad_norm": 5.78782320022583,
|
25964 |
+
"learning_rate": 2.8746844911583486e-05,
|
25965 |
+
"loss": 0.3698,
|
25966 |
+
"step": 603800
|
25967 |
+
},
|
25968 |
+
{
|
25969 |
+
"epoch": 45.22650692624485,
|
25970 |
+
"grad_norm": 3.0406911373138428,
|
25971 |
+
"learning_rate": 2.87352154429745e-05,
|
25972 |
+
"loss": 0.3934,
|
25973 |
+
"step": 604000
|
25974 |
+
},
|
25975 |
+
{
|
25976 |
+
"epoch": 45.22650692624485,
|
25977 |
+
"eval_loss": 1.5707191228866577,
|
25978 |
+
"eval_runtime": 1178.78,
|
25979 |
+
"eval_samples_per_second": 8.402,
|
25980 |
+
"eval_steps_per_second": 0.421,
|
25981 |
+
"step": 604000
|
25982 |
+
},
|
25983 |
+
{
|
25984 |
+
"epoch": 45.241482590789964,
|
25985 |
+
"grad_norm": 18.79865264892578,
|
25986 |
+
"learning_rate": 2.872358514758381e-05,
|
25987 |
+
"loss": 0.4355,
|
25988 |
+
"step": 604200
|
25989 |
+
},
|
25990 |
+
{
|
25991 |
+
"epoch": 45.25645825533508,
|
25992 |
+
"grad_norm": 5.7239508628845215,
|
25993 |
+
"learning_rate": 2.8711954027985765e-05,
|
25994 |
+
"loss": 0.3789,
|
25995 |
+
"step": 604400
|
25996 |
+
},
|
25997 |
+
{
|
25998 |
+
"epoch": 45.271433919880195,
|
25999 |
+
"grad_norm": 5.77394437789917,
|
26000 |
+
"learning_rate": 2.8700322086754894e-05,
|
26001 |
+
"loss": 0.4129,
|
26002 |
+
"step": 604600
|
26003 |
+
},
|
26004 |
+
{
|
26005 |
+
"epoch": 45.28640958442531,
|
26006 |
+
"grad_norm": 8.501045227050781,
|
26007 |
+
"learning_rate": 2.868868932646589e-05,
|
26008 |
+
"loss": 0.4196,
|
26009 |
+
"step": 604800
|
26010 |
+
},
|
26011 |
+
{
|
26012 |
+
"epoch": 45.301385248970426,
|
26013 |
+
"grad_norm": 1.9924376010894775,
|
26014 |
+
"learning_rate": 2.867705574969365e-05,
|
26015 |
+
"loss": 0.404,
|
26016 |
+
"step": 605000
|
26017 |
+
},
|
26018 |
+
{
|
26019 |
+
"epoch": 45.301385248970426,
|
26020 |
+
"eval_loss": 1.5671298503875732,
|
26021 |
+
"eval_runtime": 1178.5157,
|
26022 |
+
"eval_samples_per_second": 8.404,
|
26023 |
+
"eval_steps_per_second": 0.421,
|
26024 |
+
"step": 605000
|
26025 |
+
},
|
26026 |
+
{
|
26027 |
+
"epoch": 45.31636091351554,
|
26028 |
+
"grad_norm": 11.104948997497559,
|
26029 |
+
"learning_rate": 2.8665421359013233e-05,
|
26030 |
+
"loss": 0.4253,
|
26031 |
+
"step": 605200
|
26032 |
+
},
|
26033 |
+
{
|
26034 |
+
"epoch": 45.33133657806065,
|
26035 |
+
"grad_norm": 5.054950714111328,
|
26036 |
+
"learning_rate": 2.865378615699989e-05,
|
26037 |
+
"loss": 0.4109,
|
26038 |
+
"step": 605400
|
26039 |
+
},
|
26040 |
+
{
|
26041 |
+
"epoch": 45.34631224260576,
|
26042 |
+
"grad_norm": 5.942670822143555,
|
26043 |
+
"learning_rate": 2.8642150146229042e-05,
|
26044 |
+
"loss": 0.395,
|
26045 |
+
"step": 605600
|
26046 |
+
},
|
26047 |
+
{
|
26048 |
+
"epoch": 45.36128790715088,
|
26049 |
+
"grad_norm": 1.7649027109146118,
|
26050 |
+
"learning_rate": 2.8630513329276298e-05,
|
26051 |
+
"loss": 0.42,
|
26052 |
+
"step": 605800
|
26053 |
+
},
|
26054 |
+
{
|
26055 |
+
"epoch": 45.37626357169599,
|
26056 |
+
"grad_norm": 4.954268932342529,
|
26057 |
+
"learning_rate": 2.861887570871744e-05,
|
26058 |
+
"loss": 0.4292,
|
26059 |
+
"step": 606000
|
26060 |
+
},
|
26061 |
+
{
|
26062 |
+
"epoch": 45.37626357169599,
|
26063 |
+
"eval_loss": 1.5571595430374146,
|
26064 |
+
"eval_runtime": 1177.9473,
|
26065 |
+
"eval_samples_per_second": 8.408,
|
26066 |
+
"eval_steps_per_second": 0.421,
|
26067 |
+
"step": 606000
|
26068 |
+
},
|
26069 |
+
{
|
26070 |
+
"epoch": 45.391239236241105,
|
26071 |
+
"grad_norm": 8.101126670837402,
|
26072 |
+
"learning_rate": 2.8607237287128442e-05,
|
26073 |
+
"loss": 0.3947,
|
26074 |
+
"step": 606200
|
26075 |
+
},
|
26076 |
+
{
|
26077 |
+
"epoch": 45.406214900786225,
|
26078 |
+
"grad_norm": 19.263370513916016,
|
26079 |
+
"learning_rate": 2.8595598067085422e-05,
|
26080 |
+
"loss": 0.42,
|
26081 |
+
"step": 606400
|
26082 |
+
},
|
26083 |
+
{
|
26084 |
+
"epoch": 45.42119056533134,
|
26085 |
+
"grad_norm": 20.436559677124023,
|
26086 |
+
"learning_rate": 2.8583958051164705e-05,
|
26087 |
+
"loss": 0.41,
|
26088 |
+
"step": 606600
|
26089 |
+
},
|
26090 |
+
{
|
26091 |
+
"epoch": 45.43616622987645,
|
26092 |
+
"grad_norm": 5.639106273651123,
|
26093 |
+
"learning_rate": 2.8572317241942792e-05,
|
26094 |
+
"loss": 0.4125,
|
26095 |
+
"step": 606800
|
26096 |
+
},
|
26097 |
+
{
|
26098 |
+
"epoch": 45.45114189442157,
|
26099 |
+
"grad_norm": 4.174552917480469,
|
26100 |
+
"learning_rate": 2.8560675641996338e-05,
|
26101 |
+
"loss": 0.4398,
|
26102 |
+
"step": 607000
|
26103 |
+
},
|
26104 |
+
{
|
26105 |
+
"epoch": 45.45114189442157,
|
26106 |
+
"eval_loss": 1.550969123840332,
|
26107 |
+
"eval_runtime": 1178.1228,
|
26108 |
+
"eval_samples_per_second": 8.407,
|
26109 |
+
"eval_steps_per_second": 0.421,
|
26110 |
+
"step": 607000
|
26111 |
+
},
|
26112 |
+
{
|
26113 |
+
"epoch": 45.46611755896668,
|
26114 |
+
"grad_norm": 15.794562339782715,
|
26115 |
+
"learning_rate": 2.854903325390218e-05,
|
26116 |
+
"loss": 0.4158,
|
26117 |
+
"step": 607200
|
26118 |
+
},
|
26119 |
+
{
|
26120 |
+
"epoch": 45.48109322351179,
|
26121 |
+
"grad_norm": 3.670137882232666,
|
26122 |
+
"learning_rate": 2.853739008023736e-05,
|
26123 |
+
"loss": 0.4066,
|
26124 |
+
"step": 607400
|
26125 |
+
},
|
26126 |
+
{
|
26127 |
+
"epoch": 45.49606888805691,
|
26128 |
+
"grad_norm": 4.4699506759643555,
|
26129 |
+
"learning_rate": 2.852574612357904e-05,
|
26130 |
+
"loss": 0.435,
|
26131 |
+
"step": 607600
|
26132 |
+
},
|
26133 |
+
{
|
26134 |
+
"epoch": 45.51104455260202,
|
26135 |
+
"grad_norm": 9.282175064086914,
|
26136 |
+
"learning_rate": 2.8514101386504605e-05,
|
26137 |
+
"loss": 0.4065,
|
26138 |
+
"step": 607800
|
26139 |
+
},
|
26140 |
+
{
|
26141 |
+
"epoch": 45.526020217147135,
|
26142 |
+
"grad_norm": 8.399334907531738,
|
26143 |
+
"learning_rate": 2.8502455871591577e-05,
|
26144 |
+
"loss": 0.4054,
|
26145 |
+
"step": 608000
|
26146 |
+
},
|
26147 |
+
{
|
26148 |
+
"epoch": 45.526020217147135,
|
26149 |
+
"eval_loss": 1.5575517416000366,
|
26150 |
+
"eval_runtime": 1177.6967,
|
26151 |
+
"eval_samples_per_second": 8.41,
|
26152 |
+
"eval_steps_per_second": 0.421,
|
26153 |
+
"step": 608000
|
26154 |
+
},
|
26155 |
+
{
|
26156 |
+
"epoch": 45.54099588169225,
|
26157 |
+
"grad_norm": 5.749093532562256,
|
26158 |
+
"learning_rate": 2.8490809581417675e-05,
|
26159 |
+
"loss": 0.3893,
|
26160 |
+
"step": 608200
|
26161 |
+
},
|
26162 |
+
{
|
26163 |
+
"epoch": 45.555971546237366,
|
26164 |
+
"grad_norm": 3.4878060817718506,
|
26165 |
+
"learning_rate": 2.847916251856078e-05,
|
26166 |
+
"loss": 0.4196,
|
26167 |
+
"step": 608400
|
26168 |
+
},
|
26169 |
+
{
|
26170 |
+
"epoch": 45.57094721078248,
|
26171 |
+
"grad_norm": 5.982976913452148,
|
26172 |
+
"learning_rate": 2.846751468559894e-05,
|
26173 |
+
"loss": 0.4163,
|
26174 |
+
"step": 608600
|
26175 |
+
},
|
26176 |
+
{
|
26177 |
+
"epoch": 45.58592287532759,
|
26178 |
+
"grad_norm": 9.301414489746094,
|
26179 |
+
"learning_rate": 2.845586608511038e-05,
|
26180 |
+
"loss": 0.4154,
|
26181 |
+
"step": 608800
|
26182 |
+
},
|
26183 |
+
{
|
26184 |
+
"epoch": 45.60089853987271,
|
26185 |
+
"grad_norm": 14.666509628295898,
|
26186 |
+
"learning_rate": 2.8444216719673478e-05,
|
26187 |
+
"loss": 0.4265,
|
26188 |
+
"step": 609000
|
26189 |
+
},
|
26190 |
+
{
|
26191 |
+
"epoch": 45.60089853987271,
|
26192 |
+
"eval_loss": 1.547120213508606,
|
26193 |
+
"eval_runtime": 1178.0235,
|
26194 |
+
"eval_samples_per_second": 8.407,
|
26195 |
+
"eval_steps_per_second": 0.421,
|
26196 |
+
"step": 609000
|
26197 |
+
},
|
26198 |
+
{
|
26199 |
+
"epoch": 45.61587420441782,
|
26200 |
+
"grad_norm": 9.147184371948242,
|
26201 |
+
"learning_rate": 2.8432566591866823e-05,
|
26202 |
+
"loss": 0.4117,
|
26203 |
+
"step": 609200
|
26204 |
+
},
|
26205 |
+
{
|
26206 |
+
"epoch": 45.63084986896293,
|
26207 |
+
"grad_norm": 9.692912101745605,
|
26208 |
+
"learning_rate": 2.8420915704269114e-05,
|
26209 |
+
"loss": 0.406,
|
26210 |
+
"step": 609400
|
26211 |
+
},
|
26212 |
+
{
|
26213 |
+
"epoch": 45.64582553350805,
|
26214 |
+
"grad_norm": 8.107662200927734,
|
26215 |
+
"learning_rate": 2.8409264059459274e-05,
|
26216 |
+
"loss": 0.4404,
|
26217 |
+
"step": 609600
|
26218 |
+
},
|
26219 |
+
{
|
26220 |
+
"epoch": 45.660801198053164,
|
26221 |
+
"grad_norm": 3.5461621284484863,
|
26222 |
+
"learning_rate": 2.839761166001635e-05,
|
26223 |
+
"loss": 0.4198,
|
26224 |
+
"step": 609800
|
26225 |
+
},
|
26226 |
+
{
|
26227 |
+
"epoch": 45.675776862598276,
|
26228 |
+
"grad_norm": 10.266241073608398,
|
26229 |
+
"learning_rate": 2.8385958508519588e-05,
|
26230 |
+
"loss": 0.3968,
|
26231 |
+
"step": 610000
|
26232 |
+
},
|
26233 |
+
{
|
26234 |
+
"epoch": 45.675776862598276,
|
26235 |
+
"eval_loss": 1.5469167232513428,
|
26236 |
+
"eval_runtime": 1178.5726,
|
26237 |
+
"eval_samples_per_second": 8.403,
|
26238 |
+
"eval_steps_per_second": 0.421,
|
26239 |
+
"step": 610000
|
26240 |
+
},
|
26241 |
+
{
|
26242 |
+
"epoch": 45.690752527143395,
|
26243 |
+
"grad_norm": 11.695104598999023,
|
26244 |
+
"learning_rate": 2.8374304607548386e-05,
|
26245 |
+
"loss": 0.4112,
|
26246 |
+
"step": 610200
|
26247 |
}
|
26248 |
],
|
26249 |
"logging_steps": 200,
|
|
|
26263 |
"attributes": {}
|
26264 |
}
|
26265 |
},
|
26266 |
+
"total_flos": 9.791518820806656e+18,
|
26267 |
"train_batch_size": 10,
|
26268 |
"trial_name": null,
|
26269 |
"trial_params": null
|