Training in progress, step 10530, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 649681952
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e2591dd59db0ee77643ca5cdcbd94f39c1e3e886da067cfd69c1d268687332f1
|
3 |
size 649681952
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1299433082
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4862dfb5e20177153969f59a62e33ca923efbd45926a5ad0fabf55311009c6cb
|
3 |
size 1299433082
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64d4c2bc12bfd8f01176caa392d040353c55570591b69ee0d19a960a350865aa
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -16387,6 +16387,2050 @@
|
|
16387 |
"learning_rate": 9.986320109439124e-06,
|
16388 |
"loss": 2.0138,
|
16389 |
"step": 9360
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16390 |
}
|
16391 |
],
|
16392 |
"logging_steps": 4,
|
@@ -16394,7 +18438,7 @@
|
|
16394 |
"num_input_tokens_seen": 0,
|
16395 |
"num_train_epochs": 1,
|
16396 |
"save_steps": 1170,
|
16397 |
-
"total_flos": 1.
|
16398 |
"train_batch_size": 4,
|
16399 |
"trial_name": null,
|
16400 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.9003077975376197,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 10530,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
16387 |
"learning_rate": 9.986320109439124e-06,
|
16388 |
"loss": 2.0138,
|
16389 |
"step": 9360
|
16390 |
+
},
|
16391 |
+
{
|
16392 |
+
"epoch": 0.8,
|
16393 |
+
"grad_norm": 1.6166166067123413,
|
16394 |
+
"learning_rate": 9.96922024623803e-06,
|
16395 |
+
"loss": 1.8514,
|
16396 |
+
"step": 9364
|
16397 |
+
},
|
16398 |
+
{
|
16399 |
+
"epoch": 0.8,
|
16400 |
+
"grad_norm": 1.5393308401107788,
|
16401 |
+
"learning_rate": 9.952120383036936e-06,
|
16402 |
+
"loss": 1.8538,
|
16403 |
+
"step": 9368
|
16404 |
+
},
|
16405 |
+
{
|
16406 |
+
"epoch": 0.8,
|
16407 |
+
"grad_norm": 1.7060655355453491,
|
16408 |
+
"learning_rate": 9.935020519835841e-06,
|
16409 |
+
"loss": 1.8204,
|
16410 |
+
"step": 9372
|
16411 |
+
},
|
16412 |
+
{
|
16413 |
+
"epoch": 0.8,
|
16414 |
+
"grad_norm": 1.5741777420043945,
|
16415 |
+
"learning_rate": 9.917920656634749e-06,
|
16416 |
+
"loss": 1.8723,
|
16417 |
+
"step": 9376
|
16418 |
+
},
|
16419 |
+
{
|
16420 |
+
"epoch": 0.8,
|
16421 |
+
"grad_norm": 1.7236156463623047,
|
16422 |
+
"learning_rate": 9.900820793433653e-06,
|
16423 |
+
"loss": 2.0826,
|
16424 |
+
"step": 9380
|
16425 |
+
},
|
16426 |
+
{
|
16427 |
+
"epoch": 0.8,
|
16428 |
+
"grad_norm": 1.6071019172668457,
|
16429 |
+
"learning_rate": 9.883720930232558e-06,
|
16430 |
+
"loss": 1.8699,
|
16431 |
+
"step": 9384
|
16432 |
+
},
|
16433 |
+
{
|
16434 |
+
"epoch": 0.8,
|
16435 |
+
"grad_norm": 1.5609468221664429,
|
16436 |
+
"learning_rate": 9.866621067031464e-06,
|
16437 |
+
"loss": 1.7908,
|
16438 |
+
"step": 9388
|
16439 |
+
},
|
16440 |
+
{
|
16441 |
+
"epoch": 0.8,
|
16442 |
+
"grad_norm": 1.5828368663787842,
|
16443 |
+
"learning_rate": 9.84952120383037e-06,
|
16444 |
+
"loss": 1.9035,
|
16445 |
+
"step": 9392
|
16446 |
+
},
|
16447 |
+
{
|
16448 |
+
"epoch": 0.8,
|
16449 |
+
"grad_norm": 1.6804313659667969,
|
16450 |
+
"learning_rate": 9.832421340629275e-06,
|
16451 |
+
"loss": 1.873,
|
16452 |
+
"step": 9396
|
16453 |
+
},
|
16454 |
+
{
|
16455 |
+
"epoch": 0.8,
|
16456 |
+
"grad_norm": 1.7444369792938232,
|
16457 |
+
"learning_rate": 9.81532147742818e-06,
|
16458 |
+
"loss": 1.9714,
|
16459 |
+
"step": 9400
|
16460 |
+
},
|
16461 |
+
{
|
16462 |
+
"epoch": 0.8,
|
16463 |
+
"grad_norm": 1.478379487991333,
|
16464 |
+
"learning_rate": 9.798221614227086e-06,
|
16465 |
+
"loss": 1.825,
|
16466 |
+
"step": 9404
|
16467 |
+
},
|
16468 |
+
{
|
16469 |
+
"epoch": 0.8,
|
16470 |
+
"grad_norm": 1.8659464120864868,
|
16471 |
+
"learning_rate": 9.781121751025992e-06,
|
16472 |
+
"loss": 2.0924,
|
16473 |
+
"step": 9408
|
16474 |
+
},
|
16475 |
+
{
|
16476 |
+
"epoch": 0.8,
|
16477 |
+
"grad_norm": 1.6995452642440796,
|
16478 |
+
"learning_rate": 9.764021887824898e-06,
|
16479 |
+
"loss": 1.7909,
|
16480 |
+
"step": 9412
|
16481 |
+
},
|
16482 |
+
{
|
16483 |
+
"epoch": 0.81,
|
16484 |
+
"grad_norm": 1.7288949489593506,
|
16485 |
+
"learning_rate": 9.746922024623803e-06,
|
16486 |
+
"loss": 1.9683,
|
16487 |
+
"step": 9416
|
16488 |
+
},
|
16489 |
+
{
|
16490 |
+
"epoch": 0.81,
|
16491 |
+
"grad_norm": 1.728184461593628,
|
16492 |
+
"learning_rate": 9.729822161422709e-06,
|
16493 |
+
"loss": 1.867,
|
16494 |
+
"step": 9420
|
16495 |
+
},
|
16496 |
+
{
|
16497 |
+
"epoch": 0.81,
|
16498 |
+
"grad_norm": 1.6516950130462646,
|
16499 |
+
"learning_rate": 9.712722298221615e-06,
|
16500 |
+
"loss": 1.847,
|
16501 |
+
"step": 9424
|
16502 |
+
},
|
16503 |
+
{
|
16504 |
+
"epoch": 0.81,
|
16505 |
+
"grad_norm": 1.6084563732147217,
|
16506 |
+
"learning_rate": 9.69562243502052e-06,
|
16507 |
+
"loss": 1.8446,
|
16508 |
+
"step": 9428
|
16509 |
+
},
|
16510 |
+
{
|
16511 |
+
"epoch": 0.81,
|
16512 |
+
"grad_norm": 1.6345360279083252,
|
16513 |
+
"learning_rate": 9.678522571819426e-06,
|
16514 |
+
"loss": 1.8856,
|
16515 |
+
"step": 9432
|
16516 |
+
},
|
16517 |
+
{
|
16518 |
+
"epoch": 0.81,
|
16519 |
+
"grad_norm": 1.6293641328811646,
|
16520 |
+
"learning_rate": 9.661422708618332e-06,
|
16521 |
+
"loss": 1.8721,
|
16522 |
+
"step": 9436
|
16523 |
+
},
|
16524 |
+
{
|
16525 |
+
"epoch": 0.81,
|
16526 |
+
"grad_norm": 1.7901579141616821,
|
16527 |
+
"learning_rate": 9.644322845417237e-06,
|
16528 |
+
"loss": 1.8333,
|
16529 |
+
"step": 9440
|
16530 |
+
},
|
16531 |
+
{
|
16532 |
+
"epoch": 0.81,
|
16533 |
+
"grad_norm": 1.8444390296936035,
|
16534 |
+
"learning_rate": 9.627222982216143e-06,
|
16535 |
+
"loss": 1.8012,
|
16536 |
+
"step": 9444
|
16537 |
+
},
|
16538 |
+
{
|
16539 |
+
"epoch": 0.81,
|
16540 |
+
"grad_norm": 1.9100682735443115,
|
16541 |
+
"learning_rate": 9.610123119015049e-06,
|
16542 |
+
"loss": 1.8989,
|
16543 |
+
"step": 9448
|
16544 |
+
},
|
16545 |
+
{
|
16546 |
+
"epoch": 0.81,
|
16547 |
+
"grad_norm": 1.5829551219940186,
|
16548 |
+
"learning_rate": 9.593023255813954e-06,
|
16549 |
+
"loss": 1.692,
|
16550 |
+
"step": 9452
|
16551 |
+
},
|
16552 |
+
{
|
16553 |
+
"epoch": 0.81,
|
16554 |
+
"grad_norm": 1.7920702695846558,
|
16555 |
+
"learning_rate": 9.57592339261286e-06,
|
16556 |
+
"loss": 1.9008,
|
16557 |
+
"step": 9456
|
16558 |
+
},
|
16559 |
+
{
|
16560 |
+
"epoch": 0.81,
|
16561 |
+
"grad_norm": 1.8126853704452515,
|
16562 |
+
"learning_rate": 9.558823529411764e-06,
|
16563 |
+
"loss": 1.8993,
|
16564 |
+
"step": 9460
|
16565 |
+
},
|
16566 |
+
{
|
16567 |
+
"epoch": 0.81,
|
16568 |
+
"grad_norm": 1.6333680152893066,
|
16569 |
+
"learning_rate": 9.541723666210671e-06,
|
16570 |
+
"loss": 1.8486,
|
16571 |
+
"step": 9464
|
16572 |
+
},
|
16573 |
+
{
|
16574 |
+
"epoch": 0.81,
|
16575 |
+
"grad_norm": 1.6809686422348022,
|
16576 |
+
"learning_rate": 9.524623803009577e-06,
|
16577 |
+
"loss": 1.8182,
|
16578 |
+
"step": 9468
|
16579 |
+
},
|
16580 |
+
{
|
16581 |
+
"epoch": 0.81,
|
16582 |
+
"grad_norm": 1.5512781143188477,
|
16583 |
+
"learning_rate": 9.507523939808483e-06,
|
16584 |
+
"loss": 1.782,
|
16585 |
+
"step": 9472
|
16586 |
+
},
|
16587 |
+
{
|
16588 |
+
"epoch": 0.81,
|
16589 |
+
"grad_norm": 1.4863585233688354,
|
16590 |
+
"learning_rate": 9.490424076607388e-06,
|
16591 |
+
"loss": 1.9,
|
16592 |
+
"step": 9476
|
16593 |
+
},
|
16594 |
+
{
|
16595 |
+
"epoch": 0.81,
|
16596 |
+
"grad_norm": 1.8519527912139893,
|
16597 |
+
"learning_rate": 9.473324213406292e-06,
|
16598 |
+
"loss": 1.8792,
|
16599 |
+
"step": 9480
|
16600 |
+
},
|
16601 |
+
{
|
16602 |
+
"epoch": 0.81,
|
16603 |
+
"grad_norm": 1.68942391872406,
|
16604 |
+
"learning_rate": 9.4562243502052e-06,
|
16605 |
+
"loss": 1.81,
|
16606 |
+
"step": 9484
|
16607 |
+
},
|
16608 |
+
{
|
16609 |
+
"epoch": 0.81,
|
16610 |
+
"grad_norm": 1.6430591344833374,
|
16611 |
+
"learning_rate": 9.439124487004105e-06,
|
16612 |
+
"loss": 1.7057,
|
16613 |
+
"step": 9488
|
16614 |
+
},
|
16615 |
+
{
|
16616 |
+
"epoch": 0.81,
|
16617 |
+
"grad_norm": 1.7153677940368652,
|
16618 |
+
"learning_rate": 9.422024623803009e-06,
|
16619 |
+
"loss": 1.7675,
|
16620 |
+
"step": 9492
|
16621 |
+
},
|
16622 |
+
{
|
16623 |
+
"epoch": 0.81,
|
16624 |
+
"grad_norm": 2.2399628162384033,
|
16625 |
+
"learning_rate": 9.404924760601916e-06,
|
16626 |
+
"loss": 1.9985,
|
16627 |
+
"step": 9496
|
16628 |
+
},
|
16629 |
+
{
|
16630 |
+
"epoch": 0.81,
|
16631 |
+
"grad_norm": 1.6189675331115723,
|
16632 |
+
"learning_rate": 9.38782489740082e-06,
|
16633 |
+
"loss": 1.8922,
|
16634 |
+
"step": 9500
|
16635 |
+
},
|
16636 |
+
{
|
16637 |
+
"epoch": 0.81,
|
16638 |
+
"grad_norm": 1.6087392568588257,
|
16639 |
+
"learning_rate": 9.370725034199728e-06,
|
16640 |
+
"loss": 1.8978,
|
16641 |
+
"step": 9504
|
16642 |
+
},
|
16643 |
+
{
|
16644 |
+
"epoch": 0.81,
|
16645 |
+
"grad_norm": 1.7213088274002075,
|
16646 |
+
"learning_rate": 9.353625170998632e-06,
|
16647 |
+
"loss": 1.9047,
|
16648 |
+
"step": 9508
|
16649 |
+
},
|
16650 |
+
{
|
16651 |
+
"epoch": 0.81,
|
16652 |
+
"grad_norm": 1.6473731994628906,
|
16653 |
+
"learning_rate": 9.336525307797537e-06,
|
16654 |
+
"loss": 1.922,
|
16655 |
+
"step": 9512
|
16656 |
+
},
|
16657 |
+
{
|
16658 |
+
"epoch": 0.81,
|
16659 |
+
"grad_norm": 1.6384577751159668,
|
16660 |
+
"learning_rate": 9.319425444596445e-06,
|
16661 |
+
"loss": 1.9051,
|
16662 |
+
"step": 9516
|
16663 |
+
},
|
16664 |
+
{
|
16665 |
+
"epoch": 0.81,
|
16666 |
+
"grad_norm": 1.6715625524520874,
|
16667 |
+
"learning_rate": 9.302325581395349e-06,
|
16668 |
+
"loss": 1.7335,
|
16669 |
+
"step": 9520
|
16670 |
+
},
|
16671 |
+
{
|
16672 |
+
"epoch": 0.81,
|
16673 |
+
"grad_norm": 1.7468408346176147,
|
16674 |
+
"learning_rate": 9.285225718194254e-06,
|
16675 |
+
"loss": 1.8317,
|
16676 |
+
"step": 9524
|
16677 |
+
},
|
16678 |
+
{
|
16679 |
+
"epoch": 0.81,
|
16680 |
+
"grad_norm": 1.6021628379821777,
|
16681 |
+
"learning_rate": 9.26812585499316e-06,
|
16682 |
+
"loss": 1.7717,
|
16683 |
+
"step": 9528
|
16684 |
+
},
|
16685 |
+
{
|
16686 |
+
"epoch": 0.81,
|
16687 |
+
"grad_norm": 1.4728972911834717,
|
16688 |
+
"learning_rate": 9.251025991792066e-06,
|
16689 |
+
"loss": 1.8383,
|
16690 |
+
"step": 9532
|
16691 |
+
},
|
16692 |
+
{
|
16693 |
+
"epoch": 0.82,
|
16694 |
+
"grad_norm": 1.641257643699646,
|
16695 |
+
"learning_rate": 9.233926128590973e-06,
|
16696 |
+
"loss": 1.9203,
|
16697 |
+
"step": 9536
|
16698 |
+
},
|
16699 |
+
{
|
16700 |
+
"epoch": 0.82,
|
16701 |
+
"grad_norm": 1.5959044694900513,
|
16702 |
+
"learning_rate": 9.216826265389877e-06,
|
16703 |
+
"loss": 1.8186,
|
16704 |
+
"step": 9540
|
16705 |
+
},
|
16706 |
+
{
|
16707 |
+
"epoch": 0.82,
|
16708 |
+
"grad_norm": 1.8256573677062988,
|
16709 |
+
"learning_rate": 9.199726402188783e-06,
|
16710 |
+
"loss": 1.9279,
|
16711 |
+
"step": 9544
|
16712 |
+
},
|
16713 |
+
{
|
16714 |
+
"epoch": 0.82,
|
16715 |
+
"grad_norm": 1.6256192922592163,
|
16716 |
+
"learning_rate": 9.182626538987688e-06,
|
16717 |
+
"loss": 1.8773,
|
16718 |
+
"step": 9548
|
16719 |
+
},
|
16720 |
+
{
|
16721 |
+
"epoch": 0.82,
|
16722 |
+
"grad_norm": 1.7054941654205322,
|
16723 |
+
"learning_rate": 9.165526675786594e-06,
|
16724 |
+
"loss": 1.8826,
|
16725 |
+
"step": 9552
|
16726 |
+
},
|
16727 |
+
{
|
16728 |
+
"epoch": 0.82,
|
16729 |
+
"grad_norm": 1.7333673238754272,
|
16730 |
+
"learning_rate": 9.1484268125855e-06,
|
16731 |
+
"loss": 1.8855,
|
16732 |
+
"step": 9556
|
16733 |
+
},
|
16734 |
+
{
|
16735 |
+
"epoch": 0.82,
|
16736 |
+
"grad_norm": 1.9579733610153198,
|
16737 |
+
"learning_rate": 9.131326949384405e-06,
|
16738 |
+
"loss": 1.7222,
|
16739 |
+
"step": 9560
|
16740 |
+
},
|
16741 |
+
{
|
16742 |
+
"epoch": 0.82,
|
16743 |
+
"grad_norm": 1.6406586170196533,
|
16744 |
+
"learning_rate": 9.11422708618331e-06,
|
16745 |
+
"loss": 1.8063,
|
16746 |
+
"step": 9564
|
16747 |
+
},
|
16748 |
+
{
|
16749 |
+
"epoch": 0.82,
|
16750 |
+
"grad_norm": 1.8166844844818115,
|
16751 |
+
"learning_rate": 9.097127222982216e-06,
|
16752 |
+
"loss": 1.8802,
|
16753 |
+
"step": 9568
|
16754 |
+
},
|
16755 |
+
{
|
16756 |
+
"epoch": 0.82,
|
16757 |
+
"grad_norm": 1.815399169921875,
|
16758 |
+
"learning_rate": 9.080027359781122e-06,
|
16759 |
+
"loss": 1.9574,
|
16760 |
+
"step": 9572
|
16761 |
+
},
|
16762 |
+
{
|
16763 |
+
"epoch": 0.82,
|
16764 |
+
"grad_norm": 1.7717112302780151,
|
16765 |
+
"learning_rate": 9.062927496580028e-06,
|
16766 |
+
"loss": 1.8687,
|
16767 |
+
"step": 9576
|
16768 |
+
},
|
16769 |
+
{
|
16770 |
+
"epoch": 0.82,
|
16771 |
+
"grad_norm": 1.8283933401107788,
|
16772 |
+
"learning_rate": 9.045827633378933e-06,
|
16773 |
+
"loss": 1.7794,
|
16774 |
+
"step": 9580
|
16775 |
+
},
|
16776 |
+
{
|
16777 |
+
"epoch": 0.82,
|
16778 |
+
"grad_norm": 1.5563534498214722,
|
16779 |
+
"learning_rate": 9.028727770177839e-06,
|
16780 |
+
"loss": 1.9697,
|
16781 |
+
"step": 9584
|
16782 |
+
},
|
16783 |
+
{
|
16784 |
+
"epoch": 0.82,
|
16785 |
+
"grad_norm": 1.7888858318328857,
|
16786 |
+
"learning_rate": 9.011627906976745e-06,
|
16787 |
+
"loss": 1.8549,
|
16788 |
+
"step": 9588
|
16789 |
+
},
|
16790 |
+
{
|
16791 |
+
"epoch": 0.82,
|
16792 |
+
"grad_norm": 1.6961920261383057,
|
16793 |
+
"learning_rate": 8.99452804377565e-06,
|
16794 |
+
"loss": 1.8044,
|
16795 |
+
"step": 9592
|
16796 |
+
},
|
16797 |
+
{
|
16798 |
+
"epoch": 0.82,
|
16799 |
+
"grad_norm": 1.6775288581848145,
|
16800 |
+
"learning_rate": 8.977428180574556e-06,
|
16801 |
+
"loss": 1.7884,
|
16802 |
+
"step": 9596
|
16803 |
+
},
|
16804 |
+
{
|
16805 |
+
"epoch": 0.82,
|
16806 |
+
"grad_norm": 1.5376509428024292,
|
16807 |
+
"learning_rate": 8.960328317373462e-06,
|
16808 |
+
"loss": 1.9472,
|
16809 |
+
"step": 9600
|
16810 |
+
},
|
16811 |
+
{
|
16812 |
+
"epoch": 0.82,
|
16813 |
+
"grad_norm": 1.5770347118377686,
|
16814 |
+
"learning_rate": 8.943228454172367e-06,
|
16815 |
+
"loss": 1.7587,
|
16816 |
+
"step": 9604
|
16817 |
+
},
|
16818 |
+
{
|
16819 |
+
"epoch": 0.82,
|
16820 |
+
"grad_norm": 1.5645045042037964,
|
16821 |
+
"learning_rate": 8.926128590971273e-06,
|
16822 |
+
"loss": 1.9036,
|
16823 |
+
"step": 9608
|
16824 |
+
},
|
16825 |
+
{
|
16826 |
+
"epoch": 0.82,
|
16827 |
+
"grad_norm": 1.6209347248077393,
|
16828 |
+
"learning_rate": 8.909028727770179e-06,
|
16829 |
+
"loss": 1.9395,
|
16830 |
+
"step": 9612
|
16831 |
+
},
|
16832 |
+
{
|
16833 |
+
"epoch": 0.82,
|
16834 |
+
"grad_norm": 1.5157575607299805,
|
16835 |
+
"learning_rate": 8.891928864569084e-06,
|
16836 |
+
"loss": 1.9096,
|
16837 |
+
"step": 9616
|
16838 |
+
},
|
16839 |
+
{
|
16840 |
+
"epoch": 0.82,
|
16841 |
+
"grad_norm": 1.6890482902526855,
|
16842 |
+
"learning_rate": 8.874829001367988e-06,
|
16843 |
+
"loss": 1.8541,
|
16844 |
+
"step": 9620
|
16845 |
+
},
|
16846 |
+
{
|
16847 |
+
"epoch": 0.82,
|
16848 |
+
"grad_norm": 1.6792246103286743,
|
16849 |
+
"learning_rate": 8.857729138166896e-06,
|
16850 |
+
"loss": 1.9734,
|
16851 |
+
"step": 9624
|
16852 |
+
},
|
16853 |
+
{
|
16854 |
+
"epoch": 0.82,
|
16855 |
+
"grad_norm": 1.5776066780090332,
|
16856 |
+
"learning_rate": 8.840629274965801e-06,
|
16857 |
+
"loss": 1.9166,
|
16858 |
+
"step": 9628
|
16859 |
+
},
|
16860 |
+
{
|
16861 |
+
"epoch": 0.82,
|
16862 |
+
"grad_norm": 1.5487701892852783,
|
16863 |
+
"learning_rate": 8.823529411764707e-06,
|
16864 |
+
"loss": 1.7777,
|
16865 |
+
"step": 9632
|
16866 |
+
},
|
16867 |
+
{
|
16868 |
+
"epoch": 0.82,
|
16869 |
+
"grad_norm": 1.6531468629837036,
|
16870 |
+
"learning_rate": 8.806429548563612e-06,
|
16871 |
+
"loss": 1.6488,
|
16872 |
+
"step": 9636
|
16873 |
+
},
|
16874 |
+
{
|
16875 |
+
"epoch": 0.82,
|
16876 |
+
"grad_norm": 1.6902531385421753,
|
16877 |
+
"learning_rate": 8.789329685362516e-06,
|
16878 |
+
"loss": 1.8287,
|
16879 |
+
"step": 9640
|
16880 |
+
},
|
16881 |
+
{
|
16882 |
+
"epoch": 0.82,
|
16883 |
+
"grad_norm": 1.6954971551895142,
|
16884 |
+
"learning_rate": 8.772229822161424e-06,
|
16885 |
+
"loss": 1.896,
|
16886 |
+
"step": 9644
|
16887 |
+
},
|
16888 |
+
{
|
16889 |
+
"epoch": 0.82,
|
16890 |
+
"grad_norm": 1.5442079305648804,
|
16891 |
+
"learning_rate": 8.75512995896033e-06,
|
16892 |
+
"loss": 1.8335,
|
16893 |
+
"step": 9648
|
16894 |
+
},
|
16895 |
+
{
|
16896 |
+
"epoch": 0.83,
|
16897 |
+
"grad_norm": 1.739211916923523,
|
16898 |
+
"learning_rate": 8.738030095759233e-06,
|
16899 |
+
"loss": 1.8265,
|
16900 |
+
"step": 9652
|
16901 |
+
},
|
16902 |
+
{
|
16903 |
+
"epoch": 0.83,
|
16904 |
+
"grad_norm": 1.6437937021255493,
|
16905 |
+
"learning_rate": 8.72093023255814e-06,
|
16906 |
+
"loss": 1.7487,
|
16907 |
+
"step": 9656
|
16908 |
+
},
|
16909 |
+
{
|
16910 |
+
"epoch": 0.83,
|
16911 |
+
"grad_norm": 1.568172812461853,
|
16912 |
+
"learning_rate": 8.703830369357045e-06,
|
16913 |
+
"loss": 1.8618,
|
16914 |
+
"step": 9660
|
16915 |
+
},
|
16916 |
+
{
|
16917 |
+
"epoch": 0.83,
|
16918 |
+
"grad_norm": 1.84135103225708,
|
16919 |
+
"learning_rate": 8.686730506155952e-06,
|
16920 |
+
"loss": 1.8962,
|
16921 |
+
"step": 9664
|
16922 |
+
},
|
16923 |
+
{
|
16924 |
+
"epoch": 0.83,
|
16925 |
+
"grad_norm": 1.599940538406372,
|
16926 |
+
"learning_rate": 8.669630642954858e-06,
|
16927 |
+
"loss": 1.7502,
|
16928 |
+
"step": 9668
|
16929 |
+
},
|
16930 |
+
{
|
16931 |
+
"epoch": 0.83,
|
16932 |
+
"grad_norm": 1.9855149984359741,
|
16933 |
+
"learning_rate": 8.652530779753762e-06,
|
16934 |
+
"loss": 2.0228,
|
16935 |
+
"step": 9672
|
16936 |
+
},
|
16937 |
+
{
|
16938 |
+
"epoch": 0.83,
|
16939 |
+
"grad_norm": 1.641782283782959,
|
16940 |
+
"learning_rate": 8.635430916552669e-06,
|
16941 |
+
"loss": 2.0184,
|
16942 |
+
"step": 9676
|
16943 |
+
},
|
16944 |
+
{
|
16945 |
+
"epoch": 0.83,
|
16946 |
+
"grad_norm": 1.9988453388214111,
|
16947 |
+
"learning_rate": 8.618331053351573e-06,
|
16948 |
+
"loss": 2.0865,
|
16949 |
+
"step": 9680
|
16950 |
+
},
|
16951 |
+
{
|
16952 |
+
"epoch": 0.83,
|
16953 |
+
"grad_norm": 1.7934141159057617,
|
16954 |
+
"learning_rate": 8.601231190150479e-06,
|
16955 |
+
"loss": 1.8933,
|
16956 |
+
"step": 9684
|
16957 |
+
},
|
16958 |
+
{
|
16959 |
+
"epoch": 0.83,
|
16960 |
+
"grad_norm": 1.5691959857940674,
|
16961 |
+
"learning_rate": 8.584131326949384e-06,
|
16962 |
+
"loss": 1.8344,
|
16963 |
+
"step": 9688
|
16964 |
+
},
|
16965 |
+
{
|
16966 |
+
"epoch": 0.83,
|
16967 |
+
"grad_norm": 1.7705130577087402,
|
16968 |
+
"learning_rate": 8.56703146374829e-06,
|
16969 |
+
"loss": 1.646,
|
16970 |
+
"step": 9692
|
16971 |
+
},
|
16972 |
+
{
|
16973 |
+
"epoch": 0.83,
|
16974 |
+
"grad_norm": 1.6504237651824951,
|
16975 |
+
"learning_rate": 8.549931600547197e-06,
|
16976 |
+
"loss": 1.7268,
|
16977 |
+
"step": 9696
|
16978 |
+
},
|
16979 |
+
{
|
16980 |
+
"epoch": 0.83,
|
16981 |
+
"grad_norm": 1.4613388776779175,
|
16982 |
+
"learning_rate": 8.532831737346101e-06,
|
16983 |
+
"loss": 1.7871,
|
16984 |
+
"step": 9700
|
16985 |
+
},
|
16986 |
+
{
|
16987 |
+
"epoch": 0.83,
|
16988 |
+
"grad_norm": 1.7658060789108276,
|
16989 |
+
"learning_rate": 8.515731874145007e-06,
|
16990 |
+
"loss": 1.8519,
|
16991 |
+
"step": 9704
|
16992 |
+
},
|
16993 |
+
{
|
16994 |
+
"epoch": 0.83,
|
16995 |
+
"grad_norm": 1.84966242313385,
|
16996 |
+
"learning_rate": 8.498632010943912e-06,
|
16997 |
+
"loss": 1.8277,
|
16998 |
+
"step": 9708
|
16999 |
+
},
|
17000 |
+
{
|
17001 |
+
"epoch": 0.83,
|
17002 |
+
"grad_norm": 1.674241542816162,
|
17003 |
+
"learning_rate": 8.481532147742818e-06,
|
17004 |
+
"loss": 1.7632,
|
17005 |
+
"step": 9712
|
17006 |
+
},
|
17007 |
+
{
|
17008 |
+
"epoch": 0.83,
|
17009 |
+
"grad_norm": 1.7811789512634277,
|
17010 |
+
"learning_rate": 8.464432284541724e-06,
|
17011 |
+
"loss": 1.899,
|
17012 |
+
"step": 9716
|
17013 |
+
},
|
17014 |
+
{
|
17015 |
+
"epoch": 0.83,
|
17016 |
+
"grad_norm": 1.7027822732925415,
|
17017 |
+
"learning_rate": 8.44733242134063e-06,
|
17018 |
+
"loss": 1.9129,
|
17019 |
+
"step": 9720
|
17020 |
+
},
|
17021 |
+
{
|
17022 |
+
"epoch": 0.83,
|
17023 |
+
"grad_norm": 1.7912241220474243,
|
17024 |
+
"learning_rate": 8.430232558139535e-06,
|
17025 |
+
"loss": 1.8546,
|
17026 |
+
"step": 9724
|
17027 |
+
},
|
17028 |
+
{
|
17029 |
+
"epoch": 0.83,
|
17030 |
+
"grad_norm": 1.5889939069747925,
|
17031 |
+
"learning_rate": 8.41313269493844e-06,
|
17032 |
+
"loss": 2.0574,
|
17033 |
+
"step": 9728
|
17034 |
+
},
|
17035 |
+
{
|
17036 |
+
"epoch": 0.83,
|
17037 |
+
"grad_norm": 1.55410635471344,
|
17038 |
+
"learning_rate": 8.396032831737346e-06,
|
17039 |
+
"loss": 1.8547,
|
17040 |
+
"step": 9732
|
17041 |
+
},
|
17042 |
+
{
|
17043 |
+
"epoch": 0.83,
|
17044 |
+
"grad_norm": 1.6358247995376587,
|
17045 |
+
"learning_rate": 8.378932968536252e-06,
|
17046 |
+
"loss": 1.7924,
|
17047 |
+
"step": 9736
|
17048 |
+
},
|
17049 |
+
{
|
17050 |
+
"epoch": 0.83,
|
17051 |
+
"grad_norm": 1.698111891746521,
|
17052 |
+
"learning_rate": 8.361833105335158e-06,
|
17053 |
+
"loss": 1.8366,
|
17054 |
+
"step": 9740
|
17055 |
+
},
|
17056 |
+
{
|
17057 |
+
"epoch": 0.83,
|
17058 |
+
"grad_norm": 1.5512586832046509,
|
17059 |
+
"learning_rate": 8.344733242134063e-06,
|
17060 |
+
"loss": 1.7097,
|
17061 |
+
"step": 9744
|
17062 |
+
},
|
17063 |
+
{
|
17064 |
+
"epoch": 0.83,
|
17065 |
+
"grad_norm": 1.5253231525421143,
|
17066 |
+
"learning_rate": 8.327633378932969e-06,
|
17067 |
+
"loss": 1.9076,
|
17068 |
+
"step": 9748
|
17069 |
+
},
|
17070 |
+
{
|
17071 |
+
"epoch": 0.83,
|
17072 |
+
"grad_norm": 1.6231822967529297,
|
17073 |
+
"learning_rate": 8.310533515731875e-06,
|
17074 |
+
"loss": 1.9026,
|
17075 |
+
"step": 9752
|
17076 |
+
},
|
17077 |
+
{
|
17078 |
+
"epoch": 0.83,
|
17079 |
+
"grad_norm": 1.726797103881836,
|
17080 |
+
"learning_rate": 8.29343365253078e-06,
|
17081 |
+
"loss": 1.9518,
|
17082 |
+
"step": 9756
|
17083 |
+
},
|
17084 |
+
{
|
17085 |
+
"epoch": 0.83,
|
17086 |
+
"grad_norm": 1.7323880195617676,
|
17087 |
+
"learning_rate": 8.276333789329686e-06,
|
17088 |
+
"loss": 1.9352,
|
17089 |
+
"step": 9760
|
17090 |
+
},
|
17091 |
+
{
|
17092 |
+
"epoch": 0.83,
|
17093 |
+
"grad_norm": 1.7740181684494019,
|
17094 |
+
"learning_rate": 8.259233926128592e-06,
|
17095 |
+
"loss": 1.8697,
|
17096 |
+
"step": 9764
|
17097 |
+
},
|
17098 |
+
{
|
17099 |
+
"epoch": 0.84,
|
17100 |
+
"grad_norm": 1.6174265146255493,
|
17101 |
+
"learning_rate": 8.242134062927497e-06,
|
17102 |
+
"loss": 1.8365,
|
17103 |
+
"step": 9768
|
17104 |
+
},
|
17105 |
+
{
|
17106 |
+
"epoch": 0.84,
|
17107 |
+
"grad_norm": 1.7219985723495483,
|
17108 |
+
"learning_rate": 8.225034199726403e-06,
|
17109 |
+
"loss": 1.8343,
|
17110 |
+
"step": 9772
|
17111 |
+
},
|
17112 |
+
{
|
17113 |
+
"epoch": 0.84,
|
17114 |
+
"grad_norm": 1.618703007698059,
|
17115 |
+
"learning_rate": 8.207934336525308e-06,
|
17116 |
+
"loss": 1.914,
|
17117 |
+
"step": 9776
|
17118 |
+
},
|
17119 |
+
{
|
17120 |
+
"epoch": 0.84,
|
17121 |
+
"grad_norm": 1.5896012783050537,
|
17122 |
+
"learning_rate": 8.190834473324212e-06,
|
17123 |
+
"loss": 1.8311,
|
17124 |
+
"step": 9780
|
17125 |
+
},
|
17126 |
+
{
|
17127 |
+
"epoch": 0.84,
|
17128 |
+
"grad_norm": 1.614791989326477,
|
17129 |
+
"learning_rate": 8.17373461012312e-06,
|
17130 |
+
"loss": 1.8189,
|
17131 |
+
"step": 9784
|
17132 |
+
},
|
17133 |
+
{
|
17134 |
+
"epoch": 0.84,
|
17135 |
+
"grad_norm": 1.6827095746994019,
|
17136 |
+
"learning_rate": 8.156634746922025e-06,
|
17137 |
+
"loss": 1.7639,
|
17138 |
+
"step": 9788
|
17139 |
+
},
|
17140 |
+
{
|
17141 |
+
"epoch": 0.84,
|
17142 |
+
"grad_norm": 1.563234806060791,
|
17143 |
+
"learning_rate": 8.139534883720931e-06,
|
17144 |
+
"loss": 1.8255,
|
17145 |
+
"step": 9792
|
17146 |
+
},
|
17147 |
+
{
|
17148 |
+
"epoch": 0.84,
|
17149 |
+
"grad_norm": 1.871492624282837,
|
17150 |
+
"learning_rate": 8.122435020519837e-06,
|
17151 |
+
"loss": 1.7558,
|
17152 |
+
"step": 9796
|
17153 |
+
},
|
17154 |
+
{
|
17155 |
+
"epoch": 0.84,
|
17156 |
+
"grad_norm": 1.827107548713684,
|
17157 |
+
"learning_rate": 8.10533515731874e-06,
|
17158 |
+
"loss": 1.93,
|
17159 |
+
"step": 9800
|
17160 |
+
},
|
17161 |
+
{
|
17162 |
+
"epoch": 0.84,
|
17163 |
+
"grad_norm": 1.7132670879364014,
|
17164 |
+
"learning_rate": 8.088235294117648e-06,
|
17165 |
+
"loss": 1.9702,
|
17166 |
+
"step": 9804
|
17167 |
+
},
|
17168 |
+
{
|
17169 |
+
"epoch": 0.84,
|
17170 |
+
"grad_norm": 1.71363365650177,
|
17171 |
+
"learning_rate": 8.071135430916554e-06,
|
17172 |
+
"loss": 1.9638,
|
17173 |
+
"step": 9808
|
17174 |
+
},
|
17175 |
+
{
|
17176 |
+
"epoch": 0.84,
|
17177 |
+
"grad_norm": 1.6538909673690796,
|
17178 |
+
"learning_rate": 8.054035567715458e-06,
|
17179 |
+
"loss": 1.9329,
|
17180 |
+
"step": 9812
|
17181 |
+
},
|
17182 |
+
{
|
17183 |
+
"epoch": 0.84,
|
17184 |
+
"grad_norm": 1.4397317171096802,
|
17185 |
+
"learning_rate": 8.036935704514365e-06,
|
17186 |
+
"loss": 1.7031,
|
17187 |
+
"step": 9816
|
17188 |
+
},
|
17189 |
+
{
|
17190 |
+
"epoch": 0.84,
|
17191 |
+
"grad_norm": 1.7500611543655396,
|
17192 |
+
"learning_rate": 8.019835841313269e-06,
|
17193 |
+
"loss": 1.7927,
|
17194 |
+
"step": 9820
|
17195 |
+
},
|
17196 |
+
{
|
17197 |
+
"epoch": 0.84,
|
17198 |
+
"grad_norm": 1.7213584184646606,
|
17199 |
+
"learning_rate": 8.002735978112176e-06,
|
17200 |
+
"loss": 1.8121,
|
17201 |
+
"step": 9824
|
17202 |
+
},
|
17203 |
+
{
|
17204 |
+
"epoch": 0.84,
|
17205 |
+
"grad_norm": 1.8555186986923218,
|
17206 |
+
"learning_rate": 7.985636114911082e-06,
|
17207 |
+
"loss": 1.9388,
|
17208 |
+
"step": 9828
|
17209 |
+
},
|
17210 |
+
{
|
17211 |
+
"epoch": 0.84,
|
17212 |
+
"grad_norm": 1.5887397527694702,
|
17213 |
+
"learning_rate": 7.968536251709986e-06,
|
17214 |
+
"loss": 1.8344,
|
17215 |
+
"step": 9832
|
17216 |
+
},
|
17217 |
+
{
|
17218 |
+
"epoch": 0.84,
|
17219 |
+
"grad_norm": 1.5652546882629395,
|
17220 |
+
"learning_rate": 7.951436388508893e-06,
|
17221 |
+
"loss": 1.9696,
|
17222 |
+
"step": 9836
|
17223 |
+
},
|
17224 |
+
{
|
17225 |
+
"epoch": 0.84,
|
17226 |
+
"grad_norm": 1.5642329454421997,
|
17227 |
+
"learning_rate": 7.934336525307797e-06,
|
17228 |
+
"loss": 1.6744,
|
17229 |
+
"step": 9840
|
17230 |
+
},
|
17231 |
+
{
|
17232 |
+
"epoch": 0.84,
|
17233 |
+
"grad_norm": 1.631618618965149,
|
17234 |
+
"learning_rate": 7.917236662106703e-06,
|
17235 |
+
"loss": 1.8742,
|
17236 |
+
"step": 9844
|
17237 |
+
},
|
17238 |
+
{
|
17239 |
+
"epoch": 0.84,
|
17240 |
+
"grad_norm": 1.6112003326416016,
|
17241 |
+
"learning_rate": 7.90013679890561e-06,
|
17242 |
+
"loss": 1.9281,
|
17243 |
+
"step": 9848
|
17244 |
+
},
|
17245 |
+
{
|
17246 |
+
"epoch": 0.84,
|
17247 |
+
"grad_norm": 1.612426996231079,
|
17248 |
+
"learning_rate": 7.883036935704514e-06,
|
17249 |
+
"loss": 1.9576,
|
17250 |
+
"step": 9852
|
17251 |
+
},
|
17252 |
+
{
|
17253 |
+
"epoch": 0.84,
|
17254 |
+
"grad_norm": 1.7601630687713623,
|
17255 |
+
"learning_rate": 7.865937072503421e-06,
|
17256 |
+
"loss": 1.7478,
|
17257 |
+
"step": 9856
|
17258 |
+
},
|
17259 |
+
{
|
17260 |
+
"epoch": 0.84,
|
17261 |
+
"grad_norm": 1.782696008682251,
|
17262 |
+
"learning_rate": 7.848837209302325e-06,
|
17263 |
+
"loss": 1.8667,
|
17264 |
+
"step": 9860
|
17265 |
+
},
|
17266 |
+
{
|
17267 |
+
"epoch": 0.84,
|
17268 |
+
"grad_norm": 1.716011881828308,
|
17269 |
+
"learning_rate": 7.831737346101231e-06,
|
17270 |
+
"loss": 1.9523,
|
17271 |
+
"step": 9864
|
17272 |
+
},
|
17273 |
+
{
|
17274 |
+
"epoch": 0.84,
|
17275 |
+
"grad_norm": 1.5538578033447266,
|
17276 |
+
"learning_rate": 7.814637482900137e-06,
|
17277 |
+
"loss": 1.9909,
|
17278 |
+
"step": 9868
|
17279 |
+
},
|
17280 |
+
{
|
17281 |
+
"epoch": 0.84,
|
17282 |
+
"grad_norm": 1.6781466007232666,
|
17283 |
+
"learning_rate": 7.797537619699042e-06,
|
17284 |
+
"loss": 1.7684,
|
17285 |
+
"step": 9872
|
17286 |
+
},
|
17287 |
+
{
|
17288 |
+
"epoch": 0.84,
|
17289 |
+
"grad_norm": 1.6765227317810059,
|
17290 |
+
"learning_rate": 7.780437756497948e-06,
|
17291 |
+
"loss": 1.8663,
|
17292 |
+
"step": 9876
|
17293 |
+
},
|
17294 |
+
{
|
17295 |
+
"epoch": 0.84,
|
17296 |
+
"grad_norm": 1.5819069147109985,
|
17297 |
+
"learning_rate": 7.763337893296854e-06,
|
17298 |
+
"loss": 1.8592,
|
17299 |
+
"step": 9880
|
17300 |
+
},
|
17301 |
+
{
|
17302 |
+
"epoch": 0.85,
|
17303 |
+
"grad_norm": 1.6805880069732666,
|
17304 |
+
"learning_rate": 7.74623803009576e-06,
|
17305 |
+
"loss": 1.8612,
|
17306 |
+
"step": 9884
|
17307 |
+
},
|
17308 |
+
{
|
17309 |
+
"epoch": 0.85,
|
17310 |
+
"grad_norm": 1.6755802631378174,
|
17311 |
+
"learning_rate": 7.729138166894665e-06,
|
17312 |
+
"loss": 1.8602,
|
17313 |
+
"step": 9888
|
17314 |
+
},
|
17315 |
+
{
|
17316 |
+
"epoch": 0.85,
|
17317 |
+
"grad_norm": 1.7085371017456055,
|
17318 |
+
"learning_rate": 7.71203830369357e-06,
|
17319 |
+
"loss": 1.6679,
|
17320 |
+
"step": 9892
|
17321 |
+
},
|
17322 |
+
{
|
17323 |
+
"epoch": 0.85,
|
17324 |
+
"grad_norm": 1.5899639129638672,
|
17325 |
+
"learning_rate": 7.694938440492476e-06,
|
17326 |
+
"loss": 1.7626,
|
17327 |
+
"step": 9896
|
17328 |
+
},
|
17329 |
+
{
|
17330 |
+
"epoch": 0.85,
|
17331 |
+
"grad_norm": 1.740966796875,
|
17332 |
+
"learning_rate": 7.677838577291382e-06,
|
17333 |
+
"loss": 1.7984,
|
17334 |
+
"step": 9900
|
17335 |
+
},
|
17336 |
+
{
|
17337 |
+
"epoch": 0.85,
|
17338 |
+
"grad_norm": 1.8925389051437378,
|
17339 |
+
"learning_rate": 7.660738714090288e-06,
|
17340 |
+
"loss": 1.8242,
|
17341 |
+
"step": 9904
|
17342 |
+
},
|
17343 |
+
{
|
17344 |
+
"epoch": 0.85,
|
17345 |
+
"grad_norm": 1.7259442806243896,
|
17346 |
+
"learning_rate": 7.643638850889193e-06,
|
17347 |
+
"loss": 1.7816,
|
17348 |
+
"step": 9908
|
17349 |
+
},
|
17350 |
+
{
|
17351 |
+
"epoch": 0.85,
|
17352 |
+
"grad_norm": 1.5764368772506714,
|
17353 |
+
"learning_rate": 7.626538987688099e-06,
|
17354 |
+
"loss": 1.7718,
|
17355 |
+
"step": 9912
|
17356 |
+
},
|
17357 |
+
{
|
17358 |
+
"epoch": 0.85,
|
17359 |
+
"grad_norm": 1.7654069662094116,
|
17360 |
+
"learning_rate": 7.6094391244870045e-06,
|
17361 |
+
"loss": 1.7728,
|
17362 |
+
"step": 9916
|
17363 |
+
},
|
17364 |
+
{
|
17365 |
+
"epoch": 0.85,
|
17366 |
+
"grad_norm": 1.5519089698791504,
|
17367 |
+
"learning_rate": 7.592339261285911e-06,
|
17368 |
+
"loss": 1.9182,
|
17369 |
+
"step": 9920
|
17370 |
+
},
|
17371 |
+
{
|
17372 |
+
"epoch": 0.85,
|
17373 |
+
"grad_norm": 1.690704584121704,
|
17374 |
+
"learning_rate": 7.575239398084816e-06,
|
17375 |
+
"loss": 1.9885,
|
17376 |
+
"step": 9924
|
17377 |
+
},
|
17378 |
+
{
|
17379 |
+
"epoch": 0.85,
|
17380 |
+
"grad_norm": 1.5261144638061523,
|
17381 |
+
"learning_rate": 7.558139534883721e-06,
|
17382 |
+
"loss": 1.7169,
|
17383 |
+
"step": 9928
|
17384 |
+
},
|
17385 |
+
{
|
17386 |
+
"epoch": 0.85,
|
17387 |
+
"grad_norm": 1.5968029499053955,
|
17388 |
+
"learning_rate": 7.541039671682627e-06,
|
17389 |
+
"loss": 1.7766,
|
17390 |
+
"step": 9932
|
17391 |
+
},
|
17392 |
+
{
|
17393 |
+
"epoch": 0.85,
|
17394 |
+
"grad_norm": 1.5991588830947876,
|
17395 |
+
"learning_rate": 7.523939808481533e-06,
|
17396 |
+
"loss": 1.8359,
|
17397 |
+
"step": 9936
|
17398 |
+
},
|
17399 |
+
{
|
17400 |
+
"epoch": 0.85,
|
17401 |
+
"grad_norm": 1.591503620147705,
|
17402 |
+
"learning_rate": 7.506839945280438e-06,
|
17403 |
+
"loss": 1.7582,
|
17404 |
+
"step": 9940
|
17405 |
+
},
|
17406 |
+
{
|
17407 |
+
"epoch": 0.85,
|
17408 |
+
"grad_norm": 1.5666191577911377,
|
17409 |
+
"learning_rate": 7.489740082079344e-06,
|
17410 |
+
"loss": 1.8947,
|
17411 |
+
"step": 9944
|
17412 |
+
},
|
17413 |
+
{
|
17414 |
+
"epoch": 0.85,
|
17415 |
+
"grad_norm": 1.7336504459381104,
|
17416 |
+
"learning_rate": 7.472640218878249e-06,
|
17417 |
+
"loss": 1.8194,
|
17418 |
+
"step": 9948
|
17419 |
+
},
|
17420 |
+
{
|
17421 |
+
"epoch": 0.85,
|
17422 |
+
"grad_norm": 1.7414416074752808,
|
17423 |
+
"learning_rate": 7.455540355677155e-06,
|
17424 |
+
"loss": 1.8543,
|
17425 |
+
"step": 9952
|
17426 |
+
},
|
17427 |
+
{
|
17428 |
+
"epoch": 0.85,
|
17429 |
+
"grad_norm": 1.6664258241653442,
|
17430 |
+
"learning_rate": 7.438440492476061e-06,
|
17431 |
+
"loss": 1.786,
|
17432 |
+
"step": 9956
|
17433 |
+
},
|
17434 |
+
{
|
17435 |
+
"epoch": 0.85,
|
17436 |
+
"grad_norm": 1.594941258430481,
|
17437 |
+
"learning_rate": 7.421340629274966e-06,
|
17438 |
+
"loss": 1.773,
|
17439 |
+
"step": 9960
|
17440 |
+
},
|
17441 |
+
{
|
17442 |
+
"epoch": 0.85,
|
17443 |
+
"grad_norm": 1.71213698387146,
|
17444 |
+
"learning_rate": 7.404240766073872e-06,
|
17445 |
+
"loss": 1.9381,
|
17446 |
+
"step": 9964
|
17447 |
+
},
|
17448 |
+
{
|
17449 |
+
"epoch": 0.85,
|
17450 |
+
"grad_norm": 1.6945880651474,
|
17451 |
+
"learning_rate": 7.387140902872777e-06,
|
17452 |
+
"loss": 1.7854,
|
17453 |
+
"step": 9968
|
17454 |
+
},
|
17455 |
+
{
|
17456 |
+
"epoch": 0.85,
|
17457 |
+
"grad_norm": 1.71058189868927,
|
17458 |
+
"learning_rate": 7.370041039671682e-06,
|
17459 |
+
"loss": 1.8552,
|
17460 |
+
"step": 9972
|
17461 |
+
},
|
17462 |
+
{
|
17463 |
+
"epoch": 0.85,
|
17464 |
+
"grad_norm": 1.8152309656143188,
|
17465 |
+
"learning_rate": 7.3529411764705884e-06,
|
17466 |
+
"loss": 1.8759,
|
17467 |
+
"step": 9976
|
17468 |
+
},
|
17469 |
+
{
|
17470 |
+
"epoch": 0.85,
|
17471 |
+
"grad_norm": 1.6043202877044678,
|
17472 |
+
"learning_rate": 7.335841313269494e-06,
|
17473 |
+
"loss": 1.7945,
|
17474 |
+
"step": 9980
|
17475 |
+
},
|
17476 |
+
{
|
17477 |
+
"epoch": 0.85,
|
17478 |
+
"grad_norm": 1.7119457721710205,
|
17479 |
+
"learning_rate": 7.318741450068401e-06,
|
17480 |
+
"loss": 1.9138,
|
17481 |
+
"step": 9984
|
17482 |
+
},
|
17483 |
+
{
|
17484 |
+
"epoch": 0.85,
|
17485 |
+
"grad_norm": 1.9094306230545044,
|
17486 |
+
"learning_rate": 7.301641586867305e-06,
|
17487 |
+
"loss": 1.8234,
|
17488 |
+
"step": 9988
|
17489 |
+
},
|
17490 |
+
{
|
17491 |
+
"epoch": 0.85,
|
17492 |
+
"grad_norm": 1.6789647340774536,
|
17493 |
+
"learning_rate": 7.28454172366621e-06,
|
17494 |
+
"loss": 1.8883,
|
17495 |
+
"step": 9992
|
17496 |
+
},
|
17497 |
+
{
|
17498 |
+
"epoch": 0.85,
|
17499 |
+
"grad_norm": 1.6013123989105225,
|
17500 |
+
"learning_rate": 7.267441860465117e-06,
|
17501 |
+
"loss": 1.6778,
|
17502 |
+
"step": 9996
|
17503 |
+
},
|
17504 |
+
{
|
17505 |
+
"epoch": 0.85,
|
17506 |
+
"grad_norm": 1.7987964153289795,
|
17507 |
+
"learning_rate": 7.250341997264022e-06,
|
17508 |
+
"loss": 1.6727,
|
17509 |
+
"step": 10000
|
17510 |
+
},
|
17511 |
+
{
|
17512 |
+
"epoch": 0.86,
|
17513 |
+
"grad_norm": 1.6856164932250977,
|
17514 |
+
"learning_rate": 7.233242134062927e-06,
|
17515 |
+
"loss": 1.8289,
|
17516 |
+
"step": 10004
|
17517 |
+
},
|
17518 |
+
{
|
17519 |
+
"epoch": 0.86,
|
17520 |
+
"grad_norm": 1.7940067052841187,
|
17521 |
+
"learning_rate": 7.216142270861834e-06,
|
17522 |
+
"loss": 1.917,
|
17523 |
+
"step": 10008
|
17524 |
+
},
|
17525 |
+
{
|
17526 |
+
"epoch": 0.86,
|
17527 |
+
"grad_norm": 1.806894063949585,
|
17528 |
+
"learning_rate": 7.1990424076607384e-06,
|
17529 |
+
"loss": 1.9523,
|
17530 |
+
"step": 10012
|
17531 |
+
},
|
17532 |
+
{
|
17533 |
+
"epoch": 0.86,
|
17534 |
+
"grad_norm": 1.6045538187026978,
|
17535 |
+
"learning_rate": 7.181942544459645e-06,
|
17536 |
+
"loss": 1.8155,
|
17537 |
+
"step": 10016
|
17538 |
+
},
|
17539 |
+
{
|
17540 |
+
"epoch": 0.86,
|
17541 |
+
"grad_norm": 1.6840627193450928,
|
17542 |
+
"learning_rate": 7.164842681258551e-06,
|
17543 |
+
"loss": 1.87,
|
17544 |
+
"step": 10020
|
17545 |
+
},
|
17546 |
+
{
|
17547 |
+
"epoch": 0.86,
|
17548 |
+
"grad_norm": 1.5683917999267578,
|
17549 |
+
"learning_rate": 7.147742818057455e-06,
|
17550 |
+
"loss": 1.9501,
|
17551 |
+
"step": 10024
|
17552 |
+
},
|
17553 |
+
{
|
17554 |
+
"epoch": 0.86,
|
17555 |
+
"grad_norm": 1.6646571159362793,
|
17556 |
+
"learning_rate": 7.130642954856362e-06,
|
17557 |
+
"loss": 1.862,
|
17558 |
+
"step": 10028
|
17559 |
+
},
|
17560 |
+
{
|
17561 |
+
"epoch": 0.86,
|
17562 |
+
"grad_norm": 1.6929519176483154,
|
17563 |
+
"learning_rate": 7.113543091655267e-06,
|
17564 |
+
"loss": 1.9485,
|
17565 |
+
"step": 10032
|
17566 |
+
},
|
17567 |
+
{
|
17568 |
+
"epoch": 0.86,
|
17569 |
+
"grad_norm": 1.600963830947876,
|
17570 |
+
"learning_rate": 7.096443228454172e-06,
|
17571 |
+
"loss": 1.7745,
|
17572 |
+
"step": 10036
|
17573 |
+
},
|
17574 |
+
{
|
17575 |
+
"epoch": 0.86,
|
17576 |
+
"grad_norm": 1.6967799663543701,
|
17577 |
+
"learning_rate": 7.079343365253079e-06,
|
17578 |
+
"loss": 1.9167,
|
17579 |
+
"step": 10040
|
17580 |
+
},
|
17581 |
+
{
|
17582 |
+
"epoch": 0.86,
|
17583 |
+
"grad_norm": 1.6387646198272705,
|
17584 |
+
"learning_rate": 7.062243502051984e-06,
|
17585 |
+
"loss": 1.9051,
|
17586 |
+
"step": 10044
|
17587 |
+
},
|
17588 |
+
{
|
17589 |
+
"epoch": 0.86,
|
17590 |
+
"grad_norm": 1.5275384187698364,
|
17591 |
+
"learning_rate": 7.04514363885089e-06,
|
17592 |
+
"loss": 1.9997,
|
17593 |
+
"step": 10048
|
17594 |
+
},
|
17595 |
+
{
|
17596 |
+
"epoch": 0.86,
|
17597 |
+
"grad_norm": 1.677001953125,
|
17598 |
+
"learning_rate": 7.028043775649795e-06,
|
17599 |
+
"loss": 1.9359,
|
17600 |
+
"step": 10052
|
17601 |
+
},
|
17602 |
+
{
|
17603 |
+
"epoch": 0.86,
|
17604 |
+
"grad_norm": 1.5836267471313477,
|
17605 |
+
"learning_rate": 7.010943912448701e-06,
|
17606 |
+
"loss": 1.7359,
|
17607 |
+
"step": 10056
|
17608 |
+
},
|
17609 |
+
{
|
17610 |
+
"epoch": 0.86,
|
17611 |
+
"grad_norm": 1.7847533226013184,
|
17612 |
+
"learning_rate": 6.993844049247607e-06,
|
17613 |
+
"loss": 1.9568,
|
17614 |
+
"step": 10060
|
17615 |
+
},
|
17616 |
+
{
|
17617 |
+
"epoch": 0.86,
|
17618 |
+
"grad_norm": 1.6765036582946777,
|
17619 |
+
"learning_rate": 6.976744186046512e-06,
|
17620 |
+
"loss": 1.8862,
|
17621 |
+
"step": 10064
|
17622 |
+
},
|
17623 |
+
{
|
17624 |
+
"epoch": 0.86,
|
17625 |
+
"grad_norm": 1.8817429542541504,
|
17626 |
+
"learning_rate": 6.959644322845417e-06,
|
17627 |
+
"loss": 1.8935,
|
17628 |
+
"step": 10068
|
17629 |
+
},
|
17630 |
+
{
|
17631 |
+
"epoch": 0.86,
|
17632 |
+
"grad_norm": 1.765724778175354,
|
17633 |
+
"learning_rate": 6.942544459644323e-06,
|
17634 |
+
"loss": 1.7135,
|
17635 |
+
"step": 10072
|
17636 |
+
},
|
17637 |
+
{
|
17638 |
+
"epoch": 0.86,
|
17639 |
+
"grad_norm": 1.5911288261413574,
|
17640 |
+
"learning_rate": 6.925444596443229e-06,
|
17641 |
+
"loss": 1.7035,
|
17642 |
+
"step": 10076
|
17643 |
+
},
|
17644 |
+
{
|
17645 |
+
"epoch": 0.86,
|
17646 |
+
"grad_norm": 1.738471508026123,
|
17647 |
+
"learning_rate": 6.908344733242135e-06,
|
17648 |
+
"loss": 1.7544,
|
17649 |
+
"step": 10080
|
17650 |
+
},
|
17651 |
+
{
|
17652 |
+
"epoch": 0.86,
|
17653 |
+
"grad_norm": 1.719092845916748,
|
17654 |
+
"learning_rate": 6.89124487004104e-06,
|
17655 |
+
"loss": 1.8507,
|
17656 |
+
"step": 10084
|
17657 |
+
},
|
17658 |
+
{
|
17659 |
+
"epoch": 0.86,
|
17660 |
+
"grad_norm": 1.786017656326294,
|
17661 |
+
"learning_rate": 6.874145006839945e-06,
|
17662 |
+
"loss": 1.9009,
|
17663 |
+
"step": 10088
|
17664 |
+
},
|
17665 |
+
{
|
17666 |
+
"epoch": 0.86,
|
17667 |
+
"grad_norm": 2.0530200004577637,
|
17668 |
+
"learning_rate": 6.8570451436388514e-06,
|
17669 |
+
"loss": 1.8839,
|
17670 |
+
"step": 10092
|
17671 |
+
},
|
17672 |
+
{
|
17673 |
+
"epoch": 0.86,
|
17674 |
+
"grad_norm": 1.6444154977798462,
|
17675 |
+
"learning_rate": 6.839945280437757e-06,
|
17676 |
+
"loss": 1.8756,
|
17677 |
+
"step": 10096
|
17678 |
+
},
|
17679 |
+
{
|
17680 |
+
"epoch": 0.86,
|
17681 |
+
"grad_norm": 1.6261224746704102,
|
17682 |
+
"learning_rate": 6.822845417236662e-06,
|
17683 |
+
"loss": 1.6744,
|
17684 |
+
"step": 10100
|
17685 |
+
},
|
17686 |
+
{
|
17687 |
+
"epoch": 0.86,
|
17688 |
+
"grad_norm": 1.6390396356582642,
|
17689 |
+
"learning_rate": 6.805745554035568e-06,
|
17690 |
+
"loss": 1.8752,
|
17691 |
+
"step": 10104
|
17692 |
+
},
|
17693 |
+
{
|
17694 |
+
"epoch": 0.86,
|
17695 |
+
"grad_norm": 1.9215726852416992,
|
17696 |
+
"learning_rate": 6.788645690834473e-06,
|
17697 |
+
"loss": 1.9022,
|
17698 |
+
"step": 10108
|
17699 |
+
},
|
17700 |
+
{
|
17701 |
+
"epoch": 0.86,
|
17702 |
+
"grad_norm": 1.839656949043274,
|
17703 |
+
"learning_rate": 6.77154582763338e-06,
|
17704 |
+
"loss": 2.1024,
|
17705 |
+
"step": 10112
|
17706 |
+
},
|
17707 |
+
{
|
17708 |
+
"epoch": 0.86,
|
17709 |
+
"grad_norm": 1.6141258478164673,
|
17710 |
+
"learning_rate": 6.754445964432285e-06,
|
17711 |
+
"loss": 1.7609,
|
17712 |
+
"step": 10116
|
17713 |
+
},
|
17714 |
+
{
|
17715 |
+
"epoch": 0.87,
|
17716 |
+
"grad_norm": 1.6340337991714478,
|
17717 |
+
"learning_rate": 6.73734610123119e-06,
|
17718 |
+
"loss": 1.8939,
|
17719 |
+
"step": 10120
|
17720 |
+
},
|
17721 |
+
{
|
17722 |
+
"epoch": 0.87,
|
17723 |
+
"grad_norm": 1.529555082321167,
|
17724 |
+
"learning_rate": 6.720246238030097e-06,
|
17725 |
+
"loss": 1.7863,
|
17726 |
+
"step": 10124
|
17727 |
+
},
|
17728 |
+
{
|
17729 |
+
"epoch": 0.87,
|
17730 |
+
"grad_norm": 1.670279860496521,
|
17731 |
+
"learning_rate": 6.7031463748290014e-06,
|
17732 |
+
"loss": 1.8213,
|
17733 |
+
"step": 10128
|
17734 |
+
},
|
17735 |
+
{
|
17736 |
+
"epoch": 0.87,
|
17737 |
+
"grad_norm": 1.7106720209121704,
|
17738 |
+
"learning_rate": 6.686046511627907e-06,
|
17739 |
+
"loss": 1.6828,
|
17740 |
+
"step": 10132
|
17741 |
+
},
|
17742 |
+
{
|
17743 |
+
"epoch": 0.87,
|
17744 |
+
"grad_norm": 1.8096644878387451,
|
17745 |
+
"learning_rate": 6.668946648426814e-06,
|
17746 |
+
"loss": 1.8081,
|
17747 |
+
"step": 10136
|
17748 |
+
},
|
17749 |
+
{
|
17750 |
+
"epoch": 0.87,
|
17751 |
+
"grad_norm": 1.636301040649414,
|
17752 |
+
"learning_rate": 6.651846785225718e-06,
|
17753 |
+
"loss": 1.8518,
|
17754 |
+
"step": 10140
|
17755 |
+
},
|
17756 |
+
{
|
17757 |
+
"epoch": 0.87,
|
17758 |
+
"grad_norm": 1.5952695608139038,
|
17759 |
+
"learning_rate": 6.634746922024625e-06,
|
17760 |
+
"loss": 1.7897,
|
17761 |
+
"step": 10144
|
17762 |
+
},
|
17763 |
+
{
|
17764 |
+
"epoch": 0.87,
|
17765 |
+
"grad_norm": 1.5805269479751587,
|
17766 |
+
"learning_rate": 6.61764705882353e-06,
|
17767 |
+
"loss": 1.9542,
|
17768 |
+
"step": 10148
|
17769 |
+
},
|
17770 |
+
{
|
17771 |
+
"epoch": 0.87,
|
17772 |
+
"grad_norm": 1.8776960372924805,
|
17773 |
+
"learning_rate": 6.6005471956224345e-06,
|
17774 |
+
"loss": 1.8719,
|
17775 |
+
"step": 10152
|
17776 |
+
},
|
17777 |
+
{
|
17778 |
+
"epoch": 0.87,
|
17779 |
+
"grad_norm": 1.5122414827346802,
|
17780 |
+
"learning_rate": 6.583447332421341e-06,
|
17781 |
+
"loss": 1.7259,
|
17782 |
+
"step": 10156
|
17783 |
+
},
|
17784 |
+
{
|
17785 |
+
"epoch": 0.87,
|
17786 |
+
"grad_norm": 1.7734562158584595,
|
17787 |
+
"learning_rate": 6.566347469220247e-06,
|
17788 |
+
"loss": 1.8812,
|
17789 |
+
"step": 10160
|
17790 |
+
},
|
17791 |
+
{
|
17792 |
+
"epoch": 0.87,
|
17793 |
+
"grad_norm": 1.511674165725708,
|
17794 |
+
"learning_rate": 6.5492476060191514e-06,
|
17795 |
+
"loss": 1.841,
|
17796 |
+
"step": 10164
|
17797 |
+
},
|
17798 |
+
{
|
17799 |
+
"epoch": 0.87,
|
17800 |
+
"grad_norm": 1.6405954360961914,
|
17801 |
+
"learning_rate": 6.532147742818058e-06,
|
17802 |
+
"loss": 1.8166,
|
17803 |
+
"step": 10168
|
17804 |
+
},
|
17805 |
+
{
|
17806 |
+
"epoch": 0.87,
|
17807 |
+
"grad_norm": 1.822506070137024,
|
17808 |
+
"learning_rate": 6.515047879616963e-06,
|
17809 |
+
"loss": 1.8154,
|
17810 |
+
"step": 10172
|
17811 |
+
},
|
17812 |
+
{
|
17813 |
+
"epoch": 0.87,
|
17814 |
+
"grad_norm": 1.6633646488189697,
|
17815 |
+
"learning_rate": 6.497948016415869e-06,
|
17816 |
+
"loss": 1.8579,
|
17817 |
+
"step": 10176
|
17818 |
+
},
|
17819 |
+
{
|
17820 |
+
"epoch": 0.87,
|
17821 |
+
"grad_norm": 1.8290987014770508,
|
17822 |
+
"learning_rate": 6.480848153214775e-06,
|
17823 |
+
"loss": 1.8602,
|
17824 |
+
"step": 10180
|
17825 |
+
},
|
17826 |
+
{
|
17827 |
+
"epoch": 0.87,
|
17828 |
+
"grad_norm": 1.6676056385040283,
|
17829 |
+
"learning_rate": 6.46374829001368e-06,
|
17830 |
+
"loss": 1.7908,
|
17831 |
+
"step": 10184
|
17832 |
+
},
|
17833 |
+
{
|
17834 |
+
"epoch": 0.87,
|
17835 |
+
"grad_norm": 1.4981609582901,
|
17836 |
+
"learning_rate": 6.446648426812586e-06,
|
17837 |
+
"loss": 1.7605,
|
17838 |
+
"step": 10188
|
17839 |
+
},
|
17840 |
+
{
|
17841 |
+
"epoch": 0.87,
|
17842 |
+
"grad_norm": 1.7770746946334839,
|
17843 |
+
"learning_rate": 6.429548563611491e-06,
|
17844 |
+
"loss": 2.005,
|
17845 |
+
"step": 10192
|
17846 |
+
},
|
17847 |
+
{
|
17848 |
+
"epoch": 0.87,
|
17849 |
+
"grad_norm": 1.6636948585510254,
|
17850 |
+
"learning_rate": 6.412448700410397e-06,
|
17851 |
+
"loss": 1.9328,
|
17852 |
+
"step": 10196
|
17853 |
+
},
|
17854 |
+
{
|
17855 |
+
"epoch": 0.87,
|
17856 |
+
"grad_norm": 1.5431137084960938,
|
17857 |
+
"learning_rate": 6.395348837209303e-06,
|
17858 |
+
"loss": 1.7199,
|
17859 |
+
"step": 10200
|
17860 |
+
},
|
17861 |
+
{
|
17862 |
+
"epoch": 0.87,
|
17863 |
+
"grad_norm": 1.6235666275024414,
|
17864 |
+
"learning_rate": 6.378248974008208e-06,
|
17865 |
+
"loss": 1.8701,
|
17866 |
+
"step": 10204
|
17867 |
+
},
|
17868 |
+
{
|
17869 |
+
"epoch": 0.87,
|
17870 |
+
"grad_norm": 1.7597670555114746,
|
17871 |
+
"learning_rate": 6.3611491108071144e-06,
|
17872 |
+
"loss": 1.7161,
|
17873 |
+
"step": 10208
|
17874 |
+
},
|
17875 |
+
{
|
17876 |
+
"epoch": 0.87,
|
17877 |
+
"grad_norm": 1.8666725158691406,
|
17878 |
+
"learning_rate": 6.344049247606019e-06,
|
17879 |
+
"loss": 1.8283,
|
17880 |
+
"step": 10212
|
17881 |
+
},
|
17882 |
+
{
|
17883 |
+
"epoch": 0.87,
|
17884 |
+
"grad_norm": 1.8828492164611816,
|
17885 |
+
"learning_rate": 6.326949384404925e-06,
|
17886 |
+
"loss": 1.9689,
|
17887 |
+
"step": 10216
|
17888 |
+
},
|
17889 |
+
{
|
17890 |
+
"epoch": 0.87,
|
17891 |
+
"grad_norm": 1.539336919784546,
|
17892 |
+
"learning_rate": 6.309849521203831e-06,
|
17893 |
+
"loss": 1.731,
|
17894 |
+
"step": 10220
|
17895 |
+
},
|
17896 |
+
{
|
17897 |
+
"epoch": 0.87,
|
17898 |
+
"grad_norm": 1.6746304035186768,
|
17899 |
+
"learning_rate": 6.292749658002736e-06,
|
17900 |
+
"loss": 1.7627,
|
17901 |
+
"step": 10224
|
17902 |
+
},
|
17903 |
+
{
|
17904 |
+
"epoch": 0.87,
|
17905 |
+
"grad_norm": 1.6880711317062378,
|
17906 |
+
"learning_rate": 6.275649794801641e-06,
|
17907 |
+
"loss": 1.8081,
|
17908 |
+
"step": 10228
|
17909 |
+
},
|
17910 |
+
{
|
17911 |
+
"epoch": 0.87,
|
17912 |
+
"grad_norm": 1.6686969995498657,
|
17913 |
+
"learning_rate": 6.2585499316005475e-06,
|
17914 |
+
"loss": 1.7955,
|
17915 |
+
"step": 10232
|
17916 |
+
},
|
17917 |
+
{
|
17918 |
+
"epoch": 0.88,
|
17919 |
+
"grad_norm": 1.6514986753463745,
|
17920 |
+
"learning_rate": 6.241450068399453e-06,
|
17921 |
+
"loss": 1.7788,
|
17922 |
+
"step": 10236
|
17923 |
+
},
|
17924 |
+
{
|
17925 |
+
"epoch": 0.88,
|
17926 |
+
"grad_norm": 1.5582215785980225,
|
17927 |
+
"learning_rate": 6.224350205198359e-06,
|
17928 |
+
"loss": 1.7514,
|
17929 |
+
"step": 10240
|
17930 |
+
},
|
17931 |
+
{
|
17932 |
+
"epoch": 0.88,
|
17933 |
+
"grad_norm": 1.6330581903457642,
|
17934 |
+
"learning_rate": 6.2072503419972644e-06,
|
17935 |
+
"loss": 1.9055,
|
17936 |
+
"step": 10244
|
17937 |
+
},
|
17938 |
+
{
|
17939 |
+
"epoch": 0.88,
|
17940 |
+
"grad_norm": 1.6368670463562012,
|
17941 |
+
"learning_rate": 6.19015047879617e-06,
|
17942 |
+
"loss": 1.732,
|
17943 |
+
"step": 10248
|
17944 |
+
},
|
17945 |
+
{
|
17946 |
+
"epoch": 0.88,
|
17947 |
+
"grad_norm": 1.7110068798065186,
|
17948 |
+
"learning_rate": 6.173050615595075e-06,
|
17949 |
+
"loss": 1.9471,
|
17950 |
+
"step": 10252
|
17951 |
+
},
|
17952 |
+
{
|
17953 |
+
"epoch": 0.88,
|
17954 |
+
"grad_norm": 1.7540807723999023,
|
17955 |
+
"learning_rate": 6.155950752393981e-06,
|
17956 |
+
"loss": 1.8072,
|
17957 |
+
"step": 10256
|
17958 |
+
},
|
17959 |
+
{
|
17960 |
+
"epoch": 0.88,
|
17961 |
+
"grad_norm": 1.6526390314102173,
|
17962 |
+
"learning_rate": 6.138850889192887e-06,
|
17963 |
+
"loss": 1.7913,
|
17964 |
+
"step": 10260
|
17965 |
+
},
|
17966 |
+
{
|
17967 |
+
"epoch": 0.88,
|
17968 |
+
"grad_norm": 1.7240028381347656,
|
17969 |
+
"learning_rate": 6.121751025991793e-06,
|
17970 |
+
"loss": 1.8526,
|
17971 |
+
"step": 10264
|
17972 |
+
},
|
17973 |
+
{
|
17974 |
+
"epoch": 0.88,
|
17975 |
+
"grad_norm": 1.683828592300415,
|
17976 |
+
"learning_rate": 6.1046511627906975e-06,
|
17977 |
+
"loss": 1.7572,
|
17978 |
+
"step": 10268
|
17979 |
+
},
|
17980 |
+
{
|
17981 |
+
"epoch": 0.88,
|
17982 |
+
"grad_norm": 1.5554591417312622,
|
17983 |
+
"learning_rate": 6.087551299589603e-06,
|
17984 |
+
"loss": 1.7571,
|
17985 |
+
"step": 10272
|
17986 |
+
},
|
17987 |
+
{
|
17988 |
+
"epoch": 0.88,
|
17989 |
+
"grad_norm": 1.6821385622024536,
|
17990 |
+
"learning_rate": 6.07045143638851e-06,
|
17991 |
+
"loss": 1.7848,
|
17992 |
+
"step": 10276
|
17993 |
+
},
|
17994 |
+
{
|
17995 |
+
"epoch": 0.88,
|
17996 |
+
"grad_norm": 1.6544996500015259,
|
17997 |
+
"learning_rate": 6.053351573187415e-06,
|
17998 |
+
"loss": 1.8252,
|
17999 |
+
"step": 10280
|
18000 |
+
},
|
18001 |
+
{
|
18002 |
+
"epoch": 0.88,
|
18003 |
+
"grad_norm": 1.6138795614242554,
|
18004 |
+
"learning_rate": 6.03625170998632e-06,
|
18005 |
+
"loss": 1.979,
|
18006 |
+
"step": 10284
|
18007 |
+
},
|
18008 |
+
{
|
18009 |
+
"epoch": 0.88,
|
18010 |
+
"grad_norm": 2.004995346069336,
|
18011 |
+
"learning_rate": 6.019151846785226e-06,
|
18012 |
+
"loss": 1.7996,
|
18013 |
+
"step": 10288
|
18014 |
+
},
|
18015 |
+
{
|
18016 |
+
"epoch": 0.88,
|
18017 |
+
"grad_norm": 1.8927818536758423,
|
18018 |
+
"learning_rate": 6.002051983584131e-06,
|
18019 |
+
"loss": 1.8892,
|
18020 |
+
"step": 10292
|
18021 |
+
},
|
18022 |
+
{
|
18023 |
+
"epoch": 0.88,
|
18024 |
+
"grad_norm": 1.9253323078155518,
|
18025 |
+
"learning_rate": 5.984952120383038e-06,
|
18026 |
+
"loss": 1.9798,
|
18027 |
+
"step": 10296
|
18028 |
+
},
|
18029 |
+
{
|
18030 |
+
"epoch": 0.88,
|
18031 |
+
"grad_norm": 1.623855471611023,
|
18032 |
+
"learning_rate": 5.967852257181943e-06,
|
18033 |
+
"loss": 1.6606,
|
18034 |
+
"step": 10300
|
18035 |
+
},
|
18036 |
+
{
|
18037 |
+
"epoch": 0.88,
|
18038 |
+
"grad_norm": 1.5401018857955933,
|
18039 |
+
"learning_rate": 5.950752393980848e-06,
|
18040 |
+
"loss": 1.7557,
|
18041 |
+
"step": 10304
|
18042 |
+
},
|
18043 |
+
{
|
18044 |
+
"epoch": 0.88,
|
18045 |
+
"grad_norm": 1.643587350845337,
|
18046 |
+
"learning_rate": 5.933652530779754e-06,
|
18047 |
+
"loss": 1.9448,
|
18048 |
+
"step": 10308
|
18049 |
+
},
|
18050 |
+
{
|
18051 |
+
"epoch": 0.88,
|
18052 |
+
"grad_norm": 1.7403227090835571,
|
18053 |
+
"learning_rate": 5.91655266757866e-06,
|
18054 |
+
"loss": 1.9609,
|
18055 |
+
"step": 10312
|
18056 |
+
},
|
18057 |
+
{
|
18058 |
+
"epoch": 0.88,
|
18059 |
+
"grad_norm": 1.7716693878173828,
|
18060 |
+
"learning_rate": 5.899452804377565e-06,
|
18061 |
+
"loss": 1.8861,
|
18062 |
+
"step": 10316
|
18063 |
+
},
|
18064 |
+
{
|
18065 |
+
"epoch": 0.88,
|
18066 |
+
"grad_norm": 1.7579643726348877,
|
18067 |
+
"learning_rate": 5.882352941176471e-06,
|
18068 |
+
"loss": 1.8461,
|
18069 |
+
"step": 10320
|
18070 |
+
},
|
18071 |
+
{
|
18072 |
+
"epoch": 0.88,
|
18073 |
+
"grad_norm": 1.512486219406128,
|
18074 |
+
"learning_rate": 5.8652530779753766e-06,
|
18075 |
+
"loss": 1.8184,
|
18076 |
+
"step": 10324
|
18077 |
+
},
|
18078 |
+
{
|
18079 |
+
"epoch": 0.88,
|
18080 |
+
"grad_norm": 1.8807471990585327,
|
18081 |
+
"learning_rate": 5.848153214774282e-06,
|
18082 |
+
"loss": 1.7981,
|
18083 |
+
"step": 10328
|
18084 |
+
},
|
18085 |
+
{
|
18086 |
+
"epoch": 0.88,
|
18087 |
+
"grad_norm": 1.8313617706298828,
|
18088 |
+
"learning_rate": 5.831053351573187e-06,
|
18089 |
+
"loss": 1.7566,
|
18090 |
+
"step": 10332
|
18091 |
+
},
|
18092 |
+
{
|
18093 |
+
"epoch": 0.88,
|
18094 |
+
"grad_norm": 1.7667449712753296,
|
18095 |
+
"learning_rate": 5.8139534883720935e-06,
|
18096 |
+
"loss": 1.8996,
|
18097 |
+
"step": 10336
|
18098 |
+
},
|
18099 |
+
{
|
18100 |
+
"epoch": 0.88,
|
18101 |
+
"grad_norm": 1.5892364978790283,
|
18102 |
+
"learning_rate": 5.796853625170999e-06,
|
18103 |
+
"loss": 1.7042,
|
18104 |
+
"step": 10340
|
18105 |
+
},
|
18106 |
+
{
|
18107 |
+
"epoch": 0.88,
|
18108 |
+
"grad_norm": 1.9081833362579346,
|
18109 |
+
"learning_rate": 5.779753761969905e-06,
|
18110 |
+
"loss": 2.0525,
|
18111 |
+
"step": 10344
|
18112 |
+
},
|
18113 |
+
{
|
18114 |
+
"epoch": 0.88,
|
18115 |
+
"grad_norm": 1.891441822052002,
|
18116 |
+
"learning_rate": 5.76265389876881e-06,
|
18117 |
+
"loss": 1.7879,
|
18118 |
+
"step": 10348
|
18119 |
+
},
|
18120 |
+
{
|
18121 |
+
"epoch": 0.89,
|
18122 |
+
"grad_norm": 1.9260497093200684,
|
18123 |
+
"learning_rate": 5.745554035567715e-06,
|
18124 |
+
"loss": 1.8796,
|
18125 |
+
"step": 10352
|
18126 |
+
},
|
18127 |
+
{
|
18128 |
+
"epoch": 0.89,
|
18129 |
+
"grad_norm": 1.6760286092758179,
|
18130 |
+
"learning_rate": 5.728454172366622e-06,
|
18131 |
+
"loss": 1.9087,
|
18132 |
+
"step": 10356
|
18133 |
+
},
|
18134 |
+
{
|
18135 |
+
"epoch": 0.89,
|
18136 |
+
"grad_norm": 1.6360900402069092,
|
18137 |
+
"learning_rate": 5.711354309165527e-06,
|
18138 |
+
"loss": 1.7715,
|
18139 |
+
"step": 10360
|
18140 |
+
},
|
18141 |
+
{
|
18142 |
+
"epoch": 0.89,
|
18143 |
+
"grad_norm": 1.7029601335525513,
|
18144 |
+
"learning_rate": 5.694254445964432e-06,
|
18145 |
+
"loss": 1.7859,
|
18146 |
+
"step": 10364
|
18147 |
+
},
|
18148 |
+
{
|
18149 |
+
"epoch": 0.89,
|
18150 |
+
"grad_norm": 1.5900228023529053,
|
18151 |
+
"learning_rate": 5.677154582763338e-06,
|
18152 |
+
"loss": 1.8456,
|
18153 |
+
"step": 10368
|
18154 |
+
},
|
18155 |
+
{
|
18156 |
+
"epoch": 0.89,
|
18157 |
+
"grad_norm": 1.6234673261642456,
|
18158 |
+
"learning_rate": 5.6600547195622435e-06,
|
18159 |
+
"loss": 1.8007,
|
18160 |
+
"step": 10372
|
18161 |
+
},
|
18162 |
+
{
|
18163 |
+
"epoch": 0.89,
|
18164 |
+
"grad_norm": 1.5690399408340454,
|
18165 |
+
"learning_rate": 5.64295485636115e-06,
|
18166 |
+
"loss": 1.8154,
|
18167 |
+
"step": 10376
|
18168 |
+
},
|
18169 |
+
{
|
18170 |
+
"epoch": 0.89,
|
18171 |
+
"grad_norm": 1.622010588645935,
|
18172 |
+
"learning_rate": 5.625854993160055e-06,
|
18173 |
+
"loss": 1.9555,
|
18174 |
+
"step": 10380
|
18175 |
+
},
|
18176 |
+
{
|
18177 |
+
"epoch": 0.89,
|
18178 |
+
"grad_norm": 1.7594480514526367,
|
18179 |
+
"learning_rate": 5.6087551299589605e-06,
|
18180 |
+
"loss": 1.9146,
|
18181 |
+
"step": 10384
|
18182 |
+
},
|
18183 |
+
{
|
18184 |
+
"epoch": 0.89,
|
18185 |
+
"grad_norm": 1.644951581954956,
|
18186 |
+
"learning_rate": 5.591655266757866e-06,
|
18187 |
+
"loss": 1.9562,
|
18188 |
+
"step": 10388
|
18189 |
+
},
|
18190 |
+
{
|
18191 |
+
"epoch": 0.89,
|
18192 |
+
"grad_norm": 1.6730928421020508,
|
18193 |
+
"learning_rate": 5.574555403556772e-06,
|
18194 |
+
"loss": 1.927,
|
18195 |
+
"step": 10392
|
18196 |
+
},
|
18197 |
+
{
|
18198 |
+
"epoch": 0.89,
|
18199 |
+
"grad_norm": 1.749422311782837,
|
18200 |
+
"learning_rate": 5.557455540355677e-06,
|
18201 |
+
"loss": 1.8613,
|
18202 |
+
"step": 10396
|
18203 |
+
},
|
18204 |
+
{
|
18205 |
+
"epoch": 0.89,
|
18206 |
+
"grad_norm": 1.6241000890731812,
|
18207 |
+
"learning_rate": 5.540355677154583e-06,
|
18208 |
+
"loss": 1.6963,
|
18209 |
+
"step": 10400
|
18210 |
+
},
|
18211 |
+
{
|
18212 |
+
"epoch": 0.89,
|
18213 |
+
"grad_norm": 1.6478880643844604,
|
18214 |
+
"learning_rate": 5.523255813953489e-06,
|
18215 |
+
"loss": 1.7823,
|
18216 |
+
"step": 10404
|
18217 |
+
},
|
18218 |
+
{
|
18219 |
+
"epoch": 0.89,
|
18220 |
+
"grad_norm": 1.8106017112731934,
|
18221 |
+
"learning_rate": 5.506155950752394e-06,
|
18222 |
+
"loss": 1.8875,
|
18223 |
+
"step": 10408
|
18224 |
+
},
|
18225 |
+
{
|
18226 |
+
"epoch": 0.89,
|
18227 |
+
"grad_norm": 1.792172908782959,
|
18228 |
+
"learning_rate": 5.4890560875513e-06,
|
18229 |
+
"loss": 1.9241,
|
18230 |
+
"step": 10412
|
18231 |
+
},
|
18232 |
+
{
|
18233 |
+
"epoch": 0.89,
|
18234 |
+
"grad_norm": 1.5553025007247925,
|
18235 |
+
"learning_rate": 5.471956224350206e-06,
|
18236 |
+
"loss": 1.7159,
|
18237 |
+
"step": 10416
|
18238 |
+
},
|
18239 |
+
{
|
18240 |
+
"epoch": 0.89,
|
18241 |
+
"grad_norm": 1.5697897672653198,
|
18242 |
+
"learning_rate": 5.454856361149111e-06,
|
18243 |
+
"loss": 1.7839,
|
18244 |
+
"step": 10420
|
18245 |
+
},
|
18246 |
+
{
|
18247 |
+
"epoch": 0.89,
|
18248 |
+
"grad_norm": 1.7735848426818848,
|
18249 |
+
"learning_rate": 5.437756497948017e-06,
|
18250 |
+
"loss": 1.7892,
|
18251 |
+
"step": 10424
|
18252 |
+
},
|
18253 |
+
{
|
18254 |
+
"epoch": 0.89,
|
18255 |
+
"grad_norm": 1.8028829097747803,
|
18256 |
+
"learning_rate": 5.420656634746922e-06,
|
18257 |
+
"loss": 1.8241,
|
18258 |
+
"step": 10428
|
18259 |
+
},
|
18260 |
+
{
|
18261 |
+
"epoch": 0.89,
|
18262 |
+
"grad_norm": 1.7222471237182617,
|
18263 |
+
"learning_rate": 5.403556771545827e-06,
|
18264 |
+
"loss": 1.7322,
|
18265 |
+
"step": 10432
|
18266 |
+
},
|
18267 |
+
{
|
18268 |
+
"epoch": 0.89,
|
18269 |
+
"grad_norm": 1.5926076173782349,
|
18270 |
+
"learning_rate": 5.386456908344734e-06,
|
18271 |
+
"loss": 1.8576,
|
18272 |
+
"step": 10436
|
18273 |
+
},
|
18274 |
+
{
|
18275 |
+
"epoch": 0.89,
|
18276 |
+
"grad_norm": 1.642992377281189,
|
18277 |
+
"learning_rate": 5.3693570451436396e-06,
|
18278 |
+
"loss": 1.83,
|
18279 |
+
"step": 10440
|
18280 |
+
},
|
18281 |
+
{
|
18282 |
+
"epoch": 0.89,
|
18283 |
+
"grad_norm": 1.577621579170227,
|
18284 |
+
"learning_rate": 5.352257181942544e-06,
|
18285 |
+
"loss": 1.7914,
|
18286 |
+
"step": 10444
|
18287 |
+
},
|
18288 |
+
{
|
18289 |
+
"epoch": 0.89,
|
18290 |
+
"grad_norm": 1.5518428087234497,
|
18291 |
+
"learning_rate": 5.33515731874145e-06,
|
18292 |
+
"loss": 1.9518,
|
18293 |
+
"step": 10448
|
18294 |
+
},
|
18295 |
+
{
|
18296 |
+
"epoch": 0.89,
|
18297 |
+
"grad_norm": 1.6200405359268188,
|
18298 |
+
"learning_rate": 5.318057455540356e-06,
|
18299 |
+
"loss": 1.7868,
|
18300 |
+
"step": 10452
|
18301 |
+
},
|
18302 |
+
{
|
18303 |
+
"epoch": 0.89,
|
18304 |
+
"grad_norm": 1.5001662969589233,
|
18305 |
+
"learning_rate": 5.300957592339262e-06,
|
18306 |
+
"loss": 1.8093,
|
18307 |
+
"step": 10456
|
18308 |
+
},
|
18309 |
+
{
|
18310 |
+
"epoch": 0.89,
|
18311 |
+
"grad_norm": 1.6836313009262085,
|
18312 |
+
"learning_rate": 5.283857729138167e-06,
|
18313 |
+
"loss": 1.8562,
|
18314 |
+
"step": 10460
|
18315 |
+
},
|
18316 |
+
{
|
18317 |
+
"epoch": 0.89,
|
18318 |
+
"grad_norm": 1.7168313264846802,
|
18319 |
+
"learning_rate": 5.266757865937073e-06,
|
18320 |
+
"loss": 1.8735,
|
18321 |
+
"step": 10464
|
18322 |
+
},
|
18323 |
+
{
|
18324 |
+
"epoch": 0.9,
|
18325 |
+
"grad_norm": 1.7337522506713867,
|
18326 |
+
"learning_rate": 5.249658002735978e-06,
|
18327 |
+
"loss": 1.9054,
|
18328 |
+
"step": 10468
|
18329 |
+
},
|
18330 |
+
{
|
18331 |
+
"epoch": 0.9,
|
18332 |
+
"grad_norm": 1.4755985736846924,
|
18333 |
+
"learning_rate": 5.232558139534884e-06,
|
18334 |
+
"loss": 1.8456,
|
18335 |
+
"step": 10472
|
18336 |
+
},
|
18337 |
+
{
|
18338 |
+
"epoch": 0.9,
|
18339 |
+
"grad_norm": 1.7717063426971436,
|
18340 |
+
"learning_rate": 5.2154582763337896e-06,
|
18341 |
+
"loss": 1.8035,
|
18342 |
+
"step": 10476
|
18343 |
+
},
|
18344 |
+
{
|
18345 |
+
"epoch": 0.9,
|
18346 |
+
"grad_norm": 1.8099918365478516,
|
18347 |
+
"learning_rate": 5.198358413132695e-06,
|
18348 |
+
"loss": 1.9099,
|
18349 |
+
"step": 10480
|
18350 |
+
},
|
18351 |
+
{
|
18352 |
+
"epoch": 0.9,
|
18353 |
+
"grad_norm": 1.6142700910568237,
|
18354 |
+
"learning_rate": 5.181258549931601e-06,
|
18355 |
+
"loss": 1.7275,
|
18356 |
+
"step": 10484
|
18357 |
+
},
|
18358 |
+
{
|
18359 |
+
"epoch": 0.9,
|
18360 |
+
"grad_norm": 1.6501797437667847,
|
18361 |
+
"learning_rate": 5.1641586867305065e-06,
|
18362 |
+
"loss": 1.7888,
|
18363 |
+
"step": 10488
|
18364 |
+
},
|
18365 |
+
{
|
18366 |
+
"epoch": 0.9,
|
18367 |
+
"grad_norm": 1.6632269620895386,
|
18368 |
+
"learning_rate": 5.147058823529412e-06,
|
18369 |
+
"loss": 1.77,
|
18370 |
+
"step": 10492
|
18371 |
+
},
|
18372 |
+
{
|
18373 |
+
"epoch": 0.9,
|
18374 |
+
"grad_norm": 1.698384165763855,
|
18375 |
+
"learning_rate": 5.129958960328318e-06,
|
18376 |
+
"loss": 1.7493,
|
18377 |
+
"step": 10496
|
18378 |
+
},
|
18379 |
+
{
|
18380 |
+
"epoch": 0.9,
|
18381 |
+
"grad_norm": 1.6063936948776245,
|
18382 |
+
"learning_rate": 5.1128590971272235e-06,
|
18383 |
+
"loss": 1.7966,
|
18384 |
+
"step": 10500
|
18385 |
+
},
|
18386 |
+
{
|
18387 |
+
"epoch": 0.9,
|
18388 |
+
"grad_norm": 1.9107542037963867,
|
18389 |
+
"learning_rate": 5.095759233926129e-06,
|
18390 |
+
"loss": 1.7634,
|
18391 |
+
"step": 10504
|
18392 |
+
},
|
18393 |
+
{
|
18394 |
+
"epoch": 0.9,
|
18395 |
+
"grad_norm": 1.839381217956543,
|
18396 |
+
"learning_rate": 5.078659370725034e-06,
|
18397 |
+
"loss": 1.7991,
|
18398 |
+
"step": 10508
|
18399 |
+
},
|
18400 |
+
{
|
18401 |
+
"epoch": 0.9,
|
18402 |
+
"grad_norm": 1.7109549045562744,
|
18403 |
+
"learning_rate": 5.0615595075239396e-06,
|
18404 |
+
"loss": 1.7911,
|
18405 |
+
"step": 10512
|
18406 |
+
},
|
18407 |
+
{
|
18408 |
+
"epoch": 0.9,
|
18409 |
+
"grad_norm": 1.7641953229904175,
|
18410 |
+
"learning_rate": 5.044459644322846e-06,
|
18411 |
+
"loss": 1.9,
|
18412 |
+
"step": 10516
|
18413 |
+
},
|
18414 |
+
{
|
18415 |
+
"epoch": 0.9,
|
18416 |
+
"grad_norm": 1.981584072113037,
|
18417 |
+
"learning_rate": 5.027359781121752e-06,
|
18418 |
+
"loss": 1.8533,
|
18419 |
+
"step": 10520
|
18420 |
+
},
|
18421 |
+
{
|
18422 |
+
"epoch": 0.9,
|
18423 |
+
"grad_norm": 1.5387113094329834,
|
18424 |
+
"learning_rate": 5.0102599179206565e-06,
|
18425 |
+
"loss": 1.9877,
|
18426 |
+
"step": 10524
|
18427 |
+
},
|
18428 |
+
{
|
18429 |
+
"epoch": 0.9,
|
18430 |
+
"grad_norm": 1.630399465560913,
|
18431 |
+
"learning_rate": 4.993160054719562e-06,
|
18432 |
+
"loss": 1.9056,
|
18433 |
+
"step": 10528
|
18434 |
}
|
18435 |
],
|
18436 |
"logging_steps": 4,
|
|
|
18438 |
"num_input_tokens_seen": 0,
|
18439 |
"num_train_epochs": 1,
|
18440 |
"save_steps": 1170,
|
18441 |
+
"total_flos": 1.4268530945949696e+17,
|
18442 |
"train_batch_size": 4,
|
18443 |
"trial_name": null,
|
18444 |
"trial_params": null
|