abcdabcd987 commited on
Commit
ad7ae23
1 Parent(s): 79a88df
README.md CHANGED
@@ -1,9 +1,14 @@
1
  ---
2
  license: apache-2.0
 
3
  language:
4
  - en
5
- library_name: peft
6
  pipeline_tag: text2text-generation
 
 
 
 
 
7
  ---
8
 
9
  * Base Model: [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
@@ -11,4 +16,22 @@ pipeline_tag: text2text-generation
11
  * LoRA rank: 16
12
  * Training epochs: 4
13
 
14
- See <https://github.com/punica-ai/punica/tree/master/examples/finetune>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ base_model: meta-llama/Llama-2-7b-hf
4
  language:
5
  - en
 
6
  pipeline_tag: text2text-generation
7
+ tags:
8
+ - punica
9
+ - llama-factory
10
+ - lora
11
+ - generated_from_trainer
12
  ---
13
 
14
  * Base Model: [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
 
16
  * LoRA rank: 16
17
  * Training epochs: 4
18
 
19
+ See <https://github.com/punica-ai/punica/tree/master/examples/finetune>
20
+
21
+ ### Training hyperparameters
22
+
23
+ The following hyperparameters were used during training:
24
+ - learning_rate: 5e-05
25
+ - train_batch_size: 32
26
+ - eval_batch_size: 8
27
+ - seed: 42
28
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
29
+ - lr_scheduler_type: cosine
30
+ - num_epochs: 4.0
31
+
32
+ ### Framework versions
33
+
34
+ - Transformers 4.34.1
35
+ - Pytorch 2.2.0.dev20230911+cu121
36
+ - Datasets 2.14.4
37
+ - Tokenizers 0.14.1
adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "lora_alpha": 16.0,
12
+ "lora_dropout": 0.1,
13
+ "modules_to_save": null,
14
+ "peft_type": "LORA",
15
+ "r": 16,
16
+ "rank_pattern": {},
17
+ "revision": null,
18
+ "target_modules": [
19
+ "down_proj",
20
+ "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
+ "v_proj",
24
+ "gate_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": "CAUSAL_LM"
28
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f9f8258c444218b6c652eb2c84cc29b36ca1c0223bfbd31c34460defeb01d8d
3
+ size 160069834
all_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "train_loss": 0.40969020433914966,
4
+ "train_runtime": 4321.1174,
5
+ "train_samples_per_second": 6.918,
6
+ "train_steps_per_second": 0.217
7
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "bos_token": "<s>",
29
+ "clean_up_tokenization_spaces": false,
30
+ "eos_token": "</s>",
31
+ "legacy": false,
32
+ "model_max_length": 1000000000000000019884624838656,
33
+ "pad_token": "</s>",
34
+ "padding_side": "right",
35
+ "sp_model_kwargs": {},
36
+ "split_special_tokens": false,
37
+ "tokenizer_class": "LlamaTokenizer",
38
+ "unk_token": "<unk>",
39
+ "use_default_system_prompt": true
40
+ }
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "train_loss": 0.40969020433914966,
4
+ "train_runtime": 4321.1174,
5
+ "train_samples_per_second": 6.918,
6
+ "train_steps_per_second": 0.217
7
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 936, "loss": 1.0499, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9985919525527434e-05, "epoch": 0.04, "percentage": 1.07, "elapsed_time": "0:00:46", "remaining_time": "1:12:01"}
2
+ {"current_steps": 20, "total_steps": 936, "loss": 0.7822, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.994369396289063e-05, "epoch": 0.09, "percentage": 2.14, "elapsed_time": "0:01:33", "remaining_time": "1:11:07"}
3
+ {"current_steps": 30, "total_steps": 936, "loss": 0.575, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.987337087656614e-05, "epoch": 0.13, "percentage": 3.21, "elapsed_time": "0:02:19", "remaining_time": "1:10:17"}
4
+ {"current_steps": 40, "total_steps": 936, "loss": 0.5199, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.977502948114772e-05, "epoch": 0.17, "percentage": 4.27, "elapsed_time": "0:03:04", "remaining_time": "1:08:50"}
5
+ {"current_steps": 50, "total_steps": 936, "loss": 0.5358, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.964878055211597e-05, "epoch": 0.21, "percentage": 5.34, "elapsed_time": "0:03:50", "remaining_time": "1:08:04"}
6
+ {"current_steps": 60, "total_steps": 936, "loss": 0.508, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.949476630105669e-05, "epoch": 0.26, "percentage": 6.41, "elapsed_time": "0:04:37", "remaining_time": "1:07:26"}
7
+ {"current_steps": 70, "total_steps": 936, "loss": 0.4915, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9313160215468334e-05, "epoch": 0.3, "percentage": 7.48, "elapsed_time": "0:05:25", "remaining_time": "1:07:04"}
8
+ {"current_steps": 80, "total_steps": 936, "loss": 0.4856, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.910416686333906e-05, "epoch": 0.34, "percentage": 8.55, "elapsed_time": "0:06:10", "remaining_time": "1:06:05"}
9
+ {"current_steps": 90, "total_steps": 936, "loss": 0.479, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.886802166271364e-05, "epoch": 0.38, "percentage": 9.62, "elapsed_time": "0:06:56", "remaining_time": "1:05:18"}
10
+ {"current_steps": 100, "total_steps": 936, "loss": 0.4707, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8604990616509616e-05, "epoch": 0.43, "percentage": 10.68, "elapsed_time": "0:07:42", "remaining_time": "1:04:23"}
11
+ {"current_steps": 110, "total_steps": 936, "loss": 0.4687, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8315370012881514e-05, "epoch": 0.47, "percentage": 11.75, "elapsed_time": "0:08:31", "remaining_time": "1:03:58"}
12
+ {"current_steps": 120, "total_steps": 936, "loss": 0.4834, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.799948609147061e-05, "epoch": 0.51, "percentage": 12.82, "elapsed_time": "0:09:16", "remaining_time": "1:03:06"}
13
+ {"current_steps": 130, "total_steps": 936, "loss": 0.4541, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.765769467591625e-05, "epoch": 0.56, "percentage": 13.89, "elapsed_time": "0:10:04", "remaining_time": "1:02:26"}
14
+ {"current_steps": 140, "total_steps": 936, "loss": 0.4442, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7290380773042575e-05, "epoch": 0.6, "percentage": 14.96, "elapsed_time": "0:10:46", "remaining_time": "1:01:16"}
15
+ {"current_steps": 150, "total_steps": 936, "loss": 0.4528, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.68979581391722e-05, "epoch": 0.64, "percentage": 16.03, "elapsed_time": "0:11:33", "remaining_time": "1:00:36"}
16
+ {"current_steps": 160, "total_steps": 936, "loss": 0.436, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6480868814055424e-05, "epoch": 0.68, "percentage": 17.09, "elapsed_time": "0:12:19", "remaining_time": "0:59:47"}
17
+ {"current_steps": 170, "total_steps": 936, "loss": 0.4504, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6039582622939854e-05, "epoch": 0.73, "percentage": 18.16, "elapsed_time": "0:13:04", "remaining_time": "0:58:56"}
18
+ {"current_steps": 180, "total_steps": 936, "loss": 0.4348, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.557459664734141e-05, "epoch": 0.77, "percentage": 19.23, "elapsed_time": "0:13:50", "remaining_time": "0:58:09"}
19
+ {"current_steps": 190, "total_steps": 936, "loss": 0.4283, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5086434665112864e-05, "epoch": 0.81, "percentage": 20.3, "elapsed_time": "0:14:35", "remaining_time": "0:57:17"}
20
+ {"current_steps": 200, "total_steps": 936, "loss": 0.4527, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.457564656044056e-05, "epoch": 0.85, "percentage": 21.37, "elapsed_time": "0:15:23", "remaining_time": "0:56:39"}
21
+ {"current_steps": 210, "total_steps": 936, "loss": 0.4525, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.404280770443398e-05, "epoch": 0.9, "percentage": 22.44, "elapsed_time": "0:16:09", "remaining_time": "0:55:52"}
22
+ {"current_steps": 220, "total_steps": 936, "loss": 0.4315, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.348851830700593e-05, "epoch": 0.94, "percentage": 23.5, "elapsed_time": "0:16:55", "remaining_time": "0:55:05"}
23
+ {"current_steps": 230, "total_steps": 936, "loss": 0.4309, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2913402740773294e-05, "epoch": 0.98, "percentage": 24.57, "elapsed_time": "0:17:42", "remaining_time": "0:54:22"}
24
+ {"current_steps": 240, "total_steps": 936, "loss": 0.4207, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.231810883773999e-05, "epoch": 1.03, "percentage": 25.64, "elapsed_time": "0:18:26", "remaining_time": "0:53:28"}
25
+ {"current_steps": 250, "total_steps": 936, "loss": 0.4189, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.170330715955444e-05, "epoch": 1.07, "percentage": 26.71, "elapsed_time": "0:19:12", "remaining_time": "0:52:43"}
26
+ {"current_steps": 260, "total_steps": 936, "loss": 0.4353, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1069690242163484e-05, "epoch": 1.11, "percentage": 27.78, "elapsed_time": "0:19:58", "remaining_time": "0:51:56"}
27
+ {"current_steps": 270, "total_steps": 936, "loss": 0.4054, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0417971815713584e-05, "epoch": 1.15, "percentage": 28.85, "elapsed_time": "0:20:47", "remaining_time": "0:51:16"}
28
+ {"current_steps": 280, "total_steps": 936, "loss": 0.4152, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.974888600057808e-05, "epoch": 1.2, "percentage": 29.91, "elapsed_time": "0:21:33", "remaining_time": "0:50:29"}
29
+ {"current_steps": 290, "total_steps": 936, "loss": 0.3879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.906318648041617e-05, "epoch": 1.24, "percentage": 30.98, "elapsed_time": "0:22:18", "remaining_time": "0:49:42"}
30
+ {"current_steps": 300, "total_steps": 936, "loss": 0.4164, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8361645653195026e-05, "epoch": 1.28, "percentage": 32.05, "elapsed_time": "0:23:05", "remaining_time": "0:48:57"}
31
+ {"current_steps": 310, "total_steps": 936, "loss": 0.406, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.764505376113138e-05, "epoch": 1.32, "percentage": 33.12, "elapsed_time": "0:23:51", "remaining_time": "0:48:10"}
32
+ {"current_steps": 320, "total_steps": 936, "loss": 0.4076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.69142180005327e-05, "epoch": 1.37, "percentage": 34.19, "elapsed_time": "0:24:38", "remaining_time": "0:47:25"}
33
+ {"current_steps": 330, "total_steps": 936, "loss": 0.3953, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6169961612540645e-05, "epoch": 1.41, "percentage": 35.26, "elapsed_time": "0:25:21", "remaining_time": "0:46:33"}
34
+ {"current_steps": 340, "total_steps": 936, "loss": 0.4105, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5413122955801005e-05, "epoch": 1.45, "percentage": 36.32, "elapsed_time": "0:26:04", "remaining_time": "0:45:43"}
35
+ {"current_steps": 350, "total_steps": 936, "loss": 0.4014, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.4644554562104634e-05, "epoch": 1.5, "percentage": 37.39, "elapsed_time": "0:26:51", "remaining_time": "0:44:57"}
36
+ {"current_steps": 360, "total_steps": 936, "loss": 0.4234, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.386512217606339e-05, "epoch": 1.54, "percentage": 38.46, "elapsed_time": "0:27:37", "remaining_time": "0:44:11"}
37
+ {"current_steps": 370, "total_steps": 936, "loss": 0.4012, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.307570377990245e-05, "epoch": 1.58, "percentage": 39.53, "elapsed_time": "0:28:23", "remaining_time": "0:43:26"}
38
+ {"current_steps": 380, "total_steps": 936, "loss": 0.3979, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.227718860446782e-05, "epoch": 1.62, "percentage": 40.6, "elapsed_time": "0:29:10", "remaining_time": "0:42:41"}
39
+ {"current_steps": 390, "total_steps": 936, "loss": 0.3877, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.147047612756302e-05, "epoch": 1.67, "percentage": 41.67, "elapsed_time": "0:29:53", "remaining_time": "0:41:50"}
40
+ {"current_steps": 400, "total_steps": 936, "loss": 0.4024, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.065647506074306e-05, "epoch": 1.71, "percentage": 42.74, "elapsed_time": "0:30:46", "remaining_time": "0:41:13"}
41
+ {"current_steps": 410, "total_steps": 936, "loss": 0.3994, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.983610232570728e-05, "epoch": 1.75, "percentage": 43.8, "elapsed_time": "0:31:33", "remaining_time": "0:40:29"}
42
+ {"current_steps": 420, "total_steps": 936, "loss": 0.4052, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9010282021444008e-05, "epoch": 1.79, "percentage": 44.87, "elapsed_time": "0:32:17", "remaining_time": "0:39:40"}
43
+ {"current_steps": 430, "total_steps": 936, "loss": 0.4038, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8179944383290274e-05, "epoch": 1.84, "percentage": 45.94, "elapsed_time": "0:33:00", "remaining_time": "0:38:50"}
44
+ {"current_steps": 440, "total_steps": 936, "loss": 0.3999, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7346024735079486e-05, "epoch": 1.88, "percentage": 47.01, "elapsed_time": "0:33:48", "remaining_time": "0:38:06"}
45
+ {"current_steps": 450, "total_steps": 936, "loss": 0.3848, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6509462435557152e-05, "epoch": 1.92, "percentage": 48.08, "elapsed_time": "0:34:33", "remaining_time": "0:37:19"}
46
+ {"current_steps": 460, "total_steps": 936, "loss": 0.4126, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5671199820251534e-05, "epoch": 1.97, "percentage": 49.15, "elapsed_time": "0:35:19", "remaining_time": "0:36:33"}
47
+ {"current_steps": 470, "total_steps": 936, "loss": 0.3996, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.48321811399911e-05, "epoch": 2.01, "percentage": 50.21, "elapsed_time": "0:36:07", "remaining_time": "0:35:48"}
48
+ {"current_steps": 480, "total_steps": 936, "loss": 0.3672, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.399335149726463e-05, "epoch": 2.05, "percentage": 51.28, "elapsed_time": "0:36:52", "remaining_time": "0:35:02"}
49
+ {"current_steps": 490, "total_steps": 936, "loss": 0.3728, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3155655781621793e-05, "epoch": 2.09, "percentage": 52.35, "elapsed_time": "0:37:40", "remaining_time": "0:34:17"}
50
+ {"current_steps": 500, "total_steps": 936, "loss": 0.3901, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2320037605313808e-05, "epoch": 2.14, "percentage": 53.42, "elapsed_time": "0:38:29", "remaining_time": "0:33:33"}
51
+ {"current_steps": 510, "total_steps": 936, "loss": 0.3848, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.148743824037269e-05, "epoch": 2.18, "percentage": 54.49, "elapsed_time": "0:39:15", "remaining_time": "0:32:47"}
52
+ {"current_steps": 520, "total_steps": 936, "loss": 0.3729, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0658795558326743e-05, "epoch": 2.22, "percentage": 55.56, "elapsed_time": "0:40:03", "remaining_time": "0:32:03"}
53
+ {"current_steps": 530, "total_steps": 936, "loss": 0.3709, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.98350429737465e-05, "epoch": 2.26, "percentage": 56.62, "elapsed_time": "0:40:51", "remaining_time": "0:31:17"}
54
+ {"current_steps": 540, "total_steps": 936, "loss": 0.3811, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9017108392811065e-05, "epoch": 2.31, "percentage": 57.69, "elapsed_time": "0:41:36", "remaining_time": "0:30:30"}
55
+ {"current_steps": 550, "total_steps": 936, "loss": 0.3639, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.820591316807939e-05, "epoch": 2.35, "percentage": 58.76, "elapsed_time": "0:42:19", "remaining_time": "0:29:42"}
56
+ {"current_steps": 560, "total_steps": 936, "loss": 0.3693, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.740237106064383e-05, "epoch": 2.39, "percentage": 59.83, "elapsed_time": "0:43:04", "remaining_time": "0:28:55"}
57
+ {"current_steps": 570, "total_steps": 936, "loss": 0.3769, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6607387210834887e-05, "epoch": 2.44, "percentage": 60.9, "elapsed_time": "0:43:51", "remaining_time": "0:28:09"}
58
+ {"current_steps": 580, "total_steps": 936, "loss": 0.3621, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.582185711863681e-05, "epoch": 2.48, "percentage": 61.97, "elapsed_time": "0:44:35", "remaining_time": "0:27:22"}
59
+ {"current_steps": 590, "total_steps": 936, "loss": 0.3585, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5046665634962476e-05, "epoch": 2.52, "percentage": 63.03, "elapsed_time": "0:45:25", "remaining_time": "0:26:38"}
60
+ {"current_steps": 600, "total_steps": 936, "loss": 0.3705, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4282685964923642e-05, "epoch": 2.56, "percentage": 64.1, "elapsed_time": "0:46:08", "remaining_time": "0:25:50"}
61
+ {"current_steps": 610, "total_steps": 936, "loss": 0.3662, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3530778684219648e-05, "epoch": 2.61, "percentage": 65.17, "elapsed_time": "0:46:52", "remaining_time": "0:25:03"}
62
+ {"current_steps": 620, "total_steps": 936, "loss": 0.3831, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2791790769752232e-05, "epoch": 2.65, "percentage": 66.24, "elapsed_time": "0:47:37", "remaining_time": "0:24:16"}
63
+ {"current_steps": 630, "total_steps": 936, "loss": 0.3874, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2066554645558578e-05, "epoch": 2.69, "percentage": 67.31, "elapsed_time": "0:48:24", "remaining_time": "0:23:30"}
64
+ {"current_steps": 640, "total_steps": 936, "loss": 0.3563, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1355887245137383e-05, "epoch": 2.74, "percentage": 68.38, "elapsed_time": "0:49:13", "remaining_time": "0:22:45"}
65
+ {"current_steps": 650, "total_steps": 936, "loss": 0.3612, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0660589091223855e-05, "epoch": 2.78, "percentage": 69.44, "elapsed_time": "0:49:59", "remaining_time": "0:21:59"}
66
+ {"current_steps": 660, "total_steps": 936, "loss": 0.3721, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.981443394050525e-06, "epoch": 2.82, "percentage": 70.51, "elapsed_time": "0:50:44", "remaining_time": "0:21:13"}
67
+ {"current_steps": 670, "total_steps": 936, "loss": 0.3744, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.3192151691096e-06, "epoch": 2.86, "percentage": 71.58, "elapsed_time": "0:51:30", "remaining_time": "0:20:26"}
68
+ {"current_steps": 680, "total_steps": 936, "loss": 0.3553, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.67465037541038e-06, "epoch": 2.91, "percentage": 72.65, "elapsed_time": "0:52:18", "remaining_time": "0:19:41"}
69
+ {"current_steps": 690, "total_steps": 936, "loss": 0.3685, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.048475075202727e-06, "epoch": 2.95, "percentage": 73.72, "elapsed_time": "0:53:02", "remaining_time": "0:18:54"}
70
+ {"current_steps": 700, "total_steps": 936, "loss": 0.3731, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.441394616113062e-06, "epoch": 2.99, "percentage": 74.79, "elapsed_time": "0:53:45", "remaining_time": "0:18:07"}
71
+ {"current_steps": 710, "total_steps": 936, "loss": 0.3529, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.854092836613948e-06, "epoch": 3.03, "percentage": 75.85, "elapsed_time": "0:54:30", "remaining_time": "0:17:20"}
72
+ {"current_steps": 720, "total_steps": 936, "loss": 0.3542, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.28723129572247e-06, "epoch": 3.08, "percentage": 76.92, "elapsed_time": "0:55:18", "remaining_time": "0:16:35"}
73
+ {"current_steps": 730, "total_steps": 936, "loss": 0.3587, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.741448527795137e-06, "epoch": 3.12, "percentage": 77.99, "elapsed_time": "0:56:07", "remaining_time": "0:15:50"}
74
+ {"current_steps": 740, "total_steps": 936, "loss": 0.3583, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.217359323258459e-06, "epoch": 3.16, "percentage": 79.06, "elapsed_time": "0:56:53", "remaining_time": "0:15:04"}
75
+ {"current_steps": 750, "total_steps": 936, "loss": 0.3589, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.715554036085673e-06, "epoch": 3.21, "percentage": 80.13, "elapsed_time": "0:57:41", "remaining_time": "0:14:18"}
76
+ {"current_steps": 760, "total_steps": 936, "loss": 0.3424, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.236597918799709e-06, "epoch": 3.25, "percentage": 81.2, "elapsed_time": "0:58:28", "remaining_time": "0:13:32"}
77
+ {"current_steps": 770, "total_steps": 936, "loss": 0.3455, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7810304857511914e-06, "epoch": 3.29, "percentage": 82.26, "elapsed_time": "0:59:13", "remaining_time": "0:12:45"}
78
+ {"current_steps": 780, "total_steps": 936, "loss": 0.3667, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3493649053890326e-06, "epoch": 3.33, "percentage": 83.33, "elapsed_time": "1:00:00", "remaining_time": "0:12:00"}
79
+ {"current_steps": 790, "total_steps": 936, "loss": 0.3579, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.942087422208051e-06, "epoch": 3.38, "percentage": 84.4, "elapsed_time": "1:00:45", "remaining_time": "0:11:13"}
80
+ {"current_steps": 800, "total_steps": 936, "loss": 0.342, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5596568090246548e-06, "epoch": 3.42, "percentage": 85.47, "elapsed_time": "1:01:32", "remaining_time": "0:10:27"}
81
+ {"current_steps": 810, "total_steps": 936, "loss": 0.3542, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2025038501977486e-06, "epoch": 3.46, "percentage": 86.54, "elapsed_time": "1:02:19", "remaining_time": "0:09:41"}
82
+ {"current_steps": 820, "total_steps": 936, "loss": 0.3385, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8710308563769124e-06, "epoch": 3.5, "percentage": 87.61, "elapsed_time": "1:03:03", "remaining_time": "0:08:55"}
83
+ {"current_steps": 830, "total_steps": 936, "loss": 0.3564, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5656112113243721e-06, "epoch": 3.55, "percentage": 88.68, "elapsed_time": "1:03:46", "remaining_time": "0:08:08"}
84
+ {"current_steps": 840, "total_steps": 936, "loss": 0.357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.286588951321363e-06, "epoch": 3.59, "percentage": 89.74, "elapsed_time": "1:04:32", "remaining_time": "0:07:22"}
85
+ {"current_steps": 850, "total_steps": 936, "loss": 0.3492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.034278377632636e-06, "epoch": 3.63, "percentage": 90.81, "elapsed_time": "1:05:20", "remaining_time": "0:06:36"}
86
+ {"current_steps": 860, "total_steps": 936, "loss": 0.3547, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.089637024655483e-07, "epoch": 3.68, "percentage": 91.88, "elapsed_time": "1:06:04", "remaining_time": "0:05:50"}
87
+ {"current_steps": 870, "total_steps": 936, "loss": 0.3401, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.108987288226536e-07, "epoch": 3.72, "percentage": 92.95, "elapsed_time": "1:06:55", "remaining_time": "0:05:04"}
88
+ {"current_steps": 880, "total_steps": 936, "loss": 0.3417, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.403065646083809e-07, "epoch": 3.76, "percentage": 94.02, "elapsed_time": "1:07:41", "remaining_time": "0:04:18"}
89
+ {"current_steps": 890, "total_steps": 936, "loss": 0.3709, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.973793713118039e-07, "epoch": 3.8, "percentage": 95.09, "elapsed_time": "1:08:27", "remaining_time": "0:03:32"}
90
+ {"current_steps": 900, "total_steps": 936, "loss": 0.3702, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8227814754865068e-07, "epoch": 3.85, "percentage": 96.15, "elapsed_time": "1:09:13", "remaining_time": "0:02:46"}
91
+ {"current_steps": 910, "total_steps": 936, "loss": 0.3401, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.513254770636137e-08, "epoch": 3.89, "percentage": 97.22, "elapsed_time": "1:09:58", "remaining_time": "0:01:59"}
92
+ {"current_steps": 920, "total_steps": 936, "loss": 0.3566, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.604073589645596e-08, "epoch": 3.93, "percentage": 98.29, "elapsed_time": "1:10:44", "remaining_time": "0:01:13"}
93
+ {"current_steps": 930, "total_steps": 936, "loss": 0.3785, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.069275378746796e-09, "epoch": 3.97, "percentage": 99.36, "elapsed_time": "1:11:29", "remaining_time": "0:00:27"}
94
+ {"current_steps": 936, "total_steps": 936, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 4.0, "percentage": 100.0, "elapsed_time": "1:11:57", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
+ "eval_steps": 500,
6
+ "global_step": 936,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 4.9985919525527434e-05,
14
+ "loss": 1.0499,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.09,
19
+ "learning_rate": 4.994369396289063e-05,
20
+ "loss": 0.7822,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.13,
25
+ "learning_rate": 4.987337087656614e-05,
26
+ "loss": 0.575,
27
+ "step": 30
28
+ },
29
+ {
30
+ "epoch": 0.17,
31
+ "learning_rate": 4.977502948114772e-05,
32
+ "loss": 0.5199,
33
+ "step": 40
34
+ },
35
+ {
36
+ "epoch": 0.21,
37
+ "learning_rate": 4.964878055211597e-05,
38
+ "loss": 0.5358,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.26,
43
+ "learning_rate": 4.949476630105669e-05,
44
+ "loss": 0.508,
45
+ "step": 60
46
+ },
47
+ {
48
+ "epoch": 0.3,
49
+ "learning_rate": 4.9313160215468334e-05,
50
+ "loss": 0.4915,
51
+ "step": 70
52
+ },
53
+ {
54
+ "epoch": 0.34,
55
+ "learning_rate": 4.910416686333906e-05,
56
+ "loss": 0.4856,
57
+ "step": 80
58
+ },
59
+ {
60
+ "epoch": 0.38,
61
+ "learning_rate": 4.886802166271364e-05,
62
+ "loss": 0.479,
63
+ "step": 90
64
+ },
65
+ {
66
+ "epoch": 0.43,
67
+ "learning_rate": 4.8604990616509616e-05,
68
+ "loss": 0.4707,
69
+ "step": 100
70
+ },
71
+ {
72
+ "epoch": 0.47,
73
+ "learning_rate": 4.8315370012881514e-05,
74
+ "loss": 0.4687,
75
+ "step": 110
76
+ },
77
+ {
78
+ "epoch": 0.51,
79
+ "learning_rate": 4.799948609147061e-05,
80
+ "loss": 0.4834,
81
+ "step": 120
82
+ },
83
+ {
84
+ "epoch": 0.56,
85
+ "learning_rate": 4.765769467591625e-05,
86
+ "loss": 0.4541,
87
+ "step": 130
88
+ },
89
+ {
90
+ "epoch": 0.6,
91
+ "learning_rate": 4.7290380773042575e-05,
92
+ "loss": 0.4442,
93
+ "step": 140
94
+ },
95
+ {
96
+ "epoch": 0.64,
97
+ "learning_rate": 4.68979581391722e-05,
98
+ "loss": 0.4528,
99
+ "step": 150
100
+ },
101
+ {
102
+ "epoch": 0.68,
103
+ "learning_rate": 4.6480868814055424e-05,
104
+ "loss": 0.436,
105
+ "step": 160
106
+ },
107
+ {
108
+ "epoch": 0.73,
109
+ "learning_rate": 4.6039582622939854e-05,
110
+ "loss": 0.4504,
111
+ "step": 170
112
+ },
113
+ {
114
+ "epoch": 0.77,
115
+ "learning_rate": 4.557459664734141e-05,
116
+ "loss": 0.4348,
117
+ "step": 180
118
+ },
119
+ {
120
+ "epoch": 0.81,
121
+ "learning_rate": 4.5086434665112864e-05,
122
+ "loss": 0.4283,
123
+ "step": 190
124
+ },
125
+ {
126
+ "epoch": 0.85,
127
+ "learning_rate": 4.457564656044056e-05,
128
+ "loss": 0.4527,
129
+ "step": 200
130
+ },
131
+ {
132
+ "epoch": 0.9,
133
+ "learning_rate": 4.404280770443398e-05,
134
+ "loss": 0.4525,
135
+ "step": 210
136
+ },
137
+ {
138
+ "epoch": 0.94,
139
+ "learning_rate": 4.348851830700593e-05,
140
+ "loss": 0.4315,
141
+ "step": 220
142
+ },
143
+ {
144
+ "epoch": 0.98,
145
+ "learning_rate": 4.2913402740773294e-05,
146
+ "loss": 0.4309,
147
+ "step": 230
148
+ },
149
+ {
150
+ "epoch": 1.03,
151
+ "learning_rate": 4.231810883773999e-05,
152
+ "loss": 0.4207,
153
+ "step": 240
154
+ },
155
+ {
156
+ "epoch": 1.07,
157
+ "learning_rate": 4.170330715955444e-05,
158
+ "loss": 0.4189,
159
+ "step": 250
160
+ },
161
+ {
162
+ "epoch": 1.11,
163
+ "learning_rate": 4.1069690242163484e-05,
164
+ "loss": 0.4353,
165
+ "step": 260
166
+ },
167
+ {
168
+ "epoch": 1.15,
169
+ "learning_rate": 4.0417971815713584e-05,
170
+ "loss": 0.4054,
171
+ "step": 270
172
+ },
173
+ {
174
+ "epoch": 1.2,
175
+ "learning_rate": 3.974888600057808e-05,
176
+ "loss": 0.4152,
177
+ "step": 280
178
+ },
179
+ {
180
+ "epoch": 1.24,
181
+ "learning_rate": 3.906318648041617e-05,
182
+ "loss": 0.3879,
183
+ "step": 290
184
+ },
185
+ {
186
+ "epoch": 1.28,
187
+ "learning_rate": 3.8361645653195026e-05,
188
+ "loss": 0.4164,
189
+ "step": 300
190
+ },
191
+ {
192
+ "epoch": 1.32,
193
+ "learning_rate": 3.764505376113138e-05,
194
+ "loss": 0.406,
195
+ "step": 310
196
+ },
197
+ {
198
+ "epoch": 1.37,
199
+ "learning_rate": 3.69142180005327e-05,
200
+ "loss": 0.4076,
201
+ "step": 320
202
+ },
203
+ {
204
+ "epoch": 1.41,
205
+ "learning_rate": 3.6169961612540645e-05,
206
+ "loss": 0.3953,
207
+ "step": 330
208
+ },
209
+ {
210
+ "epoch": 1.45,
211
+ "learning_rate": 3.5413122955801005e-05,
212
+ "loss": 0.4105,
213
+ "step": 340
214
+ },
215
+ {
216
+ "epoch": 1.5,
217
+ "learning_rate": 3.4644554562104634e-05,
218
+ "loss": 0.4014,
219
+ "step": 350
220
+ },
221
+ {
222
+ "epoch": 1.54,
223
+ "learning_rate": 3.386512217606339e-05,
224
+ "loss": 0.4234,
225
+ "step": 360
226
+ },
227
+ {
228
+ "epoch": 1.58,
229
+ "learning_rate": 3.307570377990245e-05,
230
+ "loss": 0.4012,
231
+ "step": 370
232
+ },
233
+ {
234
+ "epoch": 1.62,
235
+ "learning_rate": 3.227718860446782e-05,
236
+ "loss": 0.3979,
237
+ "step": 380
238
+ },
239
+ {
240
+ "epoch": 1.67,
241
+ "learning_rate": 3.147047612756302e-05,
242
+ "loss": 0.3877,
243
+ "step": 390
244
+ },
245
+ {
246
+ "epoch": 1.71,
247
+ "learning_rate": 3.065647506074306e-05,
248
+ "loss": 0.4024,
249
+ "step": 400
250
+ },
251
+ {
252
+ "epoch": 1.75,
253
+ "learning_rate": 2.983610232570728e-05,
254
+ "loss": 0.3994,
255
+ "step": 410
256
+ },
257
+ {
258
+ "epoch": 1.79,
259
+ "learning_rate": 2.9010282021444008e-05,
260
+ "loss": 0.4052,
261
+ "step": 420
262
+ },
263
+ {
264
+ "epoch": 1.84,
265
+ "learning_rate": 2.8179944383290274e-05,
266
+ "loss": 0.4038,
267
+ "step": 430
268
+ },
269
+ {
270
+ "epoch": 1.88,
271
+ "learning_rate": 2.7346024735079486e-05,
272
+ "loss": 0.3999,
273
+ "step": 440
274
+ },
275
+ {
276
+ "epoch": 1.92,
277
+ "learning_rate": 2.6509462435557152e-05,
278
+ "loss": 0.3848,
279
+ "step": 450
280
+ },
281
+ {
282
+ "epoch": 1.97,
283
+ "learning_rate": 2.5671199820251534e-05,
284
+ "loss": 0.4126,
285
+ "step": 460
286
+ },
287
+ {
288
+ "epoch": 2.01,
289
+ "learning_rate": 2.48321811399911e-05,
290
+ "loss": 0.3996,
291
+ "step": 470
292
+ },
293
+ {
294
+ "epoch": 2.05,
295
+ "learning_rate": 2.399335149726463e-05,
296
+ "loss": 0.3672,
297
+ "step": 480
298
+ },
299
+ {
300
+ "epoch": 2.09,
301
+ "learning_rate": 2.3155655781621793e-05,
302
+ "loss": 0.3728,
303
+ "step": 490
304
+ },
305
+ {
306
+ "epoch": 2.14,
307
+ "learning_rate": 2.2320037605313808e-05,
308
+ "loss": 0.3901,
309
+ "step": 500
310
+ },
311
+ {
312
+ "epoch": 2.18,
313
+ "learning_rate": 2.148743824037269e-05,
314
+ "loss": 0.3848,
315
+ "step": 510
316
+ },
317
+ {
318
+ "epoch": 2.22,
319
+ "learning_rate": 2.0658795558326743e-05,
320
+ "loss": 0.3729,
321
+ "step": 520
322
+ },
323
+ {
324
+ "epoch": 2.26,
325
+ "learning_rate": 1.98350429737465e-05,
326
+ "loss": 0.3709,
327
+ "step": 530
328
+ },
329
+ {
330
+ "epoch": 2.31,
331
+ "learning_rate": 1.9017108392811065e-05,
332
+ "loss": 0.3811,
333
+ "step": 540
334
+ },
335
+ {
336
+ "epoch": 2.35,
337
+ "learning_rate": 1.820591316807939e-05,
338
+ "loss": 0.3639,
339
+ "step": 550
340
+ },
341
+ {
342
+ "epoch": 2.39,
343
+ "learning_rate": 1.740237106064383e-05,
344
+ "loss": 0.3693,
345
+ "step": 560
346
+ },
347
+ {
348
+ "epoch": 2.44,
349
+ "learning_rate": 1.6607387210834887e-05,
350
+ "loss": 0.3769,
351
+ "step": 570
352
+ },
353
+ {
354
+ "epoch": 2.48,
355
+ "learning_rate": 1.582185711863681e-05,
356
+ "loss": 0.3621,
357
+ "step": 580
358
+ },
359
+ {
360
+ "epoch": 2.52,
361
+ "learning_rate": 1.5046665634962476e-05,
362
+ "loss": 0.3585,
363
+ "step": 590
364
+ },
365
+ {
366
+ "epoch": 2.56,
367
+ "learning_rate": 1.4282685964923642e-05,
368
+ "loss": 0.3705,
369
+ "step": 600
370
+ },
371
+ {
372
+ "epoch": 2.61,
373
+ "learning_rate": 1.3530778684219648e-05,
374
+ "loss": 0.3662,
375
+ "step": 610
376
+ },
377
+ {
378
+ "epoch": 2.65,
379
+ "learning_rate": 1.2791790769752232e-05,
380
+ "loss": 0.3831,
381
+ "step": 620
382
+ },
383
+ {
384
+ "epoch": 2.69,
385
+ "learning_rate": 1.2066554645558578e-05,
386
+ "loss": 0.3874,
387
+ "step": 630
388
+ },
389
+ {
390
+ "epoch": 2.74,
391
+ "learning_rate": 1.1355887245137383e-05,
392
+ "loss": 0.3563,
393
+ "step": 640
394
+ },
395
+ {
396
+ "epoch": 2.78,
397
+ "learning_rate": 1.0660589091223855e-05,
398
+ "loss": 0.3612,
399
+ "step": 650
400
+ },
401
+ {
402
+ "epoch": 2.82,
403
+ "learning_rate": 9.981443394050525e-06,
404
+ "loss": 0.3721,
405
+ "step": 660
406
+ },
407
+ {
408
+ "epoch": 2.86,
409
+ "learning_rate": 9.3192151691096e-06,
410
+ "loss": 0.3744,
411
+ "step": 670
412
+ },
413
+ {
414
+ "epoch": 2.91,
415
+ "learning_rate": 8.67465037541038e-06,
416
+ "loss": 0.3553,
417
+ "step": 680
418
+ },
419
+ {
420
+ "epoch": 2.95,
421
+ "learning_rate": 8.048475075202727e-06,
422
+ "loss": 0.3685,
423
+ "step": 690
424
+ },
425
+ {
426
+ "epoch": 2.99,
427
+ "learning_rate": 7.441394616113062e-06,
428
+ "loss": 0.3731,
429
+ "step": 700
430
+ },
431
+ {
432
+ "epoch": 3.03,
433
+ "learning_rate": 6.854092836613948e-06,
434
+ "loss": 0.3529,
435
+ "step": 710
436
+ },
437
+ {
438
+ "epoch": 3.08,
439
+ "learning_rate": 6.28723129572247e-06,
440
+ "loss": 0.3542,
441
+ "step": 720
442
+ },
443
+ {
444
+ "epoch": 3.12,
445
+ "learning_rate": 5.741448527795137e-06,
446
+ "loss": 0.3587,
447
+ "step": 730
448
+ },
449
+ {
450
+ "epoch": 3.16,
451
+ "learning_rate": 5.217359323258459e-06,
452
+ "loss": 0.3583,
453
+ "step": 740
454
+ },
455
+ {
456
+ "epoch": 3.21,
457
+ "learning_rate": 4.715554036085673e-06,
458
+ "loss": 0.3589,
459
+ "step": 750
460
+ },
461
+ {
462
+ "epoch": 3.25,
463
+ "learning_rate": 4.236597918799709e-06,
464
+ "loss": 0.3424,
465
+ "step": 760
466
+ },
467
+ {
468
+ "epoch": 3.29,
469
+ "learning_rate": 3.7810304857511914e-06,
470
+ "loss": 0.3455,
471
+ "step": 770
472
+ },
473
+ {
474
+ "epoch": 3.33,
475
+ "learning_rate": 3.3493649053890326e-06,
476
+ "loss": 0.3667,
477
+ "step": 780
478
+ },
479
+ {
480
+ "epoch": 3.38,
481
+ "learning_rate": 2.942087422208051e-06,
482
+ "loss": 0.3579,
483
+ "step": 790
484
+ },
485
+ {
486
+ "epoch": 3.42,
487
+ "learning_rate": 2.5596568090246548e-06,
488
+ "loss": 0.342,
489
+ "step": 800
490
+ },
491
+ {
492
+ "epoch": 3.46,
493
+ "learning_rate": 2.2025038501977486e-06,
494
+ "loss": 0.3542,
495
+ "step": 810
496
+ },
497
+ {
498
+ "epoch": 3.5,
499
+ "learning_rate": 1.8710308563769124e-06,
500
+ "loss": 0.3385,
501
+ "step": 820
502
+ },
503
+ {
504
+ "epoch": 3.55,
505
+ "learning_rate": 1.5656112113243721e-06,
506
+ "loss": 0.3564,
507
+ "step": 830
508
+ },
509
+ {
510
+ "epoch": 3.59,
511
+ "learning_rate": 1.286588951321363e-06,
512
+ "loss": 0.357,
513
+ "step": 840
514
+ },
515
+ {
516
+ "epoch": 3.63,
517
+ "learning_rate": 1.034278377632636e-06,
518
+ "loss": 0.3492,
519
+ "step": 850
520
+ },
521
+ {
522
+ "epoch": 3.68,
523
+ "learning_rate": 8.089637024655483e-07,
524
+ "loss": 0.3547,
525
+ "step": 860
526
+ },
527
+ {
528
+ "epoch": 3.72,
529
+ "learning_rate": 6.108987288226536e-07,
530
+ "loss": 0.3401,
531
+ "step": 870
532
+ },
533
+ {
534
+ "epoch": 3.76,
535
+ "learning_rate": 4.403065646083809e-07,
536
+ "loss": 0.3417,
537
+ "step": 880
538
+ },
539
+ {
540
+ "epoch": 3.8,
541
+ "learning_rate": 2.973793713118039e-07,
542
+ "loss": 0.3709,
543
+ "step": 890
544
+ },
545
+ {
546
+ "epoch": 3.85,
547
+ "learning_rate": 1.8227814754865068e-07,
548
+ "loss": 0.3702,
549
+ "step": 900
550
+ },
551
+ {
552
+ "epoch": 3.89,
553
+ "learning_rate": 9.513254770636137e-08,
554
+ "loss": 0.3401,
555
+ "step": 910
556
+ },
557
+ {
558
+ "epoch": 3.93,
559
+ "learning_rate": 3.604073589645596e-08,
560
+ "loss": 0.3566,
561
+ "step": 920
562
+ },
563
+ {
564
+ "epoch": 3.97,
565
+ "learning_rate": 5.069275378746796e-09,
566
+ "loss": 0.3785,
567
+ "step": 930
568
+ },
569
+ {
570
+ "epoch": 4.0,
571
+ "step": 936,
572
+ "total_flos": 4.98594351337046e+17,
573
+ "train_loss": 0.40969020433914966,
574
+ "train_runtime": 4321.1174,
575
+ "train_samples_per_second": 6.918,
576
+ "train_steps_per_second": 0.217
577
+ }
578
+ ],
579
+ "logging_steps": 10,
580
+ "max_steps": 936,
581
+ "num_train_epochs": 4,
582
+ "save_steps": 200,
583
+ "total_flos": 4.98594351337046e+17,
584
+ "trial_name": null,
585
+ "trial_params": null
586
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbbed73135e44f6ca90ec72ca7dffc3ba0a828c08a457d2a39a4dcf2d902806e
3
+ size 4664
training_loss.png ADDED