juyongjiang commited on
Commit
1c988ac
1 Parent(s): 1871753

upload model checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,76 @@
1
- ---
2
- license: gemma
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ tags:
4
+ - alignment-handbook
5
+ - generated_from_trainer
6
+ datasets:
7
+ - llama-duo/synth_summarize_dataset_dedup
8
+ base_model: google/gemma-7b
9
+ model-index:
10
+ - name: gemma7b-summarize-claude3sonnet-32k
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # gemma7b-summarize-claude3sonnet-32k
18
+
19
+ This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 2.5524
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0002
41
+ - train_batch_size: 4
42
+ - eval_batch_size: 2
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 8
46
+ - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 16
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: cosine
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 10
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:-----:|:----:|:---------------:|
58
+ | 1.6901 | 1.0 | 76 | 2.9966 |
59
+ | 1.1272 | 2.0 | 152 | 2.6070 |
60
+ | 1.0337 | 3.0 | 228 | 2.5657 |
61
+ | 0.9638 | 4.0 | 304 | 2.5379 |
62
+ | 0.9419 | 5.0 | 380 | 2.5376 |
63
+ | 0.9117 | 6.0 | 456 | 2.5333 |
64
+ | 0.8944 | 7.0 | 532 | 2.5417 |
65
+ | 0.8824 | 8.0 | 608 | 2.5474 |
66
+ | 0.8759 | 9.0 | 684 | 2.5541 |
67
+ | 0.8735 | 10.0 | 760 | 2.5524 |
68
+
69
+
70
+ ### Framework versions
71
+
72
+ - PEFT 0.10.0
73
+ - Transformers 4.40.0
74
+ - Pytorch 2.1.2+cu121
75
+ - Datasets 2.18.0
76
+ - Tokenizers 0.19.1
adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "google/gemma-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6246c0c074e47dbab1c260de365a1dd3cd5f6dee4b8edc86131bd452b7492d8f
3
+ size 12860096
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_loss": 2.5523786544799805,
4
+ "eval_runtime": 0.2402,
5
+ "eval_samples": 25,
6
+ "eval_samples_per_second": 41.632,
7
+ "eval_steps_per_second": 4.163,
8
+ "total_flos": 2.318334860390826e+18,
9
+ "train_loss": 2.2621336485210217,
10
+ "train_runtime": 1856.2046,
11
+ "train_samples": 31747,
12
+ "train_samples_per_second": 26.156,
13
+ "train_steps_per_second": 0.409
14
+ }
config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/gemma-7b",
3
+ "architectures": [
4
+ "GemmaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 2,
9
+ "eos_token_id": 1,
10
+ "head_dim": 256,
11
+ "hidden_act": "gelu",
12
+ "hidden_activation": null,
13
+ "hidden_size": 3072,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 24576,
16
+ "max_position_embeddings": 8192,
17
+ "model_type": "gemma",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 28,
20
+ "num_key_value_heads": 16,
21
+ "pad_token_id": 0,
22
+ "quantization_config": {
23
+ "_load_in_4bit": true,
24
+ "_load_in_8bit": false,
25
+ "bnb_4bit_compute_dtype": "bfloat16",
26
+ "bnb_4bit_quant_storage": "uint8",
27
+ "bnb_4bit_quant_type": "nf4",
28
+ "bnb_4bit_use_double_quant": false,
29
+ "llm_int8_enable_fp32_cpu_offload": false,
30
+ "llm_int8_has_fp16_weight": false,
31
+ "llm_int8_skip_modules": null,
32
+ "llm_int8_threshold": 6.0,
33
+ "load_in_4bit": true,
34
+ "load_in_8bit": false,
35
+ "quant_method": "bitsandbytes"
36
+ },
37
+ "rms_norm_eps": 1e-06,
38
+ "rope_scaling": null,
39
+ "rope_theta": 10000.0,
40
+ "torch_dtype": "bfloat16",
41
+ "transformers_version": "4.40.0",
42
+ "use_cache": true,
43
+ "vocab_size": 256000
44
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_loss": 2.5523786544799805,
4
+ "eval_runtime": 0.2402,
5
+ "eval_samples": 25,
6
+ "eval_samples_per_second": 41.632,
7
+ "eval_steps_per_second": 4.163
8
+ }
runs/Jun13_05-58-32_gpu1-1/events.out.tfevents.1718229608.gpu1-1.440205.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:496d8947cbc337a0c498ea8911e5c7c0774ac3699b879cee1d7140ea5726b3cf
3
+ size 40721
runs/Jun13_05-58-32_gpu1-1/events.out.tfevents.1718231465.gpu1-1.440205.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b23bca1b61bda1a02121cfeb428edc004551a46a8dde1ba53809071c871f1e94
3
+ size 359
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<bos>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<pad>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:322a5f52ab5cab196761ab397a022d6fa3a2e1418585e532bb6efb2fedd2ae94
3
+ size 17477501
tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "106": {
38
+ "content": "<|im_start|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "107": {
46
+ "content": "<|im_end|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<|im_start|>",
56
+ "<|im_end|>"
57
+ ],
58
+ "bos_token": "<bos>",
59
+ "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
60
+ "clean_up_tokenization_spaces": false,
61
+ "eos_token": "<eos>",
62
+ "legacy": null,
63
+ "model_max_length": 2048,
64
+ "pad_token": "<pad>",
65
+ "sp_model_kwargs": {},
66
+ "spaces_between_special_tokens": false,
67
+ "tokenizer_class": "GemmaTokenizer",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": false
70
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 2.318334860390826e+18,
4
+ "train_loss": 2.2621336485210217,
5
+ "train_runtime": 1856.2046,
6
+ "train_samples": 31747,
7
+ "train_samples_per_second": 26.156,
8
+ "train_steps_per_second": 0.409
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 760,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.013157894736842105,
13
+ "grad_norm": 251.0,
14
+ "learning_rate": 2.631578947368421e-06,
15
+ "loss": 35.372,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.06578947368421052,
20
+ "grad_norm": 239.0,
21
+ "learning_rate": 1.3157894736842106e-05,
22
+ "loss": 35.3482,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.13157894736842105,
27
+ "grad_norm": 83.5,
28
+ "learning_rate": 2.6315789473684212e-05,
29
+ "loss": 30.5543,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.19736842105263158,
34
+ "grad_norm": 37.5,
35
+ "learning_rate": 3.9473684210526316e-05,
36
+ "loss": 23.0935,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.2631578947368421,
41
+ "grad_norm": 17.875,
42
+ "learning_rate": 5.2631578947368424e-05,
43
+ "loss": 18.9215,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.32894736842105265,
48
+ "grad_norm": 11.125,
49
+ "learning_rate": 6.578947368421054e-05,
50
+ "loss": 16.7379,
51
+ "step": 25
52
+ },
53
+ {
54
+ "epoch": 0.39473684210526316,
55
+ "grad_norm": 3.671875,
56
+ "learning_rate": 7.894736842105263e-05,
57
+ "loss": 14.9012,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.4605263157894737,
62
+ "grad_norm": 3.09375,
63
+ "learning_rate": 9.210526315789474e-05,
64
+ "loss": 14.2412,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.5263157894736842,
69
+ "grad_norm": 4.0,
70
+ "learning_rate": 0.00010526315789473685,
71
+ "loss": 13.5992,
72
+ "step": 40
73
+ },
74
+ {
75
+ "epoch": 0.5921052631578947,
76
+ "grad_norm": 6.15625,
77
+ "learning_rate": 0.00011842105263157894,
78
+ "loss": 12.7389,
79
+ "step": 45
80
+ },
81
+ {
82
+ "epoch": 0.6578947368421053,
83
+ "grad_norm": 11.875,
84
+ "learning_rate": 0.00013157894736842108,
85
+ "loss": 11.4753,
86
+ "step": 50
87
+ },
88
+ {
89
+ "epoch": 0.7236842105263158,
90
+ "grad_norm": 19.25,
91
+ "learning_rate": 0.00014473684210526317,
92
+ "loss": 8.7885,
93
+ "step": 55
94
+ },
95
+ {
96
+ "epoch": 0.7894736842105263,
97
+ "grad_norm": 20.75,
98
+ "learning_rate": 0.00015789473684210527,
99
+ "loss": 4.8341,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.8552631578947368,
104
+ "grad_norm": 5.90625,
105
+ "learning_rate": 0.00017105263157894739,
106
+ "loss": 2.389,
107
+ "step": 65
108
+ },
109
+ {
110
+ "epoch": 0.9210526315789473,
111
+ "grad_norm": 3.953125,
112
+ "learning_rate": 0.00018421052631578948,
113
+ "loss": 1.9682,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.9868421052631579,
118
+ "grad_norm": 1.078125,
119
+ "learning_rate": 0.00019736842105263157,
120
+ "loss": 1.6901,
121
+ "step": 75
122
+ },
123
+ {
124
+ "epoch": 1.0,
125
+ "eval_loss": 2.9966416358947754,
126
+ "eval_runtime": 0.2381,
127
+ "eval_samples_per_second": 41.997,
128
+ "eval_steps_per_second": 4.2,
129
+ "step": 76
130
+ },
131
+ {
132
+ "epoch": 1.0526315789473684,
133
+ "grad_norm": 1.0234375,
134
+ "learning_rate": 0.00019998312416333227,
135
+ "loss": 1.565,
136
+ "step": 80
137
+ },
138
+ {
139
+ "epoch": 1.118421052631579,
140
+ "grad_norm": 0.765625,
141
+ "learning_rate": 0.0001999145758387301,
142
+ "loss": 1.4481,
143
+ "step": 85
144
+ },
145
+ {
146
+ "epoch": 1.1842105263157894,
147
+ "grad_norm": 0.97265625,
148
+ "learning_rate": 0.00019979333640833947,
149
+ "loss": 1.3767,
150
+ "step": 90
151
+ },
152
+ {
153
+ "epoch": 1.25,
154
+ "grad_norm": 1.078125,
155
+ "learning_rate": 0.00019961946980917456,
156
+ "loss": 1.3246,
157
+ "step": 95
158
+ },
159
+ {
160
+ "epoch": 1.3157894736842106,
161
+ "grad_norm": 0.58203125,
162
+ "learning_rate": 0.00019939306773179497,
163
+ "loss": 1.2875,
164
+ "step": 100
165
+ },
166
+ {
167
+ "epoch": 1.381578947368421,
168
+ "grad_norm": 0.46484375,
169
+ "learning_rate": 0.00019911424957195158,
170
+ "loss": 1.244,
171
+ "step": 105
172
+ },
173
+ {
174
+ "epoch": 1.4473684210526316,
175
+ "grad_norm": 0.859375,
176
+ "learning_rate": 0.00019878316236762196,
177
+ "loss": 1.2335,
178
+ "step": 110
179
+ },
180
+ {
181
+ "epoch": 1.513157894736842,
182
+ "grad_norm": 0.78515625,
183
+ "learning_rate": 0.000198399980721468,
184
+ "loss": 1.2052,
185
+ "step": 115
186
+ },
187
+ {
188
+ "epoch": 1.5789473684210527,
189
+ "grad_norm": 0.59765625,
190
+ "learning_rate": 0.0001979649067087574,
191
+ "loss": 1.1933,
192
+ "step": 120
193
+ },
194
+ {
195
+ "epoch": 1.6447368421052633,
196
+ "grad_norm": 1.0703125,
197
+ "learning_rate": 0.00019747816977079671,
198
+ "loss": 1.1718,
199
+ "step": 125
200
+ },
201
+ {
202
+ "epoch": 1.7105263157894737,
203
+ "grad_norm": 1.7734375,
204
+ "learning_rate": 0.00019694002659393305,
205
+ "loss": 1.1603,
206
+ "step": 130
207
+ },
208
+ {
209
+ "epoch": 1.776315789473684,
210
+ "grad_norm": 0.5703125,
211
+ "learning_rate": 0.00019635076097418734,
212
+ "loss": 1.1664,
213
+ "step": 135
214
+ },
215
+ {
216
+ "epoch": 1.8421052631578947,
217
+ "grad_norm": 0.765625,
218
+ "learning_rate": 0.00019571068366759143,
219
+ "loss": 1.1523,
220
+ "step": 140
221
+ },
222
+ {
223
+ "epoch": 1.9078947368421053,
224
+ "grad_norm": 0.51953125,
225
+ "learning_rate": 0.00019502013222630712,
226
+ "loss": 1.1372,
227
+ "step": 145
228
+ },
229
+ {
230
+ "epoch": 1.973684210526316,
231
+ "grad_norm": 0.703125,
232
+ "learning_rate": 0.00019427947082061432,
233
+ "loss": 1.1272,
234
+ "step": 150
235
+ },
236
+ {
237
+ "epoch": 2.0,
238
+ "eval_loss": 2.6070284843444824,
239
+ "eval_runtime": 0.2379,
240
+ "eval_samples_per_second": 42.033,
241
+ "eval_steps_per_second": 4.203,
242
+ "step": 152
243
+ },
244
+ {
245
+ "epoch": 2.039473684210526,
246
+ "grad_norm": 0.6875,
247
+ "learning_rate": 0.00019348909004686152,
248
+ "loss": 1.1013,
249
+ "step": 155
250
+ },
251
+ {
252
+ "epoch": 2.1052631578947367,
253
+ "grad_norm": 1.15625,
254
+ "learning_rate": 0.00019264940672148018,
255
+ "loss": 1.1011,
256
+ "step": 160
257
+ },
258
+ {
259
+ "epoch": 2.1710526315789473,
260
+ "grad_norm": 0.59765625,
261
+ "learning_rate": 0.00019176086366117211,
262
+ "loss": 1.0822,
263
+ "step": 165
264
+ },
265
+ {
266
+ "epoch": 2.236842105263158,
267
+ "grad_norm": 1.21875,
268
+ "learning_rate": 0.00019082392944938466,
269
+ "loss": 1.0853,
270
+ "step": 170
271
+ },
272
+ {
273
+ "epoch": 2.3026315789473686,
274
+ "grad_norm": 0.859375,
275
+ "learning_rate": 0.0001898390981891979,
276
+ "loss": 1.0778,
277
+ "step": 175
278
+ },
279
+ {
280
+ "epoch": 2.3684210526315788,
281
+ "grad_norm": 0.58203125,
282
+ "learning_rate": 0.00018880688924275378,
283
+ "loss": 1.0601,
284
+ "step": 180
285
+ },
286
+ {
287
+ "epoch": 2.4342105263157894,
288
+ "grad_norm": 0.8359375,
289
+ "learning_rate": 0.0001877278469573643,
290
+ "loss": 1.0645,
291
+ "step": 185
292
+ },
293
+ {
294
+ "epoch": 2.5,
295
+ "grad_norm": 1.0078125,
296
+ "learning_rate": 0.00018660254037844388,
297
+ "loss": 1.0663,
298
+ "step": 190
299
+ },
300
+ {
301
+ "epoch": 2.5657894736842106,
302
+ "grad_norm": 2.015625,
303
+ "learning_rate": 0.0001854315629494165,
304
+ "loss": 1.0582,
305
+ "step": 195
306
+ },
307
+ {
308
+ "epoch": 2.6315789473684212,
309
+ "grad_norm": 1.015625,
310
+ "learning_rate": 0.00018421553219875658,
311
+ "loss": 1.05,
312
+ "step": 200
313
+ },
314
+ {
315
+ "epoch": 2.6973684210526314,
316
+ "grad_norm": 2.375,
317
+ "learning_rate": 0.00018295508941432815,
318
+ "loss": 1.0463,
319
+ "step": 205
320
+ },
321
+ {
322
+ "epoch": 2.763157894736842,
323
+ "grad_norm": 0.7890625,
324
+ "learning_rate": 0.0001816508993051943,
325
+ "loss": 1.0336,
326
+ "step": 210
327
+ },
328
+ {
329
+ "epoch": 2.8289473684210527,
330
+ "grad_norm": 0.84375,
331
+ "learning_rate": 0.0001803036496510752,
332
+ "loss": 1.0233,
333
+ "step": 215
334
+ },
335
+ {
336
+ "epoch": 2.8947368421052633,
337
+ "grad_norm": 0.76171875,
338
+ "learning_rate": 0.00017891405093963938,
339
+ "loss": 1.0287,
340
+ "step": 220
341
+ },
342
+ {
343
+ "epoch": 2.9605263157894735,
344
+ "grad_norm": 0.75390625,
345
+ "learning_rate": 0.00017748283599182014,
346
+ "loss": 1.0337,
347
+ "step": 225
348
+ },
349
+ {
350
+ "epoch": 3.0,
351
+ "eval_loss": 2.565699338912964,
352
+ "eval_runtime": 0.2367,
353
+ "eval_samples_per_second": 42.254,
354
+ "eval_steps_per_second": 4.225,
355
+ "step": 228
356
+ },
357
+ {
358
+ "epoch": 3.026315789473684,
359
+ "grad_norm": 0.9375,
360
+ "learning_rate": 0.00017601075957535364,
361
+ "loss": 1.0152,
362
+ "step": 230
363
+ },
364
+ {
365
+ "epoch": 3.0921052631578947,
366
+ "grad_norm": 1.109375,
367
+ "learning_rate": 0.00017449859800674371,
368
+ "loss": 0.9987,
369
+ "step": 235
370
+ },
371
+ {
372
+ "epoch": 3.1578947368421053,
373
+ "grad_norm": 2.046875,
374
+ "learning_rate": 0.0001729471487418621,
375
+ "loss": 0.9947,
376
+ "step": 240
377
+ },
378
+ {
379
+ "epoch": 3.223684210526316,
380
+ "grad_norm": 0.94921875,
381
+ "learning_rate": 0.00017135722995540107,
382
+ "loss": 0.9919,
383
+ "step": 245
384
+ },
385
+ {
386
+ "epoch": 3.2894736842105265,
387
+ "grad_norm": 0.99609375,
388
+ "learning_rate": 0.00016972968010939954,
389
+ "loss": 0.9945,
390
+ "step": 250
391
+ },
392
+ {
393
+ "epoch": 3.3552631578947367,
394
+ "grad_norm": 0.890625,
395
+ "learning_rate": 0.00016806535751107037,
396
+ "loss": 0.9904,
397
+ "step": 255
398
+ },
399
+ {
400
+ "epoch": 3.4210526315789473,
401
+ "grad_norm": 1.1171875,
402
+ "learning_rate": 0.00016636513986016213,
403
+ "loss": 0.9886,
404
+ "step": 260
405
+ },
406
+ {
407
+ "epoch": 3.486842105263158,
408
+ "grad_norm": 1.1171875,
409
+ "learning_rate": 0.00016462992378609407,
410
+ "loss": 0.99,
411
+ "step": 265
412
+ },
413
+ {
414
+ "epoch": 3.5526315789473686,
415
+ "grad_norm": 0.78515625,
416
+ "learning_rate": 0.0001628606243751082,
417
+ "loss": 0.9947,
418
+ "step": 270
419
+ },
420
+ {
421
+ "epoch": 3.6184210526315788,
422
+ "grad_norm": 0.76953125,
423
+ "learning_rate": 0.00016105817468768798,
424
+ "loss": 0.991,
425
+ "step": 275
426
+ },
427
+ {
428
+ "epoch": 3.6842105263157894,
429
+ "grad_norm": 1.03125,
430
+ "learning_rate": 0.00015922352526649803,
431
+ "loss": 0.982,
432
+ "step": 280
433
+ },
434
+ {
435
+ "epoch": 3.75,
436
+ "grad_norm": 1.1171875,
437
+ "learning_rate": 0.0001573576436351046,
438
+ "loss": 0.9797,
439
+ "step": 285
440
+ },
441
+ {
442
+ "epoch": 3.8157894736842106,
443
+ "grad_norm": 1.703125,
444
+ "learning_rate": 0.00015546151378774086,
445
+ "loss": 0.9724,
446
+ "step": 290
447
+ },
448
+ {
449
+ "epoch": 3.8815789473684212,
450
+ "grad_norm": 0.6796875,
451
+ "learning_rate": 0.00015353613567038607,
452
+ "loss": 0.9695,
453
+ "step": 295
454
+ },
455
+ {
456
+ "epoch": 3.9473684210526314,
457
+ "grad_norm": 0.87109375,
458
+ "learning_rate": 0.00015158252465343242,
459
+ "loss": 0.9638,
460
+ "step": 300
461
+ },
462
+ {
463
+ "epoch": 4.0,
464
+ "eval_loss": 2.537924289703369,
465
+ "eval_runtime": 0.2365,
466
+ "eval_samples_per_second": 42.288,
467
+ "eval_steps_per_second": 4.229,
468
+ "step": 304
469
+ },
470
+ {
471
+ "epoch": 4.0131578947368425,
472
+ "grad_norm": 0.7734375,
473
+ "learning_rate": 0.00014960171099621795,
474
+ "loss": 0.9669,
475
+ "step": 305
476
+ },
477
+ {
478
+ "epoch": 4.078947368421052,
479
+ "grad_norm": 0.7109375,
480
+ "learning_rate": 0.00014759473930370736,
481
+ "loss": 0.9414,
482
+ "step": 310
483
+ },
484
+ {
485
+ "epoch": 4.144736842105263,
486
+ "grad_norm": 0.921875,
487
+ "learning_rate": 0.00014556266797560732,
488
+ "loss": 0.9589,
489
+ "step": 315
490
+ },
491
+ {
492
+ "epoch": 4.2105263157894735,
493
+ "grad_norm": 0.66796875,
494
+ "learning_rate": 0.00014350656864820733,
495
+ "loss": 0.9489,
496
+ "step": 320
497
+ },
498
+ {
499
+ "epoch": 4.276315789473684,
500
+ "grad_norm": 0.79296875,
501
+ "learning_rate": 0.00014142752562923988,
502
+ "loss": 0.9502,
503
+ "step": 325
504
+ },
505
+ {
506
+ "epoch": 4.342105263157895,
507
+ "grad_norm": 1.2578125,
508
+ "learning_rate": 0.0001393266353260583,
509
+ "loss": 0.9626,
510
+ "step": 330
511
+ },
512
+ {
513
+ "epoch": 4.407894736842105,
514
+ "grad_norm": 0.59765625,
515
+ "learning_rate": 0.00013720500566743362,
516
+ "loss": 0.9511,
517
+ "step": 335
518
+ },
519
+ {
520
+ "epoch": 4.473684210526316,
521
+ "grad_norm": 1.65625,
522
+ "learning_rate": 0.00013506375551927547,
523
+ "loss": 0.9456,
524
+ "step": 340
525
+ },
526
+ {
527
+ "epoch": 4.5394736842105265,
528
+ "grad_norm": 1.4765625,
529
+ "learning_rate": 0.00013290401409458532,
530
+ "loss": 0.9461,
531
+ "step": 345
532
+ },
533
+ {
534
+ "epoch": 4.605263157894737,
535
+ "grad_norm": 1.875,
536
+ "learning_rate": 0.00013072692035795305,
537
+ "loss": 0.9418,
538
+ "step": 350
539
+ },
540
+ {
541
+ "epoch": 4.671052631578947,
542
+ "grad_norm": 0.98828125,
543
+ "learning_rate": 0.00012853362242491053,
544
+ "loss": 0.9481,
545
+ "step": 355
546
+ },
547
+ {
548
+ "epoch": 4.7368421052631575,
549
+ "grad_norm": 0.87890625,
550
+ "learning_rate": 0.00012632527695645993,
551
+ "loss": 0.9461,
552
+ "step": 360
553
+ },
554
+ {
555
+ "epoch": 4.802631578947368,
556
+ "grad_norm": 1.140625,
557
+ "learning_rate": 0.00012410304854909495,
558
+ "loss": 0.9493,
559
+ "step": 365
560
+ },
561
+ {
562
+ "epoch": 4.868421052631579,
563
+ "grad_norm": 0.87890625,
564
+ "learning_rate": 0.0001218681091206376,
565
+ "loss": 0.9283,
566
+ "step": 370
567
+ },
568
+ {
569
+ "epoch": 4.934210526315789,
570
+ "grad_norm": 0.62109375,
571
+ "learning_rate": 0.0001196216372922136,
572
+ "loss": 0.9395,
573
+ "step": 375
574
+ },
575
+ {
576
+ "epoch": 5.0,
577
+ "grad_norm": 1.6484375,
578
+ "learning_rate": 0.00011736481776669306,
579
+ "loss": 0.9419,
580
+ "step": 380
581
+ },
582
+ {
583
+ "epoch": 5.0,
584
+ "eval_loss": 2.5376482009887695,
585
+ "eval_runtime": 0.2349,
586
+ "eval_samples_per_second": 42.567,
587
+ "eval_steps_per_second": 4.257,
588
+ "step": 380
589
+ },
590
+ {
591
+ "epoch": 5.065789473684211,
592
+ "grad_norm": 0.95703125,
593
+ "learning_rate": 0.00011509884070392369,
594
+ "loss": 0.9265,
595
+ "step": 385
596
+ },
597
+ {
598
+ "epoch": 5.131578947368421,
599
+ "grad_norm": 0.60546875,
600
+ "learning_rate": 0.00011282490109308633,
601
+ "loss": 0.9248,
602
+ "step": 390
603
+ },
604
+ {
605
+ "epoch": 5.197368421052632,
606
+ "grad_norm": 0.6484375,
607
+ "learning_rate": 0.00011054419812250338,
608
+ "loss": 0.915,
609
+ "step": 395
610
+ },
611
+ {
612
+ "epoch": 5.2631578947368425,
613
+ "grad_norm": 0.61328125,
614
+ "learning_rate": 0.00010825793454723325,
615
+ "loss": 0.9219,
616
+ "step": 400
617
+ },
618
+ {
619
+ "epoch": 5.328947368421053,
620
+ "grad_norm": 0.73046875,
621
+ "learning_rate": 0.0001059673160547834,
622
+ "loss": 0.9223,
623
+ "step": 405
624
+ },
625
+ {
626
+ "epoch": 5.394736842105263,
627
+ "grad_norm": 0.64453125,
628
+ "learning_rate": 0.00010367355062927726,
629
+ "loss": 0.9236,
630
+ "step": 410
631
+ },
632
+ {
633
+ "epoch": 5.4605263157894735,
634
+ "grad_norm": 0.6640625,
635
+ "learning_rate": 0.00010137784791440965,
636
+ "loss": 0.9252,
637
+ "step": 415
638
+ },
639
+ {
640
+ "epoch": 5.526315789473684,
641
+ "grad_norm": 0.7890625,
642
+ "learning_rate": 9.908141857552737e-05,
643
+ "loss": 0.917,
644
+ "step": 420
645
+ },
646
+ {
647
+ "epoch": 5.592105263157895,
648
+ "grad_norm": 0.671875,
649
+ "learning_rate": 9.678547366117083e-05,
650
+ "loss": 0.9213,
651
+ "step": 425
652
+ },
653
+ {
654
+ "epoch": 5.657894736842105,
655
+ "grad_norm": 0.84375,
656
+ "learning_rate": 9.449122396441345e-05,
657
+ "loss": 0.9098,
658
+ "step": 430
659
+ },
660
+ {
661
+ "epoch": 5.723684210526316,
662
+ "grad_norm": 0.8203125,
663
+ "learning_rate": 9.219987938433621e-05,
664
+ "loss": 0.9233,
665
+ "step": 435
666
+ },
667
+ {
668
+ "epoch": 5.7894736842105265,
669
+ "grad_norm": 0.77734375,
670
+ "learning_rate": 8.991264828797319e-05,
671
+ "loss": 0.9113,
672
+ "step": 440
673
+ },
674
+ {
675
+ "epoch": 5.855263157894737,
676
+ "grad_norm": 0.55859375,
677
+ "learning_rate": 8.763073687306524e-05,
678
+ "loss": 0.9198,
679
+ "step": 445
680
+ },
681
+ {
682
+ "epoch": 5.921052631578947,
683
+ "grad_norm": 1.1328125,
684
+ "learning_rate": 8.535534853195786e-05,
685
+ "loss": 0.9186,
686
+ "step": 450
687
+ },
688
+ {
689
+ "epoch": 5.9868421052631575,
690
+ "grad_norm": 0.6796875,
691
+ "learning_rate": 8.308768321697815e-05,
692
+ "loss": 0.9117,
693
+ "step": 455
694
+ },
695
+ {
696
+ "epoch": 6.0,
697
+ "eval_loss": 2.5333404541015625,
698
+ "eval_runtime": 0.2353,
699
+ "eval_samples_per_second": 42.503,
700
+ "eval_steps_per_second": 4.25,
701
+ "step": 456
702
+ },
703
+ {
704
+ "epoch": 6.052631578947368,
705
+ "grad_norm": 0.53515625,
706
+ "learning_rate": 8.082893680762619e-05,
707
+ "loss": 0.9067,
708
+ "step": 460
709
+ },
710
+ {
711
+ "epoch": 6.118421052631579,
712
+ "grad_norm": 0.73828125,
713
+ "learning_rate": 7.858030047991411e-05,
714
+ "loss": 0.9008,
715
+ "step": 465
716
+ },
717
+ {
718
+ "epoch": 6.184210526315789,
719
+ "grad_norm": 0.5546875,
720
+ "learning_rate": 7.634296007818576e-05,
721
+ "loss": 0.8891,
722
+ "step": 470
723
+ },
724
+ {
725
+ "epoch": 6.25,
726
+ "grad_norm": 0.59375,
727
+ "learning_rate": 7.411809548974792e-05,
728
+ "loss": 0.9044,
729
+ "step": 475
730
+ },
731
+ {
732
+ "epoch": 6.315789473684211,
733
+ "grad_norm": 0.53515625,
734
+ "learning_rate": 7.190688002264308e-05,
735
+ "loss": 0.8954,
736
+ "step": 480
737
+ },
738
+ {
739
+ "epoch": 6.381578947368421,
740
+ "grad_norm": 0.55859375,
741
+ "learning_rate": 6.971047978689189e-05,
742
+ "loss": 0.895,
743
+ "step": 485
744
+ },
745
+ {
746
+ "epoch": 6.447368421052632,
747
+ "grad_norm": 0.54296875,
748
+ "learning_rate": 6.753005307953167e-05,
749
+ "loss": 0.9084,
750
+ "step": 490
751
+ },
752
+ {
753
+ "epoch": 6.5131578947368425,
754
+ "grad_norm": 0.55859375,
755
+ "learning_rate": 6.536674977377496e-05,
756
+ "loss": 0.8972,
757
+ "step": 495
758
+ },
759
+ {
760
+ "epoch": 6.578947368421053,
761
+ "grad_norm": 0.66796875,
762
+ "learning_rate": 6.322171071261071e-05,
763
+ "loss": 0.8986,
764
+ "step": 500
765
+ },
766
+ {
767
+ "epoch": 6.644736842105263,
768
+ "grad_norm": 0.5234375,
769
+ "learning_rate": 6.109606710716741e-05,
770
+ "loss": 0.8889,
771
+ "step": 505
772
+ },
773
+ {
774
+ "epoch": 6.7105263157894735,
775
+ "grad_norm": 0.859375,
776
+ "learning_rate": 5.8990939940156e-05,
777
+ "loss": 0.9018,
778
+ "step": 510
779
+ },
780
+ {
781
+ "epoch": 6.776315789473684,
782
+ "grad_norm": 0.53515625,
783
+ "learning_rate": 5.690743937470657e-05,
784
+ "loss": 0.9085,
785
+ "step": 515
786
+ },
787
+ {
788
+ "epoch": 6.842105263157895,
789
+ "grad_norm": 0.640625,
790
+ "learning_rate": 5.484666416891109e-05,
791
+ "loss": 0.9082,
792
+ "step": 520
793
+ },
794
+ {
795
+ "epoch": 6.907894736842105,
796
+ "grad_norm": 0.54296875,
797
+ "learning_rate": 5.280970109638047e-05,
798
+ "loss": 0.8921,
799
+ "step": 525
800
+ },
801
+ {
802
+ "epoch": 6.973684210526316,
803
+ "grad_norm": 0.58203125,
804
+ "learning_rate": 5.079762437312219e-05,
805
+ "loss": 0.8944,
806
+ "step": 530
807
+ },
808
+ {
809
+ "epoch": 7.0,
810
+ "eval_loss": 2.5417115688323975,
811
+ "eval_runtime": 0.2355,
812
+ "eval_samples_per_second": 42.467,
813
+ "eval_steps_per_second": 4.247,
814
+ "step": 532
815
+ },
816
+ {
817
+ "epoch": 7.0394736842105265,
818
+ "grad_norm": 0.5078125,
819
+ "learning_rate": 4.8811495091039926e-05,
820
+ "loss": 0.8912,
821
+ "step": 535
822
+ },
823
+ {
824
+ "epoch": 7.105263157894737,
825
+ "grad_norm": 0.58984375,
826
+ "learning_rate": 4.685236065835443e-05,
827
+ "loss": 0.8893,
828
+ "step": 540
829
+ },
830
+ {
831
+ "epoch": 7.171052631578948,
832
+ "grad_norm": 0.5078125,
833
+ "learning_rate": 4.492125424724086e-05,
834
+ "loss": 0.8881,
835
+ "step": 545
836
+ },
837
+ {
838
+ "epoch": 7.2368421052631575,
839
+ "grad_norm": 0.5234375,
840
+ "learning_rate": 4.301919424897338e-05,
841
+ "loss": 0.8913,
842
+ "step": 550
843
+ },
844
+ {
845
+ "epoch": 7.302631578947368,
846
+ "grad_norm": 0.498046875,
847
+ "learning_rate": 4.114718373686481e-05,
848
+ "loss": 0.8871,
849
+ "step": 555
850
+ },
851
+ {
852
+ "epoch": 7.368421052631579,
853
+ "grad_norm": 0.55859375,
854
+ "learning_rate": 3.9306209937284346e-05,
855
+ "loss": 0.8854,
856
+ "step": 560
857
+ },
858
+ {
859
+ "epoch": 7.434210526315789,
860
+ "grad_norm": 0.6328125,
861
+ "learning_rate": 3.749724370903216e-05,
862
+ "loss": 0.8908,
863
+ "step": 565
864
+ },
865
+ {
866
+ "epoch": 7.5,
867
+ "grad_norm": 0.51171875,
868
+ "learning_rate": 3.5721239031346066e-05,
869
+ "loss": 0.8887,
870
+ "step": 570
871
+ },
872
+ {
873
+ "epoch": 7.565789473684211,
874
+ "grad_norm": 0.494140625,
875
+ "learning_rate": 3.3979132500809405e-05,
876
+ "loss": 0.8858,
877
+ "step": 575
878
+ },
879
+ {
880
+ "epoch": 7.631578947368421,
881
+ "grad_norm": 0.49609375,
882
+ "learning_rate": 3.227184283742591e-05,
883
+ "loss": 0.8847,
884
+ "step": 580
885
+ },
886
+ {
887
+ "epoch": 7.697368421052632,
888
+ "grad_norm": 0.48046875,
889
+ "learning_rate": 3.0600270400122335e-05,
890
+ "loss": 0.8808,
891
+ "step": 585
892
+ },
893
+ {
894
+ "epoch": 7.7631578947368425,
895
+ "grad_norm": 0.498046875,
896
+ "learning_rate": 2.89652967119336e-05,
897
+ "loss": 0.8825,
898
+ "step": 590
899
+ },
900
+ {
901
+ "epoch": 7.828947368421053,
902
+ "grad_norm": 0.470703125,
903
+ "learning_rate": 2.73677839951215e-05,
904
+ "loss": 0.8923,
905
+ "step": 595
906
+ },
907
+ {
908
+ "epoch": 7.894736842105263,
909
+ "grad_norm": 0.447265625,
910
+ "learning_rate": 2.5808574716471856e-05,
911
+ "loss": 0.8809,
912
+ "step": 600
913
+ },
914
+ {
915
+ "epoch": 7.9605263157894735,
916
+ "grad_norm": 0.490234375,
917
+ "learning_rate": 2.4288491143009795e-05,
918
+ "loss": 0.8824,
919
+ "step": 605
920
+ },
921
+ {
922
+ "epoch": 8.0,
923
+ "eval_loss": 2.547354221343994,
924
+ "eval_runtime": 0.2365,
925
+ "eval_samples_per_second": 42.286,
926
+ "eval_steps_per_second": 4.229,
927
+ "step": 608
928
+ },
929
+ {
930
+ "epoch": 8.026315789473685,
931
+ "grad_norm": 0.478515625,
932
+ "learning_rate": 2.2808334908367914e-05,
933
+ "loss": 0.8926,
934
+ "step": 610
935
+ },
936
+ {
937
+ "epoch": 8.092105263157896,
938
+ "grad_norm": 0.48828125,
939
+ "learning_rate": 2.1368886590035443e-05,
940
+ "loss": 0.8774,
941
+ "step": 615
942
+ },
943
+ {
944
+ "epoch": 8.157894736842104,
945
+ "grad_norm": 0.4921875,
946
+ "learning_rate": 1.9970905297711606e-05,
947
+ "loss": 0.8834,
948
+ "step": 620
949
+ },
950
+ {
951
+ "epoch": 8.223684210526315,
952
+ "grad_norm": 0.53125,
953
+ "learning_rate": 1.861512827298051e-05,
954
+ "loss": 0.8845,
955
+ "step": 625
956
+ },
957
+ {
958
+ "epoch": 8.289473684210526,
959
+ "grad_norm": 0.5859375,
960
+ "learning_rate": 1.7302270500518182e-05,
961
+ "loss": 0.8801,
962
+ "step": 630
963
+ },
964
+ {
965
+ "epoch": 8.355263157894736,
966
+ "grad_norm": 0.49609375,
967
+ "learning_rate": 1.6033024331037138e-05,
968
+ "loss": 0.873,
969
+ "step": 635
970
+ },
971
+ {
972
+ "epoch": 8.421052631578947,
973
+ "grad_norm": 0.5078125,
974
+ "learning_rate": 1.4808059116167305e-05,
975
+ "loss": 0.8795,
976
+ "step": 640
977
+ },
978
+ {
979
+ "epoch": 8.486842105263158,
980
+ "grad_norm": 0.466796875,
981
+ "learning_rate": 1.3628020855465572e-05,
982
+ "loss": 0.8807,
983
+ "step": 645
984
+ },
985
+ {
986
+ "epoch": 8.552631578947368,
987
+ "grad_norm": 0.46484375,
988
+ "learning_rate": 1.2493531855740625e-05,
989
+ "loss": 0.8821,
990
+ "step": 650
991
+ },
992
+ {
993
+ "epoch": 8.618421052631579,
994
+ "grad_norm": 0.47265625,
995
+ "learning_rate": 1.1405190402872202e-05,
996
+ "loss": 0.8731,
997
+ "step": 655
998
+ },
999
+ {
1000
+ "epoch": 8.68421052631579,
1001
+ "grad_norm": 0.466796875,
1002
+ "learning_rate": 1.0363570446297999e-05,
1003
+ "loss": 0.8878,
1004
+ "step": 660
1005
+ },
1006
+ {
1007
+ "epoch": 8.75,
1008
+ "grad_norm": 0.53125,
1009
+ "learning_rate": 9.369221296335006e-06,
1010
+ "loss": 0.8827,
1011
+ "step": 665
1012
+ },
1013
+ {
1014
+ "epoch": 8.81578947368421,
1015
+ "grad_norm": 0.46484375,
1016
+ "learning_rate": 8.422667334494249e-06,
1017
+ "loss": 0.8847,
1018
+ "step": 670
1019
+ },
1020
+ {
1021
+ "epoch": 8.881578947368421,
1022
+ "grad_norm": 0.48046875,
1023
+ "learning_rate": 7.524407736942174e-06,
1024
+ "loss": 0.8844,
1025
+ "step": 675
1026
+ },
1027
+ {
1028
+ "epoch": 8.947368421052632,
1029
+ "grad_norm": 0.53125,
1030
+ "learning_rate": 6.674916211254289e-06,
1031
+ "loss": 0.8759,
1032
+ "step": 680
1033
+ },
1034
+ {
1035
+ "epoch": 9.0,
1036
+ "eval_loss": 2.5541369915008545,
1037
+ "eval_runtime": 0.2345,
1038
+ "eval_samples_per_second": 42.638,
1039
+ "eval_steps_per_second": 4.264,
1040
+ "step": 684
1041
+ },
1042
+ {
1043
+ "epoch": 9.013157894736842,
1044
+ "grad_norm": 0.455078125,
1045
+ "learning_rate": 5.8746407466000464e-06,
1046
+ "loss": 0.8768,
1047
+ "step": 685
1048
+ },
1049
+ {
1050
+ "epoch": 9.078947368421053,
1051
+ "grad_norm": 0.4765625,
1052
+ "learning_rate": 5.124003377490582e-06,
1053
+ "loss": 0.8787,
1054
+ "step": 690
1055
+ },
1056
+ {
1057
+ "epoch": 9.144736842105264,
1058
+ "grad_norm": 0.515625,
1059
+ "learning_rate": 4.423399961213892e-06,
1060
+ "loss": 0.8874,
1061
+ "step": 695
1062
+ },
1063
+ {
1064
+ "epoch": 9.210526315789474,
1065
+ "grad_norm": 0.462890625,
1066
+ "learning_rate": 3.7731999690749585e-06,
1067
+ "loss": 0.875,
1068
+ "step": 700
1069
+ },
1070
+ {
1071
+ "epoch": 9.276315789473685,
1072
+ "grad_norm": 0.46875,
1073
+ "learning_rate": 3.1737462915508277e-06,
1074
+ "loss": 0.8777,
1075
+ "step": 705
1076
+ },
1077
+ {
1078
+ "epoch": 9.342105263157896,
1079
+ "grad_norm": 0.46875,
1080
+ "learning_rate": 2.6253550574632303e-06,
1081
+ "loss": 0.8728,
1082
+ "step": 710
1083
+ },
1084
+ {
1085
+ "epoch": 9.407894736842104,
1086
+ "grad_norm": 0.470703125,
1087
+ "learning_rate": 2.128315467264552e-06,
1088
+ "loss": 0.8791,
1089
+ "step": 715
1090
+ },
1091
+ {
1092
+ "epoch": 9.473684210526315,
1093
+ "grad_norm": 0.4765625,
1094
+ "learning_rate": 1.6828896405244988e-06,
1095
+ "loss": 0.8809,
1096
+ "step": 720
1097
+ },
1098
+ {
1099
+ "epoch": 9.539473684210526,
1100
+ "grad_norm": 0.458984375,
1101
+ "learning_rate": 1.28931247769839e-06,
1102
+ "loss": 0.8836,
1103
+ "step": 725
1104
+ },
1105
+ {
1106
+ "epoch": 9.605263157894736,
1107
+ "grad_norm": 0.455078125,
1108
+ "learning_rate": 9.477915362496758e-07,
1109
+ "loss": 0.8819,
1110
+ "step": 730
1111
+ },
1112
+ {
1113
+ "epoch": 9.671052631578947,
1114
+ "grad_norm": 0.4609375,
1115
+ "learning_rate": 6.585069211921035e-07,
1116
+ "loss": 0.874,
1117
+ "step": 735
1118
+ },
1119
+ {
1120
+ "epoch": 9.736842105263158,
1121
+ "grad_norm": 0.458984375,
1122
+ "learning_rate": 4.216111901092501e-07,
1123
+ "loss": 0.8871,
1124
+ "step": 740
1125
+ },
1126
+ {
1127
+ "epoch": 9.802631578947368,
1128
+ "grad_norm": 0.490234375,
1129
+ "learning_rate": 2.372292727015557e-07,
1130
+ "loss": 0.8811,
1131
+ "step": 745
1132
+ },
1133
+ {
1134
+ "epoch": 9.868421052631579,
1135
+ "grad_norm": 0.45703125,
1136
+ "learning_rate": 1.0545840490313596e-07,
1137
+ "loss": 0.8768,
1138
+ "step": 750
1139
+ },
1140
+ {
1141
+ "epoch": 9.93421052631579,
1142
+ "grad_norm": 0.46875,
1143
+ "learning_rate": 2.6368077603367015e-08,
1144
+ "loss": 0.8774,
1145
+ "step": 755
1146
+ },
1147
+ {
1148
+ "epoch": 10.0,
1149
+ "grad_norm": 0.47265625,
1150
+ "learning_rate": 0.0,
1151
+ "loss": 0.8735,
1152
+ "step": 760
1153
+ },
1154
+ {
1155
+ "epoch": 10.0,
1156
+ "eval_loss": 2.5523786544799805,
1157
+ "eval_runtime": 0.2353,
1158
+ "eval_samples_per_second": 42.494,
1159
+ "eval_steps_per_second": 4.249,
1160
+ "step": 760
1161
+ },
1162
+ {
1163
+ "epoch": 10.0,
1164
+ "step": 760,
1165
+ "total_flos": 2.318334860390826e+18,
1166
+ "train_loss": 2.2621336485210217,
1167
+ "train_runtime": 1856.2046,
1168
+ "train_samples_per_second": 26.156,
1169
+ "train_steps_per_second": 0.409
1170
+ }
1171
+ ],
1172
+ "logging_steps": 5,
1173
+ "max_steps": 760,
1174
+ "num_input_tokens_seen": 0,
1175
+ "num_train_epochs": 10,
1176
+ "save_steps": 100,
1177
+ "total_flos": 2.318334860390826e+18,
1178
+ "train_batch_size": 4,
1179
+ "trial_name": null,
1180
+ "trial_params": null
1181
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e72d22e36be57e700734a4d42088c5a29b0893c81375cd69c80ac04f2224952
3
+ size 5176