amazingvince commited on
Commit
ca671b7
·
verified ·
1 Parent(s): bcf6ffc

Upload folder using huggingface_hub

Browse files
Files changed (40) hide show
  1. .gitattributes +1 -0
  2. .hydra/config.yaml +63 -0
  3. .hydra/hydra.yaml +156 -0
  4. .hydra/overrides.yaml +1 -0
  5. .ipynb_checkpoints/config-checkpoint.json +34 -0
  6. .ipynb_checkpoints/test-checkpoint.py +25 -0
  7. README.md +18 -0
  8. amazingvince/ul3-base/added_tokens.json +102 -0
  9. amazingvince/ul3-base/config.json +35 -0
  10. amazingvince/ul3-base/model.safetensors +3 -0
  11. amazingvince/ul3-base/special_tokens_map.json +132 -0
  12. amazingvince/ul3-base/tokenizer.json +0 -0
  13. amazingvince/ul3-base/tokenizer.model +3 -0
  14. amazingvince/ul3-base/tokenizer_config.json +952 -0
  15. checkpoint-pt-21000/model.safetensors +3 -0
  16. checkpoint-pt-21000/random_states_0.pkl +3 -0
  17. checkpoint-pt-22500/model.safetensors +3 -0
  18. checkpoint-pt-22500/random_states_0.pkl +3 -0
  19. checkpoint-pt-25500/model.safetensors +3 -0
  20. checkpoint-pt-25500/random_states_0.pkl +3 -0
  21. checkpoint-pt-27000/config.json +34 -0
  22. checkpoint-pt-27000/model.safetensors +3 -0
  23. checkpoint-pt-27000/random_states_0.pkl +3 -0
  24. checkpoint-pt-28500/config.json +34 -0
  25. checkpoint-pt-28500/model.safetensors +3 -0
  26. checkpoint-pt-28500/random_states_0.pkl +3 -0
  27. config.json +87 -0
  28. main.log +189 -0
  29. test.py +25 -0
  30. wandb/debug-internal.log +8 -0
  31. wandb/debug.log +28 -0
  32. wandb/run-20241020_182518-i0qk9v3k/files/config.yaml +123 -0
  33. wandb/run-20241020_182518-i0qk9v3k/files/output.log +259 -0
  34. wandb/run-20241020_182518-i0qk9v3k/files/requirements.txt +194 -0
  35. wandb/run-20241020_182518-i0qk9v3k/files/wandb-metadata.json +41 -0
  36. wandb/run-20241020_182518-i0qk9v3k/files/wandb-summary.json +1 -0
  37. wandb/run-20241020_182518-i0qk9v3k/logs/debug-core.log +14 -0
  38. wandb/run-20241020_182518-i0qk9v3k/logs/debug-internal.log +8 -0
  39. wandb/run-20241020_182518-i0qk9v3k/logs/debug.log +28 -0
  40. wandb/run-20241020_182518-i0qk9v3k/run-i0qk9v3k.wandb +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20241020_182518-i0qk9v3k/run-i0qk9v3k.wandb filter=lfs diff=lfs merge=lfs -text
.hydra/config.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mode: pt
2
+ device: gpu
3
+ precision: bf16
4
+ eval_only: false
5
+ predict_only: false
6
+ seed: 93789
7
+ tokenizer:
8
+ name: BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5
9
+ working_dir: null
10
+ model:
11
+ liger: true
12
+ klass: local_t5
13
+ name: pszemraj/tFINE-850m-24x24-1024ctx
14
+ overwrite:
15
+ dropout_rate: 0.0
16
+ num_decoder_layers: 16
17
+ num_key_value_heads: 4
18
+ num_layers: 16
19
+ use_gqa: true
20
+ add_config:
21
+ is_bf16: false
22
+ checkpoint_path: ''
23
+ random_init: true
24
+ compile: true
25
+ data:
26
+ multi_task: true
27
+ NTP: 0.3
28
+ input_length: 512
29
+ max_seq_len: 512
30
+ mlm_probability: 0.15
31
+ mean_noise_span_length: 3.0
32
+ num_workers: 0
33
+ optim:
34
+ name: adamwscale
35
+ base_lr: 0.001
36
+ batch_size: 128
37
+ total_steps: 65536
38
+ epochs: -1
39
+ warmup_steps: 5000
40
+ lr_scheduler: cosine
41
+ weight_decay: 0.01
42
+ grad_clip: 1.0
43
+ grad_acc: 16
44
+ final_cosine: 2.0e-05
45
+ eval:
46
+ every_steps: 500
47
+ steps: 0
48
+ checkpoint:
49
+ every_steps: 1500
50
+ logging:
51
+ every_steps: 25
52
+ grad_l2: true
53
+ weights_l2: true
54
+ use_wandb: true
55
+ wandb_config:
56
+ project: nanoT5
57
+ entity: amazingvince
58
+ tags:
59
+ - gqa
60
+ - large
61
+ - e32-d16
62
+ - 512 ctx
63
+ mode: online
.hydra/hydra.yaml ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: main
117
+ chdir: true
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: default
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /workspace/nanoT5
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /workspace/nanoT5/nanoT5/configs
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /workspace/nanoT5/logs/2024-10-20/18-25-17
144
+ choices:
145
+ local_env: default
146
+ task: pt
147
+ hydra/env: default
148
+ hydra/callbacks: null
149
+ hydra/job_logging: default
150
+ hydra/hydra_logging: default
151
+ hydra/hydra_help: default
152
+ hydra/help: default
153
+ hydra/sweeper: basic
154
+ hydra/launcher: basic
155
+ hydra/output: default
156
+ verbose: false
.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
.ipynb_checkpoints/config-checkpoint.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pszemraj/tFINE-850m-24x24-1024ctx",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 3072,
8
+ "d_kv": 64,
9
+ "d_model": 1024,
10
+ "decoder_start_token_id": 3,
11
+ "dense_act_fn": "silu",
12
+ "dropout_rate": 0.0,
13
+ "eos_token_id": 2,
14
+ "feed_forward_proj": "gated-silu",
15
+ "initializer_factor": 1.0,
16
+ "is_bf16": true,
17
+ "is_encoder_decoder": false,
18
+ "is_gated_act": true,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "model_type": "t5",
21
+ "num_decoder_layers": 16,
22
+ "num_heads": 16,
23
+ "num_key_value_heads": 4,
24
+ "num_layers": 16,
25
+ "output_past": true,
26
+ "pad_token_id": 3,
27
+ "relative_attention_max_distance": 128,
28
+ "relative_attention_num_buckets": 48,
29
+ "tie_word_embeddings": false,
30
+ "transformers_version": "4.46.0.dev0",
31
+ "use_cache": true,
32
+ "use_gqa": true,
33
+ "vocab_size": 28776
34
+ }
.ipynb_checkpoints/test-checkpoint.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import torch
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5")
5
+
6
+ special_tokens_dict = {'additional_special_tokens': ['[R]', '[S]', '[X]', '[NTP]']}
7
+ tokenizer.add_special_tokens(special_tokens_dict)
8
+
9
+ model = AutoModelForSeq2SeqLM.from_pretrained("/workspace/nanoT5/logs/2024-10-20/18-25-17/checkpoint-pt-27000").to("cuda")
10
+ prompt = "The "
11
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
12
+ # Add decoder_input_ids
13
+ # decoder_input_ids = torch.ones((inputs.input_ids.shape[0], 1), dtype=torch.long) * model.config.decoder_start_token_id
14
+
15
+ # Generate
16
+ generated_ids = model.generate(
17
+ **inputs,
18
+ # decoder_input_ids=decoder_input_ids,
19
+ max_new_tokens=20,
20
+ no_repeat_ngram_size=5
21
+ )
22
+
23
+ # Decode the output
24
+ generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
25
+ print(generated_text)
README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: mit
4
+ tags: ['t5', 'transformers']
5
+ ---
6
+
7
+ # amazingvince/ul3-base
8
+
9
+ Description of your model
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from transformers import AutoModel, AutoTokenizer
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained("amazingvince/ul3-base")
17
+ model = AutoModel.from_pretrained("amazingvince/ul3-base")
18
+ ```
amazingvince/ul3-base/added_tokens.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<extra_id_0>": 28672,
3
+ "<extra_id_10>": 28682,
4
+ "<extra_id_11>": 28683,
5
+ "<extra_id_12>": 28684,
6
+ "<extra_id_13>": 28685,
7
+ "<extra_id_14>": 28686,
8
+ "<extra_id_15>": 28687,
9
+ "<extra_id_16>": 28688,
10
+ "<extra_id_17>": 28689,
11
+ "<extra_id_18>": 28690,
12
+ "<extra_id_19>": 28691,
13
+ "<extra_id_1>": 28673,
14
+ "<extra_id_20>": 28692,
15
+ "<extra_id_21>": 28693,
16
+ "<extra_id_22>": 28694,
17
+ "<extra_id_23>": 28695,
18
+ "<extra_id_24>": 28696,
19
+ "<extra_id_25>": 28697,
20
+ "<extra_id_26>": 28698,
21
+ "<extra_id_27>": 28699,
22
+ "<extra_id_28>": 28700,
23
+ "<extra_id_29>": 28701,
24
+ "<extra_id_2>": 28674,
25
+ "<extra_id_30>": 28702,
26
+ "<extra_id_31>": 28703,
27
+ "<extra_id_32>": 28704,
28
+ "<extra_id_33>": 28705,
29
+ "<extra_id_34>": 28706,
30
+ "<extra_id_35>": 28707,
31
+ "<extra_id_36>": 28708,
32
+ "<extra_id_37>": 28709,
33
+ "<extra_id_38>": 28710,
34
+ "<extra_id_39>": 28711,
35
+ "<extra_id_3>": 28675,
36
+ "<extra_id_40>": 28712,
37
+ "<extra_id_41>": 28713,
38
+ "<extra_id_42>": 28714,
39
+ "<extra_id_43>": 28715,
40
+ "<extra_id_44>": 28716,
41
+ "<extra_id_45>": 28717,
42
+ "<extra_id_46>": 28718,
43
+ "<extra_id_47>": 28719,
44
+ "<extra_id_48>": 28720,
45
+ "<extra_id_49>": 28721,
46
+ "<extra_id_4>": 28676,
47
+ "<extra_id_50>": 28722,
48
+ "<extra_id_51>": 28723,
49
+ "<extra_id_52>": 28724,
50
+ "<extra_id_53>": 28725,
51
+ "<extra_id_54>": 28726,
52
+ "<extra_id_55>": 28727,
53
+ "<extra_id_56>": 28728,
54
+ "<extra_id_57>": 28729,
55
+ "<extra_id_58>": 28730,
56
+ "<extra_id_59>": 28731,
57
+ "<extra_id_5>": 28677,
58
+ "<extra_id_60>": 28732,
59
+ "<extra_id_61>": 28733,
60
+ "<extra_id_62>": 28734,
61
+ "<extra_id_63>": 28735,
62
+ "<extra_id_64>": 28736,
63
+ "<extra_id_65>": 28737,
64
+ "<extra_id_66>": 28738,
65
+ "<extra_id_67>": 28739,
66
+ "<extra_id_68>": 28740,
67
+ "<extra_id_69>": 28741,
68
+ "<extra_id_6>": 28678,
69
+ "<extra_id_70>": 28742,
70
+ "<extra_id_71>": 28743,
71
+ "<extra_id_72>": 28744,
72
+ "<extra_id_73>": 28745,
73
+ "<extra_id_74>": 28746,
74
+ "<extra_id_75>": 28747,
75
+ "<extra_id_76>": 28748,
76
+ "<extra_id_77>": 28749,
77
+ "<extra_id_78>": 28750,
78
+ "<extra_id_79>": 28751,
79
+ "<extra_id_7>": 28679,
80
+ "<extra_id_80>": 28752,
81
+ "<extra_id_81>": 28753,
82
+ "<extra_id_82>": 28754,
83
+ "<extra_id_83>": 28755,
84
+ "<extra_id_84>": 28756,
85
+ "<extra_id_85>": 28757,
86
+ "<extra_id_86>": 28758,
87
+ "<extra_id_87>": 28759,
88
+ "<extra_id_88>": 28760,
89
+ "<extra_id_89>": 28761,
90
+ "<extra_id_8>": 28680,
91
+ "<extra_id_90>": 28762,
92
+ "<extra_id_91>": 28763,
93
+ "<extra_id_92>": 28764,
94
+ "<extra_id_93>": 28765,
95
+ "<extra_id_94>": 28766,
96
+ "<extra_id_95>": 28767,
97
+ "<extra_id_96>": 28768,
98
+ "<extra_id_97>": 28769,
99
+ "<extra_id_98>": 28770,
100
+ "<extra_id_99>": 28771,
101
+ "<extra_id_9>": 28681
102
+ }
amazingvince/ul3-base/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/workspace/nanoT5/logs/2024-10-20/18-25-17/checkpoint-pt-27000",
3
+ "architectures": [
4
+ "T5Model"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 3072,
8
+ "d_kv": 64,
9
+ "d_model": 1024,
10
+ "decoder_start_token_id": 3,
11
+ "dense_act_fn": "silu",
12
+ "dropout_rate": 0.0,
13
+ "eos_token_id": 2,
14
+ "feed_forward_proj": "gated-silu",
15
+ "initializer_factor": 1.0,
16
+ "is_bf16": true,
17
+ "is_encoder_decoder": true,
18
+ "is_gated_act": true,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "model_type": "t5",
21
+ "num_decoder_layers": 16,
22
+ "num_heads": 16,
23
+ "num_key_value_heads": 4,
24
+ "num_layers": 16,
25
+ "output_past": true,
26
+ "pad_token_id": 3,
27
+ "relative_attention_max_distance": 128,
28
+ "relative_attention_num_buckets": 48,
29
+ "tie_word_embeddings": false,
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.46.0.dev0",
32
+ "use_cache": true,
33
+ "use_gqa": true,
34
+ "vocab_size": 28776
35
+ }
amazingvince/ul3-base/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bda83debb4d9f284d7be9c454e66a4c966c10e0919bc07c73ce47ad9a94ba11c
3
+ size 1829529944
amazingvince/ul3-base/special_tokens_map.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "bos_token": {
105
+ "content": "<s>",
106
+ "lstrip": false,
107
+ "normalized": true,
108
+ "rstrip": false,
109
+ "single_word": false
110
+ },
111
+ "eos_token": {
112
+ "content": "</s>",
113
+ "lstrip": false,
114
+ "normalized": true,
115
+ "rstrip": false,
116
+ "single_word": false
117
+ },
118
+ "pad_token": {
119
+ "content": "<pad>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ },
125
+ "unk_token": {
126
+ "content": "<unk>",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false
131
+ }
132
+ }
amazingvince/ul3-base/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
amazingvince/ul3-base/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b01740e92325d7d8300b8f4c4249cd4bcb70533fe06e5632a431b37d91d7a2a
3
+ size 711026
amazingvince/ul3-base/tokenizer_config.json ADDED
@@ -0,0 +1,952 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": true,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "28672": {
39
+ "content": "<extra_id_0>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "28673": {
47
+ "content": "<extra_id_1>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "28674": {
55
+ "content": "<extra_id_2>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "28675": {
63
+ "content": "<extra_id_3>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "28676": {
71
+ "content": "<extra_id_4>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "28677": {
79
+ "content": "<extra_id_5>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "28678": {
87
+ "content": "<extra_id_6>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "28679": {
95
+ "content": "<extra_id_7>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "28680": {
103
+ "content": "<extra_id_8>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "28681": {
111
+ "content": "<extra_id_9>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "28682": {
119
+ "content": "<extra_id_10>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": true
125
+ },
126
+ "28683": {
127
+ "content": "<extra_id_11>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": true
133
+ },
134
+ "28684": {
135
+ "content": "<extra_id_12>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": true
141
+ },
142
+ "28685": {
143
+ "content": "<extra_id_13>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": true
149
+ },
150
+ "28686": {
151
+ "content": "<extra_id_14>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": true
157
+ },
158
+ "28687": {
159
+ "content": "<extra_id_15>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": true
165
+ },
166
+ "28688": {
167
+ "content": "<extra_id_16>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": true
173
+ },
174
+ "28689": {
175
+ "content": "<extra_id_17>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": true
181
+ },
182
+ "28690": {
183
+ "content": "<extra_id_18>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": true
189
+ },
190
+ "28691": {
191
+ "content": "<extra_id_19>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": true
197
+ },
198
+ "28692": {
199
+ "content": "<extra_id_20>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": true
205
+ },
206
+ "28693": {
207
+ "content": "<extra_id_21>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": true
213
+ },
214
+ "28694": {
215
+ "content": "<extra_id_22>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "28695": {
223
+ "content": "<extra_id_23>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "28696": {
231
+ "content": "<extra_id_24>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "28697": {
239
+ "content": "<extra_id_25>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "28698": {
247
+ "content": "<extra_id_26>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "28699": {
255
+ "content": "<extra_id_27>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "28700": {
263
+ "content": "<extra_id_28>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "28701": {
271
+ "content": "<extra_id_29>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "28702": {
279
+ "content": "<extra_id_30>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "28703": {
287
+ "content": "<extra_id_31>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "28704": {
295
+ "content": "<extra_id_32>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ },
302
+ "28705": {
303
+ "content": "<extra_id_33>",
304
+ "lstrip": false,
305
+ "normalized": false,
306
+ "rstrip": false,
307
+ "single_word": false,
308
+ "special": true
309
+ },
310
+ "28706": {
311
+ "content": "<extra_id_34>",
312
+ "lstrip": false,
313
+ "normalized": false,
314
+ "rstrip": false,
315
+ "single_word": false,
316
+ "special": true
317
+ },
318
+ "28707": {
319
+ "content": "<extra_id_35>",
320
+ "lstrip": false,
321
+ "normalized": false,
322
+ "rstrip": false,
323
+ "single_word": false,
324
+ "special": true
325
+ },
326
+ "28708": {
327
+ "content": "<extra_id_36>",
328
+ "lstrip": false,
329
+ "normalized": false,
330
+ "rstrip": false,
331
+ "single_word": false,
332
+ "special": true
333
+ },
334
+ "28709": {
335
+ "content": "<extra_id_37>",
336
+ "lstrip": false,
337
+ "normalized": false,
338
+ "rstrip": false,
339
+ "single_word": false,
340
+ "special": true
341
+ },
342
+ "28710": {
343
+ "content": "<extra_id_38>",
344
+ "lstrip": false,
345
+ "normalized": false,
346
+ "rstrip": false,
347
+ "single_word": false,
348
+ "special": true
349
+ },
350
+ "28711": {
351
+ "content": "<extra_id_39>",
352
+ "lstrip": false,
353
+ "normalized": false,
354
+ "rstrip": false,
355
+ "single_word": false,
356
+ "special": true
357
+ },
358
+ "28712": {
359
+ "content": "<extra_id_40>",
360
+ "lstrip": false,
361
+ "normalized": false,
362
+ "rstrip": false,
363
+ "single_word": false,
364
+ "special": true
365
+ },
366
+ "28713": {
367
+ "content": "<extra_id_41>",
368
+ "lstrip": false,
369
+ "normalized": false,
370
+ "rstrip": false,
371
+ "single_word": false,
372
+ "special": true
373
+ },
374
+ "28714": {
375
+ "content": "<extra_id_42>",
376
+ "lstrip": false,
377
+ "normalized": false,
378
+ "rstrip": false,
379
+ "single_word": false,
380
+ "special": true
381
+ },
382
+ "28715": {
383
+ "content": "<extra_id_43>",
384
+ "lstrip": false,
385
+ "normalized": false,
386
+ "rstrip": false,
387
+ "single_word": false,
388
+ "special": true
389
+ },
390
+ "28716": {
391
+ "content": "<extra_id_44>",
392
+ "lstrip": false,
393
+ "normalized": false,
394
+ "rstrip": false,
395
+ "single_word": false,
396
+ "special": true
397
+ },
398
+ "28717": {
399
+ "content": "<extra_id_45>",
400
+ "lstrip": false,
401
+ "normalized": false,
402
+ "rstrip": false,
403
+ "single_word": false,
404
+ "special": true
405
+ },
406
+ "28718": {
407
+ "content": "<extra_id_46>",
408
+ "lstrip": false,
409
+ "normalized": false,
410
+ "rstrip": false,
411
+ "single_word": false,
412
+ "special": true
413
+ },
414
+ "28719": {
415
+ "content": "<extra_id_47>",
416
+ "lstrip": false,
417
+ "normalized": false,
418
+ "rstrip": false,
419
+ "single_word": false,
420
+ "special": true
421
+ },
422
+ "28720": {
423
+ "content": "<extra_id_48>",
424
+ "lstrip": false,
425
+ "normalized": false,
426
+ "rstrip": false,
427
+ "single_word": false,
428
+ "special": true
429
+ },
430
+ "28721": {
431
+ "content": "<extra_id_49>",
432
+ "lstrip": false,
433
+ "normalized": false,
434
+ "rstrip": false,
435
+ "single_word": false,
436
+ "special": true
437
+ },
438
+ "28722": {
439
+ "content": "<extra_id_50>",
440
+ "lstrip": false,
441
+ "normalized": false,
442
+ "rstrip": false,
443
+ "single_word": false,
444
+ "special": true
445
+ },
446
+ "28723": {
447
+ "content": "<extra_id_51>",
448
+ "lstrip": false,
449
+ "normalized": false,
450
+ "rstrip": false,
451
+ "single_word": false,
452
+ "special": true
453
+ },
454
+ "28724": {
455
+ "content": "<extra_id_52>",
456
+ "lstrip": false,
457
+ "normalized": false,
458
+ "rstrip": false,
459
+ "single_word": false,
460
+ "special": true
461
+ },
462
+ "28725": {
463
+ "content": "<extra_id_53>",
464
+ "lstrip": false,
465
+ "normalized": false,
466
+ "rstrip": false,
467
+ "single_word": false,
468
+ "special": true
469
+ },
470
+ "28726": {
471
+ "content": "<extra_id_54>",
472
+ "lstrip": false,
473
+ "normalized": false,
474
+ "rstrip": false,
475
+ "single_word": false,
476
+ "special": true
477
+ },
478
+ "28727": {
479
+ "content": "<extra_id_55>",
480
+ "lstrip": false,
481
+ "normalized": false,
482
+ "rstrip": false,
483
+ "single_word": false,
484
+ "special": true
485
+ },
486
+ "28728": {
487
+ "content": "<extra_id_56>",
488
+ "lstrip": false,
489
+ "normalized": false,
490
+ "rstrip": false,
491
+ "single_word": false,
492
+ "special": true
493
+ },
494
+ "28729": {
495
+ "content": "<extra_id_57>",
496
+ "lstrip": false,
497
+ "normalized": false,
498
+ "rstrip": false,
499
+ "single_word": false,
500
+ "special": true
501
+ },
502
+ "28730": {
503
+ "content": "<extra_id_58>",
504
+ "lstrip": false,
505
+ "normalized": false,
506
+ "rstrip": false,
507
+ "single_word": false,
508
+ "special": true
509
+ },
510
+ "28731": {
511
+ "content": "<extra_id_59>",
512
+ "lstrip": false,
513
+ "normalized": false,
514
+ "rstrip": false,
515
+ "single_word": false,
516
+ "special": true
517
+ },
518
+ "28732": {
519
+ "content": "<extra_id_60>",
520
+ "lstrip": false,
521
+ "normalized": false,
522
+ "rstrip": false,
523
+ "single_word": false,
524
+ "special": true
525
+ },
526
+ "28733": {
527
+ "content": "<extra_id_61>",
528
+ "lstrip": false,
529
+ "normalized": false,
530
+ "rstrip": false,
531
+ "single_word": false,
532
+ "special": true
533
+ },
534
+ "28734": {
535
+ "content": "<extra_id_62>",
536
+ "lstrip": false,
537
+ "normalized": false,
538
+ "rstrip": false,
539
+ "single_word": false,
540
+ "special": true
541
+ },
542
+ "28735": {
543
+ "content": "<extra_id_63>",
544
+ "lstrip": false,
545
+ "normalized": false,
546
+ "rstrip": false,
547
+ "single_word": false,
548
+ "special": true
549
+ },
550
+ "28736": {
551
+ "content": "<extra_id_64>",
552
+ "lstrip": false,
553
+ "normalized": false,
554
+ "rstrip": false,
555
+ "single_word": false,
556
+ "special": true
557
+ },
558
+ "28737": {
559
+ "content": "<extra_id_65>",
560
+ "lstrip": false,
561
+ "normalized": false,
562
+ "rstrip": false,
563
+ "single_word": false,
564
+ "special": true
565
+ },
566
+ "28738": {
567
+ "content": "<extra_id_66>",
568
+ "lstrip": false,
569
+ "normalized": false,
570
+ "rstrip": false,
571
+ "single_word": false,
572
+ "special": true
573
+ },
574
+ "28739": {
575
+ "content": "<extra_id_67>",
576
+ "lstrip": false,
577
+ "normalized": false,
578
+ "rstrip": false,
579
+ "single_word": false,
580
+ "special": true
581
+ },
582
+ "28740": {
583
+ "content": "<extra_id_68>",
584
+ "lstrip": false,
585
+ "normalized": false,
586
+ "rstrip": false,
587
+ "single_word": false,
588
+ "special": true
589
+ },
590
+ "28741": {
591
+ "content": "<extra_id_69>",
592
+ "lstrip": false,
593
+ "normalized": false,
594
+ "rstrip": false,
595
+ "single_word": false,
596
+ "special": true
597
+ },
598
+ "28742": {
599
+ "content": "<extra_id_70>",
600
+ "lstrip": false,
601
+ "normalized": false,
602
+ "rstrip": false,
603
+ "single_word": false,
604
+ "special": true
605
+ },
606
+ "28743": {
607
+ "content": "<extra_id_71>",
608
+ "lstrip": false,
609
+ "normalized": false,
610
+ "rstrip": false,
611
+ "single_word": false,
612
+ "special": true
613
+ },
614
+ "28744": {
615
+ "content": "<extra_id_72>",
616
+ "lstrip": false,
617
+ "normalized": false,
618
+ "rstrip": false,
619
+ "single_word": false,
620
+ "special": true
621
+ },
622
+ "28745": {
623
+ "content": "<extra_id_73>",
624
+ "lstrip": false,
625
+ "normalized": false,
626
+ "rstrip": false,
627
+ "single_word": false,
628
+ "special": true
629
+ },
630
+ "28746": {
631
+ "content": "<extra_id_74>",
632
+ "lstrip": false,
633
+ "normalized": false,
634
+ "rstrip": false,
635
+ "single_word": false,
636
+ "special": true
637
+ },
638
+ "28747": {
639
+ "content": "<extra_id_75>",
640
+ "lstrip": false,
641
+ "normalized": false,
642
+ "rstrip": false,
643
+ "single_word": false,
644
+ "special": true
645
+ },
646
+ "28748": {
647
+ "content": "<extra_id_76>",
648
+ "lstrip": false,
649
+ "normalized": false,
650
+ "rstrip": false,
651
+ "single_word": false,
652
+ "special": true
653
+ },
654
+ "28749": {
655
+ "content": "<extra_id_77>",
656
+ "lstrip": false,
657
+ "normalized": false,
658
+ "rstrip": false,
659
+ "single_word": false,
660
+ "special": true
661
+ },
662
+ "28750": {
663
+ "content": "<extra_id_78>",
664
+ "lstrip": false,
665
+ "normalized": false,
666
+ "rstrip": false,
667
+ "single_word": false,
668
+ "special": true
669
+ },
670
+ "28751": {
671
+ "content": "<extra_id_79>",
672
+ "lstrip": false,
673
+ "normalized": false,
674
+ "rstrip": false,
675
+ "single_word": false,
676
+ "special": true
677
+ },
678
+ "28752": {
679
+ "content": "<extra_id_80>",
680
+ "lstrip": false,
681
+ "normalized": false,
682
+ "rstrip": false,
683
+ "single_word": false,
684
+ "special": true
685
+ },
686
+ "28753": {
687
+ "content": "<extra_id_81>",
688
+ "lstrip": false,
689
+ "normalized": false,
690
+ "rstrip": false,
691
+ "single_word": false,
692
+ "special": true
693
+ },
694
+ "28754": {
695
+ "content": "<extra_id_82>",
696
+ "lstrip": false,
697
+ "normalized": false,
698
+ "rstrip": false,
699
+ "single_word": false,
700
+ "special": true
701
+ },
702
+ "28755": {
703
+ "content": "<extra_id_83>",
704
+ "lstrip": false,
705
+ "normalized": false,
706
+ "rstrip": false,
707
+ "single_word": false,
708
+ "special": true
709
+ },
710
+ "28756": {
711
+ "content": "<extra_id_84>",
712
+ "lstrip": false,
713
+ "normalized": false,
714
+ "rstrip": false,
715
+ "single_word": false,
716
+ "special": true
717
+ },
718
+ "28757": {
719
+ "content": "<extra_id_85>",
720
+ "lstrip": false,
721
+ "normalized": false,
722
+ "rstrip": false,
723
+ "single_word": false,
724
+ "special": true
725
+ },
726
+ "28758": {
727
+ "content": "<extra_id_86>",
728
+ "lstrip": false,
729
+ "normalized": false,
730
+ "rstrip": false,
731
+ "single_word": false,
732
+ "special": true
733
+ },
734
+ "28759": {
735
+ "content": "<extra_id_87>",
736
+ "lstrip": false,
737
+ "normalized": false,
738
+ "rstrip": false,
739
+ "single_word": false,
740
+ "special": true
741
+ },
742
+ "28760": {
743
+ "content": "<extra_id_88>",
744
+ "lstrip": false,
745
+ "normalized": false,
746
+ "rstrip": false,
747
+ "single_word": false,
748
+ "special": true
749
+ },
750
+ "28761": {
751
+ "content": "<extra_id_89>",
752
+ "lstrip": false,
753
+ "normalized": false,
754
+ "rstrip": false,
755
+ "single_word": false,
756
+ "special": true
757
+ },
758
+ "28762": {
759
+ "content": "<extra_id_90>",
760
+ "lstrip": false,
761
+ "normalized": false,
762
+ "rstrip": false,
763
+ "single_word": false,
764
+ "special": true
765
+ },
766
+ "28763": {
767
+ "content": "<extra_id_91>",
768
+ "lstrip": false,
769
+ "normalized": false,
770
+ "rstrip": false,
771
+ "single_word": false,
772
+ "special": true
773
+ },
774
+ "28764": {
775
+ "content": "<extra_id_92>",
776
+ "lstrip": false,
777
+ "normalized": false,
778
+ "rstrip": false,
779
+ "single_word": false,
780
+ "special": true
781
+ },
782
+ "28765": {
783
+ "content": "<extra_id_93>",
784
+ "lstrip": false,
785
+ "normalized": false,
786
+ "rstrip": false,
787
+ "single_word": false,
788
+ "special": true
789
+ },
790
+ "28766": {
791
+ "content": "<extra_id_94>",
792
+ "lstrip": false,
793
+ "normalized": false,
794
+ "rstrip": false,
795
+ "single_word": false,
796
+ "special": true
797
+ },
798
+ "28767": {
799
+ "content": "<extra_id_95>",
800
+ "lstrip": false,
801
+ "normalized": false,
802
+ "rstrip": false,
803
+ "single_word": false,
804
+ "special": true
805
+ },
806
+ "28768": {
807
+ "content": "<extra_id_96>",
808
+ "lstrip": false,
809
+ "normalized": false,
810
+ "rstrip": false,
811
+ "single_word": false,
812
+ "special": true
813
+ },
814
+ "28769": {
815
+ "content": "<extra_id_97>",
816
+ "lstrip": false,
817
+ "normalized": false,
818
+ "rstrip": false,
819
+ "single_word": false,
820
+ "special": true
821
+ },
822
+ "28770": {
823
+ "content": "<extra_id_98>",
824
+ "lstrip": false,
825
+ "normalized": false,
826
+ "rstrip": false,
827
+ "single_word": false,
828
+ "special": true
829
+ },
830
+ "28771": {
831
+ "content": "<extra_id_99>",
832
+ "lstrip": false,
833
+ "normalized": false,
834
+ "rstrip": false,
835
+ "single_word": false,
836
+ "special": true
837
+ }
838
+ },
839
+ "additional_special_tokens": [
840
+ "<extra_id_0>",
841
+ "<extra_id_1>",
842
+ "<extra_id_2>",
843
+ "<extra_id_3>",
844
+ "<extra_id_4>",
845
+ "<extra_id_5>",
846
+ "<extra_id_6>",
847
+ "<extra_id_7>",
848
+ "<extra_id_8>",
849
+ "<extra_id_9>",
850
+ "<extra_id_10>",
851
+ "<extra_id_11>",
852
+ "<extra_id_12>",
853
+ "<extra_id_13>",
854
+ "<extra_id_14>",
855
+ "<extra_id_15>",
856
+ "<extra_id_16>",
857
+ "<extra_id_17>",
858
+ "<extra_id_18>",
859
+ "<extra_id_19>",
860
+ "<extra_id_20>",
861
+ "<extra_id_21>",
862
+ "<extra_id_22>",
863
+ "<extra_id_23>",
864
+ "<extra_id_24>",
865
+ "<extra_id_25>",
866
+ "<extra_id_26>",
867
+ "<extra_id_27>",
868
+ "<extra_id_28>",
869
+ "<extra_id_29>",
870
+ "<extra_id_30>",
871
+ "<extra_id_31>",
872
+ "<extra_id_32>",
873
+ "<extra_id_33>",
874
+ "<extra_id_34>",
875
+ "<extra_id_35>",
876
+ "<extra_id_36>",
877
+ "<extra_id_37>",
878
+ "<extra_id_38>",
879
+ "<extra_id_39>",
880
+ "<extra_id_40>",
881
+ "<extra_id_41>",
882
+ "<extra_id_42>",
883
+ "<extra_id_43>",
884
+ "<extra_id_44>",
885
+ "<extra_id_45>",
886
+ "<extra_id_46>",
887
+ "<extra_id_47>",
888
+ "<extra_id_48>",
889
+ "<extra_id_49>",
890
+ "<extra_id_50>",
891
+ "<extra_id_51>",
892
+ "<extra_id_52>",
893
+ "<extra_id_53>",
894
+ "<extra_id_54>",
895
+ "<extra_id_55>",
896
+ "<extra_id_56>",
897
+ "<extra_id_57>",
898
+ "<extra_id_58>",
899
+ "<extra_id_59>",
900
+ "<extra_id_60>",
901
+ "<extra_id_61>",
902
+ "<extra_id_62>",
903
+ "<extra_id_63>",
904
+ "<extra_id_64>",
905
+ "<extra_id_65>",
906
+ "<extra_id_66>",
907
+ "<extra_id_67>",
908
+ "<extra_id_68>",
909
+ "<extra_id_69>",
910
+ "<extra_id_70>",
911
+ "<extra_id_71>",
912
+ "<extra_id_72>",
913
+ "<extra_id_73>",
914
+ "<extra_id_74>",
915
+ "<extra_id_75>",
916
+ "<extra_id_76>",
917
+ "<extra_id_77>",
918
+ "<extra_id_78>",
919
+ "<extra_id_79>",
920
+ "<extra_id_80>",
921
+ "<extra_id_81>",
922
+ "<extra_id_82>",
923
+ "<extra_id_83>",
924
+ "<extra_id_84>",
925
+ "<extra_id_85>",
926
+ "<extra_id_86>",
927
+ "<extra_id_87>",
928
+ "<extra_id_88>",
929
+ "<extra_id_89>",
930
+ "<extra_id_90>",
931
+ "<extra_id_91>",
932
+ "<extra_id_92>",
933
+ "<extra_id_93>",
934
+ "<extra_id_94>",
935
+ "<extra_id_95>",
936
+ "<extra_id_96>",
937
+ "<extra_id_97>",
938
+ "<extra_id_98>",
939
+ "<extra_id_99>"
940
+ ],
941
+ "bos_token": "<s>",
942
+ "clean_up_tokenization_spaces": false,
943
+ "eos_token": "</s>",
944
+ "legacy": false,
945
+ "model_max_length": 1000000000000000019884624838656,
946
+ "pad_token": "<pad>",
947
+ "sp_model_kwargs": {},
948
+ "spaces_between_special_tokens": false,
949
+ "tokenizer_class": "LlamaTokenizer",
950
+ "unk_token": "<unk>",
951
+ "use_default_system_prompt": false
952
+ }
checkpoint-pt-21000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa771dbc4a63d23c6984ff0cccd661741ea6930436c81ed2c6039f23c106d67f
3
+ size 1947396528
checkpoint-pt-21000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed31fc694e66aee6e3b9dbdf92b28ce62f42fa820a4906b89f7190e10867013f
3
+ size 14344
checkpoint-pt-22500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f948773759059489ac93361a2ceb7c702cc465b93711956d49e9672baff82ff
3
+ size 1947396528
checkpoint-pt-22500/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40b6245fe0cef1a5b1ebee7127ee5b84f049c2675fc9c64a2e2105955dcea53
3
+ size 14408
checkpoint-pt-25500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf9c8d29874fed7dd897f58cfa1dd048db2b7211aec88c8a31a8d289152994c3
3
+ size 1947396528
checkpoint-pt-25500/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:147819f6dc53370e7f045048178c9994c85f69245c8311221b151d13be4d89f2
3
+ size 14344
checkpoint-pt-27000/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pszemraj/tFINE-850m-24x24-1024ctx",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 3072,
8
+ "d_kv": 64,
9
+ "d_model": 1024,
10
+ "decoder_start_token_id": 3,
11
+ "dense_act_fn": "silu",
12
+ "dropout_rate": 0.0,
13
+ "eos_token_id": 2,
14
+ "feed_forward_proj": "gated-silu",
15
+ "initializer_factor": 1.0,
16
+ "is_bf16": true,
17
+ "is_encoder_decoder": true,
18
+ "is_gated_act": true,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "model_type": "t5",
21
+ "num_decoder_layers": 16,
22
+ "num_heads": 16,
23
+ "num_key_value_heads": 4,
24
+ "num_layers": 16,
25
+ "output_past": true,
26
+ "pad_token_id": 3,
27
+ "relative_attention_max_distance": 128,
28
+ "relative_attention_num_buckets": 48,
29
+ "tie_word_embeddings": false,
30
+ "transformers_version": "4.46.0.dev0",
31
+ "use_cache": true,
32
+ "use_gqa": true,
33
+ "vocab_size": 28776
34
+ }
checkpoint-pt-27000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adcba1a78a20dd132302933f9d140ab1b661ac822744324543966cf47f3ae3f6
3
+ size 1947396528
checkpoint-pt-27000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e5fcbdeef6dc6ae50b48fd2a81e8ff69cc9337d8cff59adb7fb983b993cefca
3
+ size 14344
checkpoint-pt-28500/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pszemraj/tFINE-850m-24x24-1024ctx",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 3072,
8
+ "d_kv": 64,
9
+ "d_model": 1024,
10
+ "decoder_start_token_id": 3,
11
+ "dense_act_fn": "silu",
12
+ "dropout_rate": 0.0,
13
+ "eos_token_id": 2,
14
+ "feed_forward_proj": "gated-silu",
15
+ "initializer_factor": 1.0,
16
+ "is_bf16": true,
17
+ "is_encoder_decoder": true,
18
+ "is_gated_act": true,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "model_type": "t5",
21
+ "num_decoder_layers": 16,
22
+ "num_heads": 16,
23
+ "num_key_value_heads": 4,
24
+ "num_layers": 16,
25
+ "output_past": true,
26
+ "pad_token_id": 3,
27
+ "relative_attention_max_distance": 128,
28
+ "relative_attention_num_buckets": 48,
29
+ "tie_word_embeddings": false,
30
+ "transformers_version": "4.46.0.dev0",
31
+ "use_cache": true,
32
+ "use_gqa": true,
33
+ "vocab_size": 28776
34
+ }
checkpoint-pt-28500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cbe1fc9ff42f93d50c984bd424ec74cb9bc3913c3c4f35cdf2020cea49f824
3
+ size 1947396528
checkpoint-pt-28500/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d506418cc06a51a3b061e1996bb290626c5a36d9d02c89b87c4c18b1c09d5a
3
+ size 14408
config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 28776,
3
+ "d_model": 1024,
4
+ "d_kv": 64,
5
+ "d_ff": 3072,
6
+ "num_layers": 16,
7
+ "num_decoder_layers": 16,
8
+ "num_heads": 16,
9
+ "num_key_value_heads": 4,
10
+ "relative_attention_num_buckets": 48,
11
+ "relative_attention_max_distance": 128,
12
+ "dropout_rate": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "layer_norm_epsilon": 1e-06,
15
+ "initializer_factor": 1.0,
16
+ "feed_forward_proj": "gated-silu",
17
+ "use_cache": true,
18
+ "use_gqa": true,
19
+ "dense_act_fn": "silu",
20
+ "is_gated_act": true,
21
+ "return_dict": true,
22
+ "output_hidden_states": false,
23
+ "output_attentions": false,
24
+ "torchscript": false,
25
+ "torch_dtype": null,
26
+ "use_bfloat16": false,
27
+ "tf_legacy_loss": false,
28
+ "pruned_heads": {},
29
+ "tie_word_embeddings": false,
30
+ "chunk_size_feed_forward": 0,
31
+ "is_encoder_decoder": true,
32
+ "is_decoder": false,
33
+ "cross_attention_hidden_size": null,
34
+ "add_cross_attention": false,
35
+ "tie_encoder_decoder": false,
36
+ "max_length": 20,
37
+ "min_length": 0,
38
+ "do_sample": false,
39
+ "early_stopping": false,
40
+ "num_beams": 1,
41
+ "num_beam_groups": 1,
42
+ "diversity_penalty": 0.0,
43
+ "temperature": 1.0,
44
+ "top_k": 50,
45
+ "top_p": 1.0,
46
+ "typical_p": 1.0,
47
+ "repetition_penalty": 1.0,
48
+ "length_penalty": 1.0,
49
+ "no_repeat_ngram_size": 0,
50
+ "encoder_no_repeat_ngram_size": 0,
51
+ "bad_words_ids": null,
52
+ "num_return_sequences": 1,
53
+ "output_scores": false,
54
+ "return_dict_in_generate": false,
55
+ "forced_bos_token_id": null,
56
+ "forced_eos_token_id": null,
57
+ "remove_invalid_values": false,
58
+ "exponential_decay_length_penalty": null,
59
+ "suppress_tokens": null,
60
+ "begin_suppress_tokens": null,
61
+ "architectures": [
62
+ "T5ForConditionalGeneration"
63
+ ],
64
+ "finetuning_task": null,
65
+ "id2label": {
66
+ "0": "LABEL_0",
67
+ "1": "LABEL_1"
68
+ },
69
+ "label2id": {
70
+ "LABEL_0": 0,
71
+ "LABEL_1": 1
72
+ },
73
+ "tokenizer_class": null,
74
+ "prefix": null,
75
+ "bos_token_id": null,
76
+ "pad_token_id": 3,
77
+ "eos_token_id": 2,
78
+ "sep_token_id": null,
79
+ "decoder_start_token_id": 3,
80
+ "task_specific_params": null,
81
+ "problem_type": null,
82
+ "_name_or_path": "/workspace/nanoT5/logs/2024-10-20/18-25-17/checkpoint-pt-27000",
83
+ "transformers_version": "4.46.0.dev0",
84
+ "is_bf16": true,
85
+ "model_type": "t5",
86
+ "output_past": true
87
+ }
main.log ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2024-10-20 18:25:17,510][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
2
+ [2024-10-20 18:25:17,521][Main][INFO] - Distributed environment: DistributedType.NO
3
+ Num processes: 1
4
+ Process index: 0
5
+ Local process index: 0
6
+ Device: cuda
7
+
8
+ Mixed precision type: bf16
9
+
10
+ [2024-10-20 18:25:17,522][Main][INFO] - Working directory is /workspace/nanoT5/logs/2024-10-20/18-25-17
11
+ [2024-10-20 18:31:35,111][Main][INFO] - [train] Step 25 out of 65536 | Loss --> 155.837 | Loss_ntp --> 76.275 | Loss_mlm --> 79.561 | Grad_l2 --> 476.354 | Weights_l2 --> 7701.821 | Lr --> 0.001 | Seconds_per_step --> 14.044 |
12
+ [2024-10-20 18:35:35,171][Main][INFO] - [train] Step 50 out of 65536 | Loss --> 98.644 | Loss_ntp --> 48.540 | Loss_mlm --> 50.105 | Grad_l2 --> 234.932 | Weights_l2 --> 7701.813 | Lr --> 0.001 | Seconds_per_step --> 9.602 |
13
+ [2024-10-20 18:39:35,197][Main][INFO] - [train] Step 75 out of 65536 | Loss --> 86.994 | Loss_ntp --> 42.861 | Loss_mlm --> 44.133 | Grad_l2 --> 180.388 | Weights_l2 --> 7701.806 | Lr --> 0.001 | Seconds_per_step --> 9.601 |
14
+ [2024-10-20 18:43:35,733][Main][INFO] - [train] Step 100 out of 65536 | Loss --> 80.568 | Loss_ntp --> 39.806 | Loss_mlm --> 40.762 | Grad_l2 --> 156.732 | Weights_l2 --> 7701.800 | Lr --> 0.001 | Seconds_per_step --> 9.621 |
15
+ [2024-10-20 18:47:37,016][Main][INFO] - [train] Step 125 out of 65536 | Loss --> 77.131 | Loss_ntp --> 38.127 | Loss_mlm --> 39.004 | Grad_l2 --> 179.590 | Weights_l2 --> 7701.794 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
16
+ [2024-10-20 18:51:38,437][Main][INFO] - [train] Step 150 out of 65536 | Loss --> 73.900 | Loss_ntp --> 36.620 | Loss_mlm --> 37.281 | Grad_l2 --> 161.591 | Weights_l2 --> 7701.789 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
17
+ [2024-10-20 18:55:39,020][Main][INFO] - [train] Step 175 out of 65536 | Loss --> 72.118 | Loss_ntp --> 35.763 | Loss_mlm --> 36.355 | Grad_l2 --> 161.741 | Weights_l2 --> 7701.783 | Lr --> 0.001 | Seconds_per_step --> 9.623 |
18
+ [2024-10-20 18:59:40,344][Main][INFO] - [train] Step 200 out of 65536 | Loss --> 70.712 | Loss_ntp --> 35.041 | Loss_mlm --> 35.671 | Grad_l2 --> 154.736 | Weights_l2 --> 7701.778 | Lr --> 0.001 | Seconds_per_step --> 9.653 |
19
+ [2024-10-20 19:03:39,817][Main][INFO] - [train] Step 225 out of 65536 | Loss --> 69.050 | Loss_ntp --> 34.233 | Loss_mlm --> 34.817 | Grad_l2 --> 106.908 | Weights_l2 --> 7701.772 | Lr --> 0.001 | Seconds_per_step --> 9.579 |
20
+ [2024-10-20 19:07:41,876][Main][INFO] - [train] Step 250 out of 65536 | Loss --> 68.595 | Loss_ntp --> 33.970 | Loss_mlm --> 34.625 | Grad_l2 --> 126.557 | Weights_l2 --> 7701.767 | Lr --> 0.001 | Seconds_per_step --> 9.682 |
21
+ [2024-10-20 19:11:43,944][Main][INFO] - [train] Step 275 out of 65536 | Loss --> 67.141 | Loss_ntp --> 33.297 | Loss_mlm --> 33.844 | Grad_l2 --> 114.874 | Weights_l2 --> 7701.762 | Lr --> 0.001 | Seconds_per_step --> 9.683 |
22
+ [2024-10-20 19:15:43,786][Main][INFO] - [train] Step 300 out of 65536 | Loss --> 65.916 | Loss_ntp --> 32.693 | Loss_mlm --> 33.223 | Grad_l2 --> 89.430 | Weights_l2 --> 7701.757 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
23
+ [2024-10-20 19:19:45,206][Main][INFO] - [train] Step 325 out of 65536 | Loss --> 65.322 | Loss_ntp --> 32.362 | Loss_mlm --> 32.960 | Grad_l2 --> 97.785 | Weights_l2 --> 7701.751 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
24
+ [2024-10-20 19:23:45,072][Main][INFO] - [train] Step 350 out of 65536 | Loss --> 64.367 | Loss_ntp --> 31.937 | Loss_mlm --> 32.430 | Grad_l2 --> 83.882 | Weights_l2 --> 7701.746 | Lr --> 0.001 | Seconds_per_step --> 9.595 |
25
+ [2024-10-20 19:27:46,534][Main][INFO] - [train] Step 375 out of 65536 | Loss --> 63.409 | Loss_ntp --> 31.433 | Loss_mlm --> 31.975 | Grad_l2 --> 75.548 | Weights_l2 --> 7701.741 | Lr --> 0.001 | Seconds_per_step --> 9.658 |
26
+ [2024-10-20 19:31:45,390][Main][INFO] - [train] Step 400 out of 65536 | Loss --> 62.292 | Loss_ntp --> 30.925 | Loss_mlm --> 31.367 | Grad_l2 --> 72.299 | Weights_l2 --> 7701.736 | Lr --> 0.001 | Seconds_per_step --> 9.554 |
27
+ [2024-10-20 19:35:46,689][Main][INFO] - [train] Step 425 out of 65536 | Loss --> 61.685 | Loss_ntp --> 30.585 | Loss_mlm --> 31.100 | Grad_l2 --> 73.838 | Weights_l2 --> 7701.731 | Lr --> 0.001 | Seconds_per_step --> 9.652 |
28
+ [2024-10-20 19:39:46,030][Main][INFO] - [train] Step 450 out of 65536 | Loss --> 61.416 | Loss_ntp --> 30.509 | Loss_mlm --> 30.907 | Grad_l2 --> 79.820 | Weights_l2 --> 7701.726 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
29
+ [2024-10-20 19:43:47,298][Main][INFO] - [train] Step 475 out of 65536 | Loss --> 60.536 | Loss_ntp --> 30.069 | Loss_mlm --> 30.467 | Grad_l2 --> 59.074 | Weights_l2 --> 7701.722 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
30
+ [2024-10-20 19:47:48,778][Main][INFO] - [train] Step 500 out of 65536 | Loss --> 60.085 | Loss_ntp --> 29.838 | Loss_mlm --> 30.246 | Grad_l2 --> 71.417 | Weights_l2 --> 7701.717 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
31
+ [2024-10-20 19:49:25,862][Main][INFO] - [eval] Step 500 out of 65536 | Loss --> 57.611 | Loss_ntp --> 28.694 | Loss_mlm --> 28.917 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 97.080 |
32
+ [2024-10-20 19:53:26,482][Main][INFO] - [train] Step 525 out of 65536 | Loss --> 59.106 | Loss_ntp --> 29.371 | Loss_mlm --> 29.735 | Grad_l2 --> 56.829 | Weights_l2 --> 7701.712 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
33
+ [2024-10-20 19:57:25,811][Main][INFO] - [train] Step 550 out of 65536 | Loss --> 58.185 | Loss_ntp --> 28.950 | Loss_mlm --> 29.235 | Grad_l2 --> 56.368 | Weights_l2 --> 7701.707 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
34
+ [2024-10-20 20:01:26,095][Main][INFO] - [train] Step 575 out of 65536 | Loss --> 57.301 | Loss_ntp --> 28.480 | Loss_mlm --> 28.821 | Grad_l2 --> 39.860 | Weights_l2 --> 7701.703 | Lr --> 0.001 | Seconds_per_step --> 9.611 |
35
+ [2024-10-20 20:05:26,649][Main][INFO] - [train] Step 600 out of 65536 | Loss --> 56.020 | Loss_ntp --> 27.906 | Loss_mlm --> 28.115 | Grad_l2 --> 35.414 | Weights_l2 --> 7701.698 | Lr --> 0.001 | Seconds_per_step --> 9.622 |
36
+ [2024-10-20 20:09:28,597][Main][INFO] - [train] Step 625 out of 65536 | Loss --> 55.363 | Loss_ntp --> 27.524 | Loss_mlm --> 27.840 | Grad_l2 --> 50.531 | Weights_l2 --> 7701.694 | Lr --> 0.001 | Seconds_per_step --> 9.678 |
37
+ [2024-10-20 20:13:29,399][Main][INFO] - [train] Step 650 out of 65536 | Loss --> 54.803 | Loss_ntp --> 27.252 | Loss_mlm --> 27.551 | Grad_l2 --> 56.108 | Weights_l2 --> 7701.689 | Lr --> 0.001 | Seconds_per_step --> 9.632 |
38
+ [2024-10-20 20:17:31,948][Main][INFO] - [train] Step 675 out of 65536 | Loss --> 53.970 | Loss_ntp --> 26.793 | Loss_mlm --> 27.176 | Grad_l2 --> 46.473 | Weights_l2 --> 7701.685 | Lr --> 0.001 | Seconds_per_step --> 9.702 |
39
+ [2024-10-20 20:21:31,196][Main][INFO] - [train] Step 700 out of 65536 | Loss --> 53.056 | Loss_ntp --> 26.359 | Loss_mlm --> 26.697 | Grad_l2 --> 37.435 | Weights_l2 --> 7701.680 | Lr --> 0.001 | Seconds_per_step --> 9.570 |
40
+ [2024-10-20 20:25:33,347][Main][INFO] - [train] Step 725 out of 65536 | Loss --> 52.070 | Loss_ntp --> 25.876 | Loss_mlm --> 26.194 | Grad_l2 --> 43.881 | Weights_l2 --> 7701.676 | Lr --> 0.001 | Seconds_per_step --> 9.686 |
41
+ [2024-10-20 20:29:33,004][Main][INFO] - [train] Step 750 out of 65536 | Loss --> 51.191 | Loss_ntp --> 25.456 | Loss_mlm --> 25.735 | Grad_l2 --> 44.855 | Weights_l2 --> 7701.672 | Lr --> 0.001 | Seconds_per_step --> 9.586 |
42
+ [2024-10-20 20:33:34,557][Main][INFO] - [train] Step 775 out of 65536 | Loss --> 50.129 | Loss_ntp --> 24.891 | Loss_mlm --> 25.239 | Grad_l2 --> 40.117 | Weights_l2 --> 7701.667 | Lr --> 0.001 | Seconds_per_step --> 9.662 |
43
+ [2024-10-20 20:37:33,242][Main][INFO] - [train] Step 800 out of 65536 | Loss --> 49.019 | Loss_ntp --> 24.361 | Loss_mlm --> 24.658 | Grad_l2 --> 39.953 | Weights_l2 --> 7701.663 | Lr --> 0.001 | Seconds_per_step --> 9.547 |
44
+ [2024-10-20 20:41:33,285][Main][INFO] - [train] Step 825 out of 65536 | Loss --> 48.160 | Loss_ntp --> 23.923 | Loss_mlm --> 24.238 | Grad_l2 --> 42.816 | Weights_l2 --> 7701.659 | Lr --> 0.001 | Seconds_per_step --> 9.602 |
45
+ [2024-10-20 20:45:34,352][Main][INFO] - [train] Step 850 out of 65536 | Loss --> 46.672 | Loss_ntp --> 23.149 | Loss_mlm --> 23.522 | Grad_l2 --> 42.230 | Weights_l2 --> 7701.654 | Lr --> 0.001 | Seconds_per_step --> 9.643 |
46
+ [2024-10-20 20:49:34,963][Main][INFO] - [train] Step 875 out of 65536 | Loss --> 44.855 | Loss_ntp --> 22.279 | Loss_mlm --> 22.575 | Grad_l2 --> 39.123 | Weights_l2 --> 7701.650 | Lr --> 0.001 | Seconds_per_step --> 9.624 |
47
+ [2024-10-20 20:53:36,677][Main][INFO] - [train] Step 900 out of 65536 | Loss --> 42.480 | Loss_ntp --> 21.057 | Loss_mlm --> 21.423 | Grad_l2 --> 50.501 | Weights_l2 --> 7701.645 | Lr --> 0.001 | Seconds_per_step --> 9.668 |
48
+ [2024-10-20 20:57:37,186][Main][INFO] - [train] Step 925 out of 65536 | Loss --> 40.028 | Loss_ntp --> 19.877 | Loss_mlm --> 20.151 | Grad_l2 --> 57.109 | Weights_l2 --> 7701.640 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
49
+ [2024-10-20 21:01:38,800][Main][INFO] - [train] Step 950 out of 65536 | Loss --> 37.058 | Loss_ntp --> 18.359 | Loss_mlm --> 18.699 | Grad_l2 --> 78.443 | Weights_l2 --> 7701.634 | Lr --> 0.001 | Seconds_per_step --> 9.664 |
50
+ [2024-10-20 21:05:38,405][Main][INFO] - [train] Step 975 out of 65536 | Loss --> 33.534 | Loss_ntp --> 16.618 | Loss_mlm --> 16.917 | Grad_l2 --> 87.220 | Weights_l2 --> 7701.628 | Lr --> 0.001 | Seconds_per_step --> 9.584 |
51
+ [2024-10-20 21:09:41,153][Main][INFO] - [train] Step 1000 out of 65536 | Loss --> 29.988 | Loss_ntp --> 14.857 | Loss_mlm --> 15.131 | Grad_l2 --> 88.279 | Weights_l2 --> 7701.622 | Lr --> 0.001 | Seconds_per_step --> 9.710 |
52
+ [2024-10-20 21:10:10,310][Main][INFO] - [eval] Step 1000 out of 65536 | Loss --> 28.033 | Loss_ntp --> 13.938 | Loss_mlm --> 14.095 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 29.143 |
53
+ [2024-10-20 21:14:10,580][Main][INFO] - [train] Step 1025 out of 65536 | Loss --> 26.588 | Loss_ntp --> 13.166 | Loss_mlm --> 13.423 | Grad_l2 --> 109.226 | Weights_l2 --> 7701.616 | Lr --> 0.001 | Seconds_per_step --> 9.611 |
54
+ [2024-10-20 21:18:12,558][Main][INFO] - [train] Step 1050 out of 65536 | Loss --> 23.850 | Loss_ntp --> 11.830 | Loss_mlm --> 12.020 | Grad_l2 --> 98.666 | Weights_l2 --> 7701.610 | Lr --> 0.001 | Seconds_per_step --> 9.679 |
55
+ [2024-10-20 21:22:11,593][Main][INFO] - [train] Step 1075 out of 65536 | Loss --> 21.589 | Loss_ntp --> 10.697 | Loss_mlm --> 10.892 | Grad_l2 --> 104.858 | Weights_l2 --> 7701.605 | Lr --> 0.001 | Seconds_per_step --> 9.561 |
56
+ [2024-10-20 21:26:13,779][Main][INFO] - [train] Step 1100 out of 65536 | Loss --> 19.443 | Loss_ntp --> 9.626 | Loss_mlm --> 9.817 | Grad_l2 --> 75.473 | Weights_l2 --> 7701.599 | Lr --> 0.001 | Seconds_per_step --> 9.687 |
57
+ [2024-10-20 21:30:13,762][Main][INFO] - [train] Step 1125 out of 65536 | Loss --> 17.771 | Loss_ntp --> 8.793 | Loss_mlm --> 8.978 | Grad_l2 --> 55.492 | Weights_l2 --> 7701.593 | Lr --> 0.001 | Seconds_per_step --> 9.599 |
58
+ [2024-10-20 21:34:14,478][Main][INFO] - [train] Step 1150 out of 65536 | Loss --> 17.092 | Loss_ntp --> 8.462 | Loss_mlm --> 8.630 | Grad_l2 --> 72.673 | Weights_l2 --> 7701.587 | Lr --> 0.001 | Seconds_per_step --> 9.629 |
59
+ [2024-10-20 21:38:14,797][Main][INFO] - [train] Step 1175 out of 65536 | Loss --> 16.731 | Loss_ntp --> 8.294 | Loss_mlm --> 8.437 | Grad_l2 --> 60.718 | Weights_l2 --> 7701.582 | Lr --> 0.001 | Seconds_per_step --> 9.613 |
60
+ [2024-10-20 21:42:15,467][Main][INFO] - [train] Step 1200 out of 65536 | Loss --> 16.522 | Loss_ntp --> 8.188 | Loss_mlm --> 8.334 | Grad_l2 --> 62.414 | Weights_l2 --> 7701.577 | Lr --> 0.001 | Seconds_per_step --> 9.627 |
61
+ [2024-10-20 21:46:15,957][Main][INFO] - [train] Step 1225 out of 65536 | Loss --> 16.336 | Loss_ntp --> 8.096 | Loss_mlm --> 8.240 | Grad_l2 --> 57.944 | Weights_l2 --> 7701.572 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
62
+ [2024-10-20 21:50:15,276][Main][INFO] - [train] Step 1250 out of 65536 | Loss --> 16.167 | Loss_ntp --> 8.006 | Loss_mlm --> 8.161 | Grad_l2 --> 42.899 | Weights_l2 --> 7701.567 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
63
+ [2024-10-20 21:54:18,039][Main][INFO] - [train] Step 1275 out of 65536 | Loss --> 16.183 | Loss_ntp --> 8.017 | Loss_mlm --> 8.166 | Grad_l2 --> 48.492 | Weights_l2 --> 7701.563 | Lr --> 0.001 | Seconds_per_step --> 9.710 |
64
+ [2024-10-20 21:58:18,396][Main][INFO] - [train] Step 1300 out of 65536 | Loss --> 15.988 | Loss_ntp --> 7.926 | Loss_mlm --> 8.063 | Grad_l2 --> 42.852 | Weights_l2 --> 7701.558 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
65
+ [2024-10-20 22:02:20,263][Main][INFO] - [train] Step 1325 out of 65536 | Loss --> 15.982 | Loss_ntp --> 7.916 | Loss_mlm --> 8.066 | Grad_l2 --> 47.218 | Weights_l2 --> 7701.553 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
66
+ [2024-10-20 22:06:20,739][Main][INFO] - [train] Step 1350 out of 65536 | Loss --> 15.830 | Loss_ntp --> 7.838 | Loss_mlm --> 7.992 | Grad_l2 --> 28.805 | Weights_l2 --> 7701.549 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
67
+ [2024-10-20 22:10:23,190][Main][INFO] - [train] Step 1375 out of 65536 | Loss --> 15.806 | Loss_ntp --> 7.839 | Loss_mlm --> 7.967 | Grad_l2 --> 37.388 | Weights_l2 --> 7701.544 | Lr --> 0.001 | Seconds_per_step --> 9.698 |
68
+ [2024-10-20 22:14:23,525][Main][INFO] - [train] Step 1400 out of 65536 | Loss --> 15.775 | Loss_ntp --> 7.813 | Loss_mlm --> 7.962 | Grad_l2 --> 35.380 | Weights_l2 --> 7701.540 | Lr --> 0.001 | Seconds_per_step --> 9.613 |
69
+ [2024-10-20 22:18:25,080][Main][INFO] - [train] Step 1425 out of 65536 | Loss --> 15.722 | Loss_ntp --> 7.794 | Loss_mlm --> 7.928 | Grad_l2 --> 34.978 | Weights_l2 --> 7701.535 | Lr --> 0.001 | Seconds_per_step --> 9.662 |
70
+ [2024-10-20 22:22:24,651][Main][INFO] - [train] Step 1450 out of 65536 | Loss --> 15.638 | Loss_ntp --> 7.739 | Loss_mlm --> 7.899 | Grad_l2 --> 24.003 | Weights_l2 --> 7701.530 | Lr --> 0.001 | Seconds_per_step --> 9.583 |
71
+ [2024-10-20 22:26:24,495][Main][INFO] - [train] Step 1475 out of 65536 | Loss --> 15.682 | Loss_ntp --> 7.768 | Loss_mlm --> 7.913 | Grad_l2 --> 27.599 | Weights_l2 --> 7701.526 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
72
+ [2024-10-20 22:30:25,992][Main][INFO] - [train] Step 1500 out of 65536 | Loss --> 15.638 | Loss_ntp --> 7.754 | Loss_mlm --> 7.884 | Grad_l2 --> 22.985 | Weights_l2 --> 7701.521 | Lr --> 0.001 | Seconds_per_step --> 9.660 |
73
+ [2024-10-20 22:30:54,697][Main][INFO] - [eval] Step 1500 out of 65536 | Loss --> 15.664 | Loss_ntp --> 7.782 | Loss_mlm --> 7.882 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.700 |
74
+ [2024-10-20 22:30:54,709][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-1500
75
+ [2024-10-20 22:30:54,719][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
76
+ [2024-10-20 22:30:59,988][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-1500/model.safetensors
77
+ [2024-10-20 22:31:08,673][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-1500/optimizer.bin
78
+ [2024-10-20 22:31:08,682][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-1500/scheduler.bin
79
+ [2024-10-20 22:31:08,684][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-1500/sampler.bin
80
+ [2024-10-20 22:31:08,686][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-1500/sampler_1.bin
81
+ [2024-10-20 22:31:08,694][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-1500/random_states_0.pkl
82
+ [2024-10-20 22:35:09,885][Main][INFO] - [train] Step 1525 out of 65536 | Loss --> 15.740 | Loss_ntp --> 7.803 | Loss_mlm --> 7.937 | Grad_l2 --> 35.476 | Weights_l2 --> 7701.516 | Lr --> 0.001 | Seconds_per_step --> 10.207 |
83
+ [2024-10-20 22:39:10,189][Main][INFO] - [train] Step 1550 out of 65536 | Loss --> 15.717 | Loss_ntp --> 7.796 | Loss_mlm --> 7.921 | Grad_l2 --> 32.209 | Weights_l2 --> 7701.511 | Lr --> 0.001 | Seconds_per_step --> 9.612 |
84
+ [2024-10-20 22:43:12,020][Main][INFO] - [train] Step 1575 out of 65536 | Loss --> 15.723 | Loss_ntp --> 7.805 | Loss_mlm --> 7.918 | Grad_l2 --> 35.393 | Weights_l2 --> 7701.506 | Lr --> 0.001 | Seconds_per_step --> 9.673 |
85
+ [2024-10-20 22:47:13,492][Main][INFO] - [train] Step 1600 out of 65536 | Loss --> 15.617 | Loss_ntp --> 7.752 | Loss_mlm --> 7.865 | Grad_l2 --> 29.357 | Weights_l2 --> 7701.502 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
86
+ [2024-10-20 22:51:13,978][Main][INFO] - [train] Step 1625 out of 65536 | Loss --> 15.532 | Loss_ntp --> 7.709 | Loss_mlm --> 7.822 | Grad_l2 --> 18.501 | Weights_l2 --> 7701.497 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
87
+ [2024-10-20 22:55:14,600][Main][INFO] - [train] Step 1650 out of 65536 | Loss --> 15.565 | Loss_ntp --> 7.720 | Loss_mlm --> 7.845 | Grad_l2 --> 17.546 | Weights_l2 --> 7701.493 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
88
+ [2024-10-20 22:59:14,384][Main][INFO] - [train] Step 1675 out of 65536 | Loss --> 15.576 | Loss_ntp --> 7.737 | Loss_mlm --> 7.838 | Grad_l2 --> 23.599 | Weights_l2 --> 7701.489 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
89
+ [2024-10-20 23:03:16,878][Main][INFO] - [train] Step 1700 out of 65536 | Loss --> 15.612 | Loss_ntp --> 7.757 | Loss_mlm --> 7.855 | Grad_l2 --> 28.685 | Weights_l2 --> 7701.484 | Lr --> 0.001 | Seconds_per_step --> 9.700 |
90
+ [2024-10-20 23:07:16,611][Main][INFO] - [train] Step 1725 out of 65536 | Loss --> 15.590 | Loss_ntp --> 7.728 | Loss_mlm --> 7.861 | Grad_l2 --> 22.357 | Weights_l2 --> 7701.479 | Lr --> 0.001 | Seconds_per_step --> 9.589 |
91
+ [2024-10-20 23:11:18,435][Main][INFO] - [train] Step 1750 out of 65536 | Loss --> 15.475 | Loss_ntp --> 7.683 | Loss_mlm --> 7.792 | Grad_l2 --> 20.808 | Weights_l2 --> 7701.475 | Lr --> 0.001 | Seconds_per_step --> 9.673 |
92
+ [2024-10-20 23:15:17,324][Main][INFO] - [train] Step 1775 out of 65536 | Loss --> 15.422 | Loss_ntp --> 7.655 | Loss_mlm --> 7.767 | Grad_l2 --> 16.928 | Weights_l2 --> 7701.470 | Lr --> 0.001 | Seconds_per_step --> 9.555 |
93
+ [2024-10-20 23:19:17,823][Main][INFO] - [train] Step 1800 out of 65536 | Loss --> 15.370 | Loss_ntp --> 7.625 | Loss_mlm --> 7.745 | Grad_l2 --> 16.147 | Weights_l2 --> 7701.466 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
94
+ [2024-10-20 23:23:19,005][Main][INFO] - [train] Step 1825 out of 65536 | Loss --> 15.363 | Loss_ntp --> 7.629 | Loss_mlm --> 7.734 | Grad_l2 --> 19.934 | Weights_l2 --> 7701.462 | Lr --> 0.001 | Seconds_per_step --> 9.647 |
95
+ [2024-10-20 23:27:17,933][Main][INFO] - [train] Step 1850 out of 65536 | Loss --> 15.347 | Loss_ntp --> 7.616 | Loss_mlm --> 7.732 | Grad_l2 --> 25.592 | Weights_l2 --> 7701.457 | Lr --> 0.001 | Seconds_per_step --> 9.557 |
96
+ [2024-10-20 23:31:19,805][Main][INFO] - [train] Step 1875 out of 65536 | Loss --> 15.254 | Loss_ntp --> 7.577 | Loss_mlm --> 7.677 | Grad_l2 --> 19.500 | Weights_l2 --> 7701.453 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
97
+ [2024-10-20 23:35:18,582][Main][INFO] - [train] Step 1900 out of 65536 | Loss --> 15.204 | Loss_ntp --> 7.550 | Loss_mlm --> 7.653 | Grad_l2 --> 15.358 | Weights_l2 --> 7701.448 | Lr --> 0.001 | Seconds_per_step --> 9.551 |
98
+ [2024-10-20 23:39:20,300][Main][INFO] - [train] Step 1925 out of 65536 | Loss --> 15.153 | Loss_ntp --> 7.525 | Loss_mlm --> 7.628 | Grad_l2 --> 13.241 | Weights_l2 --> 7701.445 | Lr --> 0.001 | Seconds_per_step --> 9.669 |
99
+ [2024-10-20 23:43:21,680][Main][INFO] - [train] Step 1950 out of 65536 | Loss --> 15.111 | Loss_ntp --> 7.497 | Loss_mlm --> 7.614 | Grad_l2 --> 13.357 | Weights_l2 --> 7701.441 | Lr --> 0.001 | Seconds_per_step --> 9.655 |
100
+ [2024-10-20 23:47:22,111][Main][INFO] - [train] Step 1975 out of 65536 | Loss --> 15.072 | Loss_ntp --> 7.475 | Loss_mlm --> 7.597 | Grad_l2 --> 15.485 | Weights_l2 --> 7701.437 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
101
+ [2024-10-20 23:51:21,960][Main][INFO] - [train] Step 2000 out of 65536 | Loss --> 15.061 | Loss_ntp --> 7.470 | Loss_mlm --> 7.591 | Grad_l2 --> 15.511 | Weights_l2 --> 7701.432 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
102
+ [2024-10-20 23:51:50,849][Main][INFO] - [eval] Step 2000 out of 65536 | Loss --> 15.092 | Loss_ntp --> 7.501 | Loss_mlm --> 7.591 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.883 |
103
+ [2024-10-20 23:55:53,490][Main][INFO] - [train] Step 2025 out of 65536 | Loss --> 15.080 | Loss_ntp --> 7.479 | Loss_mlm --> 7.601 | Grad_l2 --> 17.451 | Weights_l2 --> 7701.428 | Lr --> 0.001 | Seconds_per_step --> 9.705 |
104
+ [2024-10-20 23:59:53,747][Main][INFO] - [train] Step 2050 out of 65536 | Loss --> 14.998 | Loss_ntp --> 7.447 | Loss_mlm --> 7.551 | Grad_l2 --> 13.242 | Weights_l2 --> 7701.424 | Lr --> 0.001 | Seconds_per_step --> 9.610 |
105
+ [2024-10-21 00:03:57,114][Main][INFO] - [train] Step 2075 out of 65536 | Loss --> 14.994 | Loss_ntp --> 7.431 | Loss_mlm --> 7.562 | Grad_l2 --> 17.409 | Weights_l2 --> 7701.419 | Lr --> 0.001 | Seconds_per_step --> 9.735 |
106
+ [2024-10-21 00:07:56,557][Main][INFO] - [train] Step 2100 out of 65536 | Loss --> 14.993 | Loss_ntp --> 7.437 | Loss_mlm --> 7.556 | Grad_l2 --> 23.374 | Weights_l2 --> 7701.414 | Lr --> 0.001 | Seconds_per_step --> 9.578 |
107
+ [2024-10-21 00:11:56,818][Main][INFO] - [train] Step 2125 out of 65536 | Loss --> 14.963 | Loss_ntp --> 7.428 | Loss_mlm --> 7.535 | Grad_l2 --> 24.857 | Weights_l2 --> 7701.410 | Lr --> 0.001 | Seconds_per_step --> 9.610 |
108
+ [2024-10-21 00:15:56,927][Main][INFO] - [train] Step 2150 out of 65536 | Loss --> 14.829 | Loss_ntp --> 7.354 | Loss_mlm --> 7.474 | Grad_l2 --> 14.538 | Weights_l2 --> 7701.405 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
109
+ [2024-10-21 00:19:57,089][Main][INFO] - [train] Step 2175 out of 65536 | Loss --> 14.797 | Loss_ntp --> 7.344 | Loss_mlm --> 7.453 | Grad_l2 --> 13.598 | Weights_l2 --> 7701.400 | Lr --> 0.001 | Seconds_per_step --> 9.606 |
110
+ [2024-10-21 00:23:58,135][Main][INFO] - [train] Step 2200 out of 65536 | Loss --> 14.774 | Loss_ntp --> 7.321 | Loss_mlm --> 7.454 | Grad_l2 --> 13.339 | Weights_l2 --> 7701.396 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
111
+ [2024-10-21 00:27:58,499][Main][INFO] - [train] Step 2225 out of 65536 | Loss --> 14.671 | Loss_ntp --> 7.284 | Loss_mlm --> 7.387 | Grad_l2 --> 13.884 | Weights_l2 --> 7701.392 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
112
+ [2024-10-21 00:31:59,596][Main][INFO] - [train] Step 2250 out of 65536 | Loss --> 14.635 | Loss_ntp --> 7.264 | Loss_mlm --> 7.371 | Grad_l2 --> 11.527 | Weights_l2 --> 7701.388 | Lr --> 0.001 | Seconds_per_step --> 9.644 |
113
+ [2024-10-21 00:35:58,256][Main][INFO] - [train] Step 2275 out of 65536 | Loss --> 14.593 | Loss_ntp --> 7.247 | Loss_mlm --> 7.345 | Grad_l2 --> 9.993 | Weights_l2 --> 7701.384 | Lr --> 0.001 | Seconds_per_step --> 9.546 |
114
+ [2024-10-21 00:39:59,379][Main][INFO] - [train] Step 2300 out of 65536 | Loss --> 14.543 | Loss_ntp --> 7.216 | Loss_mlm --> 7.327 | Grad_l2 --> 12.147 | Weights_l2 --> 7701.381 | Lr --> 0.001 | Seconds_per_step --> 9.644 |
115
+ [2024-10-21 00:43:59,080][Main][INFO] - [train] Step 2325 out of 65536 | Loss --> 14.577 | Loss_ntp --> 7.231 | Loss_mlm --> 7.345 | Grad_l2 --> 12.365 | Weights_l2 --> 7701.376 | Lr --> 0.001 | Seconds_per_step --> 9.588 |
116
+ [2024-10-21 00:47:59,811][Main][INFO] - [train] Step 2350 out of 65536 | Loss --> 14.512 | Loss_ntp --> 7.202 | Loss_mlm --> 7.310 | Grad_l2 --> 12.472 | Weights_l2 --> 7701.372 | Lr --> 0.001 | Seconds_per_step --> 9.629 |
117
+ [2024-10-21 00:51:58,749][Main][INFO] - [train] Step 2375 out of 65536 | Loss --> 14.434 | Loss_ntp --> 7.166 | Loss_mlm --> 7.268 | Grad_l2 --> 12.198 | Weights_l2 --> 7701.368 | Lr --> 0.001 | Seconds_per_step --> 9.557 |
118
+ [2024-10-21 00:55:58,527][Main][INFO] - [train] Step 2400 out of 65536 | Loss --> 14.390 | Loss_ntp --> 7.141 | Loss_mlm --> 7.249 | Grad_l2 --> 11.488 | Weights_l2 --> 7701.365 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
119
+ [2024-10-21 00:59:59,746][Main][INFO] - [train] Step 2425 out of 65536 | Loss --> 14.396 | Loss_ntp --> 7.142 | Loss_mlm --> 7.253 | Grad_l2 --> 11.924 | Weights_l2 --> 7701.361 | Lr --> 0.001 | Seconds_per_step --> 9.649 |
120
+ [2024-10-21 01:03:58,922][Main][INFO] - [train] Step 2450 out of 65536 | Loss --> 14.319 | Loss_ntp --> 7.108 | Loss_mlm --> 7.211 | Grad_l2 --> 11.587 | Weights_l2 --> 7701.357 | Lr --> 0.001 | Seconds_per_step --> 9.567 |
121
+ [2024-10-21 01:08:00,577][Main][INFO] - [train] Step 2475 out of 65536 | Loss --> 14.363 | Loss_ntp --> 7.132 | Loss_mlm --> 7.231 | Grad_l2 --> 11.854 | Weights_l2 --> 7701.353 | Lr --> 0.001 | Seconds_per_step --> 9.666 |
122
+ [2024-10-21 01:12:00,070][Main][INFO] - [train] Step 2500 out of 65536 | Loss --> 14.333 | Loss_ntp --> 7.121 | Loss_mlm --> 7.212 | Grad_l2 --> 10.363 | Weights_l2 --> 7701.349 | Lr --> 0.001 | Seconds_per_step --> 9.580 |
123
+ [2024-10-21 01:12:28,480][Main][INFO] - [eval] Step 2500 out of 65536 | Loss --> 14.573 | Loss_ntp --> 7.286 | Loss_mlm --> 7.287 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.404 |
124
+ [2024-10-21 01:16:30,064][Main][INFO] - [train] Step 2525 out of 65536 | Loss --> 14.280 | Loss_ntp --> 7.089 | Loss_mlm --> 7.192 | Grad_l2 --> 13.178 | Weights_l2 --> 7701.345 | Lr --> 0.001 | Seconds_per_step --> 9.663 |
125
+ [2024-10-21 01:20:29,018][Main][INFO] - [train] Step 2550 out of 65536 | Loss --> 14.260 | Loss_ntp --> 7.091 | Loss_mlm --> 7.169 | Grad_l2 --> 12.381 | Weights_l2 --> 7701.341 | Lr --> 0.001 | Seconds_per_step --> 9.558 |
126
+ [2024-10-21 01:24:31,253][Main][INFO] - [train] Step 2575 out of 65536 | Loss --> 14.259 | Loss_ntp --> 7.078 | Loss_mlm --> 7.182 | Grad_l2 --> 11.247 | Weights_l2 --> 7701.337 | Lr --> 0.001 | Seconds_per_step --> 9.689 |
127
+ [2024-10-21 01:28:31,446][Main][INFO] - [train] Step 2600 out of 65536 | Loss --> 14.259 | Loss_ntp --> 7.080 | Loss_mlm --> 7.179 | Grad_l2 --> 12.524 | Weights_l2 --> 7701.333 | Lr --> 0.001 | Seconds_per_step --> 9.608 |
128
+ [2024-10-21 01:32:31,794][Main][INFO] - [train] Step 2625 out of 65536 | Loss --> 14.245 | Loss_ntp --> 7.068 | Loss_mlm --> 7.178 | Grad_l2 --> 12.087 | Weights_l2 --> 7701.330 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
129
+ [2024-10-21 01:36:32,411][Main][INFO] - [train] Step 2650 out of 65536 | Loss --> 14.247 | Loss_ntp --> 7.074 | Loss_mlm --> 7.173 | Grad_l2 --> 11.638 | Weights_l2 --> 7701.326 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
130
+ [2024-10-21 01:40:33,462][Main][INFO] - [train] Step 2675 out of 65536 | Loss --> 14.274 | Loss_ntp --> 7.086 | Loss_mlm --> 7.189 | Grad_l2 --> 10.415 | Weights_l2 --> 7701.322 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
131
+ [2024-10-21 01:44:33,254][Main][INFO] - [train] Step 2700 out of 65536 | Loss --> 14.276 | Loss_ntp --> 7.097 | Loss_mlm --> 7.179 | Grad_l2 --> 10.830 | Weights_l2 --> 7701.318 | Lr --> 0.001 | Seconds_per_step --> 9.592 |
132
+ [2024-10-21 01:48:34,104][Main][INFO] - [train] Step 2725 out of 65536 | Loss --> 14.322 | Loss_ntp --> 7.117 | Loss_mlm --> 7.205 | Grad_l2 --> 11.668 | Weights_l2 --> 7701.314 | Lr --> 0.001 | Seconds_per_step --> 9.634 |
133
+ [2024-10-21 01:52:33,834][Main][INFO] - [train] Step 2750 out of 65536 | Loss --> 14.393 | Loss_ntp --> 7.149 | Loss_mlm --> 7.244 | Grad_l2 --> 10.585 | Weights_l2 --> 7701.310 | Lr --> 0.001 | Seconds_per_step --> 9.589 |
134
+ [2024-10-21 01:56:33,130][Main][INFO] - [train] Step 2775 out of 65536 | Loss --> 14.326 | Loss_ntp --> 7.124 | Loss_mlm --> 7.202 | Grad_l2 --> 9.862 | Weights_l2 --> 7701.306 | Lr --> 0.001 | Seconds_per_step --> 9.572 |
135
+ [2024-10-21 02:00:34,375][Main][INFO] - [train] Step 2800 out of 65536 | Loss --> 14.354 | Loss_ntp --> 7.134 | Loss_mlm --> 7.220 | Grad_l2 --> 8.484 | Weights_l2 --> 7701.302 | Lr --> 0.001 | Seconds_per_step --> 9.650 |
136
+ [2024-10-21 02:04:34,763][Main][INFO] - [train] Step 2825 out of 65536 | Loss --> 14.320 | Loss_ntp --> 7.118 | Loss_mlm --> 7.202 | Grad_l2 --> 11.118 | Weights_l2 --> 7701.298 | Lr --> 0.001 | Seconds_per_step --> 9.615 |
137
+ [2024-10-21 02:08:35,157][Main][INFO] - [train] Step 2850 out of 65536 | Loss --> 14.323 | Loss_ntp --> 7.124 | Loss_mlm --> 7.199 | Grad_l2 --> 10.821 | Weights_l2 --> 7701.294 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
138
+ [2024-10-21 02:12:34,860][Main][INFO] - [train] Step 2875 out of 65536 | Loss --> 14.348 | Loss_ntp --> 7.129 | Loss_mlm --> 7.219 | Grad_l2 --> 9.481 | Weights_l2 --> 7701.291 | Lr --> 0.001 | Seconds_per_step --> 9.588 |
139
+ [2024-10-21 02:16:36,448][Main][INFO] - [train] Step 2900 out of 65536 | Loss --> 14.413 | Loss_ntp --> 7.163 | Loss_mlm --> 7.250 | Grad_l2 --> 10.586 | Weights_l2 --> 7701.287 | Lr --> 0.001 | Seconds_per_step --> 9.663 |
140
+ [2024-10-21 02:20:36,563][Main][INFO] - [train] Step 2925 out of 65536 | Loss --> 14.319 | Loss_ntp --> 7.113 | Loss_mlm --> 7.206 | Grad_l2 --> 9.175 | Weights_l2 --> 7701.283 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
141
+ [2024-10-21 02:24:36,522][Main][INFO] - [train] Step 2950 out of 65536 | Loss --> 14.292 | Loss_ntp --> 7.112 | Loss_mlm --> 7.179 | Grad_l2 --> 10.380 | Weights_l2 --> 7701.279 | Lr --> 0.001 | Seconds_per_step --> 9.598 |
142
+ [2024-10-21 02:28:36,510][Main][INFO] - [train] Step 2975 out of 65536 | Loss --> 14.202 | Loss_ntp --> 7.068 | Loss_mlm --> 7.134 | Grad_l2 --> 9.622 | Weights_l2 --> 7701.276 | Lr --> 0.001 | Seconds_per_step --> 9.599 |
143
+ [2024-10-21 02:32:38,120][Main][INFO] - [train] Step 3000 out of 65536 | Loss --> 14.214 | Loss_ntp --> 7.066 | Loss_mlm --> 7.147 | Grad_l2 --> 10.228 | Weights_l2 --> 7701.272 | Lr --> 0.001 | Seconds_per_step --> 9.664 |
144
+ [2024-10-21 02:33:06,984][Main][INFO] - [eval] Step 3000 out of 65536 | Loss --> 14.236 | Loss_ntp --> 7.111 | Loss_mlm --> 7.125 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.858 |
145
+ [2024-10-21 02:33:06,988][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-3000
146
+ [2024-10-21 02:33:07,000][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
147
+ [2024-10-21 02:33:13,140][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-3000/model.safetensors
148
+ [2024-10-21 02:33:21,968][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-3000/optimizer.bin
149
+ [2024-10-21 02:33:21,978][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-3000/scheduler.bin
150
+ [2024-10-21 02:33:21,979][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-3000/sampler.bin
151
+ [2024-10-21 02:33:21,981][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-3000/sampler_1.bin
152
+ [2024-10-21 02:33:21,990][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-3000/random_states_0.pkl
153
+ [2024-10-21 02:37:21,949][Main][INFO] - [train] Step 3025 out of 65536 | Loss --> 14.180 | Loss_ntp --> 7.041 | Loss_mlm --> 7.138 | Grad_l2 --> 9.928 | Weights_l2 --> 7701.268 | Lr --> 0.001 | Seconds_per_step --> 10.198 |
154
+ [2024-10-21 02:41:23,436][Main][INFO] - [train] Step 3050 out of 65536 | Loss --> 14.163 | Loss_ntp --> 7.032 | Loss_mlm --> 7.130 | Grad_l2 --> 9.909 | Weights_l2 --> 7701.264 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
155
+ [2024-10-21 02:45:23,362][Main][INFO] - [train] Step 3075 out of 65536 | Loss --> 14.109 | Loss_ntp --> 7.016 | Loss_mlm --> 7.093 | Grad_l2 --> 10.119 | Weights_l2 --> 7701.260 | Lr --> 0.001 | Seconds_per_step --> 9.597 |
156
+ [2024-10-21 02:49:23,828][Main][INFO] - [train] Step 3100 out of 65536 | Loss --> 14.053 | Loss_ntp --> 6.981 | Loss_mlm --> 7.072 | Grad_l2 --> 8.917 | Weights_l2 --> 7701.256 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
157
+ [2024-10-21 02:53:26,144][Main][INFO] - [train] Step 3125 out of 65536 | Loss --> 14.045 | Loss_ntp --> 6.975 | Loss_mlm --> 7.069 | Grad_l2 --> 11.184 | Weights_l2 --> 7701.252 | Lr --> 0.001 | Seconds_per_step --> 9.692 |
158
+ [2024-10-21 02:57:25,035][Main][INFO] - [train] Step 3150 out of 65536 | Loss --> 14.006 | Loss_ntp --> 6.959 | Loss_mlm --> 7.047 | Grad_l2 --> 9.280 | Weights_l2 --> 7701.248 | Lr --> 0.001 | Seconds_per_step --> 9.555 |
159
+ [2024-10-21 03:01:27,283][Main][INFO] - [train] Step 3175 out of 65536 | Loss --> 13.943 | Loss_ntp --> 6.924 | Loss_mlm --> 7.020 | Grad_l2 --> 8.769 | Weights_l2 --> 7701.245 | Lr --> 0.001 | Seconds_per_step --> 9.690 |
160
+ [2024-10-21 03:05:27,701][Main][INFO] - [train] Step 3200 out of 65536 | Loss --> 13.956 | Loss_ntp --> 6.916 | Loss_mlm --> 7.040 | Grad_l2 --> 8.625 | Weights_l2 --> 7701.241 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
161
+ [2024-10-21 03:09:28,530][Main][INFO] - [train] Step 3225 out of 65536 | Loss --> 13.916 | Loss_ntp --> 6.906 | Loss_mlm --> 7.010 | Grad_l2 --> 9.378 | Weights_l2 --> 7701.238 | Lr --> 0.001 | Seconds_per_step --> 9.633 |
162
+ [2024-10-21 03:13:28,937][Main][INFO] - [train] Step 3250 out of 65536 | Loss --> 13.849 | Loss_ntp --> 6.867 | Loss_mlm --> 6.982 | Grad_l2 --> 9.221 | Weights_l2 --> 7701.234 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
163
+ [2024-10-21 03:17:29,597][Main][INFO] - [train] Step 3275 out of 65536 | Loss --> 13.854 | Loss_ntp --> 6.869 | Loss_mlm --> 6.985 | Grad_l2 --> 8.561 | Weights_l2 --> 7701.230 | Lr --> 0.001 | Seconds_per_step --> 9.626 |
164
+ [2024-10-21 03:21:30,034][Main][INFO] - [train] Step 3300 out of 65536 | Loss --> 13.781 | Loss_ntp --> 6.843 | Loss_mlm --> 6.938 | Grad_l2 --> 8.919 | Weights_l2 --> 7701.226 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
165
+ [2024-10-21 03:25:29,815][Main][INFO] - [train] Step 3325 out of 65536 | Loss --> 13.766 | Loss_ntp --> 6.836 | Loss_mlm --> 6.930 | Grad_l2 --> 8.129 | Weights_l2 --> 7701.223 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
166
+ [2024-10-21 03:29:30,344][Main][INFO] - [train] Step 3350 out of 65536 | Loss --> 13.726 | Loss_ntp --> 6.809 | Loss_mlm --> 6.917 | Grad_l2 --> 9.145 | Weights_l2 --> 7701.219 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
167
+ [2024-10-21 03:33:30,171][Main][INFO] - [train] Step 3375 out of 65536 | Loss --> 13.751 | Loss_ntp --> 6.819 | Loss_mlm --> 6.932 | Grad_l2 --> 11.666 | Weights_l2 --> 7701.215 | Lr --> 0.001 | Seconds_per_step --> 9.593 |
168
+ [2024-10-21 03:37:32,111][Main][INFO] - [train] Step 3400 out of 65536 | Loss --> 13.700 | Loss_ntp --> 6.796 | Loss_mlm --> 6.905 | Grad_l2 --> 8.776 | Weights_l2 --> 7701.211 | Lr --> 0.001 | Seconds_per_step --> 9.677 |
169
+ [2024-10-21 03:41:31,530][Main][INFO] - [train] Step 3425 out of 65536 | Loss --> 13.641 | Loss_ntp --> 6.774 | Loss_mlm --> 6.868 | Grad_l2 --> 9.206 | Weights_l2 --> 7701.207 | Lr --> 0.001 | Seconds_per_step --> 9.577 |
170
+ [2024-10-21 03:45:33,625][Main][INFO] - [train] Step 3450 out of 65536 | Loss --> 13.588 | Loss_ntp --> 6.735 | Loss_mlm --> 6.852 | Grad_l2 --> 6.293 | Weights_l2 --> 7701.204 | Lr --> 0.001 | Seconds_per_step --> 9.684 |
171
+ [2024-10-21 03:49:34,400][Main][INFO] - [train] Step 3475 out of 65536 | Loss --> 13.615 | Loss_ntp --> 6.748 | Loss_mlm --> 6.868 | Grad_l2 --> 9.161 | Weights_l2 --> 7701.201 | Lr --> 0.001 | Seconds_per_step --> 9.631 |
172
+ [2024-10-21 03:53:35,824][Main][INFO] - [train] Step 3500 out of 65536 | Loss --> 13.532 | Loss_ntp --> 6.707 | Loss_mlm --> 6.825 | Grad_l2 --> 9.556 | Weights_l2 --> 7701.197 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
173
+ [2024-10-21 03:54:04,713][Main][INFO] - [eval] Step 3500 out of 65536 | Loss --> 13.912 | Loss_ntp --> 6.950 | Loss_mlm --> 6.962 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.883 |
174
+ [2024-10-21 03:58:05,620][Main][INFO] - [train] Step 3525 out of 65536 | Loss --> 13.463 | Loss_ntp --> 6.677 | Loss_mlm --> 6.786 | Grad_l2 --> 9.458 | Weights_l2 --> 7701.193 | Lr --> 0.001 | Seconds_per_step --> 9.636 |
175
+ [2024-10-21 04:02:06,516][Main][INFO] - [train] Step 3550 out of 65536 | Loss --> 13.419 | Loss_ntp --> 6.654 | Loss_mlm --> 6.766 | Grad_l2 --> 9.819 | Weights_l2 --> 7701.188 | Lr --> 0.001 | Seconds_per_step --> 9.636 |
176
+ [2024-10-21 04:06:07,229][Main][INFO] - [train] Step 3575 out of 65536 | Loss --> 13.362 | Loss_ntp --> 6.626 | Loss_mlm --> 6.736 | Grad_l2 --> 8.944 | Weights_l2 --> 7701.184 | Lr --> 0.001 | Seconds_per_step --> 9.628 |
177
+ [2024-10-21 04:10:08,761][Main][INFO] - [train] Step 3600 out of 65536 | Loss --> 13.401 | Loss_ntp --> 6.628 | Loss_mlm --> 6.773 | Grad_l2 --> 9.904 | Weights_l2 --> 7701.180 | Lr --> 0.001 | Seconds_per_step --> 9.661 |
178
+ [2024-10-21 04:14:09,815][Main][INFO] - [train] Step 3625 out of 65536 | Loss --> 13.361 | Loss_ntp --> 6.625 | Loss_mlm --> 6.736 | Grad_l2 --> 8.507 | Weights_l2 --> 7701.176 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
179
+ [2024-10-21 04:18:10,037][Main][INFO] - [train] Step 3650 out of 65536 | Loss --> 13.355 | Loss_ntp --> 6.614 | Loss_mlm --> 6.741 | Grad_l2 --> 9.056 | Weights_l2 --> 7701.172 | Lr --> 0.001 | Seconds_per_step --> 9.609 |
180
+ [2024-10-21 04:22:10,677][Main][INFO] - [train] Step 3675 out of 65536 | Loss --> 13.306 | Loss_ntp --> 6.586 | Loss_mlm --> 6.720 | Grad_l2 --> 9.057 | Weights_l2 --> 7701.168 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
181
+ [2024-10-21 04:26:12,857][Main][INFO] - [train] Step 3700 out of 65536 | Loss --> 13.325 | Loss_ntp --> 6.596 | Loss_mlm --> 6.729 | Grad_l2 --> 10.732 | Weights_l2 --> 7701.163 | Lr --> 0.001 | Seconds_per_step --> 9.687 |
182
+ [2024-10-21 04:30:11,816][Main][INFO] - [train] Step 3725 out of 65536 | Loss --> 13.239 | Loss_ntp --> 6.561 | Loss_mlm --> 6.678 | Grad_l2 --> 9.810 | Weights_l2 --> 7701.160 | Lr --> 0.001 | Seconds_per_step --> 9.558 |
183
+ [2024-10-21 04:34:12,167][Main][INFO] - [train] Step 3750 out of 65536 | Loss --> 13.211 | Loss_ntp --> 6.534 | Loss_mlm --> 6.677 | Grad_l2 --> 10.011 | Weights_l2 --> 7701.156 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
184
+ [2024-10-21 04:38:14,046][Main][INFO] - [train] Step 3775 out of 65536 | Loss --> 13.214 | Loss_ntp --> 6.537 | Loss_mlm --> 6.678 | Grad_l2 --> 8.939 | Weights_l2 --> 7701.152 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
185
+ [2024-10-21 04:42:14,454][Main][INFO] - [train] Step 3800 out of 65536 | Loss --> 13.148 | Loss_ntp --> 6.508 | Loss_mlm --> 6.640 | Grad_l2 --> 9.513 | Weights_l2 --> 7701.148 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
186
+ [2024-10-21 04:46:14,554][Main][INFO] - [train] Step 3825 out of 65536 | Loss --> 13.172 | Loss_ntp --> 6.514 | Loss_mlm --> 6.658 | Grad_l2 --> 9.295 | Weights_l2 --> 7701.144 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
187
+ [2024-10-21 04:50:14,762][Main][INFO] - [train] Step 3850 out of 65536 | Loss --> 13.118 | Loss_ntp --> 6.494 | Loss_mlm --> 6.624 | Grad_l2 --> 7.890 | Weights_l2 --> 7701.140 | Lr --> 0.001 | Seconds_per_step --> 9.608 |
188
+ [2024-10-21 04:54:16,032][Main][INFO] - [train] Step 3875 out of 65536 | Loss --> 13.179 | Loss_ntp --> 6.521 | Loss_mlm --> 6.657 | Grad_l2 --> 9.901 | Weights_l2 --> 7701.136 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
189
+ [2024-10-21 04:58:16,128][Main][INFO] - [train] Step 3900 out of 65536 | Loss --> 13.259 | Loss_ntp --> 6.571 | Loss_mlm --> 6.687 | Grad_l2 --> 8.910 | Weights_l2 --> 7701.132 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
test.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import torch
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5")
5
+
6
+ special_tokens_dict = {'additional_special_tokens': ['[R]', '[S]', '[X]', '[NTP]']}
7
+ tokenizer.add_special_tokens(special_tokens_dict)
8
+
9
+ model = AutoModelForSeq2SeqLM.from_pretrained("/workspace/nanoT5/logs/2024-10-20/18-25-17/checkpoint-pt-27000").to("cuda")
10
+ prompt = "The "
11
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
12
+ # Add decoder_input_ids
13
+ # decoder_input_ids = torch.ones((inputs.input_ids.shape[0], 1), dtype=torch.long) * model.config.decoder_start_token_id
14
+
15
+ # Generate
16
+ generated_ids = model.generate(
17
+ **inputs,
18
+ # decoder_input_ids=decoder_input_ids,
19
+ max_new_tokens=20,
20
+ no_repeat_ngram_size=5
21
+ )
22
+
23
+ # Decode the output
24
+ generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
25
+ print(generated_text)
wandb/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-20T18:25:18.130390854Z","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-20T18:25:18.131160825Z","level":"INFO","msg":"created symlink","path":"/workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug-core.log"}
3
+ {"time":"2024-10-20T18:25:18.247302473Z","level":"INFO","msg":"created new stream","id":"i0qk9v3k"}
4
+ {"time":"2024-10-20T18:25:18.247577857Z","level":"INFO","msg":"stream: started","id":"i0qk9v3k"}
5
+ {"time":"2024-10-20T18:25:18.247668586Z","level":"INFO","msg":"handler: started","stream_id":{"value":"i0qk9v3k"}}
6
+ {"time":"2024-10-20T18:25:18.247659857Z","level":"INFO","msg":"sender: started","stream_id":"i0qk9v3k"}
7
+ {"time":"2024-10-20T18:25:18.247631762Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"i0qk9v3k"}}
8
+ {"time":"2024-10-20T18:25:19.59293904Z","level":"INFO","msg":"Starting system monitor"}
wandb/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-20 18:25:18,064 INFO MainThread:4102 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-10-20 18:25:18,064 INFO MainThread:4102 [wandb_setup.py:_flush():79] Configure stats pid to 4102
3
+ 2024-10-20 18:25:18,065 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from /root/.config/wandb/settings
4
+ 2024-10-20 18:25:18,065 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/settings
5
+ 2024-10-20 18:25:18,066 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-10-20 18:25:18,066 INFO MainThread:4102 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2024-10-20 18:25:18,067 WARNING MainThread:4102 [wandb_setup.py:_flush():79] Could not find program at -m nanoT5.main
8
+ 2024-10-20 18:25:18,067 INFO MainThread:4102 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
9
+ 2024-10-20 18:25:18,068 INFO MainThread:4102 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-10-20 18:25:18,069 INFO MainThread:4102 [wandb_init.py:_log_setup():534] Logging user logs to /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug.log
11
+ 2024-10-20 18:25:18,071 INFO MainThread:4102 [wandb_init.py:_log_setup():535] Logging internal logs to /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug-internal.log
12
+ 2024-10-20 18:25:18,071 INFO MainThread:4102 [wandb_init.py:init():621] calling init triggers
13
+ 2024-10-20 18:25:18,072 INFO MainThread:4102 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
14
+ config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 93789, 'tokenizer': {'name': 'BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5'}, 'working_dir': '/workspace/nanoT5/logs/2024-10-20/18-25-17', 'model': {'liger': True, 'klass': 'local_t5', 'name': 'pszemraj/tFINE-850m-24x24-1024ctx', 'overwrite': {'dropout_rate': 0.0, 'num_decoder_layers': 16, 'num_key_value_heads': 4, 'num_layers': 16, 'use_gqa': True}, 'add_config': {'is_bf16': True}, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'multi_task': True, 'NTP': 0.3, 'input_length': 512, 'max_seq_len': 512, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 0}, 'optim': {'name': 'adamwscale', 'base_lr': 0.001, 'batch_size': 128, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.01, 'grad_clip': 1.0, 'grad_acc': 16, 'final_cosine': 2e-05}, 'eval': {'every_steps': 500, 'steps': 0}, 'checkpoint': {'every_steps': 1500}, 'logging': {'every_steps': 25, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'amazingvince', 'tags': ['gqa', 'large', 'e32-d16', '512 ctx'], 'mode': 'online'}}, 'slurm_id': 'none'}
15
+ 2024-10-20 18:25:18,073 INFO MainThread:4102 [wandb_init.py:init():671] starting backend
16
+ 2024-10-20 18:25:18,074 INFO MainThread:4102 [wandb_init.py:init():675] sending inform_init request
17
+ 2024-10-20 18:25:18,121 INFO MainThread:4102 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-10-20 18:25:18,122 INFO MainThread:4102 [wandb_init.py:init():688] backend started and connected
19
+ 2024-10-20 18:25:18,198 INFO MainThread:4102 [wandb_init.py:init():783] updated telemetry
20
+ 2024-10-20 18:25:18,256 INFO MainThread:4102 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
21
+ 2024-10-20 18:25:19,558 INFO MainThread:4102 [wandb_init.py:init():867] starting run threads in backend
22
+ 2024-10-20 18:25:19,755 INFO MainThread:4102 [wandb_run.py:_console_start():2463] atexit reg
23
+ 2024-10-20 18:25:19,756 INFO MainThread:4102 [wandb_run.py:_redirect():2311] redirect: wrap_raw
24
+ 2024-10-20 18:25:19,757 INFO MainThread:4102 [wandb_run.py:_redirect():2376] Wrapping output streams.
25
+ 2024-10-20 18:25:19,759 INFO MainThread:4102 [wandb_run.py:_redirect():2401] Redirects installed.
26
+ 2024-10-20 18:25:19,763 INFO MainThread:4102 [wandb_init.py:init():911] run started, returning control to user process
27
+ 2024-10-20 18:25:41,763 INFO MainThread:4102 [wandb_run.py:_config_callback():1390] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 93789, 'tokenizer': {'name': 'BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5'}, 'working_dir': '/workspace/nanoT5/logs/2024-10-20/18-25-17', 'model': {'liger': True, 'klass': 'local_t5', 'name': 'pszemraj/tFINE-850m-24x24-1024ctx', 'overwrite': {'dropout_rate': 0.0, 'num_decoder_layers': 16, 'num_key_value_heads': 4, 'num_layers': 16, 'use_gqa': True}, 'add_config': {'is_bf16': True}, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'multi_task': True, 'NTP': 0.3, 'input_length': 512, 'max_seq_len': 512, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 0, 'before_mask_input_length': 568, 'target_length': 114}, 'optim': {'name': 'adamwscale', 'base_lr': 0.001, 'batch_size': 128, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.01, 'grad_clip': 1.0, 'grad_acc': 16, 'final_cosine': 2e-05}, 'eval': {'every_steps': 500, 'steps': 0, 'corrected_steps': 0}, 'checkpoint': {'every_steps': 1500}, 'logging': {'every_steps': 25, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'amazingvince', 'tags': ['gqa', 'large', 'e32-d16', '512 ctx'], 'mode': 'online'}}, 'slurm_id': 'none', 'n_all_param': 486886912}
28
+ 2024-10-24 02:27:45,254 WARNING MsgRouterThr:4102 [router.py:message_loop():77] message_loop has been closed
wandb/run-20241020_182518-i0qk9v3k/files/config.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.5
4
+ m: []
5
+ python_version: 3.11.10
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 11
10
+ - 41
11
+ - 49
12
+ - 50
13
+ - 51
14
+ - 55
15
+ - 71
16
+ - 100
17
+ "2":
18
+ - 1
19
+ - 11
20
+ - 41
21
+ - 49
22
+ - 50
23
+ - 51
24
+ - 55
25
+ - 71
26
+ - 100
27
+ "3":
28
+ - 15
29
+ - 16
30
+ - 23
31
+ - 55
32
+ - 61
33
+ "4": 3.11.10
34
+ "5": 0.18.5
35
+ "6": 4.46.0.dev0
36
+ "8":
37
+ - 5
38
+ "12": 0.18.5
39
+ "13": linux-x86_64
40
+ checkpoint:
41
+ value:
42
+ every_steps: 1500
43
+ data:
44
+ value:
45
+ NTP: 0.3
46
+ before_mask_input_length: 568
47
+ input_length: 512
48
+ max_seq_len: 512
49
+ mean_noise_span_length: 3
50
+ mlm_probability: 0.15
51
+ multi_task: true
52
+ num_workers: 0
53
+ target_length: 114
54
+ device:
55
+ value: gpu
56
+ eval:
57
+ value:
58
+ corrected_steps: 0
59
+ every_steps: 500
60
+ steps: 0
61
+ eval_only:
62
+ value: false
63
+ logging:
64
+ value:
65
+ every_steps: 25
66
+ grad_l2: true
67
+ use_wandb: true
68
+ wandb_config:
69
+ entity: amazingvince
70
+ mode: online
71
+ project: nanoT5
72
+ tags:
73
+ - gqa
74
+ - large
75
+ - e32-d16
76
+ - 512 ctx
77
+ weights_l2: true
78
+ mode:
79
+ value: pt
80
+ model:
81
+ value:
82
+ add_config:
83
+ is_bf16: true
84
+ checkpoint_path: ""
85
+ compile: true
86
+ klass: local_t5
87
+ liger: true
88
+ name: pszemraj/tFINE-850m-24x24-1024ctx
89
+ overwrite:
90
+ dropout_rate: 0
91
+ num_decoder_layers: 16
92
+ num_key_value_heads: 4
93
+ num_layers: 16
94
+ use_gqa: true
95
+ random_init: true
96
+ n_all_param:
97
+ value: 486886912
98
+ optim:
99
+ value:
100
+ base_lr: 0.001
101
+ batch_size: 128
102
+ epochs: -1
103
+ final_cosine: 2e-05
104
+ grad_acc: 16
105
+ grad_clip: 1
106
+ lr_scheduler: cosine
107
+ name: adamwscale
108
+ total_steps: 65536
109
+ warmup_steps: 5000
110
+ weight_decay: 0.01
111
+ precision:
112
+ value: bf16
113
+ predict_only:
114
+ value: false
115
+ seed:
116
+ value: 93789
117
+ slurm_id:
118
+ value: none
119
+ tokenizer:
120
+ value:
121
+ name: BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5
122
+ working_dir:
123
+ value: /workspace/nanoT5/logs/2024-10-20/18-25-17
wandb/run-20241020_182518-i0qk9v3k/files/output.log ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Using tokenizer: BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5
2
+ loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--hf_slimpajama-6B-28672-BPE-forT5/snapshots/1ed7c49f137e7b3e2f211669f66c8f51dd392f20/tokenizer.model
3
+ loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--hf_slimpajama-6B-28672-BPE-forT5/snapshots/1ed7c49f137e7b3e2f211669f66c8f51dd392f20/tokenizer.json
4
+ loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--hf_slimpajama-6B-28672-BPE-forT5/snapshots/1ed7c49f137e7b3e2f211669f66c8f51dd392f20/added_tokens.json
5
+ loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--hf_slimpajama-6B-28672-BPE-forT5/snapshots/1ed7c49f137e7b3e2f211669f66c8f51dd392f20/special_tokens_map.json
6
+ loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--hf_slimpajama-6B-28672-BPE-forT5/snapshots/1ed7c49f137e7b3e2f211669f66c8f51dd392f20/tokenizer_config.json
7
+ Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
8
+ loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--tFINE-850m-24x24-1024ctx/snapshots/bbbb8d2ac68f72ce0129f29dd22428c4b219224c/config.json
9
+ Model config T5Config {
10
+ "_name_or_path": "pszemraj/tFINE-850m-24x24-1024ctx",
11
+ "architectures": [
12
+ "T5ForConditionalGeneration"
13
+ ],
14
+ "classifier_dropout": 0.0,
15
+ "d_ff": 3072,
16
+ "d_kv": 64,
17
+ "d_model": 1024,
18
+ "decoder_start_token_id": 3,
19
+ "dense_act_fn": "silu",
20
+ "dropout_rate": 0.0,
21
+ "eos_token_id": 2,
22
+ "feed_forward_proj": "gated-silu",
23
+ "initializer_factor": 1.0,
24
+ "is_encoder_decoder": true,
25
+ "is_gated_act": true,
26
+ "layer_norm_epsilon": 1e-06,
27
+ "model_type": "t5",
28
+ "num_decoder_layers": 24,
29
+ "num_heads": 16,
30
+ "num_key_value_heads": 8,
31
+ "num_layers": 24,
32
+ "output_past": true,
33
+ "pad_token_id": 3,
34
+ "relative_attention_max_distance": 128,
35
+ "relative_attention_num_buckets": 48,
36
+ "tie_word_embeddings": false,
37
+ "transformers_version": "4.46.0.dev0",
38
+ "use_cache": true,
39
+ "use_gqa": false,
40
+ "vocab_size": 48256
41
+ }
42
+
43
+ Resolving data files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 485.45it/s]
44
+ Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 14950.45it/s]
45
+ Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:04<00:00, 207.69it/s]
46
+ Resolving data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:00<00:00, 170394.64it/s]
47
+ =========================================================================
48
+ Layer (type:depth-idx) Output Shape Param # Trainable
49
+ =========================================================================
50
+ MyT5 486,837,760 True
51
+ Embedding 29,466,624 True
52
+ T5Stack 222,439,168 True
53
+ Embedding 29,466,624 True
54
+ ModuleList 192,971,520 True
55
+ T5LayerNorm 1,024 True
56
+ Dropout -- False
57
+ T5Stack 264,398,592 True
58
+ Embedding 29,466,624 True
59
+ ModuleList 234,930,944 True
60
+ T5LayerNorm 1,024 True
61
+ Dropout -- False
62
+ Linear 29,466,624 True
63
+ LigerCrossEntropyLoss -- False
64
+ =========================================================================
65
+ Total params: 486,837,760
66
+ Trainable params: 486,837,760
67
+ Non-trainable params: --
68
+ =========================================================================
69
+ Configuration saved in ./config.json
70
+ W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0] Graph break from `Tensor.item()`, consider setting:
71
+ W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0] torch._dynamo.config.capture_scalar_outputs = True
72
+ W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0] or:
73
+ W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0] env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
74
+ W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0] to include these operations in the captured graph.
75
+ W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0]
76
+ W1020 18:27:14.737000 139920144193088 torch/fx/experimental/symbolic_shapes.py:4449] [2/0_1] r0 is not in var_ranges, defaulting to unknown range.
77
+ W1020 18:27:21.491000 139920144193088 torch/fx/experimental/symbolic_shapes.py:4449] [2/0_1] q0 is not in var_ranges, defaulting to unknown range.
78
+ W1020 18:27:21.545000 139920144193088 torch/fx/experimental/symbolic_shapes.py:4449] [2/0_1] z0 is not in var_ranges, defaulting to unknown range.
79
+ W1020 18:27:27.722000 139920144193088 torch/fx/experimental/symbolic_shapes.py:4449] [2/0_1] x1 is not in var_ranges, defaulting to unknown range.
80
+ [2024-10-20 18:31:35,111][Main][INFO] - [train] Step 25 out of 65536 | Loss --> 155.837 | Loss_ntp --> 76.275 | Loss_mlm --> 79.561 | Grad_l2 --> 476.354 | Weights_l2 --> 7701.821 | Lr --> 0.001 | Seconds_per_step --> 14.044 |
81
+ [2024-10-20 18:35:35,171][Main][INFO] - [train] Step 50 out of 65536 | Loss --> 98.644 | Loss_ntp --> 48.540 | Loss_mlm --> 50.105 | Grad_l2 --> 234.932 | Weights_l2 --> 7701.813 | Lr --> 0.001 | Seconds_per_step --> 9.602 |
82
+ [2024-10-20 18:39:35,197][Main][INFO] - [train] Step 75 out of 65536 | Loss --> 86.994 | Loss_ntp --> 42.861 | Loss_mlm --> 44.133 | Grad_l2 --> 180.388 | Weights_l2 --> 7701.806 | Lr --> 0.001 | Seconds_per_step --> 9.601 |
83
+ [2024-10-20 18:43:35,733][Main][INFO] - [train] Step 100 out of 65536 | Loss --> 80.568 | Loss_ntp --> 39.806 | Loss_mlm --> 40.762 | Grad_l2 --> 156.732 | Weights_l2 --> 7701.800 | Lr --> 0.001 | Seconds_per_step --> 9.621 |
84
+ [2024-10-20 18:47:37,016][Main][INFO] - [train] Step 125 out of 65536 | Loss --> 77.131 | Loss_ntp --> 38.127 | Loss_mlm --> 39.004 | Grad_l2 --> 179.590 | Weights_l2 --> 7701.794 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
85
+ [2024-10-20 18:51:38,437][Main][INFO] - [train] Step 150 out of 65536 | Loss --> 73.900 | Loss_ntp --> 36.620 | Loss_mlm --> 37.281 | Grad_l2 --> 161.591 | Weights_l2 --> 7701.789 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
86
+ [2024-10-20 18:55:39,020][Main][INFO] - [train] Step 175 out of 65536 | Loss --> 72.118 | Loss_ntp --> 35.763 | Loss_mlm --> 36.355 | Grad_l2 --> 161.741 | Weights_l2 --> 7701.783 | Lr --> 0.001 | Seconds_per_step --> 9.623 |
87
+ [2024-10-20 18:59:40,344][Main][INFO] - [train] Step 200 out of 65536 | Loss --> 70.712 | Loss_ntp --> 35.041 | Loss_mlm --> 35.671 | Grad_l2 --> 154.736 | Weights_l2 --> 7701.778 | Lr --> 0.001 | Seconds_per_step --> 9.653 |
88
+ [2024-10-20 19:03:39,817][Main][INFO] - [train] Step 225 out of 65536 | Loss --> 69.050 | Loss_ntp --> 34.233 | Loss_mlm --> 34.817 | Grad_l2 --> 106.908 | Weights_l2 --> 7701.772 | Lr --> 0.001 | Seconds_per_step --> 9.579 |
89
+ [2024-10-20 19:07:41,876][Main][INFO] - [train] Step 250 out of 65536 | Loss --> 68.595 | Loss_ntp --> 33.970 | Loss_mlm --> 34.625 | Grad_l2 --> 126.557 | Weights_l2 --> 7701.767 | Lr --> 0.001 | Seconds_per_step --> 9.682 |
90
+ [2024-10-20 19:11:43,944][Main][INFO] - [train] Step 275 out of 65536 | Loss --> 67.141 | Loss_ntp --> 33.297 | Loss_mlm --> 33.844 | Grad_l2 --> 114.874 | Weights_l2 --> 7701.762 | Lr --> 0.001 | Seconds_per_step --> 9.683 |
91
+ [2024-10-20 19:15:43,786][Main][INFO] - [train] Step 300 out of 65536 | Loss --> 65.916 | Loss_ntp --> 32.693 | Loss_mlm --> 33.223 | Grad_l2 --> 89.430 | Weights_l2 --> 7701.757 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
92
+ [2024-10-20 19:19:45,206][Main][INFO] - [train] Step 325 out of 65536 | Loss --> 65.322 | Loss_ntp --> 32.362 | Loss_mlm --> 32.960 | Grad_l2 --> 97.785 | Weights_l2 --> 7701.751 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
93
+ [2024-10-20 19:23:45,072][Main][INFO] - [train] Step 350 out of 65536 | Loss --> 64.367 | Loss_ntp --> 31.937 | Loss_mlm --> 32.430 | Grad_l2 --> 83.882 | Weights_l2 --> 7701.746 | Lr --> 0.001 | Seconds_per_step --> 9.595 |
94
+ [2024-10-20 19:27:46,534][Main][INFO] - [train] Step 375 out of 65536 | Loss --> 63.409 | Loss_ntp --> 31.433 | Loss_mlm --> 31.975 | Grad_l2 --> 75.548 | Weights_l2 --> 7701.741 | Lr --> 0.001 | Seconds_per_step --> 9.658 |
95
+ [2024-10-20 19:31:45,390][Main][INFO] - [train] Step 400 out of 65536 | Loss --> 62.292 | Loss_ntp --> 30.925 | Loss_mlm --> 31.367 | Grad_l2 --> 72.299 | Weights_l2 --> 7701.736 | Lr --> 0.001 | Seconds_per_step --> 9.554 |
96
+ [2024-10-20 19:35:46,689][Main][INFO] - [train] Step 425 out of 65536 | Loss --> 61.685 | Loss_ntp --> 30.585 | Loss_mlm --> 31.100 | Grad_l2 --> 73.838 | Weights_l2 --> 7701.731 | Lr --> 0.001 | Seconds_per_step --> 9.652 |
97
+ [2024-10-20 19:39:46,030][Main][INFO] - [train] Step 450 out of 65536 | Loss --> 61.416 | Loss_ntp --> 30.509 | Loss_mlm --> 30.907 | Grad_l2 --> 79.820 | Weights_l2 --> 7701.726 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
98
+ [2024-10-20 19:43:47,298][Main][INFO] - [train] Step 475 out of 65536 | Loss --> 60.536 | Loss_ntp --> 30.069 | Loss_mlm --> 30.467 | Grad_l2 --> 59.074 | Weights_l2 --> 7701.722 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
99
+ [2024-10-20 19:47:48,778][Main][INFO] - [train] Step 500 out of 65536 | Loss --> 60.085 | Loss_ntp --> 29.838 | Loss_mlm --> 30.246 | Grad_l2 --> 71.417 | Weights_l2 --> 7701.717 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
100
+ [2024-10-20 19:49:25,862][Main][INFO] - [eval] Step 500 out of 65536 | Loss --> 57.611 | Loss_ntp --> 28.694 | Loss_mlm --> 28.917 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 97.080 |
101
+ [2024-10-20 19:53:26,482][Main][INFO] - [train] Step 525 out of 65536 | Loss --> 59.106 | Loss_ntp --> 29.371 | Loss_mlm --> 29.735 | Grad_l2 --> 56.829 | Weights_l2 --> 7701.712 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
102
+ [2024-10-20 19:57:25,811][Main][INFO] - [train] Step 550 out of 65536 | Loss --> 58.185 | Loss_ntp --> 28.950 | Loss_mlm --> 29.235 | Grad_l2 --> 56.368 | Weights_l2 --> 7701.707 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
103
+ [2024-10-20 20:01:26,095][Main][INFO] - [train] Step 575 out of 65536 | Loss --> 57.301 | Loss_ntp --> 28.480 | Loss_mlm --> 28.821 | Grad_l2 --> 39.860 | Weights_l2 --> 7701.703 | Lr --> 0.001 | Seconds_per_step --> 9.611 |
104
+ [2024-10-20 20:05:26,649][Main][INFO] - [train] Step 600 out of 65536 | Loss --> 56.020 | Loss_ntp --> 27.906 | Loss_mlm --> 28.115 | Grad_l2 --> 35.414 | Weights_l2 --> 7701.698 | Lr --> 0.001 | Seconds_per_step --> 9.622 |
105
+ [2024-10-20 20:09:28,597][Main][INFO] - [train] Step 625 out of 65536 | Loss --> 55.363 | Loss_ntp --> 27.524 | Loss_mlm --> 27.840 | Grad_l2 --> 50.531 | Weights_l2 --> 7701.694 | Lr --> 0.001 | Seconds_per_step --> 9.678 |
106
+ [2024-10-20 20:13:29,399][Main][INFO] - [train] Step 650 out of 65536 | Loss --> 54.803 | Loss_ntp --> 27.252 | Loss_mlm --> 27.551 | Grad_l2 --> 56.108 | Weights_l2 --> 7701.689 | Lr --> 0.001 | Seconds_per_step --> 9.632 |
107
+ [2024-10-20 20:17:31,948][Main][INFO] - [train] Step 675 out of 65536 | Loss --> 53.970 | Loss_ntp --> 26.793 | Loss_mlm --> 27.176 | Grad_l2 --> 46.473 | Weights_l2 --> 7701.685 | Lr --> 0.001 | Seconds_per_step --> 9.702 |
108
+ [2024-10-20 20:21:31,196][Main][INFO] - [train] Step 700 out of 65536 | Loss --> 53.056 | Loss_ntp --> 26.359 | Loss_mlm --> 26.697 | Grad_l2 --> 37.435 | Weights_l2 --> 7701.680 | Lr --> 0.001 | Seconds_per_step --> 9.570 |
109
+ [2024-10-20 20:25:33,347][Main][INFO] - [train] Step 725 out of 65536 | Loss --> 52.070 | Loss_ntp --> 25.876 | Loss_mlm --> 26.194 | Grad_l2 --> 43.881 | Weights_l2 --> 7701.676 | Lr --> 0.001 | Seconds_per_step --> 9.686 |
110
+ [2024-10-20 20:29:33,004][Main][INFO] - [train] Step 750 out of 65536 | Loss --> 51.191 | Loss_ntp --> 25.456 | Loss_mlm --> 25.735 | Grad_l2 --> 44.855 | Weights_l2 --> 7701.672 | Lr --> 0.001 | Seconds_per_step --> 9.586 |
111
+ [2024-10-20 20:33:34,557][Main][INFO] - [train] Step 775 out of 65536 | Loss --> 50.129 | Loss_ntp --> 24.891 | Loss_mlm --> 25.239 | Grad_l2 --> 40.117 | Weights_l2 --> 7701.667 | Lr --> 0.001 | Seconds_per_step --> 9.662 |
112
+ [2024-10-20 20:37:33,242][Main][INFO] - [train] Step 800 out of 65536 | Loss --> 49.019 | Loss_ntp --> 24.361 | Loss_mlm --> 24.658 | Grad_l2 --> 39.953 | Weights_l2 --> 7701.663 | Lr --> 0.001 | Seconds_per_step --> 9.547 |
113
+ [2024-10-20 20:41:33,285][Main][INFO] - [train] Step 825 out of 65536 | Loss --> 48.160 | Loss_ntp --> 23.923 | Loss_mlm --> 24.238 | Grad_l2 --> 42.816 | Weights_l2 --> 7701.659 | Lr --> 0.001 | Seconds_per_step --> 9.602 |
114
+ [2024-10-20 20:45:34,352][Main][INFO] - [train] Step 850 out of 65536 | Loss --> 46.672 | Loss_ntp --> 23.149 | Loss_mlm --> 23.522 | Grad_l2 --> 42.230 | Weights_l2 --> 7701.654 | Lr --> 0.001 | Seconds_per_step --> 9.643 |
115
+ [2024-10-20 20:49:34,963][Main][INFO] - [train] Step 875 out of 65536 | Loss --> 44.855 | Loss_ntp --> 22.279 | Loss_mlm --> 22.575 | Grad_l2 --> 39.123 | Weights_l2 --> 7701.650 | Lr --> 0.001 | Seconds_per_step --> 9.624 |
116
+ [2024-10-20 20:53:36,677][Main][INFO] - [train] Step 900 out of 65536 | Loss --> 42.480 | Loss_ntp --> 21.057 | Loss_mlm --> 21.423 | Grad_l2 --> 50.501 | Weights_l2 --> 7701.645 | Lr --> 0.001 | Seconds_per_step --> 9.668 |
117
+ [2024-10-20 20:57:37,186][Main][INFO] - [train] Step 925 out of 65536 | Loss --> 40.028 | Loss_ntp --> 19.877 | Loss_mlm --> 20.151 | Grad_l2 --> 57.109 | Weights_l2 --> 7701.640 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
118
+ [2024-10-20 21:01:38,800][Main][INFO] - [train] Step 950 out of 65536 | Loss --> 37.058 | Loss_ntp --> 18.359 | Loss_mlm --> 18.699 | Grad_l2 --> 78.443 | Weights_l2 --> 7701.634 | Lr --> 0.001 | Seconds_per_step --> 9.664 |
119
+ [2024-10-20 21:05:38,405][Main][INFO] - [train] Step 975 out of 65536 | Loss --> 33.534 | Loss_ntp --> 16.618 | Loss_mlm --> 16.917 | Grad_l2 --> 87.220 | Weights_l2 --> 7701.628 | Lr --> 0.001 | Seconds_per_step --> 9.584 |
120
+ [2024-10-20 21:09:41,153][Main][INFO] - [train] Step 1000 out of 65536 | Loss --> 29.988 | Loss_ntp --> 14.857 | Loss_mlm --> 15.131 | Grad_l2 --> 88.279 | Weights_l2 --> 7701.622 | Lr --> 0.001 | Seconds_per_step --> 9.710 |
121
+ [2024-10-20 21:10:10,310][Main][INFO] - [eval] Step 1000 out of 65536 | Loss --> 28.033 | Loss_ntp --> 13.938 | Loss_mlm --> 14.095 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 29.143 |
122
+ [2024-10-20 21:14:10,580][Main][INFO] - [train] Step 1025 out of 65536 | Loss --> 26.588 | Loss_ntp --> 13.166 | Loss_mlm --> 13.423 | Grad_l2 --> 109.226 | Weights_l2 --> 7701.616 | Lr --> 0.001 | Seconds_per_step --> 9.611 |
123
+ [2024-10-20 21:18:12,558][Main][INFO] - [train] Step 1050 out of 65536 | Loss --> 23.850 | Loss_ntp --> 11.830 | Loss_mlm --> 12.020 | Grad_l2 --> 98.666 | Weights_l2 --> 7701.610 | Lr --> 0.001 | Seconds_per_step --> 9.679 |
124
+ [2024-10-20 21:22:11,593][Main][INFO] - [train] Step 1075 out of 65536 | Loss --> 21.589 | Loss_ntp --> 10.697 | Loss_mlm --> 10.892 | Grad_l2 --> 104.858 | Weights_l2 --> 7701.605 | Lr --> 0.001 | Seconds_per_step --> 9.561 |
125
+ [2024-10-20 21:26:13,779][Main][INFO] - [train] Step 1100 out of 65536 | Loss --> 19.443 | Loss_ntp --> 9.626 | Loss_mlm --> 9.817 | Grad_l2 --> 75.473 | Weights_l2 --> 7701.599 | Lr --> 0.001 | Seconds_per_step --> 9.687 |
126
+ [2024-10-20 21:30:13,762][Main][INFO] - [train] Step 1125 out of 65536 | Loss --> 17.771 | Loss_ntp --> 8.793 | Loss_mlm --> 8.978 | Grad_l2 --> 55.492 | Weights_l2 --> 7701.593 | Lr --> 0.001 | Seconds_per_step --> 9.599 |
127
+ [2024-10-20 21:34:14,478][Main][INFO] - [train] Step 1150 out of 65536 | Loss --> 17.092 | Loss_ntp --> 8.462 | Loss_mlm --> 8.630 | Grad_l2 --> 72.673 | Weights_l2 --> 7701.587 | Lr --> 0.001 | Seconds_per_step --> 9.629 |
128
+ [2024-10-20 21:38:14,797][Main][INFO] - [train] Step 1175 out of 65536 | Loss --> 16.731 | Loss_ntp --> 8.294 | Loss_mlm --> 8.437 | Grad_l2 --> 60.718 | Weights_l2 --> 7701.582 | Lr --> 0.001 | Seconds_per_step --> 9.613 |
129
+ [2024-10-20 21:42:15,467][Main][INFO] - [train] Step 1200 out of 65536 | Loss --> 16.522 | Loss_ntp --> 8.188 | Loss_mlm --> 8.334 | Grad_l2 --> 62.414 | Weights_l2 --> 7701.577 | Lr --> 0.001 | Seconds_per_step --> 9.627 |
130
+ [2024-10-20 21:46:15,957][Main][INFO] - [train] Step 1225 out of 65536 | Loss --> 16.336 | Loss_ntp --> 8.096 | Loss_mlm --> 8.240 | Grad_l2 --> 57.944 | Weights_l2 --> 7701.572 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
131
+ [2024-10-20 21:50:15,276][Main][INFO] - [train] Step 1250 out of 65536 | Loss --> 16.167 | Loss_ntp --> 8.006 | Loss_mlm --> 8.161 | Grad_l2 --> 42.899 | Weights_l2 --> 7701.567 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
132
+ [2024-10-20 21:54:18,039][Main][INFO] - [train] Step 1275 out of 65536 | Loss --> 16.183 | Loss_ntp --> 8.017 | Loss_mlm --> 8.166 | Grad_l2 --> 48.492 | Weights_l2 --> 7701.563 | Lr --> 0.001 | Seconds_per_step --> 9.710 |
133
+ [2024-10-20 21:58:18,396][Main][INFO] - [train] Step 1300 out of 65536 | Loss --> 15.988 | Loss_ntp --> 7.926 | Loss_mlm --> 8.063 | Grad_l2 --> 42.852 | Weights_l2 --> 7701.558 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
134
+ [2024-10-20 22:02:20,263][Main][INFO] - [train] Step 1325 out of 65536 | Loss --> 15.982 | Loss_ntp --> 7.916 | Loss_mlm --> 8.066 | Grad_l2 --> 47.218 | Weights_l2 --> 7701.553 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
135
+ [2024-10-20 22:06:20,739][Main][INFO] - [train] Step 1350 out of 65536 | Loss --> 15.830 | Loss_ntp --> 7.838 | Loss_mlm --> 7.992 | Grad_l2 --> 28.805 | Weights_l2 --> 7701.549 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
136
+ [2024-10-20 22:10:23,190][Main][INFO] - [train] Step 1375 out of 65536 | Loss --> 15.806 | Loss_ntp --> 7.839 | Loss_mlm --> 7.967 | Grad_l2 --> 37.388 | Weights_l2 --> 7701.544 | Lr --> 0.001 | Seconds_per_step --> 9.698 |
137
+ [2024-10-20 22:14:23,525][Main][INFO] - [train] Step 1400 out of 65536 | Loss --> 15.775 | Loss_ntp --> 7.813 | Loss_mlm --> 7.962 | Grad_l2 --> 35.380 | Weights_l2 --> 7701.540 | Lr --> 0.001 | Seconds_per_step --> 9.613 |
138
+ [2024-10-20 22:18:25,080][Main][INFO] - [train] Step 1425 out of 65536 | Loss --> 15.722 | Loss_ntp --> 7.794 | Loss_mlm --> 7.928 | Grad_l2 --> 34.978 | Weights_l2 --> 7701.535 | Lr --> 0.001 | Seconds_per_step --> 9.662 |
139
+ [2024-10-20 22:22:24,651][Main][INFO] - [train] Step 1450 out of 65536 | Loss --> 15.638 | Loss_ntp --> 7.739 | Loss_mlm --> 7.899 | Grad_l2 --> 24.003 | Weights_l2 --> 7701.530 | Lr --> 0.001 | Seconds_per_step --> 9.583 |
140
+ [2024-10-20 22:26:24,495][Main][INFO] - [train] Step 1475 out of 65536 | Loss --> 15.682 | Loss_ntp --> 7.768 | Loss_mlm --> 7.913 | Grad_l2 --> 27.599 | Weights_l2 --> 7701.526 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
141
+ [2024-10-20 22:30:25,992][Main][INFO] - [train] Step 1500 out of 65536 | Loss --> 15.638 | Loss_ntp --> 7.754 | Loss_mlm --> 7.884 | Grad_l2 --> 22.985 | Weights_l2 --> 7701.521 | Lr --> 0.001 | Seconds_per_step --> 9.660 |
142
+ [2024-10-20 22:30:54,697][Main][INFO] - [eval] Step 1500 out of 65536 | Loss --> 15.664 | Loss_ntp --> 7.782 | Loss_mlm --> 7.882 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.700 |
143
+ [2024-10-20 22:30:54,709][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-1500
144
+ [2024-10-20 22:30:54,719][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
145
+ [2024-10-20 22:30:59,988][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-1500/model.safetensors
146
+ [2024-10-20 22:31:08,673][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-1500/optimizer.bin
147
+ [2024-10-20 22:31:08,682][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-1500/scheduler.bin
148
+ [2024-10-20 22:31:08,684][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-1500/sampler.bin
149
+ [2024-10-20 22:31:08,686][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-1500/sampler_1.bin
150
+ [2024-10-20 22:31:08,694][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-1500/random_states_0.pkl
151
+ [2024-10-20 22:35:09,885][Main][INFO] - [train] Step 1525 out of 65536 | Loss --> 15.740 | Loss_ntp --> 7.803 | Loss_mlm --> 7.937 | Grad_l2 --> 35.476 | Weights_l2 --> 7701.516 | Lr --> 0.001 | Seconds_per_step --> 10.207 |
152
+ [2024-10-20 22:39:10,189][Main][INFO] - [train] Step 1550 out of 65536 | Loss --> 15.717 | Loss_ntp --> 7.796 | Loss_mlm --> 7.921 | Grad_l2 --> 32.209 | Weights_l2 --> 7701.511 | Lr --> 0.001 | Seconds_per_step --> 9.612 |
153
+ [2024-10-20 22:43:12,020][Main][INFO] - [train] Step 1575 out of 65536 | Loss --> 15.723 | Loss_ntp --> 7.805 | Loss_mlm --> 7.918 | Grad_l2 --> 35.393 | Weights_l2 --> 7701.506 | Lr --> 0.001 | Seconds_per_step --> 9.673 |
154
+ [2024-10-20 22:47:13,492][Main][INFO] - [train] Step 1600 out of 65536 | Loss --> 15.617 | Loss_ntp --> 7.752 | Loss_mlm --> 7.865 | Grad_l2 --> 29.357 | Weights_l2 --> 7701.502 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
155
+ [2024-10-20 22:51:13,978][Main][INFO] - [train] Step 1625 out of 65536 | Loss --> 15.532 | Loss_ntp --> 7.709 | Loss_mlm --> 7.822 | Grad_l2 --> 18.501 | Weights_l2 --> 7701.497 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
156
+ [2024-10-20 22:55:14,600][Main][INFO] - [train] Step 1650 out of 65536 | Loss --> 15.565 | Loss_ntp --> 7.720 | Loss_mlm --> 7.845 | Grad_l2 --> 17.546 | Weights_l2 --> 7701.493 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
157
+ [2024-10-20 22:59:14,384][Main][INFO] - [train] Step 1675 out of 65536 | Loss --> 15.576 | Loss_ntp --> 7.737 | Loss_mlm --> 7.838 | Grad_l2 --> 23.599 | Weights_l2 --> 7701.489 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
158
+ [2024-10-20 23:03:16,878][Main][INFO] - [train] Step 1700 out of 65536 | Loss --> 15.612 | Loss_ntp --> 7.757 | Loss_mlm --> 7.855 | Grad_l2 --> 28.685 | Weights_l2 --> 7701.484 | Lr --> 0.001 | Seconds_per_step --> 9.700 |
159
+ [2024-10-20 23:07:16,611][Main][INFO] - [train] Step 1725 out of 65536 | Loss --> 15.590 | Loss_ntp --> 7.728 | Loss_mlm --> 7.861 | Grad_l2 --> 22.357 | Weights_l2 --> 7701.479 | Lr --> 0.001 | Seconds_per_step --> 9.589 |
160
+ [2024-10-20 23:11:18,435][Main][INFO] - [train] Step 1750 out of 65536 | Loss --> 15.475 | Loss_ntp --> 7.683 | Loss_mlm --> 7.792 | Grad_l2 --> 20.808 | Weights_l2 --> 7701.475 | Lr --> 0.001 | Seconds_per_step --> 9.673 |
161
+ [2024-10-20 23:15:17,324][Main][INFO] - [train] Step 1775 out of 65536 | Loss --> 15.422 | Loss_ntp --> 7.655 | Loss_mlm --> 7.767 | Grad_l2 --> 16.928 | Weights_l2 --> 7701.470 | Lr --> 0.001 | Seconds_per_step --> 9.555 |
162
+ [2024-10-20 23:19:17,823][Main][INFO] - [train] Step 1800 out of 65536 | Loss --> 15.370 | Loss_ntp --> 7.625 | Loss_mlm --> 7.745 | Grad_l2 --> 16.147 | Weights_l2 --> 7701.466 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
163
+ [2024-10-20 23:23:19,005][Main][INFO] - [train] Step 1825 out of 65536 | Loss --> 15.363 | Loss_ntp --> 7.629 | Loss_mlm --> 7.734 | Grad_l2 --> 19.934 | Weights_l2 --> 7701.462 | Lr --> 0.001 | Seconds_per_step --> 9.647 |
164
+ [2024-10-20 23:27:17,933][Main][INFO] - [train] Step 1850 out of 65536 | Loss --> 15.347 | Loss_ntp --> 7.616 | Loss_mlm --> 7.732 | Grad_l2 --> 25.592 | Weights_l2 --> 7701.457 | Lr --> 0.001 | Seconds_per_step --> 9.557 |
165
+ [2024-10-20 23:31:19,805][Main][INFO] - [train] Step 1875 out of 65536 | Loss --> 15.254 | Loss_ntp --> 7.577 | Loss_mlm --> 7.677 | Grad_l2 --> 19.500 | Weights_l2 --> 7701.453 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
166
+ [2024-10-20 23:35:18,582][Main][INFO] - [train] Step 1900 out of 65536 | Loss --> 15.204 | Loss_ntp --> 7.550 | Loss_mlm --> 7.653 | Grad_l2 --> 15.358 | Weights_l2 --> 7701.448 | Lr --> 0.001 | Seconds_per_step --> 9.551 |
167
+ [2024-10-20 23:39:20,300][Main][INFO] - [train] Step 1925 out of 65536 | Loss --> 15.153 | Loss_ntp --> 7.525 | Loss_mlm --> 7.628 | Grad_l2 --> 13.241 | Weights_l2 --> 7701.445 | Lr --> 0.001 | Seconds_per_step --> 9.669 |
168
+ [2024-10-20 23:43:21,680][Main][INFO] - [train] Step 1950 out of 65536 | Loss --> 15.111 | Loss_ntp --> 7.497 | Loss_mlm --> 7.614 | Grad_l2 --> 13.357 | Weights_l2 --> 7701.441 | Lr --> 0.001 | Seconds_per_step --> 9.655 |
169
+ [2024-10-20 23:47:22,111][Main][INFO] - [train] Step 1975 out of 65536 | Loss --> 15.072 | Loss_ntp --> 7.475 | Loss_mlm --> 7.597 | Grad_l2 --> 15.485 | Weights_l2 --> 7701.437 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
170
+ [2024-10-20 23:51:21,960][Main][INFO] - [train] Step 2000 out of 65536 | Loss --> 15.061 | Loss_ntp --> 7.470 | Loss_mlm --> 7.591 | Grad_l2 --> 15.511 | Weights_l2 --> 7701.432 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
171
+ [2024-10-20 23:51:50,849][Main][INFO] - [eval] Step 2000 out of 65536 | Loss --> 15.092 | Loss_ntp --> 7.501 | Loss_mlm --> 7.591 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.883 |
172
+ [2024-10-20 23:55:53,490][Main][INFO] - [train] Step 2025 out of 65536 | Loss --> 15.080 | Loss_ntp --> 7.479 | Loss_mlm --> 7.601 | Grad_l2 --> 17.451 | Weights_l2 --> 7701.428 | Lr --> 0.001 | Seconds_per_step --> 9.705 |
173
+ [2024-10-20 23:59:53,747][Main][INFO] - [train] Step 2050 out of 65536 | Loss --> 14.998 | Loss_ntp --> 7.447 | Loss_mlm --> 7.551 | Grad_l2 --> 13.242 | Weights_l2 --> 7701.424 | Lr --> 0.001 | Seconds_per_step --> 9.610 |
174
+ [2024-10-21 00:03:57,114][Main][INFO] - [train] Step 2075 out of 65536 | Loss --> 14.994 | Loss_ntp --> 7.431 | Loss_mlm --> 7.562 | Grad_l2 --> 17.409 | Weights_l2 --> 7701.419 | Lr --> 0.001 | Seconds_per_step --> 9.735 |
175
+ [2024-10-21 00:07:56,557][Main][INFO] - [train] Step 2100 out of 65536 | Loss --> 14.993 | Loss_ntp --> 7.437 | Loss_mlm --> 7.556 | Grad_l2 --> 23.374 | Weights_l2 --> 7701.414 | Lr --> 0.001 | Seconds_per_step --> 9.578 |
176
+ [2024-10-21 00:11:56,818][Main][INFO] - [train] Step 2125 out of 65536 | Loss --> 14.963 | Loss_ntp --> 7.428 | Loss_mlm --> 7.535 | Grad_l2 --> 24.857 | Weights_l2 --> 7701.410 | Lr --> 0.001 | Seconds_per_step --> 9.610 |
177
+ [2024-10-21 00:15:56,927][Main][INFO] - [train] Step 2150 out of 65536 | Loss --> 14.829 | Loss_ntp --> 7.354 | Loss_mlm --> 7.474 | Grad_l2 --> 14.538 | Weights_l2 --> 7701.405 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
178
+ [2024-10-21 00:19:57,089][Main][INFO] - [train] Step 2175 out of 65536 | Loss --> 14.797 | Loss_ntp --> 7.344 | Loss_mlm --> 7.453 | Grad_l2 --> 13.598 | Weights_l2 --> 7701.400 | Lr --> 0.001 | Seconds_per_step --> 9.606 |
179
+ [2024-10-21 00:23:58,135][Main][INFO] - [train] Step 2200 out of 65536 | Loss --> 14.774 | Loss_ntp --> 7.321 | Loss_mlm --> 7.454 | Grad_l2 --> 13.339 | Weights_l2 --> 7701.396 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
180
+ [2024-10-21 00:27:58,499][Main][INFO] - [train] Step 2225 out of 65536 | Loss --> 14.671 | Loss_ntp --> 7.284 | Loss_mlm --> 7.387 | Grad_l2 --> 13.884 | Weights_l2 --> 7701.392 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
181
+ [2024-10-21 00:31:59,596][Main][INFO] - [train] Step 2250 out of 65536 | Loss --> 14.635 | Loss_ntp --> 7.264 | Loss_mlm --> 7.371 | Grad_l2 --> 11.527 | Weights_l2 --> 7701.388 | Lr --> 0.001 | Seconds_per_step --> 9.644 |
182
+ [2024-10-21 00:35:58,256][Main][INFO] - [train] Step 2275 out of 65536 | Loss --> 14.593 | Loss_ntp --> 7.247 | Loss_mlm --> 7.345 | Grad_l2 --> 9.993 | Weights_l2 --> 7701.384 | Lr --> 0.001 | Seconds_per_step --> 9.546 |
183
+ [2024-10-21 00:39:59,379][Main][INFO] - [train] Step 2300 out of 65536 | Loss --> 14.543 | Loss_ntp --> 7.216 | Loss_mlm --> 7.327 | Grad_l2 --> 12.147 | Weights_l2 --> 7701.381 | Lr --> 0.001 | Seconds_per_step --> 9.644 |
184
+ [2024-10-21 00:43:59,080][Main][INFO] - [train] Step 2325 out of 65536 | Loss --> 14.577 | Loss_ntp --> 7.231 | Loss_mlm --> 7.345 | Grad_l2 --> 12.365 | Weights_l2 --> 7701.376 | Lr --> 0.001 | Seconds_per_step --> 9.588 |
185
+ [2024-10-21 00:47:59,811][Main][INFO] - [train] Step 2350 out of 65536 | Loss --> 14.512 | Loss_ntp --> 7.202 | Loss_mlm --> 7.310 | Grad_l2 --> 12.472 | Weights_l2 --> 7701.372 | Lr --> 0.001 | Seconds_per_step --> 9.629 |
186
+ [2024-10-21 00:51:58,749][Main][INFO] - [train] Step 2375 out of 65536 | Loss --> 14.434 | Loss_ntp --> 7.166 | Loss_mlm --> 7.268 | Grad_l2 --> 12.198 | Weights_l2 --> 7701.368 | Lr --> 0.001 | Seconds_per_step --> 9.557 |
187
+ [2024-10-21 00:55:58,527][Main][INFO] - [train] Step 2400 out of 65536 | Loss --> 14.390 | Loss_ntp --> 7.141 | Loss_mlm --> 7.249 | Grad_l2 --> 11.488 | Weights_l2 --> 7701.365 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
188
+ [2024-10-21 00:59:59,746][Main][INFO] - [train] Step 2425 out of 65536 | Loss --> 14.396 | Loss_ntp --> 7.142 | Loss_mlm --> 7.253 | Grad_l2 --> 11.924 | Weights_l2 --> 7701.361 | Lr --> 0.001 | Seconds_per_step --> 9.649 |
189
+ [2024-10-21 01:03:58,922][Main][INFO] - [train] Step 2450 out of 65536 | Loss --> 14.319 | Loss_ntp --> 7.108 | Loss_mlm --> 7.211 | Grad_l2 --> 11.587 | Weights_l2 --> 7701.357 | Lr --> 0.001 | Seconds_per_step --> 9.567 |
190
+ [2024-10-21 01:08:00,577][Main][INFO] - [train] Step 2475 out of 65536 | Loss --> 14.363 | Loss_ntp --> 7.132 | Loss_mlm --> 7.231 | Grad_l2 --> 11.854 | Weights_l2 --> 7701.353 | Lr --> 0.001 | Seconds_per_step --> 9.666 |
191
+ [2024-10-21 01:12:00,070][Main][INFO] - [train] Step 2500 out of 65536 | Loss --> 14.333 | Loss_ntp --> 7.121 | Loss_mlm --> 7.212 | Grad_l2 --> 10.363 | Weights_l2 --> 7701.349 | Lr --> 0.001 | Seconds_per_step --> 9.580 |
192
+ [2024-10-21 01:12:28,480][Main][INFO] - [eval] Step 2500 out of 65536 | Loss --> 14.573 | Loss_ntp --> 7.286 | Loss_mlm --> 7.287 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.404 |
193
+ [2024-10-21 01:16:30,064][Main][INFO] - [train] Step 2525 out of 65536 | Loss --> 14.280 | Loss_ntp --> 7.089 | Loss_mlm --> 7.192 | Grad_l2 --> 13.178 | Weights_l2 --> 7701.345 | Lr --> 0.001 | Seconds_per_step --> 9.663 |
194
+ [2024-10-21 01:20:29,018][Main][INFO] - [train] Step 2550 out of 65536 | Loss --> 14.260 | Loss_ntp --> 7.091 | Loss_mlm --> 7.169 | Grad_l2 --> 12.381 | Weights_l2 --> 7701.341 | Lr --> 0.001 | Seconds_per_step --> 9.558 |
195
+ [2024-10-21 01:24:31,253][Main][INFO] - [train] Step 2575 out of 65536 | Loss --> 14.259 | Loss_ntp --> 7.078 | Loss_mlm --> 7.182 | Grad_l2 --> 11.247 | Weights_l2 --> 7701.337 | Lr --> 0.001 | Seconds_per_step --> 9.689 |
196
+ [2024-10-21 01:28:31,446][Main][INFO] - [train] Step 2600 out of 65536 | Loss --> 14.259 | Loss_ntp --> 7.080 | Loss_mlm --> 7.179 | Grad_l2 --> 12.524 | Weights_l2 --> 7701.333 | Lr --> 0.001 | Seconds_per_step --> 9.608 |
197
+ [2024-10-21 01:32:31,794][Main][INFO] - [train] Step 2625 out of 65536 | Loss --> 14.245 | Loss_ntp --> 7.068 | Loss_mlm --> 7.178 | Grad_l2 --> 12.087 | Weights_l2 --> 7701.330 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
198
+ [2024-10-21 01:36:32,411][Main][INFO] - [train] Step 2650 out of 65536 | Loss --> 14.247 | Loss_ntp --> 7.074 | Loss_mlm --> 7.173 | Grad_l2 --> 11.638 | Weights_l2 --> 7701.326 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
199
+ [2024-10-21 01:40:33,462][Main][INFO] - [train] Step 2675 out of 65536 | Loss --> 14.274 | Loss_ntp --> 7.086 | Loss_mlm --> 7.189 | Grad_l2 --> 10.415 | Weights_l2 --> 7701.322 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
200
+ [2024-10-21 01:44:33,254][Main][INFO] - [train] Step 2700 out of 65536 | Loss --> 14.276 | Loss_ntp --> 7.097 | Loss_mlm --> 7.179 | Grad_l2 --> 10.830 | Weights_l2 --> 7701.318 | Lr --> 0.001 | Seconds_per_step --> 9.592 |
201
+ [2024-10-21 01:48:34,104][Main][INFO] - [train] Step 2725 out of 65536 | Loss --> 14.322 | Loss_ntp --> 7.117 | Loss_mlm --> 7.205 | Grad_l2 --> 11.668 | Weights_l2 --> 7701.314 | Lr --> 0.001 | Seconds_per_step --> 9.634 |
202
+ [2024-10-21 01:52:33,834][Main][INFO] - [train] Step 2750 out of 65536 | Loss --> 14.393 | Loss_ntp --> 7.149 | Loss_mlm --> 7.244 | Grad_l2 --> 10.585 | Weights_l2 --> 7701.310 | Lr --> 0.001 | Seconds_per_step --> 9.589 |
203
+ [2024-10-21 01:56:33,130][Main][INFO] - [train] Step 2775 out of 65536 | Loss --> 14.326 | Loss_ntp --> 7.124 | Loss_mlm --> 7.202 | Grad_l2 --> 9.862 | Weights_l2 --> 7701.306 | Lr --> 0.001 | Seconds_per_step --> 9.572 |
204
+ [2024-10-21 02:00:34,375][Main][INFO] - [train] Step 2800 out of 65536 | Loss --> 14.354 | Loss_ntp --> 7.134 | Loss_mlm --> 7.220 | Grad_l2 --> 8.484 | Weights_l2 --> 7701.302 | Lr --> 0.001 | Seconds_per_step --> 9.650 |
205
+ [2024-10-21 02:04:34,763][Main][INFO] - [train] Step 2825 out of 65536 | Loss --> 14.320 | Loss_ntp --> 7.118 | Loss_mlm --> 7.202 | Grad_l2 --> 11.118 | Weights_l2 --> 7701.298 | Lr --> 0.001 | Seconds_per_step --> 9.615 |
206
+ [2024-10-21 02:08:35,157][Main][INFO] - [train] Step 2850 out of 65536 | Loss --> 14.323 | Loss_ntp --> 7.124 | Loss_mlm --> 7.199 | Grad_l2 --> 10.821 | Weights_l2 --> 7701.294 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
207
+ [2024-10-21 02:12:34,860][Main][INFO] - [train] Step 2875 out of 65536 | Loss --> 14.348 | Loss_ntp --> 7.129 | Loss_mlm --> 7.219 | Grad_l2 --> 9.481 | Weights_l2 --> 7701.291 | Lr --> 0.001 | Seconds_per_step --> 9.588 |
208
+ [2024-10-21 02:16:36,448][Main][INFO] - [train] Step 2900 out of 65536 | Loss --> 14.413 | Loss_ntp --> 7.163 | Loss_mlm --> 7.250 | Grad_l2 --> 10.586 | Weights_l2 --> 7701.287 | Lr --> 0.001 | Seconds_per_step --> 9.663 |
209
+ [2024-10-21 02:20:36,563][Main][INFO] - [train] Step 2925 out of 65536 | Loss --> 14.319 | Loss_ntp --> 7.113 | Loss_mlm --> 7.206 | Grad_l2 --> 9.175 | Weights_l2 --> 7701.283 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
210
+ [2024-10-21 02:24:36,522][Main][INFO] - [train] Step 2950 out of 65536 | Loss --> 14.292 | Loss_ntp --> 7.112 | Loss_mlm --> 7.179 | Grad_l2 --> 10.380 | Weights_l2 --> 7701.279 | Lr --> 0.001 | Seconds_per_step --> 9.598 |
211
+ [2024-10-21 02:28:36,510][Main][INFO] - [train] Step 2975 out of 65536 | Loss --> 14.202 | Loss_ntp --> 7.068 | Loss_mlm --> 7.134 | Grad_l2 --> 9.622 | Weights_l2 --> 7701.276 | Lr --> 0.001 | Seconds_per_step --> 9.599 |
212
+ [2024-10-21 02:32:38,120][Main][INFO] - [train] Step 3000 out of 65536 | Loss --> 14.214 | Loss_ntp --> 7.066 | Loss_mlm --> 7.147 | Grad_l2 --> 10.228 | Weights_l2 --> 7701.272 | Lr --> 0.001 | Seconds_per_step --> 9.664 |
213
+ [2024-10-21 02:33:06,984][Main][INFO] - [eval] Step 3000 out of 65536 | Loss --> 14.236 | Loss_ntp --> 7.111 | Loss_mlm --> 7.125 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.858 |
214
+ [2024-10-21 02:33:06,988][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-3000
215
+ [2024-10-21 02:33:07,000][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
216
+ [2024-10-21 02:33:13,140][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-3000/model.safetensors
217
+ [2024-10-21 02:33:21,968][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-3000/optimizer.bin
218
+ [2024-10-21 02:33:21,978][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-3000/scheduler.bin
219
+ [2024-10-21 02:33:21,979][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-3000/sampler.bin
220
+ [2024-10-21 02:33:21,981][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-3000/sampler_1.bin
221
+ [2024-10-21 02:33:21,990][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-3000/random_states_0.pkl
222
+ [2024-10-21 02:37:21,949][Main][INFO] - [train] Step 3025 out of 65536 | Loss --> 14.180 | Loss_ntp --> 7.041 | Loss_mlm --> 7.138 | Grad_l2 --> 9.928 | Weights_l2 --> 7701.268 | Lr --> 0.001 | Seconds_per_step --> 10.198 |
223
+ [2024-10-21 02:41:23,436][Main][INFO] - [train] Step 3050 out of 65536 | Loss --> 14.163 | Loss_ntp --> 7.032 | Loss_mlm --> 7.130 | Grad_l2 --> 9.909 | Weights_l2 --> 7701.264 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
224
+ [2024-10-21 02:45:23,362][Main][INFO] - [train] Step 3075 out of 65536 | Loss --> 14.109 | Loss_ntp --> 7.016 | Loss_mlm --> 7.093 | Grad_l2 --> 10.119 | Weights_l2 --> 7701.260 | Lr --> 0.001 | Seconds_per_step --> 9.597 |
225
+ [2024-10-21 02:49:23,828][Main][INFO] - [train] Step 3100 out of 65536 | Loss --> 14.053 | Loss_ntp --> 6.981 | Loss_mlm --> 7.072 | Grad_l2 --> 8.917 | Weights_l2 --> 7701.256 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
226
+ [2024-10-21 02:53:26,144][Main][INFO] - [train] Step 3125 out of 65536 | Loss --> 14.045 | Loss_ntp --> 6.975 | Loss_mlm --> 7.069 | Grad_l2 --> 11.184 | Weights_l2 --> 7701.252 | Lr --> 0.001 | Seconds_per_step --> 9.692 |
227
+ [2024-10-21 02:57:25,035][Main][INFO] - [train] Step 3150 out of 65536 | Loss --> 14.006 | Loss_ntp --> 6.959 | Loss_mlm --> 7.047 | Grad_l2 --> 9.280 | Weights_l2 --> 7701.248 | Lr --> 0.001 | Seconds_per_step --> 9.555 |
228
+ [2024-10-21 03:01:27,283][Main][INFO] - [train] Step 3175 out of 65536 | Loss --> 13.943 | Loss_ntp --> 6.924 | Loss_mlm --> 7.020 | Grad_l2 --> 8.769 | Weights_l2 --> 7701.245 | Lr --> 0.001 | Seconds_per_step --> 9.690 |
229
+ [2024-10-21 03:05:27,701][Main][INFO] - [train] Step 3200 out of 65536 | Loss --> 13.956 | Loss_ntp --> 6.916 | Loss_mlm --> 7.040 | Grad_l2 --> 8.625 | Weights_l2 --> 7701.241 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
230
+ [2024-10-21 03:09:28,530][Main][INFO] - [train] Step 3225 out of 65536 | Loss --> 13.916 | Loss_ntp --> 6.906 | Loss_mlm --> 7.010 | Grad_l2 --> 9.378 | Weights_l2 --> 7701.238 | Lr --> 0.001 | Seconds_per_step --> 9.633 |
231
+ [2024-10-21 03:13:28,937][Main][INFO] - [train] Step 3250 out of 65536 | Loss --> 13.849 | Loss_ntp --> 6.867 | Loss_mlm --> 6.982 | Grad_l2 --> 9.221 | Weights_l2 --> 7701.234 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
232
+ [2024-10-21 03:17:29,597][Main][INFO] - [train] Step 3275 out of 65536 | Loss --> 13.854 | Loss_ntp --> 6.869 | Loss_mlm --> 6.985 | Grad_l2 --> 8.561 | Weights_l2 --> 7701.230 | Lr --> 0.001 | Seconds_per_step --> 9.626 |
233
+ [2024-10-21 03:21:30,034][Main][INFO] - [train] Step 3300 out of 65536 | Loss --> 13.781 | Loss_ntp --> 6.843 | Loss_mlm --> 6.938 | Grad_l2 --> 8.919 | Weights_l2 --> 7701.226 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
234
+ [2024-10-21 03:25:29,815][Main][INFO] - [train] Step 3325 out of 65536 | Loss --> 13.766 | Loss_ntp --> 6.836 | Loss_mlm --> 6.930 | Grad_l2 --> 8.129 | Weights_l2 --> 7701.223 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
235
+ [2024-10-21 03:29:30,344][Main][INFO] - [train] Step 3350 out of 65536 | Loss --> 13.726 | Loss_ntp --> 6.809 | Loss_mlm --> 6.917 | Grad_l2 --> 9.145 | Weights_l2 --> 7701.219 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
236
+ [2024-10-21 03:33:30,171][Main][INFO] - [train] Step 3375 out of 65536 | Loss --> 13.751 | Loss_ntp --> 6.819 | Loss_mlm --> 6.932 | Grad_l2 --> 11.666 | Weights_l2 --> 7701.215 | Lr --> 0.001 | Seconds_per_step --> 9.593 |
237
+ [2024-10-21 03:37:32,111][Main][INFO] - [train] Step 3400 out of 65536 | Loss --> 13.700 | Loss_ntp --> 6.796 | Loss_mlm --> 6.905 | Grad_l2 --> 8.776 | Weights_l2 --> 7701.211 | Lr --> 0.001 | Seconds_per_step --> 9.677 |
238
+ [2024-10-21 03:41:31,530][Main][INFO] - [train] Step 3425 out of 65536 | Loss --> 13.641 | Loss_ntp --> 6.774 | Loss_mlm --> 6.868 | Grad_l2 --> 9.206 | Weights_l2 --> 7701.207 | Lr --> 0.001 | Seconds_per_step --> 9.577 |
239
+ [2024-10-21 03:45:33,625][Main][INFO] - [train] Step 3450 out of 65536 | Loss --> 13.588 | Loss_ntp --> 6.735 | Loss_mlm --> 6.852 | Grad_l2 --> 6.293 | Weights_l2 --> 7701.204 | Lr --> 0.001 | Seconds_per_step --> 9.684 |
240
+ [2024-10-21 03:49:34,400][Main][INFO] - [train] Step 3475 out of 65536 | Loss --> 13.615 | Loss_ntp --> 6.748 | Loss_mlm --> 6.868 | Grad_l2 --> 9.161 | Weights_l2 --> 7701.201 | Lr --> 0.001 | Seconds_per_step --> 9.631 |
241
+ [2024-10-21 03:53:35,824][Main][INFO] - [train] Step 3500 out of 65536 | Loss --> 13.532 | Loss_ntp --> 6.707 | Loss_mlm --> 6.825 | Grad_l2 --> 9.556 | Weights_l2 --> 7701.197 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
242
+ [2024-10-21 03:54:04,713][Main][INFO] - [eval] Step 3500 out of 65536 | Loss --> 13.912 | Loss_ntp --> 6.950 | Loss_mlm --> 6.962 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.883 |
243
+ [2024-10-21 03:58:05,620][Main][INFO] - [train] Step 3525 out of 65536 | Loss --> 13.463 | Loss_ntp --> 6.677 | Loss_mlm --> 6.786 | Grad_l2 --> 9.458 | Weights_l2 --> 7701.193 | Lr --> 0.001 | Seconds_per_step --> 9.636 |
244
+ [2024-10-21 04:02:06,516][Main][INFO] - [train] Step 3550 out of 65536 | Loss --> 13.419 | Loss_ntp --> 6.654 | Loss_mlm --> 6.766 | Grad_l2 --> 9.819 | Weights_l2 --> 7701.188 | Lr --> 0.001 | Seconds_per_step --> 9.636 |
245
+ [2024-10-21 04:06:07,229][Main][INFO] - [train] Step 3575 out of 65536 | Loss --> 13.362 | Loss_ntp --> 6.626 | Loss_mlm --> 6.736 | Grad_l2 --> 8.944 | Weights_l2 --> 7701.184 | Lr --> 0.001 | Seconds_per_step --> 9.628 |
246
+ [2024-10-21 04:10:08,761][Main][INFO] - [train] Step 3600 out of 65536 | Loss --> 13.401 | Loss_ntp --> 6.628 | Loss_mlm --> 6.773 | Grad_l2 --> 9.904 | Weights_l2 --> 7701.180 | Lr --> 0.001 | Seconds_per_step --> 9.661 |
247
+ [2024-10-21 04:14:09,815][Main][INFO] - [train] Step 3625 out of 65536 | Loss --> 13.361 | Loss_ntp --> 6.625 | Loss_mlm --> 6.736 | Grad_l2 --> 8.507 | Weights_l2 --> 7701.176 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
248
+ [2024-10-21 04:18:10,037][Main][INFO] - [train] Step 3650 out of 65536 | Loss --> 13.355 | Loss_ntp --> 6.614 | Loss_mlm --> 6.741 | Grad_l2 --> 9.056 | Weights_l2 --> 7701.172 | Lr --> 0.001 | Seconds_per_step --> 9.609 |
249
+ [2024-10-21 04:22:10,677][Main][INFO] - [train] Step 3675 out of 65536 | Loss --> 13.306 | Loss_ntp --> 6.586 | Loss_mlm --> 6.720 | Grad_l2 --> 9.057 | Weights_l2 --> 7701.168 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
250
+ [2024-10-21 04:26:12,857][Main][INFO] - [train] Step 3700 out of 65536 | Loss --> 13.325 | Loss_ntp --> 6.596 | Loss_mlm --> 6.729 | Grad_l2 --> 10.732 | Weights_l2 --> 7701.163 | Lr --> 0.001 | Seconds_per_step --> 9.687 |
251
+ [2024-10-21 04:30:11,816][Main][INFO] - [train] Step 3725 out of 65536 | Loss --> 13.239 | Loss_ntp --> 6.561 | Loss_mlm --> 6.678 | Grad_l2 --> 9.810 | Weights_l2 --> 7701.160 | Lr --> 0.001 | Seconds_per_step --> 9.558 |
252
+ [2024-10-21 04:34:12,167][Main][INFO] - [train] Step 3750 out of 65536 | Loss --> 13.211 | Loss_ntp --> 6.534 | Loss_mlm --> 6.677 | Grad_l2 --> 10.011 | Weights_l2 --> 7701.156 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
253
+ [2024-10-21 04:38:14,046][Main][INFO] - [train] Step 3775 out of 65536 | Loss --> 13.214 | Loss_ntp --> 6.537 | Loss_mlm --> 6.678 | Grad_l2 --> 8.939 | Weights_l2 --> 7701.152 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
254
+ [2024-10-21 04:42:14,454][Main][INFO] - [train] Step 3800 out of 65536 | Loss --> 13.148 | Loss_ntp --> 6.508 | Loss_mlm --> 6.640 | Grad_l2 --> 9.513 | Weights_l2 --> 7701.148 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
255
+ [2024-10-21 04:46:14,554][Main][INFO] - [train] Step 3825 out of 65536 | Loss --> 13.172 | Loss_ntp --> 6.514 | Loss_mlm --> 6.658 | Grad_l2 --> 9.295 | Weights_l2 --> 7701.144 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
256
+ [2024-10-21 04:50:14,762][Main][INFO] - [train] Step 3850 out of 65536 | Loss --> 13.118 | Loss_ntp --> 6.494 | Loss_mlm --> 6.624 | Grad_l2 --> 7.890 | Weights_l2 --> 7701.140 | Lr --> 0.001 | Seconds_per_step --> 9.608 |
257
+ [2024-10-21 04:54:16,032][Main][INFO] - [train] Step 3875 out of 65536 | Loss --> 13.179 | Loss_ntp --> 6.521 | Loss_mlm --> 6.657 | Grad_l2 --> 9.901 | Weights_l2 --> 7701.136 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
258
+ [2024-10-21 04:58:16,128][Main][INFO] - [train] Step 3900 out of 65536 | Loss --> 13.259 | Loss_ntp --> 6.571 | Loss_mlm --> 6.687 | Grad_l2 --> 8.910 | Weights_l2 --> 7701.132 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
259
+ [2024-10-21 05:02:17,997][Main][INFO] - [train] Step 3925 out of 65536 | Loss --> 13.323 | Loss_ntp --> 6.594 | Loss_mlm --> 6.729 | Grad_l2 -
wandb/run-20241020_182518-i0qk9v3k/files/requirements.txt ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentencepiece==0.2.0
2
+ pytz==2024.2
3
+ pyrepl==0.9.0
4
+ antlr4-python3-runtime==4.9.3
5
+ xxhash==3.5.0
6
+ wmctrl==0.5
7
+ tzdata==2024.2
8
+ tqdm==4.66.5
9
+ smmap==5.0.1
10
+ setproctitle==1.3.3
11
+ sentry-sdk==2.17.0
12
+ safetensors==0.4.5
13
+ regex==2024.9.11
14
+ pynvml==11.5.3
15
+ pyarrow==17.0.0
16
+ protobuf==3.20.3
17
+ propcache==0.2.0
18
+ omegaconf==2.3.0
19
+ multidict==6.1.0
20
+ joblib==1.4.2
21
+ frozenlist==1.4.1
22
+ fancycompleter==0.9.1
23
+ docker-pycreds==0.4.0
24
+ dill==0.3.8
25
+ click==8.1.7
26
+ aiohappyeyeballs==2.4.3
27
+ absl-py==2.1.0
28
+ yarl==1.15.5
29
+ pdbpp==0.10.3
30
+ pandas==2.2.3
31
+ nltk==3.9.1
32
+ multiprocess==0.70.16
33
+ hydra-core==1.3.2
34
+ huggingface-hub==0.26.0
35
+ gitdb==4.0.11
36
+ aiosignal==1.3.1
37
+ tokenizers==0.20.1
38
+ rouge_score==0.1.2
39
+ GitPython==3.1.43
40
+ aiohttp==3.10.10
41
+ wandb==0.18.5
42
+ transformers==4.46.0.dev0
43
+ accelerate==1.0.1
44
+ datasets==3.0.1
45
+ evaluate==0.4.3
46
+ liger_kernel==0.3.1
47
+ entrypoints==0.4
48
+ jupyter_client==7.4.9
49
+ nbclassic==1.1.0
50
+ notebook==6.5.5
51
+ pyzmq==24.0.1
52
+ PyYAML==6.0.2
53
+ Send2Trash==1.8.3
54
+ anyio==4.6.0
55
+ argon2-cffi==23.1.0
56
+ argon2-cffi-bindings==21.2.0
57
+ arrow==1.3.0
58
+ asttokens==2.4.1
59
+ async-lru==2.0.4
60
+ attrs==24.2.0
61
+ babel==2.16.0
62
+ beautifulsoup4==4.12.3
63
+ bleach==6.1.0
64
+ certifi==2024.8.30
65
+ cffi==1.17.1
66
+ charset-normalizer==3.3.2
67
+ comm==0.2.2
68
+ debugpy==1.8.5
69
+ decorator==5.1.1
70
+ defusedxml==0.7.1
71
+ executing==2.1.0
72
+ fastjsonschema==2.20.0
73
+ fqdn==1.5.1
74
+ h11==0.14.0
75
+ httpcore==1.0.5
76
+ httpx==0.27.2
77
+ idna==3.10
78
+ ipykernel==6.29.5
79
+ ipython==8.27.0
80
+ ipython-genutils==0.2.0
81
+ ipywidgets==8.1.5
82
+ isoduration==20.11.0
83
+ jedi==0.19.1
84
+ json5==0.9.25
85
+ jsonpointer==3.0.0
86
+ jsonschema==4.23.0
87
+ jsonschema-specifications==2023.12.1
88
+ jupyter-archive==3.4.0
89
+ jupyter_contrib_core==0.4.2
90
+ jupyter_contrib_nbextensions==0.7.0
91
+ jupyter_core==5.7.2
92
+ jupyter-events==0.10.0
93
+ jupyter-highlight-selected-word==0.2.0
94
+ jupyter-lsp==2.2.5
95
+ jupyter_nbextensions_configurator==0.6.4
96
+ jupyter_server==2.14.2
97
+ jupyter_server_terminals==0.5.3
98
+ jupyterlab==4.2.5
99
+ jupyterlab_pygments==0.3.0
100
+ jupyterlab_server==2.27.3
101
+ jupyterlab_widgets==3.0.13
102
+ lxml==5.3.0
103
+ matplotlib-inline==0.1.7
104
+ mistune==3.0.2
105
+ nbclient==0.10.0
106
+ nbconvert==7.16.4
107
+ nbformat==5.10.4
108
+ nest-asyncio==1.6.0
109
+ notebook_shim==0.2.4
110
+ overrides==7.7.0
111
+ packaging==24.1
112
+ pandocfilters==1.5.1
113
+ parso==0.8.4
114
+ pexpect==4.9.0
115
+ platformdirs==4.3.6
116
+ prometheus_client==0.21.0
117
+ prompt_toolkit==3.0.47
118
+ psutil==6.0.0
119
+ ptyprocess==0.7.0
120
+ pure_eval==0.2.3
121
+ pycparser==2.22
122
+ Pygments==2.18.0
123
+ python-dateutil==2.9.0.post0
124
+ python-json-logger==2.0.7
125
+ referencing==0.35.1
126
+ requests==2.32.3
127
+ rfc3339-validator==0.1.4
128
+ rfc3986-validator==0.1.1
129
+ rpds-py==0.20.0
130
+ sniffio==1.3.1
131
+ soupsieve==2.6
132
+ stack-data==0.6.3
133
+ terminado==0.18.1
134
+ tinycss2==1.3.0
135
+ tornado==6.4.1
136
+ traitlets==5.14.3
137
+ types-python-dateutil==2.9.0.20240906
138
+ uri-template==1.3.0
139
+ urllib3==2.2.3
140
+ wcwidth==0.2.13
141
+ webcolors==24.8.0
142
+ webencodings==0.5.1
143
+ websocket-client==1.8.0
144
+ widgetsnbextension==4.0.13
145
+ Jinja2==3.1.3
146
+ MarkupSafe==2.1.5
147
+ filelock==3.13.1
148
+ fsspec==2024.2.0
149
+ mpmath==1.3.0
150
+ networkx==3.2.1
151
+ numpy==1.26.3
152
+ nvidia-cublas-cu12==12.4.2.65
153
+ nvidia-cuda-cupti-cu12==12.4.99
154
+ nvidia-cuda-nvrtc-cu12==12.4.99
155
+ nvidia-cuda-runtime-cu12==12.4.99
156
+ nvidia-cudnn-cu12==9.1.0.70
157
+ nvidia-cufft-cu12==11.2.0.44
158
+ nvidia-curand-cu12==10.3.5.119
159
+ nvidia-cusolver-cu12==11.6.0.99
160
+ nvidia-cusparse-cu12==12.3.0.142
161
+ nvidia-nccl-cu12==2.20.5
162
+ nvidia-nvjitlink-cu12==12.4.99
163
+ nvidia-nvtx-cu12==12.4.99
164
+ pillow==10.2.0
165
+ sympy==1.12
166
+ torch==2.4.1+cu124
167
+ torchaudio==2.4.1+cu124
168
+ torchvision==0.19.1+cu124
169
+ triton==3.0.0
170
+ typing_extensions==4.9.0
171
+ pip==24.2
172
+ setuptools==75.1.0
173
+ wheel==0.44.0
174
+ PyGObject==3.42.1
175
+ PyJWT==2.3.0
176
+ SecretStorage==3.3.1
177
+ blinker==1.4
178
+ cryptography==3.4.8
179
+ dbus-python==1.2.18
180
+ distro==1.7.0
181
+ httplib2==0.20.2
182
+ importlib-metadata==4.6.4
183
+ jeepney==0.7.1
184
+ keyring==23.5.0
185
+ launchpadlib==1.10.16
186
+ lazr.restfulclient==0.14.4
187
+ lazr.uri==1.0.6
188
+ more-itertools==8.10.0
189
+ oauthlib==3.2.0
190
+ pyparsing==2.4.7
191
+ python-apt==2.4.0+ubuntu4
192
+ six==1.16.0
193
+ wadllib==1.3.6
194
+ zipp==1.0.0
wandb/run-20241020_182518-i0qk9v3k/files/wandb-metadata.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-196-generic-x86_64-with-glibc2.35",
3
+ "python": "3.11.10",
4
+ "startedAt": "2024-10-20T18:25:18.123718Z",
5
+ "program": "-m nanoT5.main",
6
+ "git": {
7
+ "remote": "https://github.com/pszemraj/nanoT5.git",
8
+ "commit": "c9a96f3716604dae057adb04996323bd32fcc58e"
9
+ },
10
+ "email": "amazingvince@gmail.com",
11
+ "root": "/workspace/nanoT5/logs/2024-10-20/18-25-17",
12
+ "host": "2c2cdba3fdca",
13
+ "username": "root",
14
+ "executable": "/usr/bin/python",
15
+ "cpu_count": 48,
16
+ "cpu_count_logical": 96,
17
+ "gpu": "NVIDIA A40",
18
+ "gpu_count": 1,
19
+ "disk": {
20
+ "/": {
21
+ "total": "53687091200",
22
+ "used": "584318976"
23
+ }
24
+ },
25
+ "memory": {
26
+ "total": "540662628352"
27
+ },
28
+ "cpu": {
29
+ "count": 48,
30
+ "countLogical": 96
31
+ },
32
+ "gpu_nvidia": [
33
+ {
34
+ "name": "NVIDIA A40",
35
+ "memoryTotal": "48305799168",
36
+ "cudaCores": 10752,
37
+ "architecture": "Ampere"
38
+ }
39
+ ],
40
+ "cudaVersion": "12.4"
41
+ }
wandb/run-20241020_182518-i0qk9v3k/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_timestamp":1.7297368335104377e+09,"eval/accuracy_mlm":0.06220587782972441,"eval/accuracy":0.031102938914862203,"train/seconds_per_step":9.632421855926514,"train/weights_l2":7697.9996299951235,"eval/loss_ntp":0.14020523258785564,"train/grad_l2":30.740209579467773,"_step":29525,"eval/time":28.52281665802002,"train/loss":3.5201581421494486,"eval/loss":4.0174038550985145,"train/loss_ntp":0.14717130114790053,"_runtime":288147.130780562,"_wandb":{"runtime":288147},"train/loss_mlm":3.3729868379235266,"eval/loss_mlm":3.877198635123846,"eval/accuracy_ntp":0,"train/lr":0.0006538872513957004}
wandb/run-20241020_182518-i0qk9v3k/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-20T18:25:17.702790895Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmptqsfbw3z/port-4102.txt","pid":4102,"debug":false,"disable-analytics":false}
2
+ {"time":"2024-10-20T18:25:17.702844368Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2024-10-20T18:25:17.703798232Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":4102}
4
+ {"time":"2024-10-20T18:25:17.703819799Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":42971,"Zone":""}}
5
+ {"time":"2024-10-20T18:25:17.871771876Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:58724"}
6
+ {"time":"2024-10-20T18:25:18.127264011Z","level":"INFO","msg":"handleInformInit: received","streamId":"i0qk9v3k","id":"127.0.0.1:58724"}
7
+ {"time":"2024-10-20T18:25:18.24781459Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"i0qk9v3k","id":"127.0.0.1:58724"}
8
+ {"time":"2024-10-24T02:27:45.25401773Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:58724"}
9
+ {"time":"2024-10-24T02:27:45.254421548Z","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2024-10-24T02:27:45.254423515Z","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:58724"}
11
+ {"time":"2024-10-24T02:27:45.254678436Z","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:58724"}
12
+ {"time":"2024-10-24T02:27:45.867647487Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:58724"}
13
+ {"time":"2024-10-24T02:27:45.867719893Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:58724"}
14
+ {"time":"2024-10-24T02:27:45.86773086Z","level":"INFO","msg":"server is closed"}
wandb/run-20241020_182518-i0qk9v3k/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-20T18:25:18.130390854Z","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-20T18:25:18.131160825Z","level":"INFO","msg":"created symlink","path":"/workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug-core.log"}
3
+ {"time":"2024-10-20T18:25:18.247302473Z","level":"INFO","msg":"created new stream","id":"i0qk9v3k"}
4
+ {"time":"2024-10-20T18:25:18.247577857Z","level":"INFO","msg":"stream: started","id":"i0qk9v3k"}
5
+ {"time":"2024-10-20T18:25:18.247668586Z","level":"INFO","msg":"handler: started","stream_id":{"value":"i0qk9v3k"}}
6
+ {"time":"2024-10-20T18:25:18.247659857Z","level":"INFO","msg":"sender: started","stream_id":"i0qk9v3k"}
7
+ {"time":"2024-10-20T18:25:18.247631762Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"i0qk9v3k"}}
8
+ {"time":"2024-10-20T18:25:19.59293904Z","level":"INFO","msg":"Starting system monitor"}
wandb/run-20241020_182518-i0qk9v3k/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-20 18:25:18,064 INFO MainThread:4102 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-10-20 18:25:18,064 INFO MainThread:4102 [wandb_setup.py:_flush():79] Configure stats pid to 4102
3
+ 2024-10-20 18:25:18,065 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from /root/.config/wandb/settings
4
+ 2024-10-20 18:25:18,065 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/settings
5
+ 2024-10-20 18:25:18,066 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-10-20 18:25:18,066 INFO MainThread:4102 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2024-10-20 18:25:18,067 WARNING MainThread:4102 [wandb_setup.py:_flush():79] Could not find program at -m nanoT5.main
8
+ 2024-10-20 18:25:18,067 INFO MainThread:4102 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
9
+ 2024-10-20 18:25:18,068 INFO MainThread:4102 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-10-20 18:25:18,069 INFO MainThread:4102 [wandb_init.py:_log_setup():534] Logging user logs to /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug.log
11
+ 2024-10-20 18:25:18,071 INFO MainThread:4102 [wandb_init.py:_log_setup():535] Logging internal logs to /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug-internal.log
12
+ 2024-10-20 18:25:18,071 INFO MainThread:4102 [wandb_init.py:init():621] calling init triggers
13
+ 2024-10-20 18:25:18,072 INFO MainThread:4102 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
14
+ config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 93789, 'tokenizer': {'name': 'BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5'}, 'working_dir': '/workspace/nanoT5/logs/2024-10-20/18-25-17', 'model': {'liger': True, 'klass': 'local_t5', 'name': 'pszemraj/tFINE-850m-24x24-1024ctx', 'overwrite': {'dropout_rate': 0.0, 'num_decoder_layers': 16, 'num_key_value_heads': 4, 'num_layers': 16, 'use_gqa': True}, 'add_config': {'is_bf16': True}, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'multi_task': True, 'NTP': 0.3, 'input_length': 512, 'max_seq_len': 512, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 0}, 'optim': {'name': 'adamwscale', 'base_lr': 0.001, 'batch_size': 128, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.01, 'grad_clip': 1.0, 'grad_acc': 16, 'final_cosine': 2e-05}, 'eval': {'every_steps': 500, 'steps': 0}, 'checkpoint': {'every_steps': 1500}, 'logging': {'every_steps': 25, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'amazingvince', 'tags': ['gqa', 'large', 'e32-d16', '512 ctx'], 'mode': 'online'}}, 'slurm_id': 'none'}
15
+ 2024-10-20 18:25:18,073 INFO MainThread:4102 [wandb_init.py:init():671] starting backend
16
+ 2024-10-20 18:25:18,074 INFO MainThread:4102 [wandb_init.py:init():675] sending inform_init request
17
+ 2024-10-20 18:25:18,121 INFO MainThread:4102 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-10-20 18:25:18,122 INFO MainThread:4102 [wandb_init.py:init():688] backend started and connected
19
+ 2024-10-20 18:25:18,198 INFO MainThread:4102 [wandb_init.py:init():783] updated telemetry
20
+ 2024-10-20 18:25:18,256 INFO MainThread:4102 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
21
+ 2024-10-20 18:25:19,558 INFO MainThread:4102 [wandb_init.py:init():867] starting run threads in backend
22
+ 2024-10-20 18:25:19,755 INFO MainThread:4102 [wandb_run.py:_console_start():2463] atexit reg
23
+ 2024-10-20 18:25:19,756 INFO MainThread:4102 [wandb_run.py:_redirect():2311] redirect: wrap_raw
24
+ 2024-10-20 18:25:19,757 INFO MainThread:4102 [wandb_run.py:_redirect():2376] Wrapping output streams.
25
+ 2024-10-20 18:25:19,759 INFO MainThread:4102 [wandb_run.py:_redirect():2401] Redirects installed.
26
+ 2024-10-20 18:25:19,763 INFO MainThread:4102 [wandb_init.py:init():911] run started, returning control to user process
27
+ 2024-10-20 18:25:41,763 INFO MainThread:4102 [wandb_run.py:_config_callback():1390] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 93789, 'tokenizer': {'name': 'BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5'}, 'working_dir': '/workspace/nanoT5/logs/2024-10-20/18-25-17', 'model': {'liger': True, 'klass': 'local_t5', 'name': 'pszemraj/tFINE-850m-24x24-1024ctx', 'overwrite': {'dropout_rate': 0.0, 'num_decoder_layers': 16, 'num_key_value_heads': 4, 'num_layers': 16, 'use_gqa': True}, 'add_config': {'is_bf16': True}, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'multi_task': True, 'NTP': 0.3, 'input_length': 512, 'max_seq_len': 512, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 0, 'before_mask_input_length': 568, 'target_length': 114}, 'optim': {'name': 'adamwscale', 'base_lr': 0.001, 'batch_size': 128, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.01, 'grad_clip': 1.0, 'grad_acc': 16, 'final_cosine': 2e-05}, 'eval': {'every_steps': 500, 'steps': 0, 'corrected_steps': 0}, 'checkpoint': {'every_steps': 1500}, 'logging': {'every_steps': 25, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'amazingvince', 'tags': ['gqa', 'large', 'e32-d16', '512 ctx'], 'mode': 'online'}}, 'slurm_id': 'none', 'n_all_param': 486886912}
28
+ 2024-10-24 02:27:45,254 WARNING MsgRouterThr:4102 [router.py:message_loop():77] message_loop has been closed
wandb/run-20241020_182518-i0qk9v3k/run-i0qk9v3k.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a7c6447c6a15329124db78b2c333583d47f717079c22331f7e91b92b357127a
3
+ size 152734877