amazingvince
commited on
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- .hydra/config.yaml +63 -0
- .hydra/hydra.yaml +156 -0
- .hydra/overrides.yaml +1 -0
- .ipynb_checkpoints/config-checkpoint.json +34 -0
- .ipynb_checkpoints/test-checkpoint.py +25 -0
- README.md +18 -0
- amazingvince/ul3-base/added_tokens.json +102 -0
- amazingvince/ul3-base/config.json +35 -0
- amazingvince/ul3-base/model.safetensors +3 -0
- amazingvince/ul3-base/special_tokens_map.json +132 -0
- amazingvince/ul3-base/tokenizer.json +0 -0
- amazingvince/ul3-base/tokenizer.model +3 -0
- amazingvince/ul3-base/tokenizer_config.json +952 -0
- checkpoint-pt-21000/model.safetensors +3 -0
- checkpoint-pt-21000/random_states_0.pkl +3 -0
- checkpoint-pt-22500/model.safetensors +3 -0
- checkpoint-pt-22500/random_states_0.pkl +3 -0
- checkpoint-pt-25500/model.safetensors +3 -0
- checkpoint-pt-25500/random_states_0.pkl +3 -0
- checkpoint-pt-27000/config.json +34 -0
- checkpoint-pt-27000/model.safetensors +3 -0
- checkpoint-pt-27000/random_states_0.pkl +3 -0
- checkpoint-pt-28500/config.json +34 -0
- checkpoint-pt-28500/model.safetensors +3 -0
- checkpoint-pt-28500/random_states_0.pkl +3 -0
- config.json +87 -0
- main.log +189 -0
- test.py +25 -0
- wandb/debug-internal.log +8 -0
- wandb/debug.log +28 -0
- wandb/run-20241020_182518-i0qk9v3k/files/config.yaml +123 -0
- wandb/run-20241020_182518-i0qk9v3k/files/output.log +259 -0
- wandb/run-20241020_182518-i0qk9v3k/files/requirements.txt +194 -0
- wandb/run-20241020_182518-i0qk9v3k/files/wandb-metadata.json +41 -0
- wandb/run-20241020_182518-i0qk9v3k/files/wandb-summary.json +1 -0
- wandb/run-20241020_182518-i0qk9v3k/logs/debug-core.log +14 -0
- wandb/run-20241020_182518-i0qk9v3k/logs/debug-internal.log +8 -0
- wandb/run-20241020_182518-i0qk9v3k/logs/debug.log +28 -0
- wandb/run-20241020_182518-i0qk9v3k/run-i0qk9v3k.wandb +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
wandb/run-20241020_182518-i0qk9v3k/run-i0qk9v3k.wandb filter=lfs diff=lfs merge=lfs -text
|
.hydra/config.yaml
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mode: pt
|
2 |
+
device: gpu
|
3 |
+
precision: bf16
|
4 |
+
eval_only: false
|
5 |
+
predict_only: false
|
6 |
+
seed: 93789
|
7 |
+
tokenizer:
|
8 |
+
name: BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5
|
9 |
+
working_dir: null
|
10 |
+
model:
|
11 |
+
liger: true
|
12 |
+
klass: local_t5
|
13 |
+
name: pszemraj/tFINE-850m-24x24-1024ctx
|
14 |
+
overwrite:
|
15 |
+
dropout_rate: 0.0
|
16 |
+
num_decoder_layers: 16
|
17 |
+
num_key_value_heads: 4
|
18 |
+
num_layers: 16
|
19 |
+
use_gqa: true
|
20 |
+
add_config:
|
21 |
+
is_bf16: false
|
22 |
+
checkpoint_path: ''
|
23 |
+
random_init: true
|
24 |
+
compile: true
|
25 |
+
data:
|
26 |
+
multi_task: true
|
27 |
+
NTP: 0.3
|
28 |
+
input_length: 512
|
29 |
+
max_seq_len: 512
|
30 |
+
mlm_probability: 0.15
|
31 |
+
mean_noise_span_length: 3.0
|
32 |
+
num_workers: 0
|
33 |
+
optim:
|
34 |
+
name: adamwscale
|
35 |
+
base_lr: 0.001
|
36 |
+
batch_size: 128
|
37 |
+
total_steps: 65536
|
38 |
+
epochs: -1
|
39 |
+
warmup_steps: 5000
|
40 |
+
lr_scheduler: cosine
|
41 |
+
weight_decay: 0.01
|
42 |
+
grad_clip: 1.0
|
43 |
+
grad_acc: 16
|
44 |
+
final_cosine: 2.0e-05
|
45 |
+
eval:
|
46 |
+
every_steps: 500
|
47 |
+
steps: 0
|
48 |
+
checkpoint:
|
49 |
+
every_steps: 1500
|
50 |
+
logging:
|
51 |
+
every_steps: 25
|
52 |
+
grad_l2: true
|
53 |
+
weights_l2: true
|
54 |
+
use_wandb: true
|
55 |
+
wandb_config:
|
56 |
+
project: nanoT5
|
57 |
+
entity: amazingvince
|
58 |
+
tags:
|
59 |
+
- gqa
|
60 |
+
- large
|
61 |
+
- e32-d16
|
62 |
+
- 512 ctx
|
63 |
+
mode: online
|
.hydra/hydra.yaml
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: main
|
117 |
+
chdir: true
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: default
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /workspace/nanoT5
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /workspace/nanoT5/nanoT5/configs
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /workspace/nanoT5/logs/2024-10-20/18-25-17
|
144 |
+
choices:
|
145 |
+
local_env: default
|
146 |
+
task: pt
|
147 |
+
hydra/env: default
|
148 |
+
hydra/callbacks: null
|
149 |
+
hydra/job_logging: default
|
150 |
+
hydra/hydra_logging: default
|
151 |
+
hydra/hydra_help: default
|
152 |
+
hydra/help: default
|
153 |
+
hydra/sweeper: basic
|
154 |
+
hydra/launcher: basic
|
155 |
+
hydra/output: default
|
156 |
+
verbose: false
|
.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
.ipynb_checkpoints/config-checkpoint.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "pszemraj/tFINE-850m-24x24-1024ctx",
|
3 |
+
"architectures": [
|
4 |
+
"T5ForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"classifier_dropout": 0.0,
|
7 |
+
"d_ff": 3072,
|
8 |
+
"d_kv": 64,
|
9 |
+
"d_model": 1024,
|
10 |
+
"decoder_start_token_id": 3,
|
11 |
+
"dense_act_fn": "silu",
|
12 |
+
"dropout_rate": 0.0,
|
13 |
+
"eos_token_id": 2,
|
14 |
+
"feed_forward_proj": "gated-silu",
|
15 |
+
"initializer_factor": 1.0,
|
16 |
+
"is_bf16": true,
|
17 |
+
"is_encoder_decoder": false,
|
18 |
+
"is_gated_act": true,
|
19 |
+
"layer_norm_epsilon": 1e-06,
|
20 |
+
"model_type": "t5",
|
21 |
+
"num_decoder_layers": 16,
|
22 |
+
"num_heads": 16,
|
23 |
+
"num_key_value_heads": 4,
|
24 |
+
"num_layers": 16,
|
25 |
+
"output_past": true,
|
26 |
+
"pad_token_id": 3,
|
27 |
+
"relative_attention_max_distance": 128,
|
28 |
+
"relative_attention_num_buckets": 48,
|
29 |
+
"tie_word_embeddings": false,
|
30 |
+
"transformers_version": "4.46.0.dev0",
|
31 |
+
"use_cache": true,
|
32 |
+
"use_gqa": true,
|
33 |
+
"vocab_size": 28776
|
34 |
+
}
|
.ipynb_checkpoints/test-checkpoint.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
+
import torch
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5")
|
5 |
+
|
6 |
+
special_tokens_dict = {'additional_special_tokens': ['[R]', '[S]', '[X]', '[NTP]']}
|
7 |
+
tokenizer.add_special_tokens(special_tokens_dict)
|
8 |
+
|
9 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("/workspace/nanoT5/logs/2024-10-20/18-25-17/checkpoint-pt-27000").to("cuda")
|
10 |
+
prompt = "The "
|
11 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
12 |
+
# Add decoder_input_ids
|
13 |
+
# decoder_input_ids = torch.ones((inputs.input_ids.shape[0], 1), dtype=torch.long) * model.config.decoder_start_token_id
|
14 |
+
|
15 |
+
# Generate
|
16 |
+
generated_ids = model.generate(
|
17 |
+
**inputs,
|
18 |
+
# decoder_input_ids=decoder_input_ids,
|
19 |
+
max_new_tokens=20,
|
20 |
+
no_repeat_ngram_size=5
|
21 |
+
)
|
22 |
+
|
23 |
+
# Decode the output
|
24 |
+
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
|
25 |
+
print(generated_text)
|
README.md
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: en
|
3 |
+
license: mit
|
4 |
+
tags: ['t5', 'transformers']
|
5 |
+
---
|
6 |
+
|
7 |
+
# amazingvince/ul3-base
|
8 |
+
|
9 |
+
Description of your model
|
10 |
+
|
11 |
+
## Usage
|
12 |
+
|
13 |
+
```python
|
14 |
+
from transformers import AutoModel, AutoTokenizer
|
15 |
+
|
16 |
+
tokenizer = AutoTokenizer.from_pretrained("amazingvince/ul3-base")
|
17 |
+
model = AutoModel.from_pretrained("amazingvince/ul3-base")
|
18 |
+
```
|
amazingvince/ul3-base/added_tokens.json
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<extra_id_0>": 28672,
|
3 |
+
"<extra_id_10>": 28682,
|
4 |
+
"<extra_id_11>": 28683,
|
5 |
+
"<extra_id_12>": 28684,
|
6 |
+
"<extra_id_13>": 28685,
|
7 |
+
"<extra_id_14>": 28686,
|
8 |
+
"<extra_id_15>": 28687,
|
9 |
+
"<extra_id_16>": 28688,
|
10 |
+
"<extra_id_17>": 28689,
|
11 |
+
"<extra_id_18>": 28690,
|
12 |
+
"<extra_id_19>": 28691,
|
13 |
+
"<extra_id_1>": 28673,
|
14 |
+
"<extra_id_20>": 28692,
|
15 |
+
"<extra_id_21>": 28693,
|
16 |
+
"<extra_id_22>": 28694,
|
17 |
+
"<extra_id_23>": 28695,
|
18 |
+
"<extra_id_24>": 28696,
|
19 |
+
"<extra_id_25>": 28697,
|
20 |
+
"<extra_id_26>": 28698,
|
21 |
+
"<extra_id_27>": 28699,
|
22 |
+
"<extra_id_28>": 28700,
|
23 |
+
"<extra_id_29>": 28701,
|
24 |
+
"<extra_id_2>": 28674,
|
25 |
+
"<extra_id_30>": 28702,
|
26 |
+
"<extra_id_31>": 28703,
|
27 |
+
"<extra_id_32>": 28704,
|
28 |
+
"<extra_id_33>": 28705,
|
29 |
+
"<extra_id_34>": 28706,
|
30 |
+
"<extra_id_35>": 28707,
|
31 |
+
"<extra_id_36>": 28708,
|
32 |
+
"<extra_id_37>": 28709,
|
33 |
+
"<extra_id_38>": 28710,
|
34 |
+
"<extra_id_39>": 28711,
|
35 |
+
"<extra_id_3>": 28675,
|
36 |
+
"<extra_id_40>": 28712,
|
37 |
+
"<extra_id_41>": 28713,
|
38 |
+
"<extra_id_42>": 28714,
|
39 |
+
"<extra_id_43>": 28715,
|
40 |
+
"<extra_id_44>": 28716,
|
41 |
+
"<extra_id_45>": 28717,
|
42 |
+
"<extra_id_46>": 28718,
|
43 |
+
"<extra_id_47>": 28719,
|
44 |
+
"<extra_id_48>": 28720,
|
45 |
+
"<extra_id_49>": 28721,
|
46 |
+
"<extra_id_4>": 28676,
|
47 |
+
"<extra_id_50>": 28722,
|
48 |
+
"<extra_id_51>": 28723,
|
49 |
+
"<extra_id_52>": 28724,
|
50 |
+
"<extra_id_53>": 28725,
|
51 |
+
"<extra_id_54>": 28726,
|
52 |
+
"<extra_id_55>": 28727,
|
53 |
+
"<extra_id_56>": 28728,
|
54 |
+
"<extra_id_57>": 28729,
|
55 |
+
"<extra_id_58>": 28730,
|
56 |
+
"<extra_id_59>": 28731,
|
57 |
+
"<extra_id_5>": 28677,
|
58 |
+
"<extra_id_60>": 28732,
|
59 |
+
"<extra_id_61>": 28733,
|
60 |
+
"<extra_id_62>": 28734,
|
61 |
+
"<extra_id_63>": 28735,
|
62 |
+
"<extra_id_64>": 28736,
|
63 |
+
"<extra_id_65>": 28737,
|
64 |
+
"<extra_id_66>": 28738,
|
65 |
+
"<extra_id_67>": 28739,
|
66 |
+
"<extra_id_68>": 28740,
|
67 |
+
"<extra_id_69>": 28741,
|
68 |
+
"<extra_id_6>": 28678,
|
69 |
+
"<extra_id_70>": 28742,
|
70 |
+
"<extra_id_71>": 28743,
|
71 |
+
"<extra_id_72>": 28744,
|
72 |
+
"<extra_id_73>": 28745,
|
73 |
+
"<extra_id_74>": 28746,
|
74 |
+
"<extra_id_75>": 28747,
|
75 |
+
"<extra_id_76>": 28748,
|
76 |
+
"<extra_id_77>": 28749,
|
77 |
+
"<extra_id_78>": 28750,
|
78 |
+
"<extra_id_79>": 28751,
|
79 |
+
"<extra_id_7>": 28679,
|
80 |
+
"<extra_id_80>": 28752,
|
81 |
+
"<extra_id_81>": 28753,
|
82 |
+
"<extra_id_82>": 28754,
|
83 |
+
"<extra_id_83>": 28755,
|
84 |
+
"<extra_id_84>": 28756,
|
85 |
+
"<extra_id_85>": 28757,
|
86 |
+
"<extra_id_86>": 28758,
|
87 |
+
"<extra_id_87>": 28759,
|
88 |
+
"<extra_id_88>": 28760,
|
89 |
+
"<extra_id_89>": 28761,
|
90 |
+
"<extra_id_8>": 28680,
|
91 |
+
"<extra_id_90>": 28762,
|
92 |
+
"<extra_id_91>": 28763,
|
93 |
+
"<extra_id_92>": 28764,
|
94 |
+
"<extra_id_93>": 28765,
|
95 |
+
"<extra_id_94>": 28766,
|
96 |
+
"<extra_id_95>": 28767,
|
97 |
+
"<extra_id_96>": 28768,
|
98 |
+
"<extra_id_97>": 28769,
|
99 |
+
"<extra_id_98>": 28770,
|
100 |
+
"<extra_id_99>": 28771,
|
101 |
+
"<extra_id_9>": 28681
|
102 |
+
}
|
amazingvince/ul3-base/config.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/workspace/nanoT5/logs/2024-10-20/18-25-17/checkpoint-pt-27000",
|
3 |
+
"architectures": [
|
4 |
+
"T5Model"
|
5 |
+
],
|
6 |
+
"classifier_dropout": 0.0,
|
7 |
+
"d_ff": 3072,
|
8 |
+
"d_kv": 64,
|
9 |
+
"d_model": 1024,
|
10 |
+
"decoder_start_token_id": 3,
|
11 |
+
"dense_act_fn": "silu",
|
12 |
+
"dropout_rate": 0.0,
|
13 |
+
"eos_token_id": 2,
|
14 |
+
"feed_forward_proj": "gated-silu",
|
15 |
+
"initializer_factor": 1.0,
|
16 |
+
"is_bf16": true,
|
17 |
+
"is_encoder_decoder": true,
|
18 |
+
"is_gated_act": true,
|
19 |
+
"layer_norm_epsilon": 1e-06,
|
20 |
+
"model_type": "t5",
|
21 |
+
"num_decoder_layers": 16,
|
22 |
+
"num_heads": 16,
|
23 |
+
"num_key_value_heads": 4,
|
24 |
+
"num_layers": 16,
|
25 |
+
"output_past": true,
|
26 |
+
"pad_token_id": 3,
|
27 |
+
"relative_attention_max_distance": 128,
|
28 |
+
"relative_attention_num_buckets": 48,
|
29 |
+
"tie_word_embeddings": false,
|
30 |
+
"torch_dtype": "float32",
|
31 |
+
"transformers_version": "4.46.0.dev0",
|
32 |
+
"use_cache": true,
|
33 |
+
"use_gqa": true,
|
34 |
+
"vocab_size": 28776
|
35 |
+
}
|
amazingvince/ul3-base/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bda83debb4d9f284d7be9c454e66a4c966c10e0919bc07c73ce47ad9a94ba11c
|
3 |
+
size 1829529944
|
amazingvince/ul3-base/special_tokens_map.json
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<extra_id_0>",
|
4 |
+
"<extra_id_1>",
|
5 |
+
"<extra_id_2>",
|
6 |
+
"<extra_id_3>",
|
7 |
+
"<extra_id_4>",
|
8 |
+
"<extra_id_5>",
|
9 |
+
"<extra_id_6>",
|
10 |
+
"<extra_id_7>",
|
11 |
+
"<extra_id_8>",
|
12 |
+
"<extra_id_9>",
|
13 |
+
"<extra_id_10>",
|
14 |
+
"<extra_id_11>",
|
15 |
+
"<extra_id_12>",
|
16 |
+
"<extra_id_13>",
|
17 |
+
"<extra_id_14>",
|
18 |
+
"<extra_id_15>",
|
19 |
+
"<extra_id_16>",
|
20 |
+
"<extra_id_17>",
|
21 |
+
"<extra_id_18>",
|
22 |
+
"<extra_id_19>",
|
23 |
+
"<extra_id_20>",
|
24 |
+
"<extra_id_21>",
|
25 |
+
"<extra_id_22>",
|
26 |
+
"<extra_id_23>",
|
27 |
+
"<extra_id_24>",
|
28 |
+
"<extra_id_25>",
|
29 |
+
"<extra_id_26>",
|
30 |
+
"<extra_id_27>",
|
31 |
+
"<extra_id_28>",
|
32 |
+
"<extra_id_29>",
|
33 |
+
"<extra_id_30>",
|
34 |
+
"<extra_id_31>",
|
35 |
+
"<extra_id_32>",
|
36 |
+
"<extra_id_33>",
|
37 |
+
"<extra_id_34>",
|
38 |
+
"<extra_id_35>",
|
39 |
+
"<extra_id_36>",
|
40 |
+
"<extra_id_37>",
|
41 |
+
"<extra_id_38>",
|
42 |
+
"<extra_id_39>",
|
43 |
+
"<extra_id_40>",
|
44 |
+
"<extra_id_41>",
|
45 |
+
"<extra_id_42>",
|
46 |
+
"<extra_id_43>",
|
47 |
+
"<extra_id_44>",
|
48 |
+
"<extra_id_45>",
|
49 |
+
"<extra_id_46>",
|
50 |
+
"<extra_id_47>",
|
51 |
+
"<extra_id_48>",
|
52 |
+
"<extra_id_49>",
|
53 |
+
"<extra_id_50>",
|
54 |
+
"<extra_id_51>",
|
55 |
+
"<extra_id_52>",
|
56 |
+
"<extra_id_53>",
|
57 |
+
"<extra_id_54>",
|
58 |
+
"<extra_id_55>",
|
59 |
+
"<extra_id_56>",
|
60 |
+
"<extra_id_57>",
|
61 |
+
"<extra_id_58>",
|
62 |
+
"<extra_id_59>",
|
63 |
+
"<extra_id_60>",
|
64 |
+
"<extra_id_61>",
|
65 |
+
"<extra_id_62>",
|
66 |
+
"<extra_id_63>",
|
67 |
+
"<extra_id_64>",
|
68 |
+
"<extra_id_65>",
|
69 |
+
"<extra_id_66>",
|
70 |
+
"<extra_id_67>",
|
71 |
+
"<extra_id_68>",
|
72 |
+
"<extra_id_69>",
|
73 |
+
"<extra_id_70>",
|
74 |
+
"<extra_id_71>",
|
75 |
+
"<extra_id_72>",
|
76 |
+
"<extra_id_73>",
|
77 |
+
"<extra_id_74>",
|
78 |
+
"<extra_id_75>",
|
79 |
+
"<extra_id_76>",
|
80 |
+
"<extra_id_77>",
|
81 |
+
"<extra_id_78>",
|
82 |
+
"<extra_id_79>",
|
83 |
+
"<extra_id_80>",
|
84 |
+
"<extra_id_81>",
|
85 |
+
"<extra_id_82>",
|
86 |
+
"<extra_id_83>",
|
87 |
+
"<extra_id_84>",
|
88 |
+
"<extra_id_85>",
|
89 |
+
"<extra_id_86>",
|
90 |
+
"<extra_id_87>",
|
91 |
+
"<extra_id_88>",
|
92 |
+
"<extra_id_89>",
|
93 |
+
"<extra_id_90>",
|
94 |
+
"<extra_id_91>",
|
95 |
+
"<extra_id_92>",
|
96 |
+
"<extra_id_93>",
|
97 |
+
"<extra_id_94>",
|
98 |
+
"<extra_id_95>",
|
99 |
+
"<extra_id_96>",
|
100 |
+
"<extra_id_97>",
|
101 |
+
"<extra_id_98>",
|
102 |
+
"<extra_id_99>"
|
103 |
+
],
|
104 |
+
"bos_token": {
|
105 |
+
"content": "<s>",
|
106 |
+
"lstrip": false,
|
107 |
+
"normalized": true,
|
108 |
+
"rstrip": false,
|
109 |
+
"single_word": false
|
110 |
+
},
|
111 |
+
"eos_token": {
|
112 |
+
"content": "</s>",
|
113 |
+
"lstrip": false,
|
114 |
+
"normalized": true,
|
115 |
+
"rstrip": false,
|
116 |
+
"single_word": false
|
117 |
+
},
|
118 |
+
"pad_token": {
|
119 |
+
"content": "<pad>",
|
120 |
+
"lstrip": false,
|
121 |
+
"normalized": false,
|
122 |
+
"rstrip": false,
|
123 |
+
"single_word": false
|
124 |
+
},
|
125 |
+
"unk_token": {
|
126 |
+
"content": "<unk>",
|
127 |
+
"lstrip": false,
|
128 |
+
"normalized": true,
|
129 |
+
"rstrip": false,
|
130 |
+
"single_word": false
|
131 |
+
}
|
132 |
+
}
|
amazingvince/ul3-base/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
amazingvince/ul3-base/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b01740e92325d7d8300b8f4c4249cd4bcb70533fe06e5632a431b37d91d7a2a
|
3 |
+
size 711026
|
amazingvince/ul3-base/tokenizer_config.json
ADDED
@@ -0,0 +1,952 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_eos_token": true,
|
4 |
+
"add_prefix_space": true,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "<unk>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": true,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": true
|
13 |
+
},
|
14 |
+
"1": {
|
15 |
+
"content": "<s>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": true,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"2": {
|
23 |
+
"content": "</s>",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": true,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": true
|
29 |
+
},
|
30 |
+
"3": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false,
|
36 |
+
"special": true
|
37 |
+
},
|
38 |
+
"28672": {
|
39 |
+
"content": "<extra_id_0>",
|
40 |
+
"lstrip": false,
|
41 |
+
"normalized": false,
|
42 |
+
"rstrip": false,
|
43 |
+
"single_word": false,
|
44 |
+
"special": true
|
45 |
+
},
|
46 |
+
"28673": {
|
47 |
+
"content": "<extra_id_1>",
|
48 |
+
"lstrip": false,
|
49 |
+
"normalized": false,
|
50 |
+
"rstrip": false,
|
51 |
+
"single_word": false,
|
52 |
+
"special": true
|
53 |
+
},
|
54 |
+
"28674": {
|
55 |
+
"content": "<extra_id_2>",
|
56 |
+
"lstrip": false,
|
57 |
+
"normalized": false,
|
58 |
+
"rstrip": false,
|
59 |
+
"single_word": false,
|
60 |
+
"special": true
|
61 |
+
},
|
62 |
+
"28675": {
|
63 |
+
"content": "<extra_id_3>",
|
64 |
+
"lstrip": false,
|
65 |
+
"normalized": false,
|
66 |
+
"rstrip": false,
|
67 |
+
"single_word": false,
|
68 |
+
"special": true
|
69 |
+
},
|
70 |
+
"28676": {
|
71 |
+
"content": "<extra_id_4>",
|
72 |
+
"lstrip": false,
|
73 |
+
"normalized": false,
|
74 |
+
"rstrip": false,
|
75 |
+
"single_word": false,
|
76 |
+
"special": true
|
77 |
+
},
|
78 |
+
"28677": {
|
79 |
+
"content": "<extra_id_5>",
|
80 |
+
"lstrip": false,
|
81 |
+
"normalized": false,
|
82 |
+
"rstrip": false,
|
83 |
+
"single_word": false,
|
84 |
+
"special": true
|
85 |
+
},
|
86 |
+
"28678": {
|
87 |
+
"content": "<extra_id_6>",
|
88 |
+
"lstrip": false,
|
89 |
+
"normalized": false,
|
90 |
+
"rstrip": false,
|
91 |
+
"single_word": false,
|
92 |
+
"special": true
|
93 |
+
},
|
94 |
+
"28679": {
|
95 |
+
"content": "<extra_id_7>",
|
96 |
+
"lstrip": false,
|
97 |
+
"normalized": false,
|
98 |
+
"rstrip": false,
|
99 |
+
"single_word": false,
|
100 |
+
"special": true
|
101 |
+
},
|
102 |
+
"28680": {
|
103 |
+
"content": "<extra_id_8>",
|
104 |
+
"lstrip": false,
|
105 |
+
"normalized": false,
|
106 |
+
"rstrip": false,
|
107 |
+
"single_word": false,
|
108 |
+
"special": true
|
109 |
+
},
|
110 |
+
"28681": {
|
111 |
+
"content": "<extra_id_9>",
|
112 |
+
"lstrip": false,
|
113 |
+
"normalized": false,
|
114 |
+
"rstrip": false,
|
115 |
+
"single_word": false,
|
116 |
+
"special": true
|
117 |
+
},
|
118 |
+
"28682": {
|
119 |
+
"content": "<extra_id_10>",
|
120 |
+
"lstrip": false,
|
121 |
+
"normalized": false,
|
122 |
+
"rstrip": false,
|
123 |
+
"single_word": false,
|
124 |
+
"special": true
|
125 |
+
},
|
126 |
+
"28683": {
|
127 |
+
"content": "<extra_id_11>",
|
128 |
+
"lstrip": false,
|
129 |
+
"normalized": false,
|
130 |
+
"rstrip": false,
|
131 |
+
"single_word": false,
|
132 |
+
"special": true
|
133 |
+
},
|
134 |
+
"28684": {
|
135 |
+
"content": "<extra_id_12>",
|
136 |
+
"lstrip": false,
|
137 |
+
"normalized": false,
|
138 |
+
"rstrip": false,
|
139 |
+
"single_word": false,
|
140 |
+
"special": true
|
141 |
+
},
|
142 |
+
"28685": {
|
143 |
+
"content": "<extra_id_13>",
|
144 |
+
"lstrip": false,
|
145 |
+
"normalized": false,
|
146 |
+
"rstrip": false,
|
147 |
+
"single_word": false,
|
148 |
+
"special": true
|
149 |
+
},
|
150 |
+
"28686": {
|
151 |
+
"content": "<extra_id_14>",
|
152 |
+
"lstrip": false,
|
153 |
+
"normalized": false,
|
154 |
+
"rstrip": false,
|
155 |
+
"single_word": false,
|
156 |
+
"special": true
|
157 |
+
},
|
158 |
+
"28687": {
|
159 |
+
"content": "<extra_id_15>",
|
160 |
+
"lstrip": false,
|
161 |
+
"normalized": false,
|
162 |
+
"rstrip": false,
|
163 |
+
"single_word": false,
|
164 |
+
"special": true
|
165 |
+
},
|
166 |
+
"28688": {
|
167 |
+
"content": "<extra_id_16>",
|
168 |
+
"lstrip": false,
|
169 |
+
"normalized": false,
|
170 |
+
"rstrip": false,
|
171 |
+
"single_word": false,
|
172 |
+
"special": true
|
173 |
+
},
|
174 |
+
"28689": {
|
175 |
+
"content": "<extra_id_17>",
|
176 |
+
"lstrip": false,
|
177 |
+
"normalized": false,
|
178 |
+
"rstrip": false,
|
179 |
+
"single_word": false,
|
180 |
+
"special": true
|
181 |
+
},
|
182 |
+
"28690": {
|
183 |
+
"content": "<extra_id_18>",
|
184 |
+
"lstrip": false,
|
185 |
+
"normalized": false,
|
186 |
+
"rstrip": false,
|
187 |
+
"single_word": false,
|
188 |
+
"special": true
|
189 |
+
},
|
190 |
+
"28691": {
|
191 |
+
"content": "<extra_id_19>",
|
192 |
+
"lstrip": false,
|
193 |
+
"normalized": false,
|
194 |
+
"rstrip": false,
|
195 |
+
"single_word": false,
|
196 |
+
"special": true
|
197 |
+
},
|
198 |
+
"28692": {
|
199 |
+
"content": "<extra_id_20>",
|
200 |
+
"lstrip": false,
|
201 |
+
"normalized": false,
|
202 |
+
"rstrip": false,
|
203 |
+
"single_word": false,
|
204 |
+
"special": true
|
205 |
+
},
|
206 |
+
"28693": {
|
207 |
+
"content": "<extra_id_21>",
|
208 |
+
"lstrip": false,
|
209 |
+
"normalized": false,
|
210 |
+
"rstrip": false,
|
211 |
+
"single_word": false,
|
212 |
+
"special": true
|
213 |
+
},
|
214 |
+
"28694": {
|
215 |
+
"content": "<extra_id_22>",
|
216 |
+
"lstrip": false,
|
217 |
+
"normalized": false,
|
218 |
+
"rstrip": false,
|
219 |
+
"single_word": false,
|
220 |
+
"special": true
|
221 |
+
},
|
222 |
+
"28695": {
|
223 |
+
"content": "<extra_id_23>",
|
224 |
+
"lstrip": false,
|
225 |
+
"normalized": false,
|
226 |
+
"rstrip": false,
|
227 |
+
"single_word": false,
|
228 |
+
"special": true
|
229 |
+
},
|
230 |
+
"28696": {
|
231 |
+
"content": "<extra_id_24>",
|
232 |
+
"lstrip": false,
|
233 |
+
"normalized": false,
|
234 |
+
"rstrip": false,
|
235 |
+
"single_word": false,
|
236 |
+
"special": true
|
237 |
+
},
|
238 |
+
"28697": {
|
239 |
+
"content": "<extra_id_25>",
|
240 |
+
"lstrip": false,
|
241 |
+
"normalized": false,
|
242 |
+
"rstrip": false,
|
243 |
+
"single_word": false,
|
244 |
+
"special": true
|
245 |
+
},
|
246 |
+
"28698": {
|
247 |
+
"content": "<extra_id_26>",
|
248 |
+
"lstrip": false,
|
249 |
+
"normalized": false,
|
250 |
+
"rstrip": false,
|
251 |
+
"single_word": false,
|
252 |
+
"special": true
|
253 |
+
},
|
254 |
+
"28699": {
|
255 |
+
"content": "<extra_id_27>",
|
256 |
+
"lstrip": false,
|
257 |
+
"normalized": false,
|
258 |
+
"rstrip": false,
|
259 |
+
"single_word": false,
|
260 |
+
"special": true
|
261 |
+
},
|
262 |
+
"28700": {
|
263 |
+
"content": "<extra_id_28>",
|
264 |
+
"lstrip": false,
|
265 |
+
"normalized": false,
|
266 |
+
"rstrip": false,
|
267 |
+
"single_word": false,
|
268 |
+
"special": true
|
269 |
+
},
|
270 |
+
"28701": {
|
271 |
+
"content": "<extra_id_29>",
|
272 |
+
"lstrip": false,
|
273 |
+
"normalized": false,
|
274 |
+
"rstrip": false,
|
275 |
+
"single_word": false,
|
276 |
+
"special": true
|
277 |
+
},
|
278 |
+
"28702": {
|
279 |
+
"content": "<extra_id_30>",
|
280 |
+
"lstrip": false,
|
281 |
+
"normalized": false,
|
282 |
+
"rstrip": false,
|
283 |
+
"single_word": false,
|
284 |
+
"special": true
|
285 |
+
},
|
286 |
+
"28703": {
|
287 |
+
"content": "<extra_id_31>",
|
288 |
+
"lstrip": false,
|
289 |
+
"normalized": false,
|
290 |
+
"rstrip": false,
|
291 |
+
"single_word": false,
|
292 |
+
"special": true
|
293 |
+
},
|
294 |
+
"28704": {
|
295 |
+
"content": "<extra_id_32>",
|
296 |
+
"lstrip": false,
|
297 |
+
"normalized": false,
|
298 |
+
"rstrip": false,
|
299 |
+
"single_word": false,
|
300 |
+
"special": true
|
301 |
+
},
|
302 |
+
"28705": {
|
303 |
+
"content": "<extra_id_33>",
|
304 |
+
"lstrip": false,
|
305 |
+
"normalized": false,
|
306 |
+
"rstrip": false,
|
307 |
+
"single_word": false,
|
308 |
+
"special": true
|
309 |
+
},
|
310 |
+
"28706": {
|
311 |
+
"content": "<extra_id_34>",
|
312 |
+
"lstrip": false,
|
313 |
+
"normalized": false,
|
314 |
+
"rstrip": false,
|
315 |
+
"single_word": false,
|
316 |
+
"special": true
|
317 |
+
},
|
318 |
+
"28707": {
|
319 |
+
"content": "<extra_id_35>",
|
320 |
+
"lstrip": false,
|
321 |
+
"normalized": false,
|
322 |
+
"rstrip": false,
|
323 |
+
"single_word": false,
|
324 |
+
"special": true
|
325 |
+
},
|
326 |
+
"28708": {
|
327 |
+
"content": "<extra_id_36>",
|
328 |
+
"lstrip": false,
|
329 |
+
"normalized": false,
|
330 |
+
"rstrip": false,
|
331 |
+
"single_word": false,
|
332 |
+
"special": true
|
333 |
+
},
|
334 |
+
"28709": {
|
335 |
+
"content": "<extra_id_37>",
|
336 |
+
"lstrip": false,
|
337 |
+
"normalized": false,
|
338 |
+
"rstrip": false,
|
339 |
+
"single_word": false,
|
340 |
+
"special": true
|
341 |
+
},
|
342 |
+
"28710": {
|
343 |
+
"content": "<extra_id_38>",
|
344 |
+
"lstrip": false,
|
345 |
+
"normalized": false,
|
346 |
+
"rstrip": false,
|
347 |
+
"single_word": false,
|
348 |
+
"special": true
|
349 |
+
},
|
350 |
+
"28711": {
|
351 |
+
"content": "<extra_id_39>",
|
352 |
+
"lstrip": false,
|
353 |
+
"normalized": false,
|
354 |
+
"rstrip": false,
|
355 |
+
"single_word": false,
|
356 |
+
"special": true
|
357 |
+
},
|
358 |
+
"28712": {
|
359 |
+
"content": "<extra_id_40>",
|
360 |
+
"lstrip": false,
|
361 |
+
"normalized": false,
|
362 |
+
"rstrip": false,
|
363 |
+
"single_word": false,
|
364 |
+
"special": true
|
365 |
+
},
|
366 |
+
"28713": {
|
367 |
+
"content": "<extra_id_41>",
|
368 |
+
"lstrip": false,
|
369 |
+
"normalized": false,
|
370 |
+
"rstrip": false,
|
371 |
+
"single_word": false,
|
372 |
+
"special": true
|
373 |
+
},
|
374 |
+
"28714": {
|
375 |
+
"content": "<extra_id_42>",
|
376 |
+
"lstrip": false,
|
377 |
+
"normalized": false,
|
378 |
+
"rstrip": false,
|
379 |
+
"single_word": false,
|
380 |
+
"special": true
|
381 |
+
},
|
382 |
+
"28715": {
|
383 |
+
"content": "<extra_id_43>",
|
384 |
+
"lstrip": false,
|
385 |
+
"normalized": false,
|
386 |
+
"rstrip": false,
|
387 |
+
"single_word": false,
|
388 |
+
"special": true
|
389 |
+
},
|
390 |
+
"28716": {
|
391 |
+
"content": "<extra_id_44>",
|
392 |
+
"lstrip": false,
|
393 |
+
"normalized": false,
|
394 |
+
"rstrip": false,
|
395 |
+
"single_word": false,
|
396 |
+
"special": true
|
397 |
+
},
|
398 |
+
"28717": {
|
399 |
+
"content": "<extra_id_45>",
|
400 |
+
"lstrip": false,
|
401 |
+
"normalized": false,
|
402 |
+
"rstrip": false,
|
403 |
+
"single_word": false,
|
404 |
+
"special": true
|
405 |
+
},
|
406 |
+
"28718": {
|
407 |
+
"content": "<extra_id_46>",
|
408 |
+
"lstrip": false,
|
409 |
+
"normalized": false,
|
410 |
+
"rstrip": false,
|
411 |
+
"single_word": false,
|
412 |
+
"special": true
|
413 |
+
},
|
414 |
+
"28719": {
|
415 |
+
"content": "<extra_id_47>",
|
416 |
+
"lstrip": false,
|
417 |
+
"normalized": false,
|
418 |
+
"rstrip": false,
|
419 |
+
"single_word": false,
|
420 |
+
"special": true
|
421 |
+
},
|
422 |
+
"28720": {
|
423 |
+
"content": "<extra_id_48>",
|
424 |
+
"lstrip": false,
|
425 |
+
"normalized": false,
|
426 |
+
"rstrip": false,
|
427 |
+
"single_word": false,
|
428 |
+
"special": true
|
429 |
+
},
|
430 |
+
"28721": {
|
431 |
+
"content": "<extra_id_49>",
|
432 |
+
"lstrip": false,
|
433 |
+
"normalized": false,
|
434 |
+
"rstrip": false,
|
435 |
+
"single_word": false,
|
436 |
+
"special": true
|
437 |
+
},
|
438 |
+
"28722": {
|
439 |
+
"content": "<extra_id_50>",
|
440 |
+
"lstrip": false,
|
441 |
+
"normalized": false,
|
442 |
+
"rstrip": false,
|
443 |
+
"single_word": false,
|
444 |
+
"special": true
|
445 |
+
},
|
446 |
+
"28723": {
|
447 |
+
"content": "<extra_id_51>",
|
448 |
+
"lstrip": false,
|
449 |
+
"normalized": false,
|
450 |
+
"rstrip": false,
|
451 |
+
"single_word": false,
|
452 |
+
"special": true
|
453 |
+
},
|
454 |
+
"28724": {
|
455 |
+
"content": "<extra_id_52>",
|
456 |
+
"lstrip": false,
|
457 |
+
"normalized": false,
|
458 |
+
"rstrip": false,
|
459 |
+
"single_word": false,
|
460 |
+
"special": true
|
461 |
+
},
|
462 |
+
"28725": {
|
463 |
+
"content": "<extra_id_53>",
|
464 |
+
"lstrip": false,
|
465 |
+
"normalized": false,
|
466 |
+
"rstrip": false,
|
467 |
+
"single_word": false,
|
468 |
+
"special": true
|
469 |
+
},
|
470 |
+
"28726": {
|
471 |
+
"content": "<extra_id_54>",
|
472 |
+
"lstrip": false,
|
473 |
+
"normalized": false,
|
474 |
+
"rstrip": false,
|
475 |
+
"single_word": false,
|
476 |
+
"special": true
|
477 |
+
},
|
478 |
+
"28727": {
|
479 |
+
"content": "<extra_id_55>",
|
480 |
+
"lstrip": false,
|
481 |
+
"normalized": false,
|
482 |
+
"rstrip": false,
|
483 |
+
"single_word": false,
|
484 |
+
"special": true
|
485 |
+
},
|
486 |
+
"28728": {
|
487 |
+
"content": "<extra_id_56>",
|
488 |
+
"lstrip": false,
|
489 |
+
"normalized": false,
|
490 |
+
"rstrip": false,
|
491 |
+
"single_word": false,
|
492 |
+
"special": true
|
493 |
+
},
|
494 |
+
"28729": {
|
495 |
+
"content": "<extra_id_57>",
|
496 |
+
"lstrip": false,
|
497 |
+
"normalized": false,
|
498 |
+
"rstrip": false,
|
499 |
+
"single_word": false,
|
500 |
+
"special": true
|
501 |
+
},
|
502 |
+
"28730": {
|
503 |
+
"content": "<extra_id_58>",
|
504 |
+
"lstrip": false,
|
505 |
+
"normalized": false,
|
506 |
+
"rstrip": false,
|
507 |
+
"single_word": false,
|
508 |
+
"special": true
|
509 |
+
},
|
510 |
+
"28731": {
|
511 |
+
"content": "<extra_id_59>",
|
512 |
+
"lstrip": false,
|
513 |
+
"normalized": false,
|
514 |
+
"rstrip": false,
|
515 |
+
"single_word": false,
|
516 |
+
"special": true
|
517 |
+
},
|
518 |
+
"28732": {
|
519 |
+
"content": "<extra_id_60>",
|
520 |
+
"lstrip": false,
|
521 |
+
"normalized": false,
|
522 |
+
"rstrip": false,
|
523 |
+
"single_word": false,
|
524 |
+
"special": true
|
525 |
+
},
|
526 |
+
"28733": {
|
527 |
+
"content": "<extra_id_61>",
|
528 |
+
"lstrip": false,
|
529 |
+
"normalized": false,
|
530 |
+
"rstrip": false,
|
531 |
+
"single_word": false,
|
532 |
+
"special": true
|
533 |
+
},
|
534 |
+
"28734": {
|
535 |
+
"content": "<extra_id_62>",
|
536 |
+
"lstrip": false,
|
537 |
+
"normalized": false,
|
538 |
+
"rstrip": false,
|
539 |
+
"single_word": false,
|
540 |
+
"special": true
|
541 |
+
},
|
542 |
+
"28735": {
|
543 |
+
"content": "<extra_id_63>",
|
544 |
+
"lstrip": false,
|
545 |
+
"normalized": false,
|
546 |
+
"rstrip": false,
|
547 |
+
"single_word": false,
|
548 |
+
"special": true
|
549 |
+
},
|
550 |
+
"28736": {
|
551 |
+
"content": "<extra_id_64>",
|
552 |
+
"lstrip": false,
|
553 |
+
"normalized": false,
|
554 |
+
"rstrip": false,
|
555 |
+
"single_word": false,
|
556 |
+
"special": true
|
557 |
+
},
|
558 |
+
"28737": {
|
559 |
+
"content": "<extra_id_65>",
|
560 |
+
"lstrip": false,
|
561 |
+
"normalized": false,
|
562 |
+
"rstrip": false,
|
563 |
+
"single_word": false,
|
564 |
+
"special": true
|
565 |
+
},
|
566 |
+
"28738": {
|
567 |
+
"content": "<extra_id_66>",
|
568 |
+
"lstrip": false,
|
569 |
+
"normalized": false,
|
570 |
+
"rstrip": false,
|
571 |
+
"single_word": false,
|
572 |
+
"special": true
|
573 |
+
},
|
574 |
+
"28739": {
|
575 |
+
"content": "<extra_id_67>",
|
576 |
+
"lstrip": false,
|
577 |
+
"normalized": false,
|
578 |
+
"rstrip": false,
|
579 |
+
"single_word": false,
|
580 |
+
"special": true
|
581 |
+
},
|
582 |
+
"28740": {
|
583 |
+
"content": "<extra_id_68>",
|
584 |
+
"lstrip": false,
|
585 |
+
"normalized": false,
|
586 |
+
"rstrip": false,
|
587 |
+
"single_word": false,
|
588 |
+
"special": true
|
589 |
+
},
|
590 |
+
"28741": {
|
591 |
+
"content": "<extra_id_69>",
|
592 |
+
"lstrip": false,
|
593 |
+
"normalized": false,
|
594 |
+
"rstrip": false,
|
595 |
+
"single_word": false,
|
596 |
+
"special": true
|
597 |
+
},
|
598 |
+
"28742": {
|
599 |
+
"content": "<extra_id_70>",
|
600 |
+
"lstrip": false,
|
601 |
+
"normalized": false,
|
602 |
+
"rstrip": false,
|
603 |
+
"single_word": false,
|
604 |
+
"special": true
|
605 |
+
},
|
606 |
+
"28743": {
|
607 |
+
"content": "<extra_id_71>",
|
608 |
+
"lstrip": false,
|
609 |
+
"normalized": false,
|
610 |
+
"rstrip": false,
|
611 |
+
"single_word": false,
|
612 |
+
"special": true
|
613 |
+
},
|
614 |
+
"28744": {
|
615 |
+
"content": "<extra_id_72>",
|
616 |
+
"lstrip": false,
|
617 |
+
"normalized": false,
|
618 |
+
"rstrip": false,
|
619 |
+
"single_word": false,
|
620 |
+
"special": true
|
621 |
+
},
|
622 |
+
"28745": {
|
623 |
+
"content": "<extra_id_73>",
|
624 |
+
"lstrip": false,
|
625 |
+
"normalized": false,
|
626 |
+
"rstrip": false,
|
627 |
+
"single_word": false,
|
628 |
+
"special": true
|
629 |
+
},
|
630 |
+
"28746": {
|
631 |
+
"content": "<extra_id_74>",
|
632 |
+
"lstrip": false,
|
633 |
+
"normalized": false,
|
634 |
+
"rstrip": false,
|
635 |
+
"single_word": false,
|
636 |
+
"special": true
|
637 |
+
},
|
638 |
+
"28747": {
|
639 |
+
"content": "<extra_id_75>",
|
640 |
+
"lstrip": false,
|
641 |
+
"normalized": false,
|
642 |
+
"rstrip": false,
|
643 |
+
"single_word": false,
|
644 |
+
"special": true
|
645 |
+
},
|
646 |
+
"28748": {
|
647 |
+
"content": "<extra_id_76>",
|
648 |
+
"lstrip": false,
|
649 |
+
"normalized": false,
|
650 |
+
"rstrip": false,
|
651 |
+
"single_word": false,
|
652 |
+
"special": true
|
653 |
+
},
|
654 |
+
"28749": {
|
655 |
+
"content": "<extra_id_77>",
|
656 |
+
"lstrip": false,
|
657 |
+
"normalized": false,
|
658 |
+
"rstrip": false,
|
659 |
+
"single_word": false,
|
660 |
+
"special": true
|
661 |
+
},
|
662 |
+
"28750": {
|
663 |
+
"content": "<extra_id_78>",
|
664 |
+
"lstrip": false,
|
665 |
+
"normalized": false,
|
666 |
+
"rstrip": false,
|
667 |
+
"single_word": false,
|
668 |
+
"special": true
|
669 |
+
},
|
670 |
+
"28751": {
|
671 |
+
"content": "<extra_id_79>",
|
672 |
+
"lstrip": false,
|
673 |
+
"normalized": false,
|
674 |
+
"rstrip": false,
|
675 |
+
"single_word": false,
|
676 |
+
"special": true
|
677 |
+
},
|
678 |
+
"28752": {
|
679 |
+
"content": "<extra_id_80>",
|
680 |
+
"lstrip": false,
|
681 |
+
"normalized": false,
|
682 |
+
"rstrip": false,
|
683 |
+
"single_word": false,
|
684 |
+
"special": true
|
685 |
+
},
|
686 |
+
"28753": {
|
687 |
+
"content": "<extra_id_81>",
|
688 |
+
"lstrip": false,
|
689 |
+
"normalized": false,
|
690 |
+
"rstrip": false,
|
691 |
+
"single_word": false,
|
692 |
+
"special": true
|
693 |
+
},
|
694 |
+
"28754": {
|
695 |
+
"content": "<extra_id_82>",
|
696 |
+
"lstrip": false,
|
697 |
+
"normalized": false,
|
698 |
+
"rstrip": false,
|
699 |
+
"single_word": false,
|
700 |
+
"special": true
|
701 |
+
},
|
702 |
+
"28755": {
|
703 |
+
"content": "<extra_id_83>",
|
704 |
+
"lstrip": false,
|
705 |
+
"normalized": false,
|
706 |
+
"rstrip": false,
|
707 |
+
"single_word": false,
|
708 |
+
"special": true
|
709 |
+
},
|
710 |
+
"28756": {
|
711 |
+
"content": "<extra_id_84>",
|
712 |
+
"lstrip": false,
|
713 |
+
"normalized": false,
|
714 |
+
"rstrip": false,
|
715 |
+
"single_word": false,
|
716 |
+
"special": true
|
717 |
+
},
|
718 |
+
"28757": {
|
719 |
+
"content": "<extra_id_85>",
|
720 |
+
"lstrip": false,
|
721 |
+
"normalized": false,
|
722 |
+
"rstrip": false,
|
723 |
+
"single_word": false,
|
724 |
+
"special": true
|
725 |
+
},
|
726 |
+
"28758": {
|
727 |
+
"content": "<extra_id_86>",
|
728 |
+
"lstrip": false,
|
729 |
+
"normalized": false,
|
730 |
+
"rstrip": false,
|
731 |
+
"single_word": false,
|
732 |
+
"special": true
|
733 |
+
},
|
734 |
+
"28759": {
|
735 |
+
"content": "<extra_id_87>",
|
736 |
+
"lstrip": false,
|
737 |
+
"normalized": false,
|
738 |
+
"rstrip": false,
|
739 |
+
"single_word": false,
|
740 |
+
"special": true
|
741 |
+
},
|
742 |
+
"28760": {
|
743 |
+
"content": "<extra_id_88>",
|
744 |
+
"lstrip": false,
|
745 |
+
"normalized": false,
|
746 |
+
"rstrip": false,
|
747 |
+
"single_word": false,
|
748 |
+
"special": true
|
749 |
+
},
|
750 |
+
"28761": {
|
751 |
+
"content": "<extra_id_89>",
|
752 |
+
"lstrip": false,
|
753 |
+
"normalized": false,
|
754 |
+
"rstrip": false,
|
755 |
+
"single_word": false,
|
756 |
+
"special": true
|
757 |
+
},
|
758 |
+
"28762": {
|
759 |
+
"content": "<extra_id_90>",
|
760 |
+
"lstrip": false,
|
761 |
+
"normalized": false,
|
762 |
+
"rstrip": false,
|
763 |
+
"single_word": false,
|
764 |
+
"special": true
|
765 |
+
},
|
766 |
+
"28763": {
|
767 |
+
"content": "<extra_id_91>",
|
768 |
+
"lstrip": false,
|
769 |
+
"normalized": false,
|
770 |
+
"rstrip": false,
|
771 |
+
"single_word": false,
|
772 |
+
"special": true
|
773 |
+
},
|
774 |
+
"28764": {
|
775 |
+
"content": "<extra_id_92>",
|
776 |
+
"lstrip": false,
|
777 |
+
"normalized": false,
|
778 |
+
"rstrip": false,
|
779 |
+
"single_word": false,
|
780 |
+
"special": true
|
781 |
+
},
|
782 |
+
"28765": {
|
783 |
+
"content": "<extra_id_93>",
|
784 |
+
"lstrip": false,
|
785 |
+
"normalized": false,
|
786 |
+
"rstrip": false,
|
787 |
+
"single_word": false,
|
788 |
+
"special": true
|
789 |
+
},
|
790 |
+
"28766": {
|
791 |
+
"content": "<extra_id_94>",
|
792 |
+
"lstrip": false,
|
793 |
+
"normalized": false,
|
794 |
+
"rstrip": false,
|
795 |
+
"single_word": false,
|
796 |
+
"special": true
|
797 |
+
},
|
798 |
+
"28767": {
|
799 |
+
"content": "<extra_id_95>",
|
800 |
+
"lstrip": false,
|
801 |
+
"normalized": false,
|
802 |
+
"rstrip": false,
|
803 |
+
"single_word": false,
|
804 |
+
"special": true
|
805 |
+
},
|
806 |
+
"28768": {
|
807 |
+
"content": "<extra_id_96>",
|
808 |
+
"lstrip": false,
|
809 |
+
"normalized": false,
|
810 |
+
"rstrip": false,
|
811 |
+
"single_word": false,
|
812 |
+
"special": true
|
813 |
+
},
|
814 |
+
"28769": {
|
815 |
+
"content": "<extra_id_97>",
|
816 |
+
"lstrip": false,
|
817 |
+
"normalized": false,
|
818 |
+
"rstrip": false,
|
819 |
+
"single_word": false,
|
820 |
+
"special": true
|
821 |
+
},
|
822 |
+
"28770": {
|
823 |
+
"content": "<extra_id_98>",
|
824 |
+
"lstrip": false,
|
825 |
+
"normalized": false,
|
826 |
+
"rstrip": false,
|
827 |
+
"single_word": false,
|
828 |
+
"special": true
|
829 |
+
},
|
830 |
+
"28771": {
|
831 |
+
"content": "<extra_id_99>",
|
832 |
+
"lstrip": false,
|
833 |
+
"normalized": false,
|
834 |
+
"rstrip": false,
|
835 |
+
"single_word": false,
|
836 |
+
"special": true
|
837 |
+
}
|
838 |
+
},
|
839 |
+
"additional_special_tokens": [
|
840 |
+
"<extra_id_0>",
|
841 |
+
"<extra_id_1>",
|
842 |
+
"<extra_id_2>",
|
843 |
+
"<extra_id_3>",
|
844 |
+
"<extra_id_4>",
|
845 |
+
"<extra_id_5>",
|
846 |
+
"<extra_id_6>",
|
847 |
+
"<extra_id_7>",
|
848 |
+
"<extra_id_8>",
|
849 |
+
"<extra_id_9>",
|
850 |
+
"<extra_id_10>",
|
851 |
+
"<extra_id_11>",
|
852 |
+
"<extra_id_12>",
|
853 |
+
"<extra_id_13>",
|
854 |
+
"<extra_id_14>",
|
855 |
+
"<extra_id_15>",
|
856 |
+
"<extra_id_16>",
|
857 |
+
"<extra_id_17>",
|
858 |
+
"<extra_id_18>",
|
859 |
+
"<extra_id_19>",
|
860 |
+
"<extra_id_20>",
|
861 |
+
"<extra_id_21>",
|
862 |
+
"<extra_id_22>",
|
863 |
+
"<extra_id_23>",
|
864 |
+
"<extra_id_24>",
|
865 |
+
"<extra_id_25>",
|
866 |
+
"<extra_id_26>",
|
867 |
+
"<extra_id_27>",
|
868 |
+
"<extra_id_28>",
|
869 |
+
"<extra_id_29>",
|
870 |
+
"<extra_id_30>",
|
871 |
+
"<extra_id_31>",
|
872 |
+
"<extra_id_32>",
|
873 |
+
"<extra_id_33>",
|
874 |
+
"<extra_id_34>",
|
875 |
+
"<extra_id_35>",
|
876 |
+
"<extra_id_36>",
|
877 |
+
"<extra_id_37>",
|
878 |
+
"<extra_id_38>",
|
879 |
+
"<extra_id_39>",
|
880 |
+
"<extra_id_40>",
|
881 |
+
"<extra_id_41>",
|
882 |
+
"<extra_id_42>",
|
883 |
+
"<extra_id_43>",
|
884 |
+
"<extra_id_44>",
|
885 |
+
"<extra_id_45>",
|
886 |
+
"<extra_id_46>",
|
887 |
+
"<extra_id_47>",
|
888 |
+
"<extra_id_48>",
|
889 |
+
"<extra_id_49>",
|
890 |
+
"<extra_id_50>",
|
891 |
+
"<extra_id_51>",
|
892 |
+
"<extra_id_52>",
|
893 |
+
"<extra_id_53>",
|
894 |
+
"<extra_id_54>",
|
895 |
+
"<extra_id_55>",
|
896 |
+
"<extra_id_56>",
|
897 |
+
"<extra_id_57>",
|
898 |
+
"<extra_id_58>",
|
899 |
+
"<extra_id_59>",
|
900 |
+
"<extra_id_60>",
|
901 |
+
"<extra_id_61>",
|
902 |
+
"<extra_id_62>",
|
903 |
+
"<extra_id_63>",
|
904 |
+
"<extra_id_64>",
|
905 |
+
"<extra_id_65>",
|
906 |
+
"<extra_id_66>",
|
907 |
+
"<extra_id_67>",
|
908 |
+
"<extra_id_68>",
|
909 |
+
"<extra_id_69>",
|
910 |
+
"<extra_id_70>",
|
911 |
+
"<extra_id_71>",
|
912 |
+
"<extra_id_72>",
|
913 |
+
"<extra_id_73>",
|
914 |
+
"<extra_id_74>",
|
915 |
+
"<extra_id_75>",
|
916 |
+
"<extra_id_76>",
|
917 |
+
"<extra_id_77>",
|
918 |
+
"<extra_id_78>",
|
919 |
+
"<extra_id_79>",
|
920 |
+
"<extra_id_80>",
|
921 |
+
"<extra_id_81>",
|
922 |
+
"<extra_id_82>",
|
923 |
+
"<extra_id_83>",
|
924 |
+
"<extra_id_84>",
|
925 |
+
"<extra_id_85>",
|
926 |
+
"<extra_id_86>",
|
927 |
+
"<extra_id_87>",
|
928 |
+
"<extra_id_88>",
|
929 |
+
"<extra_id_89>",
|
930 |
+
"<extra_id_90>",
|
931 |
+
"<extra_id_91>",
|
932 |
+
"<extra_id_92>",
|
933 |
+
"<extra_id_93>",
|
934 |
+
"<extra_id_94>",
|
935 |
+
"<extra_id_95>",
|
936 |
+
"<extra_id_96>",
|
937 |
+
"<extra_id_97>",
|
938 |
+
"<extra_id_98>",
|
939 |
+
"<extra_id_99>"
|
940 |
+
],
|
941 |
+
"bos_token": "<s>",
|
942 |
+
"clean_up_tokenization_spaces": false,
|
943 |
+
"eos_token": "</s>",
|
944 |
+
"legacy": false,
|
945 |
+
"model_max_length": 1000000000000000019884624838656,
|
946 |
+
"pad_token": "<pad>",
|
947 |
+
"sp_model_kwargs": {},
|
948 |
+
"spaces_between_special_tokens": false,
|
949 |
+
"tokenizer_class": "LlamaTokenizer",
|
950 |
+
"unk_token": "<unk>",
|
951 |
+
"use_default_system_prompt": false
|
952 |
+
}
|
checkpoint-pt-21000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa771dbc4a63d23c6984ff0cccd661741ea6930436c81ed2c6039f23c106d67f
|
3 |
+
size 1947396528
|
checkpoint-pt-21000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed31fc694e66aee6e3b9dbdf92b28ce62f42fa820a4906b89f7190e10867013f
|
3 |
+
size 14344
|
checkpoint-pt-22500/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f948773759059489ac93361a2ceb7c702cc465b93711956d49e9672baff82ff
|
3 |
+
size 1947396528
|
checkpoint-pt-22500/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c40b6245fe0cef1a5b1ebee7127ee5b84f049c2675fc9c64a2e2105955dcea53
|
3 |
+
size 14408
|
checkpoint-pt-25500/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf9c8d29874fed7dd897f58cfa1dd048db2b7211aec88c8a31a8d289152994c3
|
3 |
+
size 1947396528
|
checkpoint-pt-25500/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:147819f6dc53370e7f045048178c9994c85f69245c8311221b151d13be4d89f2
|
3 |
+
size 14344
|
checkpoint-pt-27000/config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "pszemraj/tFINE-850m-24x24-1024ctx",
|
3 |
+
"architectures": [
|
4 |
+
"T5ForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"classifier_dropout": 0.0,
|
7 |
+
"d_ff": 3072,
|
8 |
+
"d_kv": 64,
|
9 |
+
"d_model": 1024,
|
10 |
+
"decoder_start_token_id": 3,
|
11 |
+
"dense_act_fn": "silu",
|
12 |
+
"dropout_rate": 0.0,
|
13 |
+
"eos_token_id": 2,
|
14 |
+
"feed_forward_proj": "gated-silu",
|
15 |
+
"initializer_factor": 1.0,
|
16 |
+
"is_bf16": true,
|
17 |
+
"is_encoder_decoder": true,
|
18 |
+
"is_gated_act": true,
|
19 |
+
"layer_norm_epsilon": 1e-06,
|
20 |
+
"model_type": "t5",
|
21 |
+
"num_decoder_layers": 16,
|
22 |
+
"num_heads": 16,
|
23 |
+
"num_key_value_heads": 4,
|
24 |
+
"num_layers": 16,
|
25 |
+
"output_past": true,
|
26 |
+
"pad_token_id": 3,
|
27 |
+
"relative_attention_max_distance": 128,
|
28 |
+
"relative_attention_num_buckets": 48,
|
29 |
+
"tie_word_embeddings": false,
|
30 |
+
"transformers_version": "4.46.0.dev0",
|
31 |
+
"use_cache": true,
|
32 |
+
"use_gqa": true,
|
33 |
+
"vocab_size": 28776
|
34 |
+
}
|
checkpoint-pt-27000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:adcba1a78a20dd132302933f9d140ab1b661ac822744324543966cf47f3ae3f6
|
3 |
+
size 1947396528
|
checkpoint-pt-27000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e5fcbdeef6dc6ae50b48fd2a81e8ff69cc9337d8cff59adb7fb983b993cefca
|
3 |
+
size 14344
|
checkpoint-pt-28500/config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "pszemraj/tFINE-850m-24x24-1024ctx",
|
3 |
+
"architectures": [
|
4 |
+
"T5ForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"classifier_dropout": 0.0,
|
7 |
+
"d_ff": 3072,
|
8 |
+
"d_kv": 64,
|
9 |
+
"d_model": 1024,
|
10 |
+
"decoder_start_token_id": 3,
|
11 |
+
"dense_act_fn": "silu",
|
12 |
+
"dropout_rate": 0.0,
|
13 |
+
"eos_token_id": 2,
|
14 |
+
"feed_forward_proj": "gated-silu",
|
15 |
+
"initializer_factor": 1.0,
|
16 |
+
"is_bf16": true,
|
17 |
+
"is_encoder_decoder": true,
|
18 |
+
"is_gated_act": true,
|
19 |
+
"layer_norm_epsilon": 1e-06,
|
20 |
+
"model_type": "t5",
|
21 |
+
"num_decoder_layers": 16,
|
22 |
+
"num_heads": 16,
|
23 |
+
"num_key_value_heads": 4,
|
24 |
+
"num_layers": 16,
|
25 |
+
"output_past": true,
|
26 |
+
"pad_token_id": 3,
|
27 |
+
"relative_attention_max_distance": 128,
|
28 |
+
"relative_attention_num_buckets": 48,
|
29 |
+
"tie_word_embeddings": false,
|
30 |
+
"transformers_version": "4.46.0.dev0",
|
31 |
+
"use_cache": true,
|
32 |
+
"use_gqa": true,
|
33 |
+
"vocab_size": 28776
|
34 |
+
}
|
checkpoint-pt-28500/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09cbe1fc9ff42f93d50c984bd424ec74cb9bc3913c3c4f35cdf2020cea49f824
|
3 |
+
size 1947396528
|
checkpoint-pt-28500/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:11d506418cc06a51a3b061e1996bb290626c5a36d9d02c89b87c4c18b1c09d5a
|
3 |
+
size 14408
|
config.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"vocab_size": 28776,
|
3 |
+
"d_model": 1024,
|
4 |
+
"d_kv": 64,
|
5 |
+
"d_ff": 3072,
|
6 |
+
"num_layers": 16,
|
7 |
+
"num_decoder_layers": 16,
|
8 |
+
"num_heads": 16,
|
9 |
+
"num_key_value_heads": 4,
|
10 |
+
"relative_attention_num_buckets": 48,
|
11 |
+
"relative_attention_max_distance": 128,
|
12 |
+
"dropout_rate": 0.0,
|
13 |
+
"classifier_dropout": 0.0,
|
14 |
+
"layer_norm_epsilon": 1e-06,
|
15 |
+
"initializer_factor": 1.0,
|
16 |
+
"feed_forward_proj": "gated-silu",
|
17 |
+
"use_cache": true,
|
18 |
+
"use_gqa": true,
|
19 |
+
"dense_act_fn": "silu",
|
20 |
+
"is_gated_act": true,
|
21 |
+
"return_dict": true,
|
22 |
+
"output_hidden_states": false,
|
23 |
+
"output_attentions": false,
|
24 |
+
"torchscript": false,
|
25 |
+
"torch_dtype": null,
|
26 |
+
"use_bfloat16": false,
|
27 |
+
"tf_legacy_loss": false,
|
28 |
+
"pruned_heads": {},
|
29 |
+
"tie_word_embeddings": false,
|
30 |
+
"chunk_size_feed_forward": 0,
|
31 |
+
"is_encoder_decoder": true,
|
32 |
+
"is_decoder": false,
|
33 |
+
"cross_attention_hidden_size": null,
|
34 |
+
"add_cross_attention": false,
|
35 |
+
"tie_encoder_decoder": false,
|
36 |
+
"max_length": 20,
|
37 |
+
"min_length": 0,
|
38 |
+
"do_sample": false,
|
39 |
+
"early_stopping": false,
|
40 |
+
"num_beams": 1,
|
41 |
+
"num_beam_groups": 1,
|
42 |
+
"diversity_penalty": 0.0,
|
43 |
+
"temperature": 1.0,
|
44 |
+
"top_k": 50,
|
45 |
+
"top_p": 1.0,
|
46 |
+
"typical_p": 1.0,
|
47 |
+
"repetition_penalty": 1.0,
|
48 |
+
"length_penalty": 1.0,
|
49 |
+
"no_repeat_ngram_size": 0,
|
50 |
+
"encoder_no_repeat_ngram_size": 0,
|
51 |
+
"bad_words_ids": null,
|
52 |
+
"num_return_sequences": 1,
|
53 |
+
"output_scores": false,
|
54 |
+
"return_dict_in_generate": false,
|
55 |
+
"forced_bos_token_id": null,
|
56 |
+
"forced_eos_token_id": null,
|
57 |
+
"remove_invalid_values": false,
|
58 |
+
"exponential_decay_length_penalty": null,
|
59 |
+
"suppress_tokens": null,
|
60 |
+
"begin_suppress_tokens": null,
|
61 |
+
"architectures": [
|
62 |
+
"T5ForConditionalGeneration"
|
63 |
+
],
|
64 |
+
"finetuning_task": null,
|
65 |
+
"id2label": {
|
66 |
+
"0": "LABEL_0",
|
67 |
+
"1": "LABEL_1"
|
68 |
+
},
|
69 |
+
"label2id": {
|
70 |
+
"LABEL_0": 0,
|
71 |
+
"LABEL_1": 1
|
72 |
+
},
|
73 |
+
"tokenizer_class": null,
|
74 |
+
"prefix": null,
|
75 |
+
"bos_token_id": null,
|
76 |
+
"pad_token_id": 3,
|
77 |
+
"eos_token_id": 2,
|
78 |
+
"sep_token_id": null,
|
79 |
+
"decoder_start_token_id": 3,
|
80 |
+
"task_specific_params": null,
|
81 |
+
"problem_type": null,
|
82 |
+
"_name_or_path": "/workspace/nanoT5/logs/2024-10-20/18-25-17/checkpoint-pt-27000",
|
83 |
+
"transformers_version": "4.46.0.dev0",
|
84 |
+
"is_bf16": true,
|
85 |
+
"model_type": "t5",
|
86 |
+
"output_past": true
|
87 |
+
}
|
main.log
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[2024-10-20 18:25:17,510][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
2 |
+
[2024-10-20 18:25:17,521][Main][INFO] - Distributed environment: DistributedType.NO
|
3 |
+
Num processes: 1
|
4 |
+
Process index: 0
|
5 |
+
Local process index: 0
|
6 |
+
Device: cuda
|
7 |
+
|
8 |
+
Mixed precision type: bf16
|
9 |
+
|
10 |
+
[2024-10-20 18:25:17,522][Main][INFO] - Working directory is /workspace/nanoT5/logs/2024-10-20/18-25-17
|
11 |
+
[2024-10-20 18:31:35,111][Main][INFO] - [train] Step 25 out of 65536 | Loss --> 155.837 | Loss_ntp --> 76.275 | Loss_mlm --> 79.561 | Grad_l2 --> 476.354 | Weights_l2 --> 7701.821 | Lr --> 0.001 | Seconds_per_step --> 14.044 |
|
12 |
+
[2024-10-20 18:35:35,171][Main][INFO] - [train] Step 50 out of 65536 | Loss --> 98.644 | Loss_ntp --> 48.540 | Loss_mlm --> 50.105 | Grad_l2 --> 234.932 | Weights_l2 --> 7701.813 | Lr --> 0.001 | Seconds_per_step --> 9.602 |
|
13 |
+
[2024-10-20 18:39:35,197][Main][INFO] - [train] Step 75 out of 65536 | Loss --> 86.994 | Loss_ntp --> 42.861 | Loss_mlm --> 44.133 | Grad_l2 --> 180.388 | Weights_l2 --> 7701.806 | Lr --> 0.001 | Seconds_per_step --> 9.601 |
|
14 |
+
[2024-10-20 18:43:35,733][Main][INFO] - [train] Step 100 out of 65536 | Loss --> 80.568 | Loss_ntp --> 39.806 | Loss_mlm --> 40.762 | Grad_l2 --> 156.732 | Weights_l2 --> 7701.800 | Lr --> 0.001 | Seconds_per_step --> 9.621 |
|
15 |
+
[2024-10-20 18:47:37,016][Main][INFO] - [train] Step 125 out of 65536 | Loss --> 77.131 | Loss_ntp --> 38.127 | Loss_mlm --> 39.004 | Grad_l2 --> 179.590 | Weights_l2 --> 7701.794 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
|
16 |
+
[2024-10-20 18:51:38,437][Main][INFO] - [train] Step 150 out of 65536 | Loss --> 73.900 | Loss_ntp --> 36.620 | Loss_mlm --> 37.281 | Grad_l2 --> 161.591 | Weights_l2 --> 7701.789 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
|
17 |
+
[2024-10-20 18:55:39,020][Main][INFO] - [train] Step 175 out of 65536 | Loss --> 72.118 | Loss_ntp --> 35.763 | Loss_mlm --> 36.355 | Grad_l2 --> 161.741 | Weights_l2 --> 7701.783 | Lr --> 0.001 | Seconds_per_step --> 9.623 |
|
18 |
+
[2024-10-20 18:59:40,344][Main][INFO] - [train] Step 200 out of 65536 | Loss --> 70.712 | Loss_ntp --> 35.041 | Loss_mlm --> 35.671 | Grad_l2 --> 154.736 | Weights_l2 --> 7701.778 | Lr --> 0.001 | Seconds_per_step --> 9.653 |
|
19 |
+
[2024-10-20 19:03:39,817][Main][INFO] - [train] Step 225 out of 65536 | Loss --> 69.050 | Loss_ntp --> 34.233 | Loss_mlm --> 34.817 | Grad_l2 --> 106.908 | Weights_l2 --> 7701.772 | Lr --> 0.001 | Seconds_per_step --> 9.579 |
|
20 |
+
[2024-10-20 19:07:41,876][Main][INFO] - [train] Step 250 out of 65536 | Loss --> 68.595 | Loss_ntp --> 33.970 | Loss_mlm --> 34.625 | Grad_l2 --> 126.557 | Weights_l2 --> 7701.767 | Lr --> 0.001 | Seconds_per_step --> 9.682 |
|
21 |
+
[2024-10-20 19:11:43,944][Main][INFO] - [train] Step 275 out of 65536 | Loss --> 67.141 | Loss_ntp --> 33.297 | Loss_mlm --> 33.844 | Grad_l2 --> 114.874 | Weights_l2 --> 7701.762 | Lr --> 0.001 | Seconds_per_step --> 9.683 |
|
22 |
+
[2024-10-20 19:15:43,786][Main][INFO] - [train] Step 300 out of 65536 | Loss --> 65.916 | Loss_ntp --> 32.693 | Loss_mlm --> 33.223 | Grad_l2 --> 89.430 | Weights_l2 --> 7701.757 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
|
23 |
+
[2024-10-20 19:19:45,206][Main][INFO] - [train] Step 325 out of 65536 | Loss --> 65.322 | Loss_ntp --> 32.362 | Loss_mlm --> 32.960 | Grad_l2 --> 97.785 | Weights_l2 --> 7701.751 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
|
24 |
+
[2024-10-20 19:23:45,072][Main][INFO] - [train] Step 350 out of 65536 | Loss --> 64.367 | Loss_ntp --> 31.937 | Loss_mlm --> 32.430 | Grad_l2 --> 83.882 | Weights_l2 --> 7701.746 | Lr --> 0.001 | Seconds_per_step --> 9.595 |
|
25 |
+
[2024-10-20 19:27:46,534][Main][INFO] - [train] Step 375 out of 65536 | Loss --> 63.409 | Loss_ntp --> 31.433 | Loss_mlm --> 31.975 | Grad_l2 --> 75.548 | Weights_l2 --> 7701.741 | Lr --> 0.001 | Seconds_per_step --> 9.658 |
|
26 |
+
[2024-10-20 19:31:45,390][Main][INFO] - [train] Step 400 out of 65536 | Loss --> 62.292 | Loss_ntp --> 30.925 | Loss_mlm --> 31.367 | Grad_l2 --> 72.299 | Weights_l2 --> 7701.736 | Lr --> 0.001 | Seconds_per_step --> 9.554 |
|
27 |
+
[2024-10-20 19:35:46,689][Main][INFO] - [train] Step 425 out of 65536 | Loss --> 61.685 | Loss_ntp --> 30.585 | Loss_mlm --> 31.100 | Grad_l2 --> 73.838 | Weights_l2 --> 7701.731 | Lr --> 0.001 | Seconds_per_step --> 9.652 |
|
28 |
+
[2024-10-20 19:39:46,030][Main][INFO] - [train] Step 450 out of 65536 | Loss --> 61.416 | Loss_ntp --> 30.509 | Loss_mlm --> 30.907 | Grad_l2 --> 79.820 | Weights_l2 --> 7701.726 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
|
29 |
+
[2024-10-20 19:43:47,298][Main][INFO] - [train] Step 475 out of 65536 | Loss --> 60.536 | Loss_ntp --> 30.069 | Loss_mlm --> 30.467 | Grad_l2 --> 59.074 | Weights_l2 --> 7701.722 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
|
30 |
+
[2024-10-20 19:47:48,778][Main][INFO] - [train] Step 500 out of 65536 | Loss --> 60.085 | Loss_ntp --> 29.838 | Loss_mlm --> 30.246 | Grad_l2 --> 71.417 | Weights_l2 --> 7701.717 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
|
31 |
+
[2024-10-20 19:49:25,862][Main][INFO] - [eval] Step 500 out of 65536 | Loss --> 57.611 | Loss_ntp --> 28.694 | Loss_mlm --> 28.917 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 97.080 |
|
32 |
+
[2024-10-20 19:53:26,482][Main][INFO] - [train] Step 525 out of 65536 | Loss --> 59.106 | Loss_ntp --> 29.371 | Loss_mlm --> 29.735 | Grad_l2 --> 56.829 | Weights_l2 --> 7701.712 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
|
33 |
+
[2024-10-20 19:57:25,811][Main][INFO] - [train] Step 550 out of 65536 | Loss --> 58.185 | Loss_ntp --> 28.950 | Loss_mlm --> 29.235 | Grad_l2 --> 56.368 | Weights_l2 --> 7701.707 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
|
34 |
+
[2024-10-20 20:01:26,095][Main][INFO] - [train] Step 575 out of 65536 | Loss --> 57.301 | Loss_ntp --> 28.480 | Loss_mlm --> 28.821 | Grad_l2 --> 39.860 | Weights_l2 --> 7701.703 | Lr --> 0.001 | Seconds_per_step --> 9.611 |
|
35 |
+
[2024-10-20 20:05:26,649][Main][INFO] - [train] Step 600 out of 65536 | Loss --> 56.020 | Loss_ntp --> 27.906 | Loss_mlm --> 28.115 | Grad_l2 --> 35.414 | Weights_l2 --> 7701.698 | Lr --> 0.001 | Seconds_per_step --> 9.622 |
|
36 |
+
[2024-10-20 20:09:28,597][Main][INFO] - [train] Step 625 out of 65536 | Loss --> 55.363 | Loss_ntp --> 27.524 | Loss_mlm --> 27.840 | Grad_l2 --> 50.531 | Weights_l2 --> 7701.694 | Lr --> 0.001 | Seconds_per_step --> 9.678 |
|
37 |
+
[2024-10-20 20:13:29,399][Main][INFO] - [train] Step 650 out of 65536 | Loss --> 54.803 | Loss_ntp --> 27.252 | Loss_mlm --> 27.551 | Grad_l2 --> 56.108 | Weights_l2 --> 7701.689 | Lr --> 0.001 | Seconds_per_step --> 9.632 |
|
38 |
+
[2024-10-20 20:17:31,948][Main][INFO] - [train] Step 675 out of 65536 | Loss --> 53.970 | Loss_ntp --> 26.793 | Loss_mlm --> 27.176 | Grad_l2 --> 46.473 | Weights_l2 --> 7701.685 | Lr --> 0.001 | Seconds_per_step --> 9.702 |
|
39 |
+
[2024-10-20 20:21:31,196][Main][INFO] - [train] Step 700 out of 65536 | Loss --> 53.056 | Loss_ntp --> 26.359 | Loss_mlm --> 26.697 | Grad_l2 --> 37.435 | Weights_l2 --> 7701.680 | Lr --> 0.001 | Seconds_per_step --> 9.570 |
|
40 |
+
[2024-10-20 20:25:33,347][Main][INFO] - [train] Step 725 out of 65536 | Loss --> 52.070 | Loss_ntp --> 25.876 | Loss_mlm --> 26.194 | Grad_l2 --> 43.881 | Weights_l2 --> 7701.676 | Lr --> 0.001 | Seconds_per_step --> 9.686 |
|
41 |
+
[2024-10-20 20:29:33,004][Main][INFO] - [train] Step 750 out of 65536 | Loss --> 51.191 | Loss_ntp --> 25.456 | Loss_mlm --> 25.735 | Grad_l2 --> 44.855 | Weights_l2 --> 7701.672 | Lr --> 0.001 | Seconds_per_step --> 9.586 |
|
42 |
+
[2024-10-20 20:33:34,557][Main][INFO] - [train] Step 775 out of 65536 | Loss --> 50.129 | Loss_ntp --> 24.891 | Loss_mlm --> 25.239 | Grad_l2 --> 40.117 | Weights_l2 --> 7701.667 | Lr --> 0.001 | Seconds_per_step --> 9.662 |
|
43 |
+
[2024-10-20 20:37:33,242][Main][INFO] - [train] Step 800 out of 65536 | Loss --> 49.019 | Loss_ntp --> 24.361 | Loss_mlm --> 24.658 | Grad_l2 --> 39.953 | Weights_l2 --> 7701.663 | Lr --> 0.001 | Seconds_per_step --> 9.547 |
|
44 |
+
[2024-10-20 20:41:33,285][Main][INFO] - [train] Step 825 out of 65536 | Loss --> 48.160 | Loss_ntp --> 23.923 | Loss_mlm --> 24.238 | Grad_l2 --> 42.816 | Weights_l2 --> 7701.659 | Lr --> 0.001 | Seconds_per_step --> 9.602 |
|
45 |
+
[2024-10-20 20:45:34,352][Main][INFO] - [train] Step 850 out of 65536 | Loss --> 46.672 | Loss_ntp --> 23.149 | Loss_mlm --> 23.522 | Grad_l2 --> 42.230 | Weights_l2 --> 7701.654 | Lr --> 0.001 | Seconds_per_step --> 9.643 |
|
46 |
+
[2024-10-20 20:49:34,963][Main][INFO] - [train] Step 875 out of 65536 | Loss --> 44.855 | Loss_ntp --> 22.279 | Loss_mlm --> 22.575 | Grad_l2 --> 39.123 | Weights_l2 --> 7701.650 | Lr --> 0.001 | Seconds_per_step --> 9.624 |
|
47 |
+
[2024-10-20 20:53:36,677][Main][INFO] - [train] Step 900 out of 65536 | Loss --> 42.480 | Loss_ntp --> 21.057 | Loss_mlm --> 21.423 | Grad_l2 --> 50.501 | Weights_l2 --> 7701.645 | Lr --> 0.001 | Seconds_per_step --> 9.668 |
|
48 |
+
[2024-10-20 20:57:37,186][Main][INFO] - [train] Step 925 out of 65536 | Loss --> 40.028 | Loss_ntp --> 19.877 | Loss_mlm --> 20.151 | Grad_l2 --> 57.109 | Weights_l2 --> 7701.640 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
|
49 |
+
[2024-10-20 21:01:38,800][Main][INFO] - [train] Step 950 out of 65536 | Loss --> 37.058 | Loss_ntp --> 18.359 | Loss_mlm --> 18.699 | Grad_l2 --> 78.443 | Weights_l2 --> 7701.634 | Lr --> 0.001 | Seconds_per_step --> 9.664 |
|
50 |
+
[2024-10-20 21:05:38,405][Main][INFO] - [train] Step 975 out of 65536 | Loss --> 33.534 | Loss_ntp --> 16.618 | Loss_mlm --> 16.917 | Grad_l2 --> 87.220 | Weights_l2 --> 7701.628 | Lr --> 0.001 | Seconds_per_step --> 9.584 |
|
51 |
+
[2024-10-20 21:09:41,153][Main][INFO] - [train] Step 1000 out of 65536 | Loss --> 29.988 | Loss_ntp --> 14.857 | Loss_mlm --> 15.131 | Grad_l2 --> 88.279 | Weights_l2 --> 7701.622 | Lr --> 0.001 | Seconds_per_step --> 9.710 |
|
52 |
+
[2024-10-20 21:10:10,310][Main][INFO] - [eval] Step 1000 out of 65536 | Loss --> 28.033 | Loss_ntp --> 13.938 | Loss_mlm --> 14.095 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 29.143 |
|
53 |
+
[2024-10-20 21:14:10,580][Main][INFO] - [train] Step 1025 out of 65536 | Loss --> 26.588 | Loss_ntp --> 13.166 | Loss_mlm --> 13.423 | Grad_l2 --> 109.226 | Weights_l2 --> 7701.616 | Lr --> 0.001 | Seconds_per_step --> 9.611 |
|
54 |
+
[2024-10-20 21:18:12,558][Main][INFO] - [train] Step 1050 out of 65536 | Loss --> 23.850 | Loss_ntp --> 11.830 | Loss_mlm --> 12.020 | Grad_l2 --> 98.666 | Weights_l2 --> 7701.610 | Lr --> 0.001 | Seconds_per_step --> 9.679 |
|
55 |
+
[2024-10-20 21:22:11,593][Main][INFO] - [train] Step 1075 out of 65536 | Loss --> 21.589 | Loss_ntp --> 10.697 | Loss_mlm --> 10.892 | Grad_l2 --> 104.858 | Weights_l2 --> 7701.605 | Lr --> 0.001 | Seconds_per_step --> 9.561 |
|
56 |
+
[2024-10-20 21:26:13,779][Main][INFO] - [train] Step 1100 out of 65536 | Loss --> 19.443 | Loss_ntp --> 9.626 | Loss_mlm --> 9.817 | Grad_l2 --> 75.473 | Weights_l2 --> 7701.599 | Lr --> 0.001 | Seconds_per_step --> 9.687 |
|
57 |
+
[2024-10-20 21:30:13,762][Main][INFO] - [train] Step 1125 out of 65536 | Loss --> 17.771 | Loss_ntp --> 8.793 | Loss_mlm --> 8.978 | Grad_l2 --> 55.492 | Weights_l2 --> 7701.593 | Lr --> 0.001 | Seconds_per_step --> 9.599 |
|
58 |
+
[2024-10-20 21:34:14,478][Main][INFO] - [train] Step 1150 out of 65536 | Loss --> 17.092 | Loss_ntp --> 8.462 | Loss_mlm --> 8.630 | Grad_l2 --> 72.673 | Weights_l2 --> 7701.587 | Lr --> 0.001 | Seconds_per_step --> 9.629 |
|
59 |
+
[2024-10-20 21:38:14,797][Main][INFO] - [train] Step 1175 out of 65536 | Loss --> 16.731 | Loss_ntp --> 8.294 | Loss_mlm --> 8.437 | Grad_l2 --> 60.718 | Weights_l2 --> 7701.582 | Lr --> 0.001 | Seconds_per_step --> 9.613 |
|
60 |
+
[2024-10-20 21:42:15,467][Main][INFO] - [train] Step 1200 out of 65536 | Loss --> 16.522 | Loss_ntp --> 8.188 | Loss_mlm --> 8.334 | Grad_l2 --> 62.414 | Weights_l2 --> 7701.577 | Lr --> 0.001 | Seconds_per_step --> 9.627 |
|
61 |
+
[2024-10-20 21:46:15,957][Main][INFO] - [train] Step 1225 out of 65536 | Loss --> 16.336 | Loss_ntp --> 8.096 | Loss_mlm --> 8.240 | Grad_l2 --> 57.944 | Weights_l2 --> 7701.572 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
|
62 |
+
[2024-10-20 21:50:15,276][Main][INFO] - [train] Step 1250 out of 65536 | Loss --> 16.167 | Loss_ntp --> 8.006 | Loss_mlm --> 8.161 | Grad_l2 --> 42.899 | Weights_l2 --> 7701.567 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
|
63 |
+
[2024-10-20 21:54:18,039][Main][INFO] - [train] Step 1275 out of 65536 | Loss --> 16.183 | Loss_ntp --> 8.017 | Loss_mlm --> 8.166 | Grad_l2 --> 48.492 | Weights_l2 --> 7701.563 | Lr --> 0.001 | Seconds_per_step --> 9.710 |
|
64 |
+
[2024-10-20 21:58:18,396][Main][INFO] - [train] Step 1300 out of 65536 | Loss --> 15.988 | Loss_ntp --> 7.926 | Loss_mlm --> 8.063 | Grad_l2 --> 42.852 | Weights_l2 --> 7701.558 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
|
65 |
+
[2024-10-20 22:02:20,263][Main][INFO] - [train] Step 1325 out of 65536 | Loss --> 15.982 | Loss_ntp --> 7.916 | Loss_mlm --> 8.066 | Grad_l2 --> 47.218 | Weights_l2 --> 7701.553 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
|
66 |
+
[2024-10-20 22:06:20,739][Main][INFO] - [train] Step 1350 out of 65536 | Loss --> 15.830 | Loss_ntp --> 7.838 | Loss_mlm --> 7.992 | Grad_l2 --> 28.805 | Weights_l2 --> 7701.549 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
|
67 |
+
[2024-10-20 22:10:23,190][Main][INFO] - [train] Step 1375 out of 65536 | Loss --> 15.806 | Loss_ntp --> 7.839 | Loss_mlm --> 7.967 | Grad_l2 --> 37.388 | Weights_l2 --> 7701.544 | Lr --> 0.001 | Seconds_per_step --> 9.698 |
|
68 |
+
[2024-10-20 22:14:23,525][Main][INFO] - [train] Step 1400 out of 65536 | Loss --> 15.775 | Loss_ntp --> 7.813 | Loss_mlm --> 7.962 | Grad_l2 --> 35.380 | Weights_l2 --> 7701.540 | Lr --> 0.001 | Seconds_per_step --> 9.613 |
|
69 |
+
[2024-10-20 22:18:25,080][Main][INFO] - [train] Step 1425 out of 65536 | Loss --> 15.722 | Loss_ntp --> 7.794 | Loss_mlm --> 7.928 | Grad_l2 --> 34.978 | Weights_l2 --> 7701.535 | Lr --> 0.001 | Seconds_per_step --> 9.662 |
|
70 |
+
[2024-10-20 22:22:24,651][Main][INFO] - [train] Step 1450 out of 65536 | Loss --> 15.638 | Loss_ntp --> 7.739 | Loss_mlm --> 7.899 | Grad_l2 --> 24.003 | Weights_l2 --> 7701.530 | Lr --> 0.001 | Seconds_per_step --> 9.583 |
|
71 |
+
[2024-10-20 22:26:24,495][Main][INFO] - [train] Step 1475 out of 65536 | Loss --> 15.682 | Loss_ntp --> 7.768 | Loss_mlm --> 7.913 | Grad_l2 --> 27.599 | Weights_l2 --> 7701.526 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
|
72 |
+
[2024-10-20 22:30:25,992][Main][INFO] - [train] Step 1500 out of 65536 | Loss --> 15.638 | Loss_ntp --> 7.754 | Loss_mlm --> 7.884 | Grad_l2 --> 22.985 | Weights_l2 --> 7701.521 | Lr --> 0.001 | Seconds_per_step --> 9.660 |
|
73 |
+
[2024-10-20 22:30:54,697][Main][INFO] - [eval] Step 1500 out of 65536 | Loss --> 15.664 | Loss_ntp --> 7.782 | Loss_mlm --> 7.882 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.700 |
|
74 |
+
[2024-10-20 22:30:54,709][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-1500
|
75 |
+
[2024-10-20 22:30:54,719][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
76 |
+
[2024-10-20 22:30:59,988][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-1500/model.safetensors
|
77 |
+
[2024-10-20 22:31:08,673][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-1500/optimizer.bin
|
78 |
+
[2024-10-20 22:31:08,682][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-1500/scheduler.bin
|
79 |
+
[2024-10-20 22:31:08,684][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-1500/sampler.bin
|
80 |
+
[2024-10-20 22:31:08,686][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-1500/sampler_1.bin
|
81 |
+
[2024-10-20 22:31:08,694][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-1500/random_states_0.pkl
|
82 |
+
[2024-10-20 22:35:09,885][Main][INFO] - [train] Step 1525 out of 65536 | Loss --> 15.740 | Loss_ntp --> 7.803 | Loss_mlm --> 7.937 | Grad_l2 --> 35.476 | Weights_l2 --> 7701.516 | Lr --> 0.001 | Seconds_per_step --> 10.207 |
|
83 |
+
[2024-10-20 22:39:10,189][Main][INFO] - [train] Step 1550 out of 65536 | Loss --> 15.717 | Loss_ntp --> 7.796 | Loss_mlm --> 7.921 | Grad_l2 --> 32.209 | Weights_l2 --> 7701.511 | Lr --> 0.001 | Seconds_per_step --> 9.612 |
|
84 |
+
[2024-10-20 22:43:12,020][Main][INFO] - [train] Step 1575 out of 65536 | Loss --> 15.723 | Loss_ntp --> 7.805 | Loss_mlm --> 7.918 | Grad_l2 --> 35.393 | Weights_l2 --> 7701.506 | Lr --> 0.001 | Seconds_per_step --> 9.673 |
|
85 |
+
[2024-10-20 22:47:13,492][Main][INFO] - [train] Step 1600 out of 65536 | Loss --> 15.617 | Loss_ntp --> 7.752 | Loss_mlm --> 7.865 | Grad_l2 --> 29.357 | Weights_l2 --> 7701.502 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
|
86 |
+
[2024-10-20 22:51:13,978][Main][INFO] - [train] Step 1625 out of 65536 | Loss --> 15.532 | Loss_ntp --> 7.709 | Loss_mlm --> 7.822 | Grad_l2 --> 18.501 | Weights_l2 --> 7701.497 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
|
87 |
+
[2024-10-20 22:55:14,600][Main][INFO] - [train] Step 1650 out of 65536 | Loss --> 15.565 | Loss_ntp --> 7.720 | Loss_mlm --> 7.845 | Grad_l2 --> 17.546 | Weights_l2 --> 7701.493 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
|
88 |
+
[2024-10-20 22:59:14,384][Main][INFO] - [train] Step 1675 out of 65536 | Loss --> 15.576 | Loss_ntp --> 7.737 | Loss_mlm --> 7.838 | Grad_l2 --> 23.599 | Weights_l2 --> 7701.489 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
|
89 |
+
[2024-10-20 23:03:16,878][Main][INFO] - [train] Step 1700 out of 65536 | Loss --> 15.612 | Loss_ntp --> 7.757 | Loss_mlm --> 7.855 | Grad_l2 --> 28.685 | Weights_l2 --> 7701.484 | Lr --> 0.001 | Seconds_per_step --> 9.700 |
|
90 |
+
[2024-10-20 23:07:16,611][Main][INFO] - [train] Step 1725 out of 65536 | Loss --> 15.590 | Loss_ntp --> 7.728 | Loss_mlm --> 7.861 | Grad_l2 --> 22.357 | Weights_l2 --> 7701.479 | Lr --> 0.001 | Seconds_per_step --> 9.589 |
|
91 |
+
[2024-10-20 23:11:18,435][Main][INFO] - [train] Step 1750 out of 65536 | Loss --> 15.475 | Loss_ntp --> 7.683 | Loss_mlm --> 7.792 | Grad_l2 --> 20.808 | Weights_l2 --> 7701.475 | Lr --> 0.001 | Seconds_per_step --> 9.673 |
|
92 |
+
[2024-10-20 23:15:17,324][Main][INFO] - [train] Step 1775 out of 65536 | Loss --> 15.422 | Loss_ntp --> 7.655 | Loss_mlm --> 7.767 | Grad_l2 --> 16.928 | Weights_l2 --> 7701.470 | Lr --> 0.001 | Seconds_per_step --> 9.555 |
|
93 |
+
[2024-10-20 23:19:17,823][Main][INFO] - [train] Step 1800 out of 65536 | Loss --> 15.370 | Loss_ntp --> 7.625 | Loss_mlm --> 7.745 | Grad_l2 --> 16.147 | Weights_l2 --> 7701.466 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
|
94 |
+
[2024-10-20 23:23:19,005][Main][INFO] - [train] Step 1825 out of 65536 | Loss --> 15.363 | Loss_ntp --> 7.629 | Loss_mlm --> 7.734 | Grad_l2 --> 19.934 | Weights_l2 --> 7701.462 | Lr --> 0.001 | Seconds_per_step --> 9.647 |
|
95 |
+
[2024-10-20 23:27:17,933][Main][INFO] - [train] Step 1850 out of 65536 | Loss --> 15.347 | Loss_ntp --> 7.616 | Loss_mlm --> 7.732 | Grad_l2 --> 25.592 | Weights_l2 --> 7701.457 | Lr --> 0.001 | Seconds_per_step --> 9.557 |
|
96 |
+
[2024-10-20 23:31:19,805][Main][INFO] - [train] Step 1875 out of 65536 | Loss --> 15.254 | Loss_ntp --> 7.577 | Loss_mlm --> 7.677 | Grad_l2 --> 19.500 | Weights_l2 --> 7701.453 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
|
97 |
+
[2024-10-20 23:35:18,582][Main][INFO] - [train] Step 1900 out of 65536 | Loss --> 15.204 | Loss_ntp --> 7.550 | Loss_mlm --> 7.653 | Grad_l2 --> 15.358 | Weights_l2 --> 7701.448 | Lr --> 0.001 | Seconds_per_step --> 9.551 |
|
98 |
+
[2024-10-20 23:39:20,300][Main][INFO] - [train] Step 1925 out of 65536 | Loss --> 15.153 | Loss_ntp --> 7.525 | Loss_mlm --> 7.628 | Grad_l2 --> 13.241 | Weights_l2 --> 7701.445 | Lr --> 0.001 | Seconds_per_step --> 9.669 |
|
99 |
+
[2024-10-20 23:43:21,680][Main][INFO] - [train] Step 1950 out of 65536 | Loss --> 15.111 | Loss_ntp --> 7.497 | Loss_mlm --> 7.614 | Grad_l2 --> 13.357 | Weights_l2 --> 7701.441 | Lr --> 0.001 | Seconds_per_step --> 9.655 |
|
100 |
+
[2024-10-20 23:47:22,111][Main][INFO] - [train] Step 1975 out of 65536 | Loss --> 15.072 | Loss_ntp --> 7.475 | Loss_mlm --> 7.597 | Grad_l2 --> 15.485 | Weights_l2 --> 7701.437 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
|
101 |
+
[2024-10-20 23:51:21,960][Main][INFO] - [train] Step 2000 out of 65536 | Loss --> 15.061 | Loss_ntp --> 7.470 | Loss_mlm --> 7.591 | Grad_l2 --> 15.511 | Weights_l2 --> 7701.432 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
|
102 |
+
[2024-10-20 23:51:50,849][Main][INFO] - [eval] Step 2000 out of 65536 | Loss --> 15.092 | Loss_ntp --> 7.501 | Loss_mlm --> 7.591 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.883 |
|
103 |
+
[2024-10-20 23:55:53,490][Main][INFO] - [train] Step 2025 out of 65536 | Loss --> 15.080 | Loss_ntp --> 7.479 | Loss_mlm --> 7.601 | Grad_l2 --> 17.451 | Weights_l2 --> 7701.428 | Lr --> 0.001 | Seconds_per_step --> 9.705 |
|
104 |
+
[2024-10-20 23:59:53,747][Main][INFO] - [train] Step 2050 out of 65536 | Loss --> 14.998 | Loss_ntp --> 7.447 | Loss_mlm --> 7.551 | Grad_l2 --> 13.242 | Weights_l2 --> 7701.424 | Lr --> 0.001 | Seconds_per_step --> 9.610 |
|
105 |
+
[2024-10-21 00:03:57,114][Main][INFO] - [train] Step 2075 out of 65536 | Loss --> 14.994 | Loss_ntp --> 7.431 | Loss_mlm --> 7.562 | Grad_l2 --> 17.409 | Weights_l2 --> 7701.419 | Lr --> 0.001 | Seconds_per_step --> 9.735 |
|
106 |
+
[2024-10-21 00:07:56,557][Main][INFO] - [train] Step 2100 out of 65536 | Loss --> 14.993 | Loss_ntp --> 7.437 | Loss_mlm --> 7.556 | Grad_l2 --> 23.374 | Weights_l2 --> 7701.414 | Lr --> 0.001 | Seconds_per_step --> 9.578 |
|
107 |
+
[2024-10-21 00:11:56,818][Main][INFO] - [train] Step 2125 out of 65536 | Loss --> 14.963 | Loss_ntp --> 7.428 | Loss_mlm --> 7.535 | Grad_l2 --> 24.857 | Weights_l2 --> 7701.410 | Lr --> 0.001 | Seconds_per_step --> 9.610 |
|
108 |
+
[2024-10-21 00:15:56,927][Main][INFO] - [train] Step 2150 out of 65536 | Loss --> 14.829 | Loss_ntp --> 7.354 | Loss_mlm --> 7.474 | Grad_l2 --> 14.538 | Weights_l2 --> 7701.405 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
|
109 |
+
[2024-10-21 00:19:57,089][Main][INFO] - [train] Step 2175 out of 65536 | Loss --> 14.797 | Loss_ntp --> 7.344 | Loss_mlm --> 7.453 | Grad_l2 --> 13.598 | Weights_l2 --> 7701.400 | Lr --> 0.001 | Seconds_per_step --> 9.606 |
|
110 |
+
[2024-10-21 00:23:58,135][Main][INFO] - [train] Step 2200 out of 65536 | Loss --> 14.774 | Loss_ntp --> 7.321 | Loss_mlm --> 7.454 | Grad_l2 --> 13.339 | Weights_l2 --> 7701.396 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
|
111 |
+
[2024-10-21 00:27:58,499][Main][INFO] - [train] Step 2225 out of 65536 | Loss --> 14.671 | Loss_ntp --> 7.284 | Loss_mlm --> 7.387 | Grad_l2 --> 13.884 | Weights_l2 --> 7701.392 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
|
112 |
+
[2024-10-21 00:31:59,596][Main][INFO] - [train] Step 2250 out of 65536 | Loss --> 14.635 | Loss_ntp --> 7.264 | Loss_mlm --> 7.371 | Grad_l2 --> 11.527 | Weights_l2 --> 7701.388 | Lr --> 0.001 | Seconds_per_step --> 9.644 |
|
113 |
+
[2024-10-21 00:35:58,256][Main][INFO] - [train] Step 2275 out of 65536 | Loss --> 14.593 | Loss_ntp --> 7.247 | Loss_mlm --> 7.345 | Grad_l2 --> 9.993 | Weights_l2 --> 7701.384 | Lr --> 0.001 | Seconds_per_step --> 9.546 |
|
114 |
+
[2024-10-21 00:39:59,379][Main][INFO] - [train] Step 2300 out of 65536 | Loss --> 14.543 | Loss_ntp --> 7.216 | Loss_mlm --> 7.327 | Grad_l2 --> 12.147 | Weights_l2 --> 7701.381 | Lr --> 0.001 | Seconds_per_step --> 9.644 |
|
115 |
+
[2024-10-21 00:43:59,080][Main][INFO] - [train] Step 2325 out of 65536 | Loss --> 14.577 | Loss_ntp --> 7.231 | Loss_mlm --> 7.345 | Grad_l2 --> 12.365 | Weights_l2 --> 7701.376 | Lr --> 0.001 | Seconds_per_step --> 9.588 |
|
116 |
+
[2024-10-21 00:47:59,811][Main][INFO] - [train] Step 2350 out of 65536 | Loss --> 14.512 | Loss_ntp --> 7.202 | Loss_mlm --> 7.310 | Grad_l2 --> 12.472 | Weights_l2 --> 7701.372 | Lr --> 0.001 | Seconds_per_step --> 9.629 |
|
117 |
+
[2024-10-21 00:51:58,749][Main][INFO] - [train] Step 2375 out of 65536 | Loss --> 14.434 | Loss_ntp --> 7.166 | Loss_mlm --> 7.268 | Grad_l2 --> 12.198 | Weights_l2 --> 7701.368 | Lr --> 0.001 | Seconds_per_step --> 9.557 |
|
118 |
+
[2024-10-21 00:55:58,527][Main][INFO] - [train] Step 2400 out of 65536 | Loss --> 14.390 | Loss_ntp --> 7.141 | Loss_mlm --> 7.249 | Grad_l2 --> 11.488 | Weights_l2 --> 7701.365 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
|
119 |
+
[2024-10-21 00:59:59,746][Main][INFO] - [train] Step 2425 out of 65536 | Loss --> 14.396 | Loss_ntp --> 7.142 | Loss_mlm --> 7.253 | Grad_l2 --> 11.924 | Weights_l2 --> 7701.361 | Lr --> 0.001 | Seconds_per_step --> 9.649 |
|
120 |
+
[2024-10-21 01:03:58,922][Main][INFO] - [train] Step 2450 out of 65536 | Loss --> 14.319 | Loss_ntp --> 7.108 | Loss_mlm --> 7.211 | Grad_l2 --> 11.587 | Weights_l2 --> 7701.357 | Lr --> 0.001 | Seconds_per_step --> 9.567 |
|
121 |
+
[2024-10-21 01:08:00,577][Main][INFO] - [train] Step 2475 out of 65536 | Loss --> 14.363 | Loss_ntp --> 7.132 | Loss_mlm --> 7.231 | Grad_l2 --> 11.854 | Weights_l2 --> 7701.353 | Lr --> 0.001 | Seconds_per_step --> 9.666 |
|
122 |
+
[2024-10-21 01:12:00,070][Main][INFO] - [train] Step 2500 out of 65536 | Loss --> 14.333 | Loss_ntp --> 7.121 | Loss_mlm --> 7.212 | Grad_l2 --> 10.363 | Weights_l2 --> 7701.349 | Lr --> 0.001 | Seconds_per_step --> 9.580 |
|
123 |
+
[2024-10-21 01:12:28,480][Main][INFO] - [eval] Step 2500 out of 65536 | Loss --> 14.573 | Loss_ntp --> 7.286 | Loss_mlm --> 7.287 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.404 |
|
124 |
+
[2024-10-21 01:16:30,064][Main][INFO] - [train] Step 2525 out of 65536 | Loss --> 14.280 | Loss_ntp --> 7.089 | Loss_mlm --> 7.192 | Grad_l2 --> 13.178 | Weights_l2 --> 7701.345 | Lr --> 0.001 | Seconds_per_step --> 9.663 |
|
125 |
+
[2024-10-21 01:20:29,018][Main][INFO] - [train] Step 2550 out of 65536 | Loss --> 14.260 | Loss_ntp --> 7.091 | Loss_mlm --> 7.169 | Grad_l2 --> 12.381 | Weights_l2 --> 7701.341 | Lr --> 0.001 | Seconds_per_step --> 9.558 |
|
126 |
+
[2024-10-21 01:24:31,253][Main][INFO] - [train] Step 2575 out of 65536 | Loss --> 14.259 | Loss_ntp --> 7.078 | Loss_mlm --> 7.182 | Grad_l2 --> 11.247 | Weights_l2 --> 7701.337 | Lr --> 0.001 | Seconds_per_step --> 9.689 |
|
127 |
+
[2024-10-21 01:28:31,446][Main][INFO] - [train] Step 2600 out of 65536 | Loss --> 14.259 | Loss_ntp --> 7.080 | Loss_mlm --> 7.179 | Grad_l2 --> 12.524 | Weights_l2 --> 7701.333 | Lr --> 0.001 | Seconds_per_step --> 9.608 |
|
128 |
+
[2024-10-21 01:32:31,794][Main][INFO] - [train] Step 2625 out of 65536 | Loss --> 14.245 | Loss_ntp --> 7.068 | Loss_mlm --> 7.178 | Grad_l2 --> 12.087 | Weights_l2 --> 7701.330 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
|
129 |
+
[2024-10-21 01:36:32,411][Main][INFO] - [train] Step 2650 out of 65536 | Loss --> 14.247 | Loss_ntp --> 7.074 | Loss_mlm --> 7.173 | Grad_l2 --> 11.638 | Weights_l2 --> 7701.326 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
|
130 |
+
[2024-10-21 01:40:33,462][Main][INFO] - [train] Step 2675 out of 65536 | Loss --> 14.274 | Loss_ntp --> 7.086 | Loss_mlm --> 7.189 | Grad_l2 --> 10.415 | Weights_l2 --> 7701.322 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
|
131 |
+
[2024-10-21 01:44:33,254][Main][INFO] - [train] Step 2700 out of 65536 | Loss --> 14.276 | Loss_ntp --> 7.097 | Loss_mlm --> 7.179 | Grad_l2 --> 10.830 | Weights_l2 --> 7701.318 | Lr --> 0.001 | Seconds_per_step --> 9.592 |
|
132 |
+
[2024-10-21 01:48:34,104][Main][INFO] - [train] Step 2725 out of 65536 | Loss --> 14.322 | Loss_ntp --> 7.117 | Loss_mlm --> 7.205 | Grad_l2 --> 11.668 | Weights_l2 --> 7701.314 | Lr --> 0.001 | Seconds_per_step --> 9.634 |
|
133 |
+
[2024-10-21 01:52:33,834][Main][INFO] - [train] Step 2750 out of 65536 | Loss --> 14.393 | Loss_ntp --> 7.149 | Loss_mlm --> 7.244 | Grad_l2 --> 10.585 | Weights_l2 --> 7701.310 | Lr --> 0.001 | Seconds_per_step --> 9.589 |
|
134 |
+
[2024-10-21 01:56:33,130][Main][INFO] - [train] Step 2775 out of 65536 | Loss --> 14.326 | Loss_ntp --> 7.124 | Loss_mlm --> 7.202 | Grad_l2 --> 9.862 | Weights_l2 --> 7701.306 | Lr --> 0.001 | Seconds_per_step --> 9.572 |
|
135 |
+
[2024-10-21 02:00:34,375][Main][INFO] - [train] Step 2800 out of 65536 | Loss --> 14.354 | Loss_ntp --> 7.134 | Loss_mlm --> 7.220 | Grad_l2 --> 8.484 | Weights_l2 --> 7701.302 | Lr --> 0.001 | Seconds_per_step --> 9.650 |
|
136 |
+
[2024-10-21 02:04:34,763][Main][INFO] - [train] Step 2825 out of 65536 | Loss --> 14.320 | Loss_ntp --> 7.118 | Loss_mlm --> 7.202 | Grad_l2 --> 11.118 | Weights_l2 --> 7701.298 | Lr --> 0.001 | Seconds_per_step --> 9.615 |
|
137 |
+
[2024-10-21 02:08:35,157][Main][INFO] - [train] Step 2850 out of 65536 | Loss --> 14.323 | Loss_ntp --> 7.124 | Loss_mlm --> 7.199 | Grad_l2 --> 10.821 | Weights_l2 --> 7701.294 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
|
138 |
+
[2024-10-21 02:12:34,860][Main][INFO] - [train] Step 2875 out of 65536 | Loss --> 14.348 | Loss_ntp --> 7.129 | Loss_mlm --> 7.219 | Grad_l2 --> 9.481 | Weights_l2 --> 7701.291 | Lr --> 0.001 | Seconds_per_step --> 9.588 |
|
139 |
+
[2024-10-21 02:16:36,448][Main][INFO] - [train] Step 2900 out of 65536 | Loss --> 14.413 | Loss_ntp --> 7.163 | Loss_mlm --> 7.250 | Grad_l2 --> 10.586 | Weights_l2 --> 7701.287 | Lr --> 0.001 | Seconds_per_step --> 9.663 |
|
140 |
+
[2024-10-21 02:20:36,563][Main][INFO] - [train] Step 2925 out of 65536 | Loss --> 14.319 | Loss_ntp --> 7.113 | Loss_mlm --> 7.206 | Grad_l2 --> 9.175 | Weights_l2 --> 7701.283 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
|
141 |
+
[2024-10-21 02:24:36,522][Main][INFO] - [train] Step 2950 out of 65536 | Loss --> 14.292 | Loss_ntp --> 7.112 | Loss_mlm --> 7.179 | Grad_l2 --> 10.380 | Weights_l2 --> 7701.279 | Lr --> 0.001 | Seconds_per_step --> 9.598 |
|
142 |
+
[2024-10-21 02:28:36,510][Main][INFO] - [train] Step 2975 out of 65536 | Loss --> 14.202 | Loss_ntp --> 7.068 | Loss_mlm --> 7.134 | Grad_l2 --> 9.622 | Weights_l2 --> 7701.276 | Lr --> 0.001 | Seconds_per_step --> 9.599 |
|
143 |
+
[2024-10-21 02:32:38,120][Main][INFO] - [train] Step 3000 out of 65536 | Loss --> 14.214 | Loss_ntp --> 7.066 | Loss_mlm --> 7.147 | Grad_l2 --> 10.228 | Weights_l2 --> 7701.272 | Lr --> 0.001 | Seconds_per_step --> 9.664 |
|
144 |
+
[2024-10-21 02:33:06,984][Main][INFO] - [eval] Step 3000 out of 65536 | Loss --> 14.236 | Loss_ntp --> 7.111 | Loss_mlm --> 7.125 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.858 |
|
145 |
+
[2024-10-21 02:33:06,988][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-3000
|
146 |
+
[2024-10-21 02:33:07,000][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
147 |
+
[2024-10-21 02:33:13,140][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-3000/model.safetensors
|
148 |
+
[2024-10-21 02:33:21,968][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-3000/optimizer.bin
|
149 |
+
[2024-10-21 02:33:21,978][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-3000/scheduler.bin
|
150 |
+
[2024-10-21 02:33:21,979][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-3000/sampler.bin
|
151 |
+
[2024-10-21 02:33:21,981][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-3000/sampler_1.bin
|
152 |
+
[2024-10-21 02:33:21,990][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-3000/random_states_0.pkl
|
153 |
+
[2024-10-21 02:37:21,949][Main][INFO] - [train] Step 3025 out of 65536 | Loss --> 14.180 | Loss_ntp --> 7.041 | Loss_mlm --> 7.138 | Grad_l2 --> 9.928 | Weights_l2 --> 7701.268 | Lr --> 0.001 | Seconds_per_step --> 10.198 |
|
154 |
+
[2024-10-21 02:41:23,436][Main][INFO] - [train] Step 3050 out of 65536 | Loss --> 14.163 | Loss_ntp --> 7.032 | Loss_mlm --> 7.130 | Grad_l2 --> 9.909 | Weights_l2 --> 7701.264 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
|
155 |
+
[2024-10-21 02:45:23,362][Main][INFO] - [train] Step 3075 out of 65536 | Loss --> 14.109 | Loss_ntp --> 7.016 | Loss_mlm --> 7.093 | Grad_l2 --> 10.119 | Weights_l2 --> 7701.260 | Lr --> 0.001 | Seconds_per_step --> 9.597 |
|
156 |
+
[2024-10-21 02:49:23,828][Main][INFO] - [train] Step 3100 out of 65536 | Loss --> 14.053 | Loss_ntp --> 6.981 | Loss_mlm --> 7.072 | Grad_l2 --> 8.917 | Weights_l2 --> 7701.256 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
|
157 |
+
[2024-10-21 02:53:26,144][Main][INFO] - [train] Step 3125 out of 65536 | Loss --> 14.045 | Loss_ntp --> 6.975 | Loss_mlm --> 7.069 | Grad_l2 --> 11.184 | Weights_l2 --> 7701.252 | Lr --> 0.001 | Seconds_per_step --> 9.692 |
|
158 |
+
[2024-10-21 02:57:25,035][Main][INFO] - [train] Step 3150 out of 65536 | Loss --> 14.006 | Loss_ntp --> 6.959 | Loss_mlm --> 7.047 | Grad_l2 --> 9.280 | Weights_l2 --> 7701.248 | Lr --> 0.001 | Seconds_per_step --> 9.555 |
|
159 |
+
[2024-10-21 03:01:27,283][Main][INFO] - [train] Step 3175 out of 65536 | Loss --> 13.943 | Loss_ntp --> 6.924 | Loss_mlm --> 7.020 | Grad_l2 --> 8.769 | Weights_l2 --> 7701.245 | Lr --> 0.001 | Seconds_per_step --> 9.690 |
|
160 |
+
[2024-10-21 03:05:27,701][Main][INFO] - [train] Step 3200 out of 65536 | Loss --> 13.956 | Loss_ntp --> 6.916 | Loss_mlm --> 7.040 | Grad_l2 --> 8.625 | Weights_l2 --> 7701.241 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
|
161 |
+
[2024-10-21 03:09:28,530][Main][INFO] - [train] Step 3225 out of 65536 | Loss --> 13.916 | Loss_ntp --> 6.906 | Loss_mlm --> 7.010 | Grad_l2 --> 9.378 | Weights_l2 --> 7701.238 | Lr --> 0.001 | Seconds_per_step --> 9.633 |
|
162 |
+
[2024-10-21 03:13:28,937][Main][INFO] - [train] Step 3250 out of 65536 | Loss --> 13.849 | Loss_ntp --> 6.867 | Loss_mlm --> 6.982 | Grad_l2 --> 9.221 | Weights_l2 --> 7701.234 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
|
163 |
+
[2024-10-21 03:17:29,597][Main][INFO] - [train] Step 3275 out of 65536 | Loss --> 13.854 | Loss_ntp --> 6.869 | Loss_mlm --> 6.985 | Grad_l2 --> 8.561 | Weights_l2 --> 7701.230 | Lr --> 0.001 | Seconds_per_step --> 9.626 |
|
164 |
+
[2024-10-21 03:21:30,034][Main][INFO] - [train] Step 3300 out of 65536 | Loss --> 13.781 | Loss_ntp --> 6.843 | Loss_mlm --> 6.938 | Grad_l2 --> 8.919 | Weights_l2 --> 7701.226 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
|
165 |
+
[2024-10-21 03:25:29,815][Main][INFO] - [train] Step 3325 out of 65536 | Loss --> 13.766 | Loss_ntp --> 6.836 | Loss_mlm --> 6.930 | Grad_l2 --> 8.129 | Weights_l2 --> 7701.223 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
|
166 |
+
[2024-10-21 03:29:30,344][Main][INFO] - [train] Step 3350 out of 65536 | Loss --> 13.726 | Loss_ntp --> 6.809 | Loss_mlm --> 6.917 | Grad_l2 --> 9.145 | Weights_l2 --> 7701.219 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
|
167 |
+
[2024-10-21 03:33:30,171][Main][INFO] - [train] Step 3375 out of 65536 | Loss --> 13.751 | Loss_ntp --> 6.819 | Loss_mlm --> 6.932 | Grad_l2 --> 11.666 | Weights_l2 --> 7701.215 | Lr --> 0.001 | Seconds_per_step --> 9.593 |
|
168 |
+
[2024-10-21 03:37:32,111][Main][INFO] - [train] Step 3400 out of 65536 | Loss --> 13.700 | Loss_ntp --> 6.796 | Loss_mlm --> 6.905 | Grad_l2 --> 8.776 | Weights_l2 --> 7701.211 | Lr --> 0.001 | Seconds_per_step --> 9.677 |
|
169 |
+
[2024-10-21 03:41:31,530][Main][INFO] - [train] Step 3425 out of 65536 | Loss --> 13.641 | Loss_ntp --> 6.774 | Loss_mlm --> 6.868 | Grad_l2 --> 9.206 | Weights_l2 --> 7701.207 | Lr --> 0.001 | Seconds_per_step --> 9.577 |
|
170 |
+
[2024-10-21 03:45:33,625][Main][INFO] - [train] Step 3450 out of 65536 | Loss --> 13.588 | Loss_ntp --> 6.735 | Loss_mlm --> 6.852 | Grad_l2 --> 6.293 | Weights_l2 --> 7701.204 | Lr --> 0.001 | Seconds_per_step --> 9.684 |
|
171 |
+
[2024-10-21 03:49:34,400][Main][INFO] - [train] Step 3475 out of 65536 | Loss --> 13.615 | Loss_ntp --> 6.748 | Loss_mlm --> 6.868 | Grad_l2 --> 9.161 | Weights_l2 --> 7701.201 | Lr --> 0.001 | Seconds_per_step --> 9.631 |
|
172 |
+
[2024-10-21 03:53:35,824][Main][INFO] - [train] Step 3500 out of 65536 | Loss --> 13.532 | Loss_ntp --> 6.707 | Loss_mlm --> 6.825 | Grad_l2 --> 9.556 | Weights_l2 --> 7701.197 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
|
173 |
+
[2024-10-21 03:54:04,713][Main][INFO] - [eval] Step 3500 out of 65536 | Loss --> 13.912 | Loss_ntp --> 6.950 | Loss_mlm --> 6.962 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.883 |
|
174 |
+
[2024-10-21 03:58:05,620][Main][INFO] - [train] Step 3525 out of 65536 | Loss --> 13.463 | Loss_ntp --> 6.677 | Loss_mlm --> 6.786 | Grad_l2 --> 9.458 | Weights_l2 --> 7701.193 | Lr --> 0.001 | Seconds_per_step --> 9.636 |
|
175 |
+
[2024-10-21 04:02:06,516][Main][INFO] - [train] Step 3550 out of 65536 | Loss --> 13.419 | Loss_ntp --> 6.654 | Loss_mlm --> 6.766 | Grad_l2 --> 9.819 | Weights_l2 --> 7701.188 | Lr --> 0.001 | Seconds_per_step --> 9.636 |
|
176 |
+
[2024-10-21 04:06:07,229][Main][INFO] - [train] Step 3575 out of 65536 | Loss --> 13.362 | Loss_ntp --> 6.626 | Loss_mlm --> 6.736 | Grad_l2 --> 8.944 | Weights_l2 --> 7701.184 | Lr --> 0.001 | Seconds_per_step --> 9.628 |
|
177 |
+
[2024-10-21 04:10:08,761][Main][INFO] - [train] Step 3600 out of 65536 | Loss --> 13.401 | Loss_ntp --> 6.628 | Loss_mlm --> 6.773 | Grad_l2 --> 9.904 | Weights_l2 --> 7701.180 | Lr --> 0.001 | Seconds_per_step --> 9.661 |
|
178 |
+
[2024-10-21 04:14:09,815][Main][INFO] - [train] Step 3625 out of 65536 | Loss --> 13.361 | Loss_ntp --> 6.625 | Loss_mlm --> 6.736 | Grad_l2 --> 8.507 | Weights_l2 --> 7701.176 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
|
179 |
+
[2024-10-21 04:18:10,037][Main][INFO] - [train] Step 3650 out of 65536 | Loss --> 13.355 | Loss_ntp --> 6.614 | Loss_mlm --> 6.741 | Grad_l2 --> 9.056 | Weights_l2 --> 7701.172 | Lr --> 0.001 | Seconds_per_step --> 9.609 |
|
180 |
+
[2024-10-21 04:22:10,677][Main][INFO] - [train] Step 3675 out of 65536 | Loss --> 13.306 | Loss_ntp --> 6.586 | Loss_mlm --> 6.720 | Grad_l2 --> 9.057 | Weights_l2 --> 7701.168 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
|
181 |
+
[2024-10-21 04:26:12,857][Main][INFO] - [train] Step 3700 out of 65536 | Loss --> 13.325 | Loss_ntp --> 6.596 | Loss_mlm --> 6.729 | Grad_l2 --> 10.732 | Weights_l2 --> 7701.163 | Lr --> 0.001 | Seconds_per_step --> 9.687 |
|
182 |
+
[2024-10-21 04:30:11,816][Main][INFO] - [train] Step 3725 out of 65536 | Loss --> 13.239 | Loss_ntp --> 6.561 | Loss_mlm --> 6.678 | Grad_l2 --> 9.810 | Weights_l2 --> 7701.160 | Lr --> 0.001 | Seconds_per_step --> 9.558 |
|
183 |
+
[2024-10-21 04:34:12,167][Main][INFO] - [train] Step 3750 out of 65536 | Loss --> 13.211 | Loss_ntp --> 6.534 | Loss_mlm --> 6.677 | Grad_l2 --> 10.011 | Weights_l2 --> 7701.156 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
|
184 |
+
[2024-10-21 04:38:14,046][Main][INFO] - [train] Step 3775 out of 65536 | Loss --> 13.214 | Loss_ntp --> 6.537 | Loss_mlm --> 6.678 | Grad_l2 --> 8.939 | Weights_l2 --> 7701.152 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
|
185 |
+
[2024-10-21 04:42:14,454][Main][INFO] - [train] Step 3800 out of 65536 | Loss --> 13.148 | Loss_ntp --> 6.508 | Loss_mlm --> 6.640 | Grad_l2 --> 9.513 | Weights_l2 --> 7701.148 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
|
186 |
+
[2024-10-21 04:46:14,554][Main][INFO] - [train] Step 3825 out of 65536 | Loss --> 13.172 | Loss_ntp --> 6.514 | Loss_mlm --> 6.658 | Grad_l2 --> 9.295 | Weights_l2 --> 7701.144 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
|
187 |
+
[2024-10-21 04:50:14,762][Main][INFO] - [train] Step 3850 out of 65536 | Loss --> 13.118 | Loss_ntp --> 6.494 | Loss_mlm --> 6.624 | Grad_l2 --> 7.890 | Weights_l2 --> 7701.140 | Lr --> 0.001 | Seconds_per_step --> 9.608 |
|
188 |
+
[2024-10-21 04:54:16,032][Main][INFO] - [train] Step 3875 out of 65536 | Loss --> 13.179 | Loss_ntp --> 6.521 | Loss_mlm --> 6.657 | Grad_l2 --> 9.901 | Weights_l2 --> 7701.136 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
|
189 |
+
[2024-10-21 04:58:16,128][Main][INFO] - [train] Step 3900 out of 65536 | Loss --> 13.259 | Loss_ntp --> 6.571 | Loss_mlm --> 6.687 | Grad_l2 --> 8.910 | Weights_l2 --> 7701.132 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
|
test.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
+
import torch
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5")
|
5 |
+
|
6 |
+
special_tokens_dict = {'additional_special_tokens': ['[R]', '[S]', '[X]', '[NTP]']}
|
7 |
+
tokenizer.add_special_tokens(special_tokens_dict)
|
8 |
+
|
9 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("/workspace/nanoT5/logs/2024-10-20/18-25-17/checkpoint-pt-27000").to("cuda")
|
10 |
+
prompt = "The "
|
11 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
12 |
+
# Add decoder_input_ids
|
13 |
+
# decoder_input_ids = torch.ones((inputs.input_ids.shape[0], 1), dtype=torch.long) * model.config.decoder_start_token_id
|
14 |
+
|
15 |
+
# Generate
|
16 |
+
generated_ids = model.generate(
|
17 |
+
**inputs,
|
18 |
+
# decoder_input_ids=decoder_input_ids,
|
19 |
+
max_new_tokens=20,
|
20 |
+
no_repeat_ngram_size=5
|
21 |
+
)
|
22 |
+
|
23 |
+
# Decode the output
|
24 |
+
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
|
25 |
+
print(generated_text)
|
wandb/debug-internal.log
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-10-20T18:25:18.130390854Z","level":"INFO","msg":"using version","core version":"0.18.5"}
|
2 |
+
{"time":"2024-10-20T18:25:18.131160825Z","level":"INFO","msg":"created symlink","path":"/workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug-core.log"}
|
3 |
+
{"time":"2024-10-20T18:25:18.247302473Z","level":"INFO","msg":"created new stream","id":"i0qk9v3k"}
|
4 |
+
{"time":"2024-10-20T18:25:18.247577857Z","level":"INFO","msg":"stream: started","id":"i0qk9v3k"}
|
5 |
+
{"time":"2024-10-20T18:25:18.247668586Z","level":"INFO","msg":"handler: started","stream_id":{"value":"i0qk9v3k"}}
|
6 |
+
{"time":"2024-10-20T18:25:18.247659857Z","level":"INFO","msg":"sender: started","stream_id":"i0qk9v3k"}
|
7 |
+
{"time":"2024-10-20T18:25:18.247631762Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"i0qk9v3k"}}
|
8 |
+
{"time":"2024-10-20T18:25:19.59293904Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/debug.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-20 18:25:18,064 INFO MainThread:4102 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
|
2 |
+
2024-10-20 18:25:18,064 INFO MainThread:4102 [wandb_setup.py:_flush():79] Configure stats pid to 4102
|
3 |
+
2024-10-20 18:25:18,065 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from /root/.config/wandb/settings
|
4 |
+
2024-10-20 18:25:18,065 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/settings
|
5 |
+
2024-10-20 18:25:18,066 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
|
6 |
+
2024-10-20 18:25:18,066 INFO MainThread:4102 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
7 |
+
2024-10-20 18:25:18,067 WARNING MainThread:4102 [wandb_setup.py:_flush():79] Could not find program at -m nanoT5.main
|
8 |
+
2024-10-20 18:25:18,067 INFO MainThread:4102 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
|
9 |
+
2024-10-20 18:25:18,068 INFO MainThread:4102 [wandb_setup.py:_flush():79] Applying login settings: {}
|
10 |
+
2024-10-20 18:25:18,069 INFO MainThread:4102 [wandb_init.py:_log_setup():534] Logging user logs to /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug.log
|
11 |
+
2024-10-20 18:25:18,071 INFO MainThread:4102 [wandb_init.py:_log_setup():535] Logging internal logs to /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug-internal.log
|
12 |
+
2024-10-20 18:25:18,071 INFO MainThread:4102 [wandb_init.py:init():621] calling init triggers
|
13 |
+
2024-10-20 18:25:18,072 INFO MainThread:4102 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
|
14 |
+
config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 93789, 'tokenizer': {'name': 'BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5'}, 'working_dir': '/workspace/nanoT5/logs/2024-10-20/18-25-17', 'model': {'liger': True, 'klass': 'local_t5', 'name': 'pszemraj/tFINE-850m-24x24-1024ctx', 'overwrite': {'dropout_rate': 0.0, 'num_decoder_layers': 16, 'num_key_value_heads': 4, 'num_layers': 16, 'use_gqa': True}, 'add_config': {'is_bf16': True}, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'multi_task': True, 'NTP': 0.3, 'input_length': 512, 'max_seq_len': 512, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 0}, 'optim': {'name': 'adamwscale', 'base_lr': 0.001, 'batch_size': 128, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.01, 'grad_clip': 1.0, 'grad_acc': 16, 'final_cosine': 2e-05}, 'eval': {'every_steps': 500, 'steps': 0}, 'checkpoint': {'every_steps': 1500}, 'logging': {'every_steps': 25, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'amazingvince', 'tags': ['gqa', 'large', 'e32-d16', '512 ctx'], 'mode': 'online'}}, 'slurm_id': 'none'}
|
15 |
+
2024-10-20 18:25:18,073 INFO MainThread:4102 [wandb_init.py:init():671] starting backend
|
16 |
+
2024-10-20 18:25:18,074 INFO MainThread:4102 [wandb_init.py:init():675] sending inform_init request
|
17 |
+
2024-10-20 18:25:18,121 INFO MainThread:4102 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-10-20 18:25:18,122 INFO MainThread:4102 [wandb_init.py:init():688] backend started and connected
|
19 |
+
2024-10-20 18:25:18,198 INFO MainThread:4102 [wandb_init.py:init():783] updated telemetry
|
20 |
+
2024-10-20 18:25:18,256 INFO MainThread:4102 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-10-20 18:25:19,558 INFO MainThread:4102 [wandb_init.py:init():867] starting run threads in backend
|
22 |
+
2024-10-20 18:25:19,755 INFO MainThread:4102 [wandb_run.py:_console_start():2463] atexit reg
|
23 |
+
2024-10-20 18:25:19,756 INFO MainThread:4102 [wandb_run.py:_redirect():2311] redirect: wrap_raw
|
24 |
+
2024-10-20 18:25:19,757 INFO MainThread:4102 [wandb_run.py:_redirect():2376] Wrapping output streams.
|
25 |
+
2024-10-20 18:25:19,759 INFO MainThread:4102 [wandb_run.py:_redirect():2401] Redirects installed.
|
26 |
+
2024-10-20 18:25:19,763 INFO MainThread:4102 [wandb_init.py:init():911] run started, returning control to user process
|
27 |
+
2024-10-20 18:25:41,763 INFO MainThread:4102 [wandb_run.py:_config_callback():1390] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 93789, 'tokenizer': {'name': 'BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5'}, 'working_dir': '/workspace/nanoT5/logs/2024-10-20/18-25-17', 'model': {'liger': True, 'klass': 'local_t5', 'name': 'pszemraj/tFINE-850m-24x24-1024ctx', 'overwrite': {'dropout_rate': 0.0, 'num_decoder_layers': 16, 'num_key_value_heads': 4, 'num_layers': 16, 'use_gqa': True}, 'add_config': {'is_bf16': True}, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'multi_task': True, 'NTP': 0.3, 'input_length': 512, 'max_seq_len': 512, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 0, 'before_mask_input_length': 568, 'target_length': 114}, 'optim': {'name': 'adamwscale', 'base_lr': 0.001, 'batch_size': 128, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.01, 'grad_clip': 1.0, 'grad_acc': 16, 'final_cosine': 2e-05}, 'eval': {'every_steps': 500, 'steps': 0, 'corrected_steps': 0}, 'checkpoint': {'every_steps': 1500}, 'logging': {'every_steps': 25, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'amazingvince', 'tags': ['gqa', 'large', 'e32-d16', '512 ctx'], 'mode': 'online'}}, 'slurm_id': 'none', 'n_all_param': 486886912}
|
28 |
+
2024-10-24 02:27:45,254 WARNING MsgRouterThr:4102 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20241020_182518-i0qk9v3k/files/config.yaml
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.18.5
|
4 |
+
m: []
|
5 |
+
python_version: 3.11.10
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 11
|
10 |
+
- 41
|
11 |
+
- 49
|
12 |
+
- 50
|
13 |
+
- 51
|
14 |
+
- 55
|
15 |
+
- 71
|
16 |
+
- 100
|
17 |
+
"2":
|
18 |
+
- 1
|
19 |
+
- 11
|
20 |
+
- 41
|
21 |
+
- 49
|
22 |
+
- 50
|
23 |
+
- 51
|
24 |
+
- 55
|
25 |
+
- 71
|
26 |
+
- 100
|
27 |
+
"3":
|
28 |
+
- 15
|
29 |
+
- 16
|
30 |
+
- 23
|
31 |
+
- 55
|
32 |
+
- 61
|
33 |
+
"4": 3.11.10
|
34 |
+
"5": 0.18.5
|
35 |
+
"6": 4.46.0.dev0
|
36 |
+
"8":
|
37 |
+
- 5
|
38 |
+
"12": 0.18.5
|
39 |
+
"13": linux-x86_64
|
40 |
+
checkpoint:
|
41 |
+
value:
|
42 |
+
every_steps: 1500
|
43 |
+
data:
|
44 |
+
value:
|
45 |
+
NTP: 0.3
|
46 |
+
before_mask_input_length: 568
|
47 |
+
input_length: 512
|
48 |
+
max_seq_len: 512
|
49 |
+
mean_noise_span_length: 3
|
50 |
+
mlm_probability: 0.15
|
51 |
+
multi_task: true
|
52 |
+
num_workers: 0
|
53 |
+
target_length: 114
|
54 |
+
device:
|
55 |
+
value: gpu
|
56 |
+
eval:
|
57 |
+
value:
|
58 |
+
corrected_steps: 0
|
59 |
+
every_steps: 500
|
60 |
+
steps: 0
|
61 |
+
eval_only:
|
62 |
+
value: false
|
63 |
+
logging:
|
64 |
+
value:
|
65 |
+
every_steps: 25
|
66 |
+
grad_l2: true
|
67 |
+
use_wandb: true
|
68 |
+
wandb_config:
|
69 |
+
entity: amazingvince
|
70 |
+
mode: online
|
71 |
+
project: nanoT5
|
72 |
+
tags:
|
73 |
+
- gqa
|
74 |
+
- large
|
75 |
+
- e32-d16
|
76 |
+
- 512 ctx
|
77 |
+
weights_l2: true
|
78 |
+
mode:
|
79 |
+
value: pt
|
80 |
+
model:
|
81 |
+
value:
|
82 |
+
add_config:
|
83 |
+
is_bf16: true
|
84 |
+
checkpoint_path: ""
|
85 |
+
compile: true
|
86 |
+
klass: local_t5
|
87 |
+
liger: true
|
88 |
+
name: pszemraj/tFINE-850m-24x24-1024ctx
|
89 |
+
overwrite:
|
90 |
+
dropout_rate: 0
|
91 |
+
num_decoder_layers: 16
|
92 |
+
num_key_value_heads: 4
|
93 |
+
num_layers: 16
|
94 |
+
use_gqa: true
|
95 |
+
random_init: true
|
96 |
+
n_all_param:
|
97 |
+
value: 486886912
|
98 |
+
optim:
|
99 |
+
value:
|
100 |
+
base_lr: 0.001
|
101 |
+
batch_size: 128
|
102 |
+
epochs: -1
|
103 |
+
final_cosine: 2e-05
|
104 |
+
grad_acc: 16
|
105 |
+
grad_clip: 1
|
106 |
+
lr_scheduler: cosine
|
107 |
+
name: adamwscale
|
108 |
+
total_steps: 65536
|
109 |
+
warmup_steps: 5000
|
110 |
+
weight_decay: 0.01
|
111 |
+
precision:
|
112 |
+
value: bf16
|
113 |
+
predict_only:
|
114 |
+
value: false
|
115 |
+
seed:
|
116 |
+
value: 93789
|
117 |
+
slurm_id:
|
118 |
+
value: none
|
119 |
+
tokenizer:
|
120 |
+
value:
|
121 |
+
name: BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5
|
122 |
+
working_dir:
|
123 |
+
value: /workspace/nanoT5/logs/2024-10-20/18-25-17
|
wandb/run-20241020_182518-i0qk9v3k/files/output.log
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Using tokenizer: BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5
|
2 |
+
loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--hf_slimpajama-6B-28672-BPE-forT5/snapshots/1ed7c49f137e7b3e2f211669f66c8f51dd392f20/tokenizer.model
|
3 |
+
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--hf_slimpajama-6B-28672-BPE-forT5/snapshots/1ed7c49f137e7b3e2f211669f66c8f51dd392f20/tokenizer.json
|
4 |
+
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--hf_slimpajama-6B-28672-BPE-forT5/snapshots/1ed7c49f137e7b3e2f211669f66c8f51dd392f20/added_tokens.json
|
5 |
+
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--hf_slimpajama-6B-28672-BPE-forT5/snapshots/1ed7c49f137e7b3e2f211669f66c8f51dd392f20/special_tokens_map.json
|
6 |
+
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--hf_slimpajama-6B-28672-BPE-forT5/snapshots/1ed7c49f137e7b3e2f211669f66c8f51dd392f20/tokenizer_config.json
|
7 |
+
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
8 |
+
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--tFINE-850m-24x24-1024ctx/snapshots/bbbb8d2ac68f72ce0129f29dd22428c4b219224c/config.json
|
9 |
+
Model config T5Config {
|
10 |
+
"_name_or_path": "pszemraj/tFINE-850m-24x24-1024ctx",
|
11 |
+
"architectures": [
|
12 |
+
"T5ForConditionalGeneration"
|
13 |
+
],
|
14 |
+
"classifier_dropout": 0.0,
|
15 |
+
"d_ff": 3072,
|
16 |
+
"d_kv": 64,
|
17 |
+
"d_model": 1024,
|
18 |
+
"decoder_start_token_id": 3,
|
19 |
+
"dense_act_fn": "silu",
|
20 |
+
"dropout_rate": 0.0,
|
21 |
+
"eos_token_id": 2,
|
22 |
+
"feed_forward_proj": "gated-silu",
|
23 |
+
"initializer_factor": 1.0,
|
24 |
+
"is_encoder_decoder": true,
|
25 |
+
"is_gated_act": true,
|
26 |
+
"layer_norm_epsilon": 1e-06,
|
27 |
+
"model_type": "t5",
|
28 |
+
"num_decoder_layers": 24,
|
29 |
+
"num_heads": 16,
|
30 |
+
"num_key_value_heads": 8,
|
31 |
+
"num_layers": 24,
|
32 |
+
"output_past": true,
|
33 |
+
"pad_token_id": 3,
|
34 |
+
"relative_attention_max_distance": 128,
|
35 |
+
"relative_attention_num_buckets": 48,
|
36 |
+
"tie_word_embeddings": false,
|
37 |
+
"transformers_version": "4.46.0.dev0",
|
38 |
+
"use_cache": true,
|
39 |
+
"use_gqa": false,
|
40 |
+
"vocab_size": 48256
|
41 |
+
}
|
42 |
+
|
43 |
+
Resolving data files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 485.45it/s]
|
44 |
+
Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 14950.45it/s]
|
45 |
+
Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:04<00:00, 207.69it/s]
|
46 |
+
Resolving data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:00<00:00, 170394.64it/s]
|
47 |
+
=========================================================================
|
48 |
+
Layer (type:depth-idx) Output Shape Param # Trainable
|
49 |
+
=========================================================================
|
50 |
+
MyT5 486,837,760 True
|
51 |
+
Embedding 29,466,624 True
|
52 |
+
T5Stack 222,439,168 True
|
53 |
+
Embedding 29,466,624 True
|
54 |
+
ModuleList 192,971,520 True
|
55 |
+
T5LayerNorm 1,024 True
|
56 |
+
Dropout -- False
|
57 |
+
T5Stack 264,398,592 True
|
58 |
+
Embedding 29,466,624 True
|
59 |
+
ModuleList 234,930,944 True
|
60 |
+
T5LayerNorm 1,024 True
|
61 |
+
Dropout -- False
|
62 |
+
Linear 29,466,624 True
|
63 |
+
LigerCrossEntropyLoss -- False
|
64 |
+
=========================================================================
|
65 |
+
Total params: 486,837,760
|
66 |
+
Trainable params: 486,837,760
|
67 |
+
Non-trainable params: --
|
68 |
+
=========================================================================
|
69 |
+
Configuration saved in ./config.json
|
70 |
+
W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0] Graph break from `Tensor.item()`, consider setting:
|
71 |
+
W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0] torch._dynamo.config.capture_scalar_outputs = True
|
72 |
+
W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0] or:
|
73 |
+
W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0] env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
|
74 |
+
W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0] to include these operations in the captured graph.
|
75 |
+
W1020 18:25:54.835000 139930872341632 torch/_dynamo/variables/tensor.py:715] [0/0]
|
76 |
+
W1020 18:27:14.737000 139920144193088 torch/fx/experimental/symbolic_shapes.py:4449] [2/0_1] r0 is not in var_ranges, defaulting to unknown range.
|
77 |
+
W1020 18:27:21.491000 139920144193088 torch/fx/experimental/symbolic_shapes.py:4449] [2/0_1] q0 is not in var_ranges, defaulting to unknown range.
|
78 |
+
W1020 18:27:21.545000 139920144193088 torch/fx/experimental/symbolic_shapes.py:4449] [2/0_1] z0 is not in var_ranges, defaulting to unknown range.
|
79 |
+
W1020 18:27:27.722000 139920144193088 torch/fx/experimental/symbolic_shapes.py:4449] [2/0_1] x1 is not in var_ranges, defaulting to unknown range.
|
80 |
+
[2024-10-20 18:31:35,111][Main][INFO] - [train] Step 25 out of 65536 | Loss --> 155.837 | Loss_ntp --> 76.275 | Loss_mlm --> 79.561 | Grad_l2 --> 476.354 | Weights_l2 --> 7701.821 | Lr --> 0.001 | Seconds_per_step --> 14.044 |
|
81 |
+
[2024-10-20 18:35:35,171][Main][INFO] - [train] Step 50 out of 65536 | Loss --> 98.644 | Loss_ntp --> 48.540 | Loss_mlm --> 50.105 | Grad_l2 --> 234.932 | Weights_l2 --> 7701.813 | Lr --> 0.001 | Seconds_per_step --> 9.602 |
|
82 |
+
[2024-10-20 18:39:35,197][Main][INFO] - [train] Step 75 out of 65536 | Loss --> 86.994 | Loss_ntp --> 42.861 | Loss_mlm --> 44.133 | Grad_l2 --> 180.388 | Weights_l2 --> 7701.806 | Lr --> 0.001 | Seconds_per_step --> 9.601 |
|
83 |
+
[2024-10-20 18:43:35,733][Main][INFO] - [train] Step 100 out of 65536 | Loss --> 80.568 | Loss_ntp --> 39.806 | Loss_mlm --> 40.762 | Grad_l2 --> 156.732 | Weights_l2 --> 7701.800 | Lr --> 0.001 | Seconds_per_step --> 9.621 |
|
84 |
+
[2024-10-20 18:47:37,016][Main][INFO] - [train] Step 125 out of 65536 | Loss --> 77.131 | Loss_ntp --> 38.127 | Loss_mlm --> 39.004 | Grad_l2 --> 179.590 | Weights_l2 --> 7701.794 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
|
85 |
+
[2024-10-20 18:51:38,437][Main][INFO] - [train] Step 150 out of 65536 | Loss --> 73.900 | Loss_ntp --> 36.620 | Loss_mlm --> 37.281 | Grad_l2 --> 161.591 | Weights_l2 --> 7701.789 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
|
86 |
+
[2024-10-20 18:55:39,020][Main][INFO] - [train] Step 175 out of 65536 | Loss --> 72.118 | Loss_ntp --> 35.763 | Loss_mlm --> 36.355 | Grad_l2 --> 161.741 | Weights_l2 --> 7701.783 | Lr --> 0.001 | Seconds_per_step --> 9.623 |
|
87 |
+
[2024-10-20 18:59:40,344][Main][INFO] - [train] Step 200 out of 65536 | Loss --> 70.712 | Loss_ntp --> 35.041 | Loss_mlm --> 35.671 | Grad_l2 --> 154.736 | Weights_l2 --> 7701.778 | Lr --> 0.001 | Seconds_per_step --> 9.653 |
|
88 |
+
[2024-10-20 19:03:39,817][Main][INFO] - [train] Step 225 out of 65536 | Loss --> 69.050 | Loss_ntp --> 34.233 | Loss_mlm --> 34.817 | Grad_l2 --> 106.908 | Weights_l2 --> 7701.772 | Lr --> 0.001 | Seconds_per_step --> 9.579 |
|
89 |
+
[2024-10-20 19:07:41,876][Main][INFO] - [train] Step 250 out of 65536 | Loss --> 68.595 | Loss_ntp --> 33.970 | Loss_mlm --> 34.625 | Grad_l2 --> 126.557 | Weights_l2 --> 7701.767 | Lr --> 0.001 | Seconds_per_step --> 9.682 |
|
90 |
+
[2024-10-20 19:11:43,944][Main][INFO] - [train] Step 275 out of 65536 | Loss --> 67.141 | Loss_ntp --> 33.297 | Loss_mlm --> 33.844 | Grad_l2 --> 114.874 | Weights_l2 --> 7701.762 | Lr --> 0.001 | Seconds_per_step --> 9.683 |
|
91 |
+
[2024-10-20 19:15:43,786][Main][INFO] - [train] Step 300 out of 65536 | Loss --> 65.916 | Loss_ntp --> 32.693 | Loss_mlm --> 33.223 | Grad_l2 --> 89.430 | Weights_l2 --> 7701.757 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
|
92 |
+
[2024-10-20 19:19:45,206][Main][INFO] - [train] Step 325 out of 65536 | Loss --> 65.322 | Loss_ntp --> 32.362 | Loss_mlm --> 32.960 | Grad_l2 --> 97.785 | Weights_l2 --> 7701.751 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
|
93 |
+
[2024-10-20 19:23:45,072][Main][INFO] - [train] Step 350 out of 65536 | Loss --> 64.367 | Loss_ntp --> 31.937 | Loss_mlm --> 32.430 | Grad_l2 --> 83.882 | Weights_l2 --> 7701.746 | Lr --> 0.001 | Seconds_per_step --> 9.595 |
|
94 |
+
[2024-10-20 19:27:46,534][Main][INFO] - [train] Step 375 out of 65536 | Loss --> 63.409 | Loss_ntp --> 31.433 | Loss_mlm --> 31.975 | Grad_l2 --> 75.548 | Weights_l2 --> 7701.741 | Lr --> 0.001 | Seconds_per_step --> 9.658 |
|
95 |
+
[2024-10-20 19:31:45,390][Main][INFO] - [train] Step 400 out of 65536 | Loss --> 62.292 | Loss_ntp --> 30.925 | Loss_mlm --> 31.367 | Grad_l2 --> 72.299 | Weights_l2 --> 7701.736 | Lr --> 0.001 | Seconds_per_step --> 9.554 |
|
96 |
+
[2024-10-20 19:35:46,689][Main][INFO] - [train] Step 425 out of 65536 | Loss --> 61.685 | Loss_ntp --> 30.585 | Loss_mlm --> 31.100 | Grad_l2 --> 73.838 | Weights_l2 --> 7701.731 | Lr --> 0.001 | Seconds_per_step --> 9.652 |
|
97 |
+
[2024-10-20 19:39:46,030][Main][INFO] - [train] Step 450 out of 65536 | Loss --> 61.416 | Loss_ntp --> 30.509 | Loss_mlm --> 30.907 | Grad_l2 --> 79.820 | Weights_l2 --> 7701.726 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
|
98 |
+
[2024-10-20 19:43:47,298][Main][INFO] - [train] Step 475 out of 65536 | Loss --> 60.536 | Loss_ntp --> 30.069 | Loss_mlm --> 30.467 | Grad_l2 --> 59.074 | Weights_l2 --> 7701.722 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
|
99 |
+
[2024-10-20 19:47:48,778][Main][INFO] - [train] Step 500 out of 65536 | Loss --> 60.085 | Loss_ntp --> 29.838 | Loss_mlm --> 30.246 | Grad_l2 --> 71.417 | Weights_l2 --> 7701.717 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
|
100 |
+
[2024-10-20 19:49:25,862][Main][INFO] - [eval] Step 500 out of 65536 | Loss --> 57.611 | Loss_ntp --> 28.694 | Loss_mlm --> 28.917 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 97.080 |
|
101 |
+
[2024-10-20 19:53:26,482][Main][INFO] - [train] Step 525 out of 65536 | Loss --> 59.106 | Loss_ntp --> 29.371 | Loss_mlm --> 29.735 | Grad_l2 --> 56.829 | Weights_l2 --> 7701.712 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
|
102 |
+
[2024-10-20 19:57:25,811][Main][INFO] - [train] Step 550 out of 65536 | Loss --> 58.185 | Loss_ntp --> 28.950 | Loss_mlm --> 29.235 | Grad_l2 --> 56.368 | Weights_l2 --> 7701.707 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
|
103 |
+
[2024-10-20 20:01:26,095][Main][INFO] - [train] Step 575 out of 65536 | Loss --> 57.301 | Loss_ntp --> 28.480 | Loss_mlm --> 28.821 | Grad_l2 --> 39.860 | Weights_l2 --> 7701.703 | Lr --> 0.001 | Seconds_per_step --> 9.611 |
|
104 |
+
[2024-10-20 20:05:26,649][Main][INFO] - [train] Step 600 out of 65536 | Loss --> 56.020 | Loss_ntp --> 27.906 | Loss_mlm --> 28.115 | Grad_l2 --> 35.414 | Weights_l2 --> 7701.698 | Lr --> 0.001 | Seconds_per_step --> 9.622 |
|
105 |
+
[2024-10-20 20:09:28,597][Main][INFO] - [train] Step 625 out of 65536 | Loss --> 55.363 | Loss_ntp --> 27.524 | Loss_mlm --> 27.840 | Grad_l2 --> 50.531 | Weights_l2 --> 7701.694 | Lr --> 0.001 | Seconds_per_step --> 9.678 |
|
106 |
+
[2024-10-20 20:13:29,399][Main][INFO] - [train] Step 650 out of 65536 | Loss --> 54.803 | Loss_ntp --> 27.252 | Loss_mlm --> 27.551 | Grad_l2 --> 56.108 | Weights_l2 --> 7701.689 | Lr --> 0.001 | Seconds_per_step --> 9.632 |
|
107 |
+
[2024-10-20 20:17:31,948][Main][INFO] - [train] Step 675 out of 65536 | Loss --> 53.970 | Loss_ntp --> 26.793 | Loss_mlm --> 27.176 | Grad_l2 --> 46.473 | Weights_l2 --> 7701.685 | Lr --> 0.001 | Seconds_per_step --> 9.702 |
|
108 |
+
[2024-10-20 20:21:31,196][Main][INFO] - [train] Step 700 out of 65536 | Loss --> 53.056 | Loss_ntp --> 26.359 | Loss_mlm --> 26.697 | Grad_l2 --> 37.435 | Weights_l2 --> 7701.680 | Lr --> 0.001 | Seconds_per_step --> 9.570 |
|
109 |
+
[2024-10-20 20:25:33,347][Main][INFO] - [train] Step 725 out of 65536 | Loss --> 52.070 | Loss_ntp --> 25.876 | Loss_mlm --> 26.194 | Grad_l2 --> 43.881 | Weights_l2 --> 7701.676 | Lr --> 0.001 | Seconds_per_step --> 9.686 |
|
110 |
+
[2024-10-20 20:29:33,004][Main][INFO] - [train] Step 750 out of 65536 | Loss --> 51.191 | Loss_ntp --> 25.456 | Loss_mlm --> 25.735 | Grad_l2 --> 44.855 | Weights_l2 --> 7701.672 | Lr --> 0.001 | Seconds_per_step --> 9.586 |
|
111 |
+
[2024-10-20 20:33:34,557][Main][INFO] - [train] Step 775 out of 65536 | Loss --> 50.129 | Loss_ntp --> 24.891 | Loss_mlm --> 25.239 | Grad_l2 --> 40.117 | Weights_l2 --> 7701.667 | Lr --> 0.001 | Seconds_per_step --> 9.662 |
|
112 |
+
[2024-10-20 20:37:33,242][Main][INFO] - [train] Step 800 out of 65536 | Loss --> 49.019 | Loss_ntp --> 24.361 | Loss_mlm --> 24.658 | Grad_l2 --> 39.953 | Weights_l2 --> 7701.663 | Lr --> 0.001 | Seconds_per_step --> 9.547 |
|
113 |
+
[2024-10-20 20:41:33,285][Main][INFO] - [train] Step 825 out of 65536 | Loss --> 48.160 | Loss_ntp --> 23.923 | Loss_mlm --> 24.238 | Grad_l2 --> 42.816 | Weights_l2 --> 7701.659 | Lr --> 0.001 | Seconds_per_step --> 9.602 |
|
114 |
+
[2024-10-20 20:45:34,352][Main][INFO] - [train] Step 850 out of 65536 | Loss --> 46.672 | Loss_ntp --> 23.149 | Loss_mlm --> 23.522 | Grad_l2 --> 42.230 | Weights_l2 --> 7701.654 | Lr --> 0.001 | Seconds_per_step --> 9.643 |
|
115 |
+
[2024-10-20 20:49:34,963][Main][INFO] - [train] Step 875 out of 65536 | Loss --> 44.855 | Loss_ntp --> 22.279 | Loss_mlm --> 22.575 | Grad_l2 --> 39.123 | Weights_l2 --> 7701.650 | Lr --> 0.001 | Seconds_per_step --> 9.624 |
|
116 |
+
[2024-10-20 20:53:36,677][Main][INFO] - [train] Step 900 out of 65536 | Loss --> 42.480 | Loss_ntp --> 21.057 | Loss_mlm --> 21.423 | Grad_l2 --> 50.501 | Weights_l2 --> 7701.645 | Lr --> 0.001 | Seconds_per_step --> 9.668 |
|
117 |
+
[2024-10-20 20:57:37,186][Main][INFO] - [train] Step 925 out of 65536 | Loss --> 40.028 | Loss_ntp --> 19.877 | Loss_mlm --> 20.151 | Grad_l2 --> 57.109 | Weights_l2 --> 7701.640 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
|
118 |
+
[2024-10-20 21:01:38,800][Main][INFO] - [train] Step 950 out of 65536 | Loss --> 37.058 | Loss_ntp --> 18.359 | Loss_mlm --> 18.699 | Grad_l2 --> 78.443 | Weights_l2 --> 7701.634 | Lr --> 0.001 | Seconds_per_step --> 9.664 |
|
119 |
+
[2024-10-20 21:05:38,405][Main][INFO] - [train] Step 975 out of 65536 | Loss --> 33.534 | Loss_ntp --> 16.618 | Loss_mlm --> 16.917 | Grad_l2 --> 87.220 | Weights_l2 --> 7701.628 | Lr --> 0.001 | Seconds_per_step --> 9.584 |
|
120 |
+
[2024-10-20 21:09:41,153][Main][INFO] - [train] Step 1000 out of 65536 | Loss --> 29.988 | Loss_ntp --> 14.857 | Loss_mlm --> 15.131 | Grad_l2 --> 88.279 | Weights_l2 --> 7701.622 | Lr --> 0.001 | Seconds_per_step --> 9.710 |
|
121 |
+
[2024-10-20 21:10:10,310][Main][INFO] - [eval] Step 1000 out of 65536 | Loss --> 28.033 | Loss_ntp --> 13.938 | Loss_mlm --> 14.095 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 29.143 |
|
122 |
+
[2024-10-20 21:14:10,580][Main][INFO] - [train] Step 1025 out of 65536 | Loss --> 26.588 | Loss_ntp --> 13.166 | Loss_mlm --> 13.423 | Grad_l2 --> 109.226 | Weights_l2 --> 7701.616 | Lr --> 0.001 | Seconds_per_step --> 9.611 |
|
123 |
+
[2024-10-20 21:18:12,558][Main][INFO] - [train] Step 1050 out of 65536 | Loss --> 23.850 | Loss_ntp --> 11.830 | Loss_mlm --> 12.020 | Grad_l2 --> 98.666 | Weights_l2 --> 7701.610 | Lr --> 0.001 | Seconds_per_step --> 9.679 |
|
124 |
+
[2024-10-20 21:22:11,593][Main][INFO] - [train] Step 1075 out of 65536 | Loss --> 21.589 | Loss_ntp --> 10.697 | Loss_mlm --> 10.892 | Grad_l2 --> 104.858 | Weights_l2 --> 7701.605 | Lr --> 0.001 | Seconds_per_step --> 9.561 |
|
125 |
+
[2024-10-20 21:26:13,779][Main][INFO] - [train] Step 1100 out of 65536 | Loss --> 19.443 | Loss_ntp --> 9.626 | Loss_mlm --> 9.817 | Grad_l2 --> 75.473 | Weights_l2 --> 7701.599 | Lr --> 0.001 | Seconds_per_step --> 9.687 |
|
126 |
+
[2024-10-20 21:30:13,762][Main][INFO] - [train] Step 1125 out of 65536 | Loss --> 17.771 | Loss_ntp --> 8.793 | Loss_mlm --> 8.978 | Grad_l2 --> 55.492 | Weights_l2 --> 7701.593 | Lr --> 0.001 | Seconds_per_step --> 9.599 |
|
127 |
+
[2024-10-20 21:34:14,478][Main][INFO] - [train] Step 1150 out of 65536 | Loss --> 17.092 | Loss_ntp --> 8.462 | Loss_mlm --> 8.630 | Grad_l2 --> 72.673 | Weights_l2 --> 7701.587 | Lr --> 0.001 | Seconds_per_step --> 9.629 |
|
128 |
+
[2024-10-20 21:38:14,797][Main][INFO] - [train] Step 1175 out of 65536 | Loss --> 16.731 | Loss_ntp --> 8.294 | Loss_mlm --> 8.437 | Grad_l2 --> 60.718 | Weights_l2 --> 7701.582 | Lr --> 0.001 | Seconds_per_step --> 9.613 |
|
129 |
+
[2024-10-20 21:42:15,467][Main][INFO] - [train] Step 1200 out of 65536 | Loss --> 16.522 | Loss_ntp --> 8.188 | Loss_mlm --> 8.334 | Grad_l2 --> 62.414 | Weights_l2 --> 7701.577 | Lr --> 0.001 | Seconds_per_step --> 9.627 |
|
130 |
+
[2024-10-20 21:46:15,957][Main][INFO] - [train] Step 1225 out of 65536 | Loss --> 16.336 | Loss_ntp --> 8.096 | Loss_mlm --> 8.240 | Grad_l2 --> 57.944 | Weights_l2 --> 7701.572 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
|
131 |
+
[2024-10-20 21:50:15,276][Main][INFO] - [train] Step 1250 out of 65536 | Loss --> 16.167 | Loss_ntp --> 8.006 | Loss_mlm --> 8.161 | Grad_l2 --> 42.899 | Weights_l2 --> 7701.567 | Lr --> 0.001 | Seconds_per_step --> 9.573 |
|
132 |
+
[2024-10-20 21:54:18,039][Main][INFO] - [train] Step 1275 out of 65536 | Loss --> 16.183 | Loss_ntp --> 8.017 | Loss_mlm --> 8.166 | Grad_l2 --> 48.492 | Weights_l2 --> 7701.563 | Lr --> 0.001 | Seconds_per_step --> 9.710 |
|
133 |
+
[2024-10-20 21:58:18,396][Main][INFO] - [train] Step 1300 out of 65536 | Loss --> 15.988 | Loss_ntp --> 7.926 | Loss_mlm --> 8.063 | Grad_l2 --> 42.852 | Weights_l2 --> 7701.558 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
|
134 |
+
[2024-10-20 22:02:20,263][Main][INFO] - [train] Step 1325 out of 65536 | Loss --> 15.982 | Loss_ntp --> 7.916 | Loss_mlm --> 8.066 | Grad_l2 --> 47.218 | Weights_l2 --> 7701.553 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
|
135 |
+
[2024-10-20 22:06:20,739][Main][INFO] - [train] Step 1350 out of 65536 | Loss --> 15.830 | Loss_ntp --> 7.838 | Loss_mlm --> 7.992 | Grad_l2 --> 28.805 | Weights_l2 --> 7701.549 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
|
136 |
+
[2024-10-20 22:10:23,190][Main][INFO] - [train] Step 1375 out of 65536 | Loss --> 15.806 | Loss_ntp --> 7.839 | Loss_mlm --> 7.967 | Grad_l2 --> 37.388 | Weights_l2 --> 7701.544 | Lr --> 0.001 | Seconds_per_step --> 9.698 |
|
137 |
+
[2024-10-20 22:14:23,525][Main][INFO] - [train] Step 1400 out of 65536 | Loss --> 15.775 | Loss_ntp --> 7.813 | Loss_mlm --> 7.962 | Grad_l2 --> 35.380 | Weights_l2 --> 7701.540 | Lr --> 0.001 | Seconds_per_step --> 9.613 |
|
138 |
+
[2024-10-20 22:18:25,080][Main][INFO] - [train] Step 1425 out of 65536 | Loss --> 15.722 | Loss_ntp --> 7.794 | Loss_mlm --> 7.928 | Grad_l2 --> 34.978 | Weights_l2 --> 7701.535 | Lr --> 0.001 | Seconds_per_step --> 9.662 |
|
139 |
+
[2024-10-20 22:22:24,651][Main][INFO] - [train] Step 1450 out of 65536 | Loss --> 15.638 | Loss_ntp --> 7.739 | Loss_mlm --> 7.899 | Grad_l2 --> 24.003 | Weights_l2 --> 7701.530 | Lr --> 0.001 | Seconds_per_step --> 9.583 |
|
140 |
+
[2024-10-20 22:26:24,495][Main][INFO] - [train] Step 1475 out of 65536 | Loss --> 15.682 | Loss_ntp --> 7.768 | Loss_mlm --> 7.913 | Grad_l2 --> 27.599 | Weights_l2 --> 7701.526 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
|
141 |
+
[2024-10-20 22:30:25,992][Main][INFO] - [train] Step 1500 out of 65536 | Loss --> 15.638 | Loss_ntp --> 7.754 | Loss_mlm --> 7.884 | Grad_l2 --> 22.985 | Weights_l2 --> 7701.521 | Lr --> 0.001 | Seconds_per_step --> 9.660 |
|
142 |
+
[2024-10-20 22:30:54,697][Main][INFO] - [eval] Step 1500 out of 65536 | Loss --> 15.664 | Loss_ntp --> 7.782 | Loss_mlm --> 7.882 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.700 |
|
143 |
+
[2024-10-20 22:30:54,709][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-1500
|
144 |
+
[2024-10-20 22:30:54,719][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
145 |
+
[2024-10-20 22:30:59,988][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-1500/model.safetensors
|
146 |
+
[2024-10-20 22:31:08,673][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-1500/optimizer.bin
|
147 |
+
[2024-10-20 22:31:08,682][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-1500/scheduler.bin
|
148 |
+
[2024-10-20 22:31:08,684][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-1500/sampler.bin
|
149 |
+
[2024-10-20 22:31:08,686][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-1500/sampler_1.bin
|
150 |
+
[2024-10-20 22:31:08,694][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-1500/random_states_0.pkl
|
151 |
+
[2024-10-20 22:35:09,885][Main][INFO] - [train] Step 1525 out of 65536 | Loss --> 15.740 | Loss_ntp --> 7.803 | Loss_mlm --> 7.937 | Grad_l2 --> 35.476 | Weights_l2 --> 7701.516 | Lr --> 0.001 | Seconds_per_step --> 10.207 |
|
152 |
+
[2024-10-20 22:39:10,189][Main][INFO] - [train] Step 1550 out of 65536 | Loss --> 15.717 | Loss_ntp --> 7.796 | Loss_mlm --> 7.921 | Grad_l2 --> 32.209 | Weights_l2 --> 7701.511 | Lr --> 0.001 | Seconds_per_step --> 9.612 |
|
153 |
+
[2024-10-20 22:43:12,020][Main][INFO] - [train] Step 1575 out of 65536 | Loss --> 15.723 | Loss_ntp --> 7.805 | Loss_mlm --> 7.918 | Grad_l2 --> 35.393 | Weights_l2 --> 7701.506 | Lr --> 0.001 | Seconds_per_step --> 9.673 |
|
154 |
+
[2024-10-20 22:47:13,492][Main][INFO] - [train] Step 1600 out of 65536 | Loss --> 15.617 | Loss_ntp --> 7.752 | Loss_mlm --> 7.865 | Grad_l2 --> 29.357 | Weights_l2 --> 7701.502 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
|
155 |
+
[2024-10-20 22:51:13,978][Main][INFO] - [train] Step 1625 out of 65536 | Loss --> 15.532 | Loss_ntp --> 7.709 | Loss_mlm --> 7.822 | Grad_l2 --> 18.501 | Weights_l2 --> 7701.497 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
|
156 |
+
[2024-10-20 22:55:14,600][Main][INFO] - [train] Step 1650 out of 65536 | Loss --> 15.565 | Loss_ntp --> 7.720 | Loss_mlm --> 7.845 | Grad_l2 --> 17.546 | Weights_l2 --> 7701.493 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
|
157 |
+
[2024-10-20 22:59:14,384][Main][INFO] - [train] Step 1675 out of 65536 | Loss --> 15.576 | Loss_ntp --> 7.737 | Loss_mlm --> 7.838 | Grad_l2 --> 23.599 | Weights_l2 --> 7701.489 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
|
158 |
+
[2024-10-20 23:03:16,878][Main][INFO] - [train] Step 1700 out of 65536 | Loss --> 15.612 | Loss_ntp --> 7.757 | Loss_mlm --> 7.855 | Grad_l2 --> 28.685 | Weights_l2 --> 7701.484 | Lr --> 0.001 | Seconds_per_step --> 9.700 |
|
159 |
+
[2024-10-20 23:07:16,611][Main][INFO] - [train] Step 1725 out of 65536 | Loss --> 15.590 | Loss_ntp --> 7.728 | Loss_mlm --> 7.861 | Grad_l2 --> 22.357 | Weights_l2 --> 7701.479 | Lr --> 0.001 | Seconds_per_step --> 9.589 |
|
160 |
+
[2024-10-20 23:11:18,435][Main][INFO] - [train] Step 1750 out of 65536 | Loss --> 15.475 | Loss_ntp --> 7.683 | Loss_mlm --> 7.792 | Grad_l2 --> 20.808 | Weights_l2 --> 7701.475 | Lr --> 0.001 | Seconds_per_step --> 9.673 |
|
161 |
+
[2024-10-20 23:15:17,324][Main][INFO] - [train] Step 1775 out of 65536 | Loss --> 15.422 | Loss_ntp --> 7.655 | Loss_mlm --> 7.767 | Grad_l2 --> 16.928 | Weights_l2 --> 7701.470 | Lr --> 0.001 | Seconds_per_step --> 9.555 |
|
162 |
+
[2024-10-20 23:19:17,823][Main][INFO] - [train] Step 1800 out of 65536 | Loss --> 15.370 | Loss_ntp --> 7.625 | Loss_mlm --> 7.745 | Grad_l2 --> 16.147 | Weights_l2 --> 7701.466 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
|
163 |
+
[2024-10-20 23:23:19,005][Main][INFO] - [train] Step 1825 out of 65536 | Loss --> 15.363 | Loss_ntp --> 7.629 | Loss_mlm --> 7.734 | Grad_l2 --> 19.934 | Weights_l2 --> 7701.462 | Lr --> 0.001 | Seconds_per_step --> 9.647 |
|
164 |
+
[2024-10-20 23:27:17,933][Main][INFO] - [train] Step 1850 out of 65536 | Loss --> 15.347 | Loss_ntp --> 7.616 | Loss_mlm --> 7.732 | Grad_l2 --> 25.592 | Weights_l2 --> 7701.457 | Lr --> 0.001 | Seconds_per_step --> 9.557 |
|
165 |
+
[2024-10-20 23:31:19,805][Main][INFO] - [train] Step 1875 out of 65536 | Loss --> 15.254 | Loss_ntp --> 7.577 | Loss_mlm --> 7.677 | Grad_l2 --> 19.500 | Weights_l2 --> 7701.453 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
|
166 |
+
[2024-10-20 23:35:18,582][Main][INFO] - [train] Step 1900 out of 65536 | Loss --> 15.204 | Loss_ntp --> 7.550 | Loss_mlm --> 7.653 | Grad_l2 --> 15.358 | Weights_l2 --> 7701.448 | Lr --> 0.001 | Seconds_per_step --> 9.551 |
|
167 |
+
[2024-10-20 23:39:20,300][Main][INFO] - [train] Step 1925 out of 65536 | Loss --> 15.153 | Loss_ntp --> 7.525 | Loss_mlm --> 7.628 | Grad_l2 --> 13.241 | Weights_l2 --> 7701.445 | Lr --> 0.001 | Seconds_per_step --> 9.669 |
|
168 |
+
[2024-10-20 23:43:21,680][Main][INFO] - [train] Step 1950 out of 65536 | Loss --> 15.111 | Loss_ntp --> 7.497 | Loss_mlm --> 7.614 | Grad_l2 --> 13.357 | Weights_l2 --> 7701.441 | Lr --> 0.001 | Seconds_per_step --> 9.655 |
|
169 |
+
[2024-10-20 23:47:22,111][Main][INFO] - [train] Step 1975 out of 65536 | Loss --> 15.072 | Loss_ntp --> 7.475 | Loss_mlm --> 7.597 | Grad_l2 --> 15.485 | Weights_l2 --> 7701.437 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
|
170 |
+
[2024-10-20 23:51:21,960][Main][INFO] - [train] Step 2000 out of 65536 | Loss --> 15.061 | Loss_ntp --> 7.470 | Loss_mlm --> 7.591 | Grad_l2 --> 15.511 | Weights_l2 --> 7701.432 | Lr --> 0.001 | Seconds_per_step --> 9.594 |
|
171 |
+
[2024-10-20 23:51:50,849][Main][INFO] - [eval] Step 2000 out of 65536 | Loss --> 15.092 | Loss_ntp --> 7.501 | Loss_mlm --> 7.591 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.883 |
|
172 |
+
[2024-10-20 23:55:53,490][Main][INFO] - [train] Step 2025 out of 65536 | Loss --> 15.080 | Loss_ntp --> 7.479 | Loss_mlm --> 7.601 | Grad_l2 --> 17.451 | Weights_l2 --> 7701.428 | Lr --> 0.001 | Seconds_per_step --> 9.705 |
|
173 |
+
[2024-10-20 23:59:53,747][Main][INFO] - [train] Step 2050 out of 65536 | Loss --> 14.998 | Loss_ntp --> 7.447 | Loss_mlm --> 7.551 | Grad_l2 --> 13.242 | Weights_l2 --> 7701.424 | Lr --> 0.001 | Seconds_per_step --> 9.610 |
|
174 |
+
[2024-10-21 00:03:57,114][Main][INFO] - [train] Step 2075 out of 65536 | Loss --> 14.994 | Loss_ntp --> 7.431 | Loss_mlm --> 7.562 | Grad_l2 --> 17.409 | Weights_l2 --> 7701.419 | Lr --> 0.001 | Seconds_per_step --> 9.735 |
|
175 |
+
[2024-10-21 00:07:56,557][Main][INFO] - [train] Step 2100 out of 65536 | Loss --> 14.993 | Loss_ntp --> 7.437 | Loss_mlm --> 7.556 | Grad_l2 --> 23.374 | Weights_l2 --> 7701.414 | Lr --> 0.001 | Seconds_per_step --> 9.578 |
|
176 |
+
[2024-10-21 00:11:56,818][Main][INFO] - [train] Step 2125 out of 65536 | Loss --> 14.963 | Loss_ntp --> 7.428 | Loss_mlm --> 7.535 | Grad_l2 --> 24.857 | Weights_l2 --> 7701.410 | Lr --> 0.001 | Seconds_per_step --> 9.610 |
|
177 |
+
[2024-10-21 00:15:56,927][Main][INFO] - [train] Step 2150 out of 65536 | Loss --> 14.829 | Loss_ntp --> 7.354 | Loss_mlm --> 7.474 | Grad_l2 --> 14.538 | Weights_l2 --> 7701.405 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
|
178 |
+
[2024-10-21 00:19:57,089][Main][INFO] - [train] Step 2175 out of 65536 | Loss --> 14.797 | Loss_ntp --> 7.344 | Loss_mlm --> 7.453 | Grad_l2 --> 13.598 | Weights_l2 --> 7701.400 | Lr --> 0.001 | Seconds_per_step --> 9.606 |
|
179 |
+
[2024-10-21 00:23:58,135][Main][INFO] - [train] Step 2200 out of 65536 | Loss --> 14.774 | Loss_ntp --> 7.321 | Loss_mlm --> 7.454 | Grad_l2 --> 13.339 | Weights_l2 --> 7701.396 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
|
180 |
+
[2024-10-21 00:27:58,499][Main][INFO] - [train] Step 2225 out of 65536 | Loss --> 14.671 | Loss_ntp --> 7.284 | Loss_mlm --> 7.387 | Grad_l2 --> 13.884 | Weights_l2 --> 7701.392 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
|
181 |
+
[2024-10-21 00:31:59,596][Main][INFO] - [train] Step 2250 out of 65536 | Loss --> 14.635 | Loss_ntp --> 7.264 | Loss_mlm --> 7.371 | Grad_l2 --> 11.527 | Weights_l2 --> 7701.388 | Lr --> 0.001 | Seconds_per_step --> 9.644 |
|
182 |
+
[2024-10-21 00:35:58,256][Main][INFO] - [train] Step 2275 out of 65536 | Loss --> 14.593 | Loss_ntp --> 7.247 | Loss_mlm --> 7.345 | Grad_l2 --> 9.993 | Weights_l2 --> 7701.384 | Lr --> 0.001 | Seconds_per_step --> 9.546 |
|
183 |
+
[2024-10-21 00:39:59,379][Main][INFO] - [train] Step 2300 out of 65536 | Loss --> 14.543 | Loss_ntp --> 7.216 | Loss_mlm --> 7.327 | Grad_l2 --> 12.147 | Weights_l2 --> 7701.381 | Lr --> 0.001 | Seconds_per_step --> 9.644 |
|
184 |
+
[2024-10-21 00:43:59,080][Main][INFO] - [train] Step 2325 out of 65536 | Loss --> 14.577 | Loss_ntp --> 7.231 | Loss_mlm --> 7.345 | Grad_l2 --> 12.365 | Weights_l2 --> 7701.376 | Lr --> 0.001 | Seconds_per_step --> 9.588 |
|
185 |
+
[2024-10-21 00:47:59,811][Main][INFO] - [train] Step 2350 out of 65536 | Loss --> 14.512 | Loss_ntp --> 7.202 | Loss_mlm --> 7.310 | Grad_l2 --> 12.472 | Weights_l2 --> 7701.372 | Lr --> 0.001 | Seconds_per_step --> 9.629 |
|
186 |
+
[2024-10-21 00:51:58,749][Main][INFO] - [train] Step 2375 out of 65536 | Loss --> 14.434 | Loss_ntp --> 7.166 | Loss_mlm --> 7.268 | Grad_l2 --> 12.198 | Weights_l2 --> 7701.368 | Lr --> 0.001 | Seconds_per_step --> 9.557 |
|
187 |
+
[2024-10-21 00:55:58,527][Main][INFO] - [train] Step 2400 out of 65536 | Loss --> 14.390 | Loss_ntp --> 7.141 | Loss_mlm --> 7.249 | Grad_l2 --> 11.488 | Weights_l2 --> 7701.365 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
|
188 |
+
[2024-10-21 00:59:59,746][Main][INFO] - [train] Step 2425 out of 65536 | Loss --> 14.396 | Loss_ntp --> 7.142 | Loss_mlm --> 7.253 | Grad_l2 --> 11.924 | Weights_l2 --> 7701.361 | Lr --> 0.001 | Seconds_per_step --> 9.649 |
|
189 |
+
[2024-10-21 01:03:58,922][Main][INFO] - [train] Step 2450 out of 65536 | Loss --> 14.319 | Loss_ntp --> 7.108 | Loss_mlm --> 7.211 | Grad_l2 --> 11.587 | Weights_l2 --> 7701.357 | Lr --> 0.001 | Seconds_per_step --> 9.567 |
|
190 |
+
[2024-10-21 01:08:00,577][Main][INFO] - [train] Step 2475 out of 65536 | Loss --> 14.363 | Loss_ntp --> 7.132 | Loss_mlm --> 7.231 | Grad_l2 --> 11.854 | Weights_l2 --> 7701.353 | Lr --> 0.001 | Seconds_per_step --> 9.666 |
|
191 |
+
[2024-10-21 01:12:00,070][Main][INFO] - [train] Step 2500 out of 65536 | Loss --> 14.333 | Loss_ntp --> 7.121 | Loss_mlm --> 7.212 | Grad_l2 --> 10.363 | Weights_l2 --> 7701.349 | Lr --> 0.001 | Seconds_per_step --> 9.580 |
|
192 |
+
[2024-10-21 01:12:28,480][Main][INFO] - [eval] Step 2500 out of 65536 | Loss --> 14.573 | Loss_ntp --> 7.286 | Loss_mlm --> 7.287 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.404 |
|
193 |
+
[2024-10-21 01:16:30,064][Main][INFO] - [train] Step 2525 out of 65536 | Loss --> 14.280 | Loss_ntp --> 7.089 | Loss_mlm --> 7.192 | Grad_l2 --> 13.178 | Weights_l2 --> 7701.345 | Lr --> 0.001 | Seconds_per_step --> 9.663 |
|
194 |
+
[2024-10-21 01:20:29,018][Main][INFO] - [train] Step 2550 out of 65536 | Loss --> 14.260 | Loss_ntp --> 7.091 | Loss_mlm --> 7.169 | Grad_l2 --> 12.381 | Weights_l2 --> 7701.341 | Lr --> 0.001 | Seconds_per_step --> 9.558 |
|
195 |
+
[2024-10-21 01:24:31,253][Main][INFO] - [train] Step 2575 out of 65536 | Loss --> 14.259 | Loss_ntp --> 7.078 | Loss_mlm --> 7.182 | Grad_l2 --> 11.247 | Weights_l2 --> 7701.337 | Lr --> 0.001 | Seconds_per_step --> 9.689 |
|
196 |
+
[2024-10-21 01:28:31,446][Main][INFO] - [train] Step 2600 out of 65536 | Loss --> 14.259 | Loss_ntp --> 7.080 | Loss_mlm --> 7.179 | Grad_l2 --> 12.524 | Weights_l2 --> 7701.333 | Lr --> 0.001 | Seconds_per_step --> 9.608 |
|
197 |
+
[2024-10-21 01:32:31,794][Main][INFO] - [train] Step 2625 out of 65536 | Loss --> 14.245 | Loss_ntp --> 7.068 | Loss_mlm --> 7.178 | Grad_l2 --> 12.087 | Weights_l2 --> 7701.330 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
|
198 |
+
[2024-10-21 01:36:32,411][Main][INFO] - [train] Step 2650 out of 65536 | Loss --> 14.247 | Loss_ntp --> 7.074 | Loss_mlm --> 7.173 | Grad_l2 --> 11.638 | Weights_l2 --> 7701.326 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
|
199 |
+
[2024-10-21 01:40:33,462][Main][INFO] - [train] Step 2675 out of 65536 | Loss --> 14.274 | Loss_ntp --> 7.086 | Loss_mlm --> 7.189 | Grad_l2 --> 10.415 | Weights_l2 --> 7701.322 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
|
200 |
+
[2024-10-21 01:44:33,254][Main][INFO] - [train] Step 2700 out of 65536 | Loss --> 14.276 | Loss_ntp --> 7.097 | Loss_mlm --> 7.179 | Grad_l2 --> 10.830 | Weights_l2 --> 7701.318 | Lr --> 0.001 | Seconds_per_step --> 9.592 |
|
201 |
+
[2024-10-21 01:48:34,104][Main][INFO] - [train] Step 2725 out of 65536 | Loss --> 14.322 | Loss_ntp --> 7.117 | Loss_mlm --> 7.205 | Grad_l2 --> 11.668 | Weights_l2 --> 7701.314 | Lr --> 0.001 | Seconds_per_step --> 9.634 |
|
202 |
+
[2024-10-21 01:52:33,834][Main][INFO] - [train] Step 2750 out of 65536 | Loss --> 14.393 | Loss_ntp --> 7.149 | Loss_mlm --> 7.244 | Grad_l2 --> 10.585 | Weights_l2 --> 7701.310 | Lr --> 0.001 | Seconds_per_step --> 9.589 |
|
203 |
+
[2024-10-21 01:56:33,130][Main][INFO] - [train] Step 2775 out of 65536 | Loss --> 14.326 | Loss_ntp --> 7.124 | Loss_mlm --> 7.202 | Grad_l2 --> 9.862 | Weights_l2 --> 7701.306 | Lr --> 0.001 | Seconds_per_step --> 9.572 |
|
204 |
+
[2024-10-21 02:00:34,375][Main][INFO] - [train] Step 2800 out of 65536 | Loss --> 14.354 | Loss_ntp --> 7.134 | Loss_mlm --> 7.220 | Grad_l2 --> 8.484 | Weights_l2 --> 7701.302 | Lr --> 0.001 | Seconds_per_step --> 9.650 |
|
205 |
+
[2024-10-21 02:04:34,763][Main][INFO] - [train] Step 2825 out of 65536 | Loss --> 14.320 | Loss_ntp --> 7.118 | Loss_mlm --> 7.202 | Grad_l2 --> 11.118 | Weights_l2 --> 7701.298 | Lr --> 0.001 | Seconds_per_step --> 9.615 |
|
206 |
+
[2024-10-21 02:08:35,157][Main][INFO] - [train] Step 2850 out of 65536 | Loss --> 14.323 | Loss_ntp --> 7.124 | Loss_mlm --> 7.199 | Grad_l2 --> 10.821 | Weights_l2 --> 7701.294 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
|
207 |
+
[2024-10-21 02:12:34,860][Main][INFO] - [train] Step 2875 out of 65536 | Loss --> 14.348 | Loss_ntp --> 7.129 | Loss_mlm --> 7.219 | Grad_l2 --> 9.481 | Weights_l2 --> 7701.291 | Lr --> 0.001 | Seconds_per_step --> 9.588 |
|
208 |
+
[2024-10-21 02:16:36,448][Main][INFO] - [train] Step 2900 out of 65536 | Loss --> 14.413 | Loss_ntp --> 7.163 | Loss_mlm --> 7.250 | Grad_l2 --> 10.586 | Weights_l2 --> 7701.287 | Lr --> 0.001 | Seconds_per_step --> 9.663 |
|
209 |
+
[2024-10-21 02:20:36,563][Main][INFO] - [train] Step 2925 out of 65536 | Loss --> 14.319 | Loss_ntp --> 7.113 | Loss_mlm --> 7.206 | Grad_l2 --> 9.175 | Weights_l2 --> 7701.283 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
|
210 |
+
[2024-10-21 02:24:36,522][Main][INFO] - [train] Step 2950 out of 65536 | Loss --> 14.292 | Loss_ntp --> 7.112 | Loss_mlm --> 7.179 | Grad_l2 --> 10.380 | Weights_l2 --> 7701.279 | Lr --> 0.001 | Seconds_per_step --> 9.598 |
|
211 |
+
[2024-10-21 02:28:36,510][Main][INFO] - [train] Step 2975 out of 65536 | Loss --> 14.202 | Loss_ntp --> 7.068 | Loss_mlm --> 7.134 | Grad_l2 --> 9.622 | Weights_l2 --> 7701.276 | Lr --> 0.001 | Seconds_per_step --> 9.599 |
|
212 |
+
[2024-10-21 02:32:38,120][Main][INFO] - [train] Step 3000 out of 65536 | Loss --> 14.214 | Loss_ntp --> 7.066 | Loss_mlm --> 7.147 | Grad_l2 --> 10.228 | Weights_l2 --> 7701.272 | Lr --> 0.001 | Seconds_per_step --> 9.664 |
|
213 |
+
[2024-10-21 02:33:06,984][Main][INFO] - [eval] Step 3000 out of 65536 | Loss --> 14.236 | Loss_ntp --> 7.111 | Loss_mlm --> 7.125 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.858 |
|
214 |
+
[2024-10-21 02:33:06,988][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-3000
|
215 |
+
[2024-10-21 02:33:07,000][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
216 |
+
[2024-10-21 02:33:13,140][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-3000/model.safetensors
|
217 |
+
[2024-10-21 02:33:21,968][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-3000/optimizer.bin
|
218 |
+
[2024-10-21 02:33:21,978][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-3000/scheduler.bin
|
219 |
+
[2024-10-21 02:33:21,979][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-3000/sampler.bin
|
220 |
+
[2024-10-21 02:33:21,981][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-3000/sampler_1.bin
|
221 |
+
[2024-10-21 02:33:21,990][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-3000/random_states_0.pkl
|
222 |
+
[2024-10-21 02:37:21,949][Main][INFO] - [train] Step 3025 out of 65536 | Loss --> 14.180 | Loss_ntp --> 7.041 | Loss_mlm --> 7.138 | Grad_l2 --> 9.928 | Weights_l2 --> 7701.268 | Lr --> 0.001 | Seconds_per_step --> 10.198 |
|
223 |
+
[2024-10-21 02:41:23,436][Main][INFO] - [train] Step 3050 out of 65536 | Loss --> 14.163 | Loss_ntp --> 7.032 | Loss_mlm --> 7.130 | Grad_l2 --> 9.909 | Weights_l2 --> 7701.264 | Lr --> 0.001 | Seconds_per_step --> 9.659 |
|
224 |
+
[2024-10-21 02:45:23,362][Main][INFO] - [train] Step 3075 out of 65536 | Loss --> 14.109 | Loss_ntp --> 7.016 | Loss_mlm --> 7.093 | Grad_l2 --> 10.119 | Weights_l2 --> 7701.260 | Lr --> 0.001 | Seconds_per_step --> 9.597 |
|
225 |
+
[2024-10-21 02:49:23,828][Main][INFO] - [train] Step 3100 out of 65536 | Loss --> 14.053 | Loss_ntp --> 6.981 | Loss_mlm --> 7.072 | Grad_l2 --> 8.917 | Weights_l2 --> 7701.256 | Lr --> 0.001 | Seconds_per_step --> 9.619 |
|
226 |
+
[2024-10-21 02:53:26,144][Main][INFO] - [train] Step 3125 out of 65536 | Loss --> 14.045 | Loss_ntp --> 6.975 | Loss_mlm --> 7.069 | Grad_l2 --> 11.184 | Weights_l2 --> 7701.252 | Lr --> 0.001 | Seconds_per_step --> 9.692 |
|
227 |
+
[2024-10-21 02:57:25,035][Main][INFO] - [train] Step 3150 out of 65536 | Loss --> 14.006 | Loss_ntp --> 6.959 | Loss_mlm --> 7.047 | Grad_l2 --> 9.280 | Weights_l2 --> 7701.248 | Lr --> 0.001 | Seconds_per_step --> 9.555 |
|
228 |
+
[2024-10-21 03:01:27,283][Main][INFO] - [train] Step 3175 out of 65536 | Loss --> 13.943 | Loss_ntp --> 6.924 | Loss_mlm --> 7.020 | Grad_l2 --> 8.769 | Weights_l2 --> 7701.245 | Lr --> 0.001 | Seconds_per_step --> 9.690 |
|
229 |
+
[2024-10-21 03:05:27,701][Main][INFO] - [train] Step 3200 out of 65536 | Loss --> 13.956 | Loss_ntp --> 6.916 | Loss_mlm --> 7.040 | Grad_l2 --> 8.625 | Weights_l2 --> 7701.241 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
|
230 |
+
[2024-10-21 03:09:28,530][Main][INFO] - [train] Step 3225 out of 65536 | Loss --> 13.916 | Loss_ntp --> 6.906 | Loss_mlm --> 7.010 | Grad_l2 --> 9.378 | Weights_l2 --> 7701.238 | Lr --> 0.001 | Seconds_per_step --> 9.633 |
|
231 |
+
[2024-10-21 03:13:28,937][Main][INFO] - [train] Step 3250 out of 65536 | Loss --> 13.849 | Loss_ntp --> 6.867 | Loss_mlm --> 6.982 | Grad_l2 --> 9.221 | Weights_l2 --> 7701.234 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
|
232 |
+
[2024-10-21 03:17:29,597][Main][INFO] - [train] Step 3275 out of 65536 | Loss --> 13.854 | Loss_ntp --> 6.869 | Loss_mlm --> 6.985 | Grad_l2 --> 8.561 | Weights_l2 --> 7701.230 | Lr --> 0.001 | Seconds_per_step --> 9.626 |
|
233 |
+
[2024-10-21 03:21:30,034][Main][INFO] - [train] Step 3300 out of 65536 | Loss --> 13.781 | Loss_ntp --> 6.843 | Loss_mlm --> 6.938 | Grad_l2 --> 8.919 | Weights_l2 --> 7701.226 | Lr --> 0.001 | Seconds_per_step --> 9.617 |
|
234 |
+
[2024-10-21 03:25:29,815][Main][INFO] - [train] Step 3325 out of 65536 | Loss --> 13.766 | Loss_ntp --> 6.836 | Loss_mlm --> 6.930 | Grad_l2 --> 8.129 | Weights_l2 --> 7701.223 | Lr --> 0.001 | Seconds_per_step --> 9.591 |
|
235 |
+
[2024-10-21 03:29:30,344][Main][INFO] - [train] Step 3350 out of 65536 | Loss --> 13.726 | Loss_ntp --> 6.809 | Loss_mlm --> 6.917 | Grad_l2 --> 9.145 | Weights_l2 --> 7701.219 | Lr --> 0.001 | Seconds_per_step --> 9.620 |
|
236 |
+
[2024-10-21 03:33:30,171][Main][INFO] - [train] Step 3375 out of 65536 | Loss --> 13.751 | Loss_ntp --> 6.819 | Loss_mlm --> 6.932 | Grad_l2 --> 11.666 | Weights_l2 --> 7701.215 | Lr --> 0.001 | Seconds_per_step --> 9.593 |
|
237 |
+
[2024-10-21 03:37:32,111][Main][INFO] - [train] Step 3400 out of 65536 | Loss --> 13.700 | Loss_ntp --> 6.796 | Loss_mlm --> 6.905 | Grad_l2 --> 8.776 | Weights_l2 --> 7701.211 | Lr --> 0.001 | Seconds_per_step --> 9.677 |
|
238 |
+
[2024-10-21 03:41:31,530][Main][INFO] - [train] Step 3425 out of 65536 | Loss --> 13.641 | Loss_ntp --> 6.774 | Loss_mlm --> 6.868 | Grad_l2 --> 9.206 | Weights_l2 --> 7701.207 | Lr --> 0.001 | Seconds_per_step --> 9.577 |
|
239 |
+
[2024-10-21 03:45:33,625][Main][INFO] - [train] Step 3450 out of 65536 | Loss --> 13.588 | Loss_ntp --> 6.735 | Loss_mlm --> 6.852 | Grad_l2 --> 6.293 | Weights_l2 --> 7701.204 | Lr --> 0.001 | Seconds_per_step --> 9.684 |
|
240 |
+
[2024-10-21 03:49:34,400][Main][INFO] - [train] Step 3475 out of 65536 | Loss --> 13.615 | Loss_ntp --> 6.748 | Loss_mlm --> 6.868 | Grad_l2 --> 9.161 | Weights_l2 --> 7701.201 | Lr --> 0.001 | Seconds_per_step --> 9.631 |
|
241 |
+
[2024-10-21 03:53:35,824][Main][INFO] - [train] Step 3500 out of 65536 | Loss --> 13.532 | Loss_ntp --> 6.707 | Loss_mlm --> 6.825 | Grad_l2 --> 9.556 | Weights_l2 --> 7701.197 | Lr --> 0.001 | Seconds_per_step --> 9.657 |
|
242 |
+
[2024-10-21 03:54:04,713][Main][INFO] - [eval] Step 3500 out of 65536 | Loss --> 13.912 | Loss_ntp --> 6.950 | Loss_mlm --> 6.962 | Accuracy_mlm --> 0.000 | Accuracy_ntp --> 0.000 | Accuracy --> 0.000 | Time --> 28.883 |
|
243 |
+
[2024-10-21 03:58:05,620][Main][INFO] - [train] Step 3525 out of 65536 | Loss --> 13.463 | Loss_ntp --> 6.677 | Loss_mlm --> 6.786 | Grad_l2 --> 9.458 | Weights_l2 --> 7701.193 | Lr --> 0.001 | Seconds_per_step --> 9.636 |
|
244 |
+
[2024-10-21 04:02:06,516][Main][INFO] - [train] Step 3550 out of 65536 | Loss --> 13.419 | Loss_ntp --> 6.654 | Loss_mlm --> 6.766 | Grad_l2 --> 9.819 | Weights_l2 --> 7701.188 | Lr --> 0.001 | Seconds_per_step --> 9.636 |
|
245 |
+
[2024-10-21 04:06:07,229][Main][INFO] - [train] Step 3575 out of 65536 | Loss --> 13.362 | Loss_ntp --> 6.626 | Loss_mlm --> 6.736 | Grad_l2 --> 8.944 | Weights_l2 --> 7701.184 | Lr --> 0.001 | Seconds_per_step --> 9.628 |
|
246 |
+
[2024-10-21 04:10:08,761][Main][INFO] - [train] Step 3600 out of 65536 | Loss --> 13.401 | Loss_ntp --> 6.628 | Loss_mlm --> 6.773 | Grad_l2 --> 9.904 | Weights_l2 --> 7701.180 | Lr --> 0.001 | Seconds_per_step --> 9.661 |
|
247 |
+
[2024-10-21 04:14:09,815][Main][INFO] - [train] Step 3625 out of 65536 | Loss --> 13.361 | Loss_ntp --> 6.625 | Loss_mlm --> 6.736 | Grad_l2 --> 8.507 | Weights_l2 --> 7701.176 | Lr --> 0.001 | Seconds_per_step --> 9.642 |
|
248 |
+
[2024-10-21 04:18:10,037][Main][INFO] - [train] Step 3650 out of 65536 | Loss --> 13.355 | Loss_ntp --> 6.614 | Loss_mlm --> 6.741 | Grad_l2 --> 9.056 | Weights_l2 --> 7701.172 | Lr --> 0.001 | Seconds_per_step --> 9.609 |
|
249 |
+
[2024-10-21 04:22:10,677][Main][INFO] - [train] Step 3675 out of 65536 | Loss --> 13.306 | Loss_ntp --> 6.586 | Loss_mlm --> 6.720 | Grad_l2 --> 9.057 | Weights_l2 --> 7701.168 | Lr --> 0.001 | Seconds_per_step --> 9.625 |
|
250 |
+
[2024-10-21 04:26:12,857][Main][INFO] - [train] Step 3700 out of 65536 | Loss --> 13.325 | Loss_ntp --> 6.596 | Loss_mlm --> 6.729 | Grad_l2 --> 10.732 | Weights_l2 --> 7701.163 | Lr --> 0.001 | Seconds_per_step --> 9.687 |
|
251 |
+
[2024-10-21 04:30:11,816][Main][INFO] - [train] Step 3725 out of 65536 | Loss --> 13.239 | Loss_ntp --> 6.561 | Loss_mlm --> 6.678 | Grad_l2 --> 9.810 | Weights_l2 --> 7701.160 | Lr --> 0.001 | Seconds_per_step --> 9.558 |
|
252 |
+
[2024-10-21 04:34:12,167][Main][INFO] - [train] Step 3750 out of 65536 | Loss --> 13.211 | Loss_ntp --> 6.534 | Loss_mlm --> 6.677 | Grad_l2 --> 10.011 | Weights_l2 --> 7701.156 | Lr --> 0.001 | Seconds_per_step --> 9.614 |
|
253 |
+
[2024-10-21 04:38:14,046][Main][INFO] - [train] Step 3775 out of 65536 | Loss --> 13.214 | Loss_ntp --> 6.537 | Loss_mlm --> 6.678 | Grad_l2 --> 8.939 | Weights_l2 --> 7701.152 | Lr --> 0.001 | Seconds_per_step --> 9.675 |
|
254 |
+
[2024-10-21 04:42:14,454][Main][INFO] - [train] Step 3800 out of 65536 | Loss --> 13.148 | Loss_ntp --> 6.508 | Loss_mlm --> 6.640 | Grad_l2 --> 9.513 | Weights_l2 --> 7701.148 | Lr --> 0.001 | Seconds_per_step --> 9.616 |
|
255 |
+
[2024-10-21 04:46:14,554][Main][INFO] - [train] Step 3825 out of 65536 | Loss --> 13.172 | Loss_ntp --> 6.514 | Loss_mlm --> 6.658 | Grad_l2 --> 9.295 | Weights_l2 --> 7701.144 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
|
256 |
+
[2024-10-21 04:50:14,762][Main][INFO] - [train] Step 3850 out of 65536 | Loss --> 13.118 | Loss_ntp --> 6.494 | Loss_mlm --> 6.624 | Grad_l2 --> 7.890 | Weights_l2 --> 7701.140 | Lr --> 0.001 | Seconds_per_step --> 9.608 |
|
257 |
+
[2024-10-21 04:54:16,032][Main][INFO] - [train] Step 3875 out of 65536 | Loss --> 13.179 | Loss_ntp --> 6.521 | Loss_mlm --> 6.657 | Grad_l2 --> 9.901 | Weights_l2 --> 7701.136 | Lr --> 0.001 | Seconds_per_step --> 9.651 |
|
258 |
+
[2024-10-21 04:58:16,128][Main][INFO] - [train] Step 3900 out of 65536 | Loss --> 13.259 | Loss_ntp --> 6.571 | Loss_mlm --> 6.687 | Grad_l2 --> 8.910 | Weights_l2 --> 7701.132 | Lr --> 0.001 | Seconds_per_step --> 9.604 |
|
259 |
+
[2024-10-21 05:02:17,997][Main][INFO] - [train] Step 3925 out of 65536 | Loss --> 13.323 | Loss_ntp --> 6.594 | Loss_mlm --> 6.729 | Grad_l2 -
|
wandb/run-20241020_182518-i0qk9v3k/files/requirements.txt
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sentencepiece==0.2.0
|
2 |
+
pytz==2024.2
|
3 |
+
pyrepl==0.9.0
|
4 |
+
antlr4-python3-runtime==4.9.3
|
5 |
+
xxhash==3.5.0
|
6 |
+
wmctrl==0.5
|
7 |
+
tzdata==2024.2
|
8 |
+
tqdm==4.66.5
|
9 |
+
smmap==5.0.1
|
10 |
+
setproctitle==1.3.3
|
11 |
+
sentry-sdk==2.17.0
|
12 |
+
safetensors==0.4.5
|
13 |
+
regex==2024.9.11
|
14 |
+
pynvml==11.5.3
|
15 |
+
pyarrow==17.0.0
|
16 |
+
protobuf==3.20.3
|
17 |
+
propcache==0.2.0
|
18 |
+
omegaconf==2.3.0
|
19 |
+
multidict==6.1.0
|
20 |
+
joblib==1.4.2
|
21 |
+
frozenlist==1.4.1
|
22 |
+
fancycompleter==0.9.1
|
23 |
+
docker-pycreds==0.4.0
|
24 |
+
dill==0.3.8
|
25 |
+
click==8.1.7
|
26 |
+
aiohappyeyeballs==2.4.3
|
27 |
+
absl-py==2.1.0
|
28 |
+
yarl==1.15.5
|
29 |
+
pdbpp==0.10.3
|
30 |
+
pandas==2.2.3
|
31 |
+
nltk==3.9.1
|
32 |
+
multiprocess==0.70.16
|
33 |
+
hydra-core==1.3.2
|
34 |
+
huggingface-hub==0.26.0
|
35 |
+
gitdb==4.0.11
|
36 |
+
aiosignal==1.3.1
|
37 |
+
tokenizers==0.20.1
|
38 |
+
rouge_score==0.1.2
|
39 |
+
GitPython==3.1.43
|
40 |
+
aiohttp==3.10.10
|
41 |
+
wandb==0.18.5
|
42 |
+
transformers==4.46.0.dev0
|
43 |
+
accelerate==1.0.1
|
44 |
+
datasets==3.0.1
|
45 |
+
evaluate==0.4.3
|
46 |
+
liger_kernel==0.3.1
|
47 |
+
entrypoints==0.4
|
48 |
+
jupyter_client==7.4.9
|
49 |
+
nbclassic==1.1.0
|
50 |
+
notebook==6.5.5
|
51 |
+
pyzmq==24.0.1
|
52 |
+
PyYAML==6.0.2
|
53 |
+
Send2Trash==1.8.3
|
54 |
+
anyio==4.6.0
|
55 |
+
argon2-cffi==23.1.0
|
56 |
+
argon2-cffi-bindings==21.2.0
|
57 |
+
arrow==1.3.0
|
58 |
+
asttokens==2.4.1
|
59 |
+
async-lru==2.0.4
|
60 |
+
attrs==24.2.0
|
61 |
+
babel==2.16.0
|
62 |
+
beautifulsoup4==4.12.3
|
63 |
+
bleach==6.1.0
|
64 |
+
certifi==2024.8.30
|
65 |
+
cffi==1.17.1
|
66 |
+
charset-normalizer==3.3.2
|
67 |
+
comm==0.2.2
|
68 |
+
debugpy==1.8.5
|
69 |
+
decorator==5.1.1
|
70 |
+
defusedxml==0.7.1
|
71 |
+
executing==2.1.0
|
72 |
+
fastjsonschema==2.20.0
|
73 |
+
fqdn==1.5.1
|
74 |
+
h11==0.14.0
|
75 |
+
httpcore==1.0.5
|
76 |
+
httpx==0.27.2
|
77 |
+
idna==3.10
|
78 |
+
ipykernel==6.29.5
|
79 |
+
ipython==8.27.0
|
80 |
+
ipython-genutils==0.2.0
|
81 |
+
ipywidgets==8.1.5
|
82 |
+
isoduration==20.11.0
|
83 |
+
jedi==0.19.1
|
84 |
+
json5==0.9.25
|
85 |
+
jsonpointer==3.0.0
|
86 |
+
jsonschema==4.23.0
|
87 |
+
jsonschema-specifications==2023.12.1
|
88 |
+
jupyter-archive==3.4.0
|
89 |
+
jupyter_contrib_core==0.4.2
|
90 |
+
jupyter_contrib_nbextensions==0.7.0
|
91 |
+
jupyter_core==5.7.2
|
92 |
+
jupyter-events==0.10.0
|
93 |
+
jupyter-highlight-selected-word==0.2.0
|
94 |
+
jupyter-lsp==2.2.5
|
95 |
+
jupyter_nbextensions_configurator==0.6.4
|
96 |
+
jupyter_server==2.14.2
|
97 |
+
jupyter_server_terminals==0.5.3
|
98 |
+
jupyterlab==4.2.5
|
99 |
+
jupyterlab_pygments==0.3.0
|
100 |
+
jupyterlab_server==2.27.3
|
101 |
+
jupyterlab_widgets==3.0.13
|
102 |
+
lxml==5.3.0
|
103 |
+
matplotlib-inline==0.1.7
|
104 |
+
mistune==3.0.2
|
105 |
+
nbclient==0.10.0
|
106 |
+
nbconvert==7.16.4
|
107 |
+
nbformat==5.10.4
|
108 |
+
nest-asyncio==1.6.0
|
109 |
+
notebook_shim==0.2.4
|
110 |
+
overrides==7.7.0
|
111 |
+
packaging==24.1
|
112 |
+
pandocfilters==1.5.1
|
113 |
+
parso==0.8.4
|
114 |
+
pexpect==4.9.0
|
115 |
+
platformdirs==4.3.6
|
116 |
+
prometheus_client==0.21.0
|
117 |
+
prompt_toolkit==3.0.47
|
118 |
+
psutil==6.0.0
|
119 |
+
ptyprocess==0.7.0
|
120 |
+
pure_eval==0.2.3
|
121 |
+
pycparser==2.22
|
122 |
+
Pygments==2.18.0
|
123 |
+
python-dateutil==2.9.0.post0
|
124 |
+
python-json-logger==2.0.7
|
125 |
+
referencing==0.35.1
|
126 |
+
requests==2.32.3
|
127 |
+
rfc3339-validator==0.1.4
|
128 |
+
rfc3986-validator==0.1.1
|
129 |
+
rpds-py==0.20.0
|
130 |
+
sniffio==1.3.1
|
131 |
+
soupsieve==2.6
|
132 |
+
stack-data==0.6.3
|
133 |
+
terminado==0.18.1
|
134 |
+
tinycss2==1.3.0
|
135 |
+
tornado==6.4.1
|
136 |
+
traitlets==5.14.3
|
137 |
+
types-python-dateutil==2.9.0.20240906
|
138 |
+
uri-template==1.3.0
|
139 |
+
urllib3==2.2.3
|
140 |
+
wcwidth==0.2.13
|
141 |
+
webcolors==24.8.0
|
142 |
+
webencodings==0.5.1
|
143 |
+
websocket-client==1.8.0
|
144 |
+
widgetsnbextension==4.0.13
|
145 |
+
Jinja2==3.1.3
|
146 |
+
MarkupSafe==2.1.5
|
147 |
+
filelock==3.13.1
|
148 |
+
fsspec==2024.2.0
|
149 |
+
mpmath==1.3.0
|
150 |
+
networkx==3.2.1
|
151 |
+
numpy==1.26.3
|
152 |
+
nvidia-cublas-cu12==12.4.2.65
|
153 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
154 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
155 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
156 |
+
nvidia-cudnn-cu12==9.1.0.70
|
157 |
+
nvidia-cufft-cu12==11.2.0.44
|
158 |
+
nvidia-curand-cu12==10.3.5.119
|
159 |
+
nvidia-cusolver-cu12==11.6.0.99
|
160 |
+
nvidia-cusparse-cu12==12.3.0.142
|
161 |
+
nvidia-nccl-cu12==2.20.5
|
162 |
+
nvidia-nvjitlink-cu12==12.4.99
|
163 |
+
nvidia-nvtx-cu12==12.4.99
|
164 |
+
pillow==10.2.0
|
165 |
+
sympy==1.12
|
166 |
+
torch==2.4.1+cu124
|
167 |
+
torchaudio==2.4.1+cu124
|
168 |
+
torchvision==0.19.1+cu124
|
169 |
+
triton==3.0.0
|
170 |
+
typing_extensions==4.9.0
|
171 |
+
pip==24.2
|
172 |
+
setuptools==75.1.0
|
173 |
+
wheel==0.44.0
|
174 |
+
PyGObject==3.42.1
|
175 |
+
PyJWT==2.3.0
|
176 |
+
SecretStorage==3.3.1
|
177 |
+
blinker==1.4
|
178 |
+
cryptography==3.4.8
|
179 |
+
dbus-python==1.2.18
|
180 |
+
distro==1.7.0
|
181 |
+
httplib2==0.20.2
|
182 |
+
importlib-metadata==4.6.4
|
183 |
+
jeepney==0.7.1
|
184 |
+
keyring==23.5.0
|
185 |
+
launchpadlib==1.10.16
|
186 |
+
lazr.restfulclient==0.14.4
|
187 |
+
lazr.uri==1.0.6
|
188 |
+
more-itertools==8.10.0
|
189 |
+
oauthlib==3.2.0
|
190 |
+
pyparsing==2.4.7
|
191 |
+
python-apt==2.4.0+ubuntu4
|
192 |
+
six==1.16.0
|
193 |
+
wadllib==1.3.6
|
194 |
+
zipp==1.0.0
|
wandb/run-20241020_182518-i0qk9v3k/files/wandb-metadata.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.4.0-196-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.11.10",
|
4 |
+
"startedAt": "2024-10-20T18:25:18.123718Z",
|
5 |
+
"program": "-m nanoT5.main",
|
6 |
+
"git": {
|
7 |
+
"remote": "https://github.com/pszemraj/nanoT5.git",
|
8 |
+
"commit": "c9a96f3716604dae057adb04996323bd32fcc58e"
|
9 |
+
},
|
10 |
+
"email": "amazingvince@gmail.com",
|
11 |
+
"root": "/workspace/nanoT5/logs/2024-10-20/18-25-17",
|
12 |
+
"host": "2c2cdba3fdca",
|
13 |
+
"username": "root",
|
14 |
+
"executable": "/usr/bin/python",
|
15 |
+
"cpu_count": 48,
|
16 |
+
"cpu_count_logical": 96,
|
17 |
+
"gpu": "NVIDIA A40",
|
18 |
+
"gpu_count": 1,
|
19 |
+
"disk": {
|
20 |
+
"/": {
|
21 |
+
"total": "53687091200",
|
22 |
+
"used": "584318976"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"memory": {
|
26 |
+
"total": "540662628352"
|
27 |
+
},
|
28 |
+
"cpu": {
|
29 |
+
"count": 48,
|
30 |
+
"countLogical": 96
|
31 |
+
},
|
32 |
+
"gpu_nvidia": [
|
33 |
+
{
|
34 |
+
"name": "NVIDIA A40",
|
35 |
+
"memoryTotal": "48305799168",
|
36 |
+
"cudaCores": 10752,
|
37 |
+
"architecture": "Ampere"
|
38 |
+
}
|
39 |
+
],
|
40 |
+
"cudaVersion": "12.4"
|
41 |
+
}
|
wandb/run-20241020_182518-i0qk9v3k/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_timestamp":1.7297368335104377e+09,"eval/accuracy_mlm":0.06220587782972441,"eval/accuracy":0.031102938914862203,"train/seconds_per_step":9.632421855926514,"train/weights_l2":7697.9996299951235,"eval/loss_ntp":0.14020523258785564,"train/grad_l2":30.740209579467773,"_step":29525,"eval/time":28.52281665802002,"train/loss":3.5201581421494486,"eval/loss":4.0174038550985145,"train/loss_ntp":0.14717130114790053,"_runtime":288147.130780562,"_wandb":{"runtime":288147},"train/loss_mlm":3.3729868379235266,"eval/loss_mlm":3.877198635123846,"eval/accuracy_ntp":0,"train/lr":0.0006538872513957004}
|
wandb/run-20241020_182518-i0qk9v3k/logs/debug-core.log
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-10-20T18:25:17.702790895Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmptqsfbw3z/port-4102.txt","pid":4102,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2024-10-20T18:25:17.702844368Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2024-10-20T18:25:17.703798232Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":4102}
|
4 |
+
{"time":"2024-10-20T18:25:17.703819799Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":42971,"Zone":""}}
|
5 |
+
{"time":"2024-10-20T18:25:17.871771876Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:58724"}
|
6 |
+
{"time":"2024-10-20T18:25:18.127264011Z","level":"INFO","msg":"handleInformInit: received","streamId":"i0qk9v3k","id":"127.0.0.1:58724"}
|
7 |
+
{"time":"2024-10-20T18:25:18.24781459Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"i0qk9v3k","id":"127.0.0.1:58724"}
|
8 |
+
{"time":"2024-10-24T02:27:45.25401773Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:58724"}
|
9 |
+
{"time":"2024-10-24T02:27:45.254421548Z","level":"INFO","msg":"server is shutting down"}
|
10 |
+
{"time":"2024-10-24T02:27:45.254423515Z","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:58724"}
|
11 |
+
{"time":"2024-10-24T02:27:45.254678436Z","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:58724"}
|
12 |
+
{"time":"2024-10-24T02:27:45.867647487Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:58724"}
|
13 |
+
{"time":"2024-10-24T02:27:45.867719893Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:58724"}
|
14 |
+
{"time":"2024-10-24T02:27:45.86773086Z","level":"INFO","msg":"server is closed"}
|
wandb/run-20241020_182518-i0qk9v3k/logs/debug-internal.log
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-10-20T18:25:18.130390854Z","level":"INFO","msg":"using version","core version":"0.18.5"}
|
2 |
+
{"time":"2024-10-20T18:25:18.131160825Z","level":"INFO","msg":"created symlink","path":"/workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug-core.log"}
|
3 |
+
{"time":"2024-10-20T18:25:18.247302473Z","level":"INFO","msg":"created new stream","id":"i0qk9v3k"}
|
4 |
+
{"time":"2024-10-20T18:25:18.247577857Z","level":"INFO","msg":"stream: started","id":"i0qk9v3k"}
|
5 |
+
{"time":"2024-10-20T18:25:18.247668586Z","level":"INFO","msg":"handler: started","stream_id":{"value":"i0qk9v3k"}}
|
6 |
+
{"time":"2024-10-20T18:25:18.247659857Z","level":"INFO","msg":"sender: started","stream_id":"i0qk9v3k"}
|
7 |
+
{"time":"2024-10-20T18:25:18.247631762Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"i0qk9v3k"}}
|
8 |
+
{"time":"2024-10-20T18:25:19.59293904Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/run-20241020_182518-i0qk9v3k/logs/debug.log
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-20 18:25:18,064 INFO MainThread:4102 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
|
2 |
+
2024-10-20 18:25:18,064 INFO MainThread:4102 [wandb_setup.py:_flush():79] Configure stats pid to 4102
|
3 |
+
2024-10-20 18:25:18,065 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from /root/.config/wandb/settings
|
4 |
+
2024-10-20 18:25:18,065 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/settings
|
5 |
+
2024-10-20 18:25:18,066 INFO MainThread:4102 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
|
6 |
+
2024-10-20 18:25:18,066 INFO MainThread:4102 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
7 |
+
2024-10-20 18:25:18,067 WARNING MainThread:4102 [wandb_setup.py:_flush():79] Could not find program at -m nanoT5.main
|
8 |
+
2024-10-20 18:25:18,067 INFO MainThread:4102 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
|
9 |
+
2024-10-20 18:25:18,068 INFO MainThread:4102 [wandb_setup.py:_flush():79] Applying login settings: {}
|
10 |
+
2024-10-20 18:25:18,069 INFO MainThread:4102 [wandb_init.py:_log_setup():534] Logging user logs to /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug.log
|
11 |
+
2024-10-20 18:25:18,071 INFO MainThread:4102 [wandb_init.py:_log_setup():535] Logging internal logs to /workspace/nanoT5/logs/2024-10-20/18-25-17/wandb/run-20241020_182518-i0qk9v3k/logs/debug-internal.log
|
12 |
+
2024-10-20 18:25:18,071 INFO MainThread:4102 [wandb_init.py:init():621] calling init triggers
|
13 |
+
2024-10-20 18:25:18,072 INFO MainThread:4102 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
|
14 |
+
config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 93789, 'tokenizer': {'name': 'BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5'}, 'working_dir': '/workspace/nanoT5/logs/2024-10-20/18-25-17', 'model': {'liger': True, 'klass': 'local_t5', 'name': 'pszemraj/tFINE-850m-24x24-1024ctx', 'overwrite': {'dropout_rate': 0.0, 'num_decoder_layers': 16, 'num_key_value_heads': 4, 'num_layers': 16, 'use_gqa': True}, 'add_config': {'is_bf16': True}, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'multi_task': True, 'NTP': 0.3, 'input_length': 512, 'max_seq_len': 512, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 0}, 'optim': {'name': 'adamwscale', 'base_lr': 0.001, 'batch_size': 128, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.01, 'grad_clip': 1.0, 'grad_acc': 16, 'final_cosine': 2e-05}, 'eval': {'every_steps': 500, 'steps': 0}, 'checkpoint': {'every_steps': 1500}, 'logging': {'every_steps': 25, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'amazingvince', 'tags': ['gqa', 'large', 'e32-d16', '512 ctx'], 'mode': 'online'}}, 'slurm_id': 'none'}
|
15 |
+
2024-10-20 18:25:18,073 INFO MainThread:4102 [wandb_init.py:init():671] starting backend
|
16 |
+
2024-10-20 18:25:18,074 INFO MainThread:4102 [wandb_init.py:init():675] sending inform_init request
|
17 |
+
2024-10-20 18:25:18,121 INFO MainThread:4102 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-10-20 18:25:18,122 INFO MainThread:4102 [wandb_init.py:init():688] backend started and connected
|
19 |
+
2024-10-20 18:25:18,198 INFO MainThread:4102 [wandb_init.py:init():783] updated telemetry
|
20 |
+
2024-10-20 18:25:18,256 INFO MainThread:4102 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-10-20 18:25:19,558 INFO MainThread:4102 [wandb_init.py:init():867] starting run threads in backend
|
22 |
+
2024-10-20 18:25:19,755 INFO MainThread:4102 [wandb_run.py:_console_start():2463] atexit reg
|
23 |
+
2024-10-20 18:25:19,756 INFO MainThread:4102 [wandb_run.py:_redirect():2311] redirect: wrap_raw
|
24 |
+
2024-10-20 18:25:19,757 INFO MainThread:4102 [wandb_run.py:_redirect():2376] Wrapping output streams.
|
25 |
+
2024-10-20 18:25:19,759 INFO MainThread:4102 [wandb_run.py:_redirect():2401] Redirects installed.
|
26 |
+
2024-10-20 18:25:19,763 INFO MainThread:4102 [wandb_init.py:init():911] run started, returning control to user process
|
27 |
+
2024-10-20 18:25:41,763 INFO MainThread:4102 [wandb_run.py:_config_callback():1390] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 93789, 'tokenizer': {'name': 'BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5'}, 'working_dir': '/workspace/nanoT5/logs/2024-10-20/18-25-17', 'model': {'liger': True, 'klass': 'local_t5', 'name': 'pszemraj/tFINE-850m-24x24-1024ctx', 'overwrite': {'dropout_rate': 0.0, 'num_decoder_layers': 16, 'num_key_value_heads': 4, 'num_layers': 16, 'use_gqa': True}, 'add_config': {'is_bf16': True}, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'multi_task': True, 'NTP': 0.3, 'input_length': 512, 'max_seq_len': 512, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 0, 'before_mask_input_length': 568, 'target_length': 114}, 'optim': {'name': 'adamwscale', 'base_lr': 0.001, 'batch_size': 128, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.01, 'grad_clip': 1.0, 'grad_acc': 16, 'final_cosine': 2e-05}, 'eval': {'every_steps': 500, 'steps': 0, 'corrected_steps': 0}, 'checkpoint': {'every_steps': 1500}, 'logging': {'every_steps': 25, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'amazingvince', 'tags': ['gqa', 'large', 'e32-d16', '512 ctx'], 'mode': 'online'}}, 'slurm_id': 'none', 'n_all_param': 486886912}
|
28 |
+
2024-10-24 02:27:45,254 WARNING MsgRouterThr:4102 [router.py:message_loop():77] message_loop has been closed
|
wandb/run-20241020_182518-i0qk9v3k/run-i0qk9v3k.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a7c6447c6a15329124db78b2c333583d47f717079c22331f7e91b92b357127a
|
3 |
+
size 152734877
|