Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- checkpoints/.hydra/config.yaml +50 -0
- checkpoints/.hydra/hydra.yaml +154 -0
- checkpoints/.hydra/overrides.yaml +1 -0
- checkpoints/checkpoint-pt-10000/model.safetensors +3 -0
- checkpoints/checkpoint-pt-10000/random_states_0.pkl +3 -0
- checkpoints/checkpoint-pt-12500/model.safetensors +3 -0
- checkpoints/checkpoint-pt-12500/random_states_0.pkl +3 -0
- checkpoints/checkpoint-pt-15000/model.safetensors +3 -0
- checkpoints/checkpoint-pt-15000/random_states_0.pkl +3 -0
- checkpoints/checkpoint-pt-2500/model.safetensors +3 -0
- checkpoints/checkpoint-pt-2500/random_states_0.pkl +3 -0
- checkpoints/checkpoint-pt-5000/model.safetensors +3 -0
- checkpoints/checkpoint-pt-5000/random_states_0.pkl +3 -0
- checkpoints/checkpoint-pt-7500/model.safetensors +3 -0
- checkpoints/checkpoint-pt-7500/random_states_0.pkl +3 -0
- checkpoints/config.json +32 -0
- checkpoints/main.log +0 -0
- checkpoints/wandb/debug-internal.log +0 -0
- checkpoints/wandb/debug.log +27 -0
- checkpoints/wandb/run-20240830_195924-mao0tqjy/files/config.yaml +132 -0
- checkpoints/wandb/run-20240830_195924-mao0tqjy/files/diff.patch +163 -0
- checkpoints/wandb/run-20240830_195924-mao0tqjy/files/output.log +0 -0
- checkpoints/wandb/run-20240830_195924-mao0tqjy/files/requirements.txt +200 -0
- checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-metadata.json +547 -0
- checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-summary.json +1 -0
- checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log +0 -0
- checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug.log +27 -0
- checkpoints/wandb/run-20240830_195924-mao0tqjy/run-mao0tqjy.wandb +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
checkpoints/wandb/run-20240830_195924-mao0tqjy/run-mao0tqjy.wandb filter=lfs diff=lfs merge=lfs -text
|
checkpoints/.hydra/config.yaml
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mode: pt
|
2 |
+
device: gpu
|
3 |
+
precision: bf16
|
4 |
+
eval_only: false
|
5 |
+
predict_only: false
|
6 |
+
seed: 34534
|
7 |
+
model:
|
8 |
+
klass: hf_t5
|
9 |
+
name: pszemraj/tFINE-900m-e16-d32
|
10 |
+
overwrite:
|
11 |
+
dropout_rate: 0.0
|
12 |
+
checkpoint_path: ''
|
13 |
+
random_init: false
|
14 |
+
compile: true
|
15 |
+
tokenizer:
|
16 |
+
name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
|
17 |
+
data:
|
18 |
+
input_length: 1024
|
19 |
+
mlm_probability: 0.15
|
20 |
+
mean_noise_span_length: 3.0
|
21 |
+
num_workers: 16
|
22 |
+
optim:
|
23 |
+
name: adamwscale
|
24 |
+
base_lr: 0.01
|
25 |
+
batch_size: 128
|
26 |
+
total_steps: 20000
|
27 |
+
epochs: -1
|
28 |
+
warmup_steps: 5000
|
29 |
+
lr_scheduler: cosine
|
30 |
+
weight_decay: 0.0001
|
31 |
+
grad_clip: 1.0
|
32 |
+
grad_acc: 8
|
33 |
+
final_cosine: 2.0e-05
|
34 |
+
eval:
|
35 |
+
every_steps: 1000000000
|
36 |
+
steps: 500
|
37 |
+
checkpoint:
|
38 |
+
every_steps: 2500
|
39 |
+
logging:
|
40 |
+
use_wandb: true
|
41 |
+
wandb_config:
|
42 |
+
project: nanoT5
|
43 |
+
entity: pszemraj
|
44 |
+
tags:
|
45 |
+
- 900m
|
46 |
+
- '1024'
|
47 |
+
mode: online
|
48 |
+
every_steps: 25
|
49 |
+
grad_l2: true
|
50 |
+
weights_l2: true
|
checkpoints/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: main
|
117 |
+
chdir: null
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: default
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /workspace/nanoT5
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /workspace/nanoT5/nanoT5/configs
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /workspace/nanoT5/outputs/2024-08-30/19-59-22
|
144 |
+
choices:
|
145 |
+
hydra/env: default
|
146 |
+
hydra/callbacks: null
|
147 |
+
hydra/job_logging: default
|
148 |
+
hydra/hydra_logging: default
|
149 |
+
hydra/hydra_help: default
|
150 |
+
hydra/help: default
|
151 |
+
hydra/sweeper: basic
|
152 |
+
hydra/launcher: basic
|
153 |
+
hydra/output: default
|
154 |
+
verbose: false
|
checkpoints/.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
checkpoints/checkpoint-pt-10000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b04238335d2e95c6bfa1c92a501bef0bf99434e8e3475d41216cdc74d3d7a76
|
3 |
+
size 3550041880
|
checkpoints/checkpoint-pt-10000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
|
3 |
+
size 14344
|
checkpoints/checkpoint-pt-12500/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f034c61cb3eac83e1c6a7ea881e34d255ffba59e0ca7746df690bba59229a687
|
3 |
+
size 3550041880
|
checkpoints/checkpoint-pt-12500/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
|
3 |
+
size 14344
|
checkpoints/checkpoint-pt-15000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f5cd29ef789593d59329578d0b2f454238da7789c2b5a9bee1c3d139c64a5e2
|
3 |
+
size 3550041880
|
checkpoints/checkpoint-pt-15000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
|
3 |
+
size 14344
|
checkpoints/checkpoint-pt-2500/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6fb5ed5e9042405a8d4d053759f532abe0d167456785d277c71e18fa74c29a4
|
3 |
+
size 3550041880
|
checkpoints/checkpoint-pt-2500/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
|
3 |
+
size 14344
|
checkpoints/checkpoint-pt-5000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:48e5e3d34b17f5fb15734e0dc5f17878d3cf58a9b748b31341c993f3e5e94f3e
|
3 |
+
size 3550041880
|
checkpoints/checkpoint-pt-5000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
|
3 |
+
size 14344
|
checkpoints/checkpoint-pt-7500/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c720f1fc7d3a669689be8dcae3d1137518b37665abda21edf4b547a0e7b1abe4
|
3 |
+
size 3550041880
|
checkpoints/checkpoint-pt-7500/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
|
3 |
+
size 14344
|
checkpoints/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "pszemraj/tFINE-900m-e16-d32",
|
3 |
+
"architectures": [
|
4 |
+
"T5ForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"classifier_dropout": 0.0,
|
7 |
+
"d_ff": 3072,
|
8 |
+
"d_kv": 64,
|
9 |
+
"d_model": 1024,
|
10 |
+
"decoder_start_token_id": 3,
|
11 |
+
"dense_act_fn": "silu",
|
12 |
+
"dropout_rate": 0.0,
|
13 |
+
"eos_token_id": 2,
|
14 |
+
"feed_forward_proj": "gated-silu",
|
15 |
+
"initializer_factor": 1.0,
|
16 |
+
"is_bf16": true,
|
17 |
+
"is_encoder_decoder": false,
|
18 |
+
"is_gated_act": true,
|
19 |
+
"layer_norm_epsilon": 1e-06,
|
20 |
+
"model_type": "t5",
|
21 |
+
"num_decoder_layers": 32,
|
22 |
+
"num_heads": 16,
|
23 |
+
"num_layers": 16,
|
24 |
+
"output_past": true,
|
25 |
+
"pad_token_id": 3,
|
26 |
+
"relative_attention_max_distance": 128,
|
27 |
+
"relative_attention_num_buckets": 48,
|
28 |
+
"tie_word_embeddings": false,
|
29 |
+
"transformers_version": "4.44.2",
|
30 |
+
"use_cache": true,
|
31 |
+
"vocab_size": 48256
|
32 |
+
}
|
checkpoints/main.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoints/wandb/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoints/wandb/debug.log
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-30 19:59:24,178 INFO MainThread:29052 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
|
2 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Configure stats pid to 29052
|
3 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
4 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/settings
|
5 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
6 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-30 19:59:24,179 WARNING MainThread:29052 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
|
8 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
|
9 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Applying login settings: {}
|
10 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug.log
|
11 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log
|
12 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():607] calling init triggers
|
13 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
|
14 |
+
config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22'}
|
15 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():657] starting backend
|
16 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():661] setting up manager
|
17 |
+
2024-08-30 19:59:24,185 INFO MainThread:29052 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-08-30 19:59:24,187 INFO MainThread:29052 [wandb_init.py:init():669] backend started and connected
|
19 |
+
2024-08-30 19:59:24,192 INFO MainThread:29052 [wandb_init.py:init():767] updated telemetry
|
20 |
+
2024-08-30 19:59:24,198 INFO MainThread:29052 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-08-30 19:59:24,583 INFO MainThread:29052 [wandb_init.py:init():851] starting run threads in backend
|
22 |
+
2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_console_start():2463] atexit reg
|
23 |
+
2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_redirect():2309] redirect: wrap_raw
|
24 |
+
2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_redirect():2374] Wrapping output streams.
|
25 |
+
2024-08-30 19:59:24,815 INFO MainThread:29052 [wandb_run.py:_redirect():2399] Redirects installed.
|
26 |
+
2024-08-30 19:59:24,818 INFO MainThread:29052 [wandb_init.py:init():894] run started, returning control to user process
|
27 |
+
2024-08-30 19:59:44,796 INFO MainThread:29052 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22', 'n_all_param': 887492096}
|
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/config.yaml
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb_version: 1
|
2 |
+
|
3 |
+
mode:
|
4 |
+
desc: null
|
5 |
+
value: pt
|
6 |
+
device:
|
7 |
+
desc: null
|
8 |
+
value: gpu
|
9 |
+
precision:
|
10 |
+
desc: null
|
11 |
+
value: bf16
|
12 |
+
eval_only:
|
13 |
+
desc: null
|
14 |
+
value: false
|
15 |
+
predict_only:
|
16 |
+
desc: null
|
17 |
+
value: false
|
18 |
+
seed:
|
19 |
+
desc: null
|
20 |
+
value: 34534
|
21 |
+
model:
|
22 |
+
desc: null
|
23 |
+
value:
|
24 |
+
klass: hf_t5
|
25 |
+
name: pszemraj/tFINE-900m-e16-d32
|
26 |
+
overwrite:
|
27 |
+
dropout_rate: 0.0
|
28 |
+
checkpoint_path: ''
|
29 |
+
random_init: false
|
30 |
+
compile: true
|
31 |
+
tokenizer:
|
32 |
+
desc: null
|
33 |
+
value:
|
34 |
+
name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
|
35 |
+
data:
|
36 |
+
desc: null
|
37 |
+
value:
|
38 |
+
input_length: 1024
|
39 |
+
mlm_probability: 0.15
|
40 |
+
mean_noise_span_length: 3.0
|
41 |
+
num_workers: 16
|
42 |
+
before_mask_input_length: 1137
|
43 |
+
target_length: 229
|
44 |
+
optim:
|
45 |
+
desc: null
|
46 |
+
value:
|
47 |
+
name: adamwscale
|
48 |
+
base_lr: 0.01
|
49 |
+
batch_size: 128
|
50 |
+
total_steps: 20000
|
51 |
+
epochs: -1
|
52 |
+
warmup_steps: 5000
|
53 |
+
lr_scheduler: cosine
|
54 |
+
weight_decay: 0.0001
|
55 |
+
grad_clip: 1.0
|
56 |
+
grad_acc: 8
|
57 |
+
final_cosine: 2.0e-05
|
58 |
+
eval:
|
59 |
+
desc: null
|
60 |
+
value:
|
61 |
+
every_steps: 1000000000
|
62 |
+
steps: 500
|
63 |
+
corrected_steps: 500
|
64 |
+
checkpoint:
|
65 |
+
desc: null
|
66 |
+
value:
|
67 |
+
every_steps: 2500
|
68 |
+
logging:
|
69 |
+
desc: null
|
70 |
+
value:
|
71 |
+
use_wandb: true
|
72 |
+
wandb_config:
|
73 |
+
project: nanoT5
|
74 |
+
entity: pszemraj
|
75 |
+
tags:
|
76 |
+
- 900m
|
77 |
+
- '1024'
|
78 |
+
mode: online
|
79 |
+
every_steps: 25
|
80 |
+
grad_l2: true
|
81 |
+
weights_l2: true
|
82 |
+
slurm_id:
|
83 |
+
desc: null
|
84 |
+
value: none
|
85 |
+
working_dir:
|
86 |
+
desc: null
|
87 |
+
value: /workspace/nanoT5/outputs/2024-08-30/19-59-22
|
88 |
+
_wandb:
|
89 |
+
desc: null
|
90 |
+
value:
|
91 |
+
python_version: 3.11.9
|
92 |
+
cli_version: 0.17.8
|
93 |
+
framework: huggingface
|
94 |
+
huggingface_version: 4.44.2
|
95 |
+
is_jupyter_run: false
|
96 |
+
is_kaggle_kernel: false
|
97 |
+
start_time: 1725047964
|
98 |
+
t:
|
99 |
+
1:
|
100 |
+
- 1
|
101 |
+
- 11
|
102 |
+
- 41
|
103 |
+
- 49
|
104 |
+
- 50
|
105 |
+
- 51
|
106 |
+
- 55
|
107 |
+
- 71
|
108 |
+
- 100
|
109 |
+
2:
|
110 |
+
- 1
|
111 |
+
- 11
|
112 |
+
- 41
|
113 |
+
- 49
|
114 |
+
- 50
|
115 |
+
- 51
|
116 |
+
- 55
|
117 |
+
- 71
|
118 |
+
- 100
|
119 |
+
3:
|
120 |
+
- 15
|
121 |
+
- 16
|
122 |
+
- 23
|
123 |
+
- 61
|
124 |
+
4: 3.11.9
|
125 |
+
5: 0.17.8
|
126 |
+
6: 4.44.2
|
127 |
+
8:
|
128 |
+
- 5
|
129 |
+
13: linux-x86_64
|
130 |
+
n_all_param:
|
131 |
+
desc: null
|
132 |
+
value: 887492096
|
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/diff.patch
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
diff --git a/nanoT5/configs/default.yaml b/nanoT5/configs/default.yaml
|
2 |
+
index 6e10cc4..240ff3d 100644
|
3 |
+
--- a/nanoT5/configs/default.yaml
|
4 |
+
+++ b/nanoT5/configs/default.yaml
|
5 |
+
@@ -1,65 +1,51 @@
|
6 |
+
-defaults:
|
7 |
+
- - _self_
|
8 |
+
- - task: pt
|
9 |
+
- - local_env: default
|
10 |
+
-
|
11 |
+
-# Experiment args
|
12 |
+
-mode: 'pt'
|
13 |
+
+mode: pt
|
14 |
+
device: gpu
|
15 |
+
-precision: 'bf16'
|
16 |
+
+precision: bf16
|
17 |
+
eval_only: false
|
18 |
+
predict_only: false
|
19 |
+
-seed: 2137
|
20 |
+
+seed: 34534
|
21 |
+
|
22 |
+
model:
|
23 |
+
- klass: local_t5
|
24 |
+
- name: 'google/t5-v1_1-base'
|
25 |
+
- overwrite: # overwrite config with these values
|
26 |
+
- dropout_rate: 0.0
|
27 |
+
- add_config: # add these values to the config
|
28 |
+
- is_bf16: false
|
29 |
+
- checkpoint_path: ''
|
30 |
+
- random_init: true
|
31 |
+
- compile: true # Pytorch 2.0
|
32 |
+
-
|
33 |
+
+ klass: hf_t5
|
34 |
+
+ name: pszemraj/tFINE-900m-e16-d32
|
35 |
+
+ overwrite:
|
36 |
+
+ dropout_rate: 0.0
|
37 |
+
+# add_config:
|
38 |
+
+# is_bf16: false
|
39 |
+
+ checkpoint_path: ''
|
40 |
+
+ random_init: false
|
41 |
+
+ compile: true
|
42 |
+
+tokenizer:
|
43 |
+
+ name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
|
44 |
+
data:
|
45 |
+
- input_length: 512
|
46 |
+
- mlm_probability: 0.15
|
47 |
+
- mean_noise_span_length: 3.0
|
48 |
+
- num_workers: 8
|
49 |
+
-
|
50 |
+
+ input_length: 1024
|
51 |
+
+ mlm_probability: 0.15
|
52 |
+
+ mean_noise_span_length: 3.0
|
53 |
+
+ num_workers: 16
|
54 |
+
optim:
|
55 |
+
- name: adamwscale
|
56 |
+
- base_lr: 2e-2
|
57 |
+
- batch_size: 128
|
58 |
+
- total_steps: 65536
|
59 |
+
- epochs: -1 # If it's > 0 it overwrites total_steps
|
60 |
+
- warmup_steps: 10000
|
61 |
+
- lr_scheduler: cosine
|
62 |
+
- weight_decay: 0.0
|
63 |
+
- grad_clip: 1.0
|
64 |
+
- grad_acc: 1
|
65 |
+
- final_cosine: 1e-5
|
66 |
+
-
|
67 |
+
+ name: adamwscale
|
68 |
+
+ base_lr: 0.01
|
69 |
+
+ batch_size: 128
|
70 |
+
+ total_steps: 20000
|
71 |
+
+ epochs: -1
|
72 |
+
+ warmup_steps: 5000
|
73 |
+
+ lr_scheduler: cosine
|
74 |
+
+ weight_decay: 0.0001
|
75 |
+
+ grad_clip: 1.0
|
76 |
+
+ grad_acc: 8
|
77 |
+
+ final_cosine: 2.0e-05
|
78 |
+
eval:
|
79 |
+
- every_steps: 100000 # Eval once in the end
|
80 |
+
- steps: 500
|
81 |
+
-
|
82 |
+
+ every_steps: 1000000000
|
83 |
+
+ steps: 500
|
84 |
+
checkpoint:
|
85 |
+
- every_steps: 100000 # Save checkpoint once in the end
|
86 |
+
-
|
87 |
+
+ every_steps: 2500
|
88 |
+
logging:
|
89 |
+
- every_steps: 100
|
90 |
+
- grad_l2: true
|
91 |
+
- weights_l2: true
|
92 |
+
- use_wandb: false
|
93 |
+
- # Can remove or comment out the below if not using Weights & Biases
|
94 |
+
- wandb_config:
|
95 |
+
- project: nanoT5
|
96 |
+
- entity: 'your_wandb_username'
|
97 |
+
- tags: ['nanoT5', 'my_tag']
|
98 |
+
- mode: 'online'
|
99 |
+
-
|
100 |
+
-hydra:
|
101 |
+
- job:
|
102 |
+
- chdir: True
|
103 |
+
+ use_wandb: true
|
104 |
+
+ wandb_config:
|
105 |
+
+ project: nanoT5
|
106 |
+
+ entity: 'pszemraj'
|
107 |
+
+ tags: ['900m', '1024',]
|
108 |
+
+ mode: 'online'
|
109 |
+
+ every_steps: 25
|
110 |
+
+ grad_l2: true
|
111 |
+
+ weights_l2: true
|
112 |
+
diff --git a/nanoT5/main.py b/nanoT5/main.py
|
113 |
+
index 12dfbae..c4ba985 100644
|
114 |
+
--- a/nanoT5/main.py
|
115 |
+
+++ b/nanoT5/main.py
|
116 |
+
@@ -19,6 +19,40 @@ from .utils import (
|
117 |
+
train,
|
118 |
+
)
|
119 |
+
|
120 |
+
+# >>> DYNAMO UPDATES
|
121 |
+
+
|
122 |
+
+# Torch compile arguments
|
123 |
+
+torch_compile_arguments = [
|
124 |
+
+ "config.dce = True",
|
125 |
+
+ "config.memory_planning = True",
|
126 |
+
+ "config.memory_pool = 'combined'",
|
127 |
+
+ "config.coordinate_descent_tuning = True",
|
128 |
+
+ "config.max_autotune_gemm = False", # GEMM is unnecessary
|
129 |
+
+ "config.autotune_multi_device = False",
|
130 |
+
+ "config.max_autotune_gemm_backends = 'ATEN'", # Not much faster
|
131 |
+
+ "config.aggressive_fusion = False", # Careful changes results!
|
132 |
+
+ "config.cuda.enable_cuda_lto = True",
|
133 |
+
+ "config.cuda.use_fast_math = True",
|
134 |
+
+ "config.cuda.compile_opt_level = '-O3'",
|
135 |
+
+]
|
136 |
+
+# Torch dynamo arguments
|
137 |
+
+torch_dynamo_arguments = [
|
138 |
+
+ "config.accumulated_cache_size_limit = 1024", # Bump up a bit from 256
|
139 |
+
+ "config.suppress_errors = True", # Supress errors for now
|
140 |
+
+ "config.do_not_emit_runtime_asserts = True",
|
141 |
+
+]
|
142 |
+
+import torch._inductor.config as config
|
143 |
+
+for _try_compile_argument in torch_compile_arguments:
|
144 |
+
+ try: exec(_try_compile_argument)
|
145 |
+
+ except: pass
|
146 |
+
+pass
|
147 |
+
+import torch._dynamo.config as config
|
148 |
+
+for _try_dynamo_argument in torch_dynamo_arguments:
|
149 |
+
+ try: exec(_try_dynamo_argument)
|
150 |
+
+ except: pass
|
151 |
+
+pass
|
152 |
+
+
|
153 |
+
+# >>> DYNAMO UPDATES
|
154 |
+
|
155 |
+
@hydra.main(config_path="configs", config_name="default", version_base="1.1")
|
156 |
+
def main(args):
|
157 |
+
@@ -83,4 +117,4 @@ def main(args):
|
158 |
+
|
159 |
+
|
160 |
+
if __name__ == "__main__":
|
161 |
+
- main()
|
162 |
+
+ main()
|
163 |
+
|
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/output.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/requirements.txt
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GitPython==3.1.43
|
2 |
+
Jinja2==3.1.4
|
3 |
+
MarkupSafe==2.1.5
|
4 |
+
PyGObject==3.42.1
|
5 |
+
PyJWT==2.3.0
|
6 |
+
PyYAML==5.4.1
|
7 |
+
PyYAML==6.0.2
|
8 |
+
Pygments==2.11.2
|
9 |
+
Pygments==2.18.0
|
10 |
+
SecretStorage==3.3.1
|
11 |
+
Send2Trash==1.8.3
|
12 |
+
absl-py==2.1.0
|
13 |
+
accelerate==0.33.0
|
14 |
+
aiohappyeyeballs==2.4.0
|
15 |
+
aiohttp==3.10.5
|
16 |
+
aiosignal==1.3.1
|
17 |
+
antlr4-python3-runtime==4.9.3
|
18 |
+
anyio==4.4.0
|
19 |
+
argon2-cffi-bindings==21.2.0
|
20 |
+
argon2-cffi==23.1.0
|
21 |
+
arrow==1.3.0
|
22 |
+
asttokens==2.4.1
|
23 |
+
async-lru==2.0.4
|
24 |
+
attrs==24.2.0
|
25 |
+
babel==2.16.0
|
26 |
+
beautifulsoup4==4.12.3
|
27 |
+
bleach==6.1.0
|
28 |
+
blessed==1.20.0
|
29 |
+
blinker==1.4
|
30 |
+
certifi==2024.7.4
|
31 |
+
cffi==1.17.0
|
32 |
+
charset-normalizer==3.3.2
|
33 |
+
click==8.1.7
|
34 |
+
comm==0.2.2
|
35 |
+
cryptography==3.4.8
|
36 |
+
datasets==2.21.0
|
37 |
+
dbus-python==1.2.18
|
38 |
+
debugpy==1.8.5
|
39 |
+
decorator==5.1.1
|
40 |
+
defusedxml==0.7.1
|
41 |
+
dill==0.3.8
|
42 |
+
distro==1.7.0
|
43 |
+
docker-pycreds==0.4.0
|
44 |
+
entrypoints==0.4
|
45 |
+
evaluate==0.4.2
|
46 |
+
executing==2.0.1
|
47 |
+
fancycompleter==0.9.1
|
48 |
+
fastjsonschema==2.20.0
|
49 |
+
filelock==3.15.4
|
50 |
+
fire==0.6.0
|
51 |
+
fqdn==1.5.1
|
52 |
+
frozenlist==1.4.1
|
53 |
+
fsspec==2024.6.1
|
54 |
+
gitdb==4.0.11
|
55 |
+
gpustat==1.1.1
|
56 |
+
h11==0.14.0
|
57 |
+
httpcore==1.0.5
|
58 |
+
httplib2==0.20.2
|
59 |
+
httpx==0.27.0
|
60 |
+
huggingface-hub==0.24.6
|
61 |
+
hydra-core==1.3.2
|
62 |
+
idna==3.7
|
63 |
+
importlib-metadata==4.6.4
|
64 |
+
ipykernel==6.29.5
|
65 |
+
ipython-genutils==0.2.0
|
66 |
+
ipython==8.26.0
|
67 |
+
ipywidgets==8.1.3
|
68 |
+
isoduration==20.11.0
|
69 |
+
jedi==0.19.1
|
70 |
+
jeepney==0.7.1
|
71 |
+
joblib==1.4.2
|
72 |
+
json5==0.9.25
|
73 |
+
jsonpointer==3.0.0
|
74 |
+
jsonschema-specifications==2023.12.1
|
75 |
+
jsonschema==4.23.0
|
76 |
+
jupyter-archive==3.4.0
|
77 |
+
jupyter-events==0.10.0
|
78 |
+
jupyter-highlight-selected-word==0.2.0
|
79 |
+
jupyter-lsp==2.2.5
|
80 |
+
jupyter_client==7.4.9
|
81 |
+
jupyter_contrib_core==0.4.2
|
82 |
+
jupyter_contrib_nbextensions==0.7.0
|
83 |
+
jupyter_core==5.7.2
|
84 |
+
jupyter_nbextensions_configurator==0.6.4
|
85 |
+
jupyter_server==2.14.2
|
86 |
+
jupyter_server_terminals==0.5.3
|
87 |
+
jupyterlab==4.2.4
|
88 |
+
jupyterlab_pygments==0.3.0
|
89 |
+
jupyterlab_server==2.27.3
|
90 |
+
jupyterlab_widgets==3.0.11
|
91 |
+
keyring==23.5.0
|
92 |
+
launchpadlib==1.10.16
|
93 |
+
lazr.restfulclient==0.14.4
|
94 |
+
lazr.uri==1.0.6
|
95 |
+
lxml==5.3.0
|
96 |
+
matplotlib-inline==0.1.7
|
97 |
+
mistune==3.0.2
|
98 |
+
more-itertools==8.10.0
|
99 |
+
mpmath==1.3.0
|
100 |
+
multidict==6.0.5
|
101 |
+
multiprocess==0.70.16
|
102 |
+
nbclassic==1.1.0
|
103 |
+
nbclient==0.10.0
|
104 |
+
nbconvert==7.16.4
|
105 |
+
nbformat==5.10.4
|
106 |
+
nest-asyncio==1.6.0
|
107 |
+
networkx==3.3
|
108 |
+
ninja==1.11.1.1
|
109 |
+
nltk==3.9.1
|
110 |
+
notebook==6.5.5
|
111 |
+
notebook_shim==0.2.4
|
112 |
+
numpy==1.26.4
|
113 |
+
nvidia-cublas-cu12==12.1.3.1
|
114 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
115 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
116 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
117 |
+
nvidia-cudnn-cu12==9.1.0.70
|
118 |
+
nvidia-cufft-cu12==11.0.2.54
|
119 |
+
nvidia-curand-cu12==10.3.2.106
|
120 |
+
nvidia-cusolver-cu12==11.4.5.107
|
121 |
+
nvidia-cusparse-cu12==12.1.0.106
|
122 |
+
nvidia-ml-py==12.560.30
|
123 |
+
nvidia-nccl-cu12==2.20.5
|
124 |
+
nvidia-nvjitlink-cu12==12.6.20
|
125 |
+
nvidia-nvtx-cu12==12.1.105
|
126 |
+
oauthlib==3.2.0
|
127 |
+
omegaconf==2.3.0
|
128 |
+
overrides==7.7.0
|
129 |
+
packaging==24.1
|
130 |
+
pandas==2.2.2
|
131 |
+
pandocfilters==1.5.1
|
132 |
+
parso==0.8.4
|
133 |
+
pdbpp==0.10.3
|
134 |
+
pexpect==4.9.0
|
135 |
+
pillow==10.4.0
|
136 |
+
pip==24.2
|
137 |
+
platformdirs==4.2.2
|
138 |
+
prometheus_client==0.20.0
|
139 |
+
prompt_toolkit==3.0.47
|
140 |
+
protobuf==3.20.3
|
141 |
+
psutil==6.0.0
|
142 |
+
ptyprocess==0.7.0
|
143 |
+
pure_eval==0.2.3
|
144 |
+
pyarrow==17.0.0
|
145 |
+
pycparser==2.22
|
146 |
+
pynvml==11.5.3
|
147 |
+
pyparsing==2.4.7
|
148 |
+
pyrepl==0.9.0
|
149 |
+
python-apt==2.4.0+ubuntu3
|
150 |
+
python-dateutil==2.9.0.post0
|
151 |
+
python-json-logger==2.0.7
|
152 |
+
pytz==2024.1
|
153 |
+
pyzmq==24.0.1
|
154 |
+
referencing==0.35.1
|
155 |
+
regex==2024.7.24
|
156 |
+
requests==2.32.3
|
157 |
+
rfc3339-validator==0.1.4
|
158 |
+
rfc3986-validator==0.1.1
|
159 |
+
rouge_score==0.1.2
|
160 |
+
rpds-py==0.20.0
|
161 |
+
safetensors==0.4.4
|
162 |
+
sentencepiece==0.2.0
|
163 |
+
sentry-sdk==2.13.0
|
164 |
+
setproctitle==1.3.3
|
165 |
+
setuptools==73.0.1
|
166 |
+
six==1.16.0
|
167 |
+
smmap==5.0.1
|
168 |
+
sniffio==1.3.1
|
169 |
+
soupsieve==2.6
|
170 |
+
stack-data==0.6.3
|
171 |
+
sympy==1.13.2
|
172 |
+
termcolor==2.4.0
|
173 |
+
terminado==0.18.1
|
174 |
+
tinycss2==1.3.0
|
175 |
+
tokenizers==0.19.1
|
176 |
+
torch==2.4.0
|
177 |
+
torchaudio==2.4.0
|
178 |
+
torchvision==0.19.0
|
179 |
+
tornado==6.4.1
|
180 |
+
tqdm==4.66.5
|
181 |
+
traitlets==5.14.3
|
182 |
+
transformers==4.44.2
|
183 |
+
triton==3.0.0
|
184 |
+
types-python-dateutil==2.9.0.20240821
|
185 |
+
typing_extensions==4.12.2
|
186 |
+
tzdata==2024.1
|
187 |
+
uri-template==1.3.0
|
188 |
+
urllib3==2.2.2
|
189 |
+
wadllib==1.3.6
|
190 |
+
wandb==0.17.8
|
191 |
+
wcwidth==0.2.13
|
192 |
+
webcolors==24.8.0
|
193 |
+
webencodings==0.5.1
|
194 |
+
websocket-client==1.8.0
|
195 |
+
wheel==0.44.0
|
196 |
+
widgetsnbextension==4.0.11
|
197 |
+
wmctrl==0.5
|
198 |
+
xxhash==3.5.0
|
199 |
+
yarl==1.9.4
|
200 |
+
zipp==1.0.0
|
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-metadata.json
ADDED
@@ -0,0 +1,547 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-6.5.0-41-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.11.9",
|
4 |
+
"heartbeatAt": "2024-08-30T19:59:24.641329",
|
5 |
+
"startedAt": "2024-08-30T19:59:24.177472",
|
6 |
+
"docker": null,
|
7 |
+
"cuda": null,
|
8 |
+
"args": [],
|
9 |
+
"state": "running",
|
10 |
+
"program": "-m nanoT5.main",
|
11 |
+
"codePathLocal": null,
|
12 |
+
"git": {
|
13 |
+
"remote": "https://github.com/pszemraj/nanoT5.git",
|
14 |
+
"commit": "58834d398cca39b8344c83490f8b1bec71116423"
|
15 |
+
},
|
16 |
+
"email": null,
|
17 |
+
"root": "/workspace/nanoT5",
|
18 |
+
"host": "cf696b887dc2",
|
19 |
+
"username": "root",
|
20 |
+
"executable": "/usr/bin/python",
|
21 |
+
"cpu_count": 100,
|
22 |
+
"cpu_count_logical": 100,
|
23 |
+
"cpu_freq": {
|
24 |
+
"current": 2249.869999999997,
|
25 |
+
"min": 0.0,
|
26 |
+
"max": 0.0
|
27 |
+
},
|
28 |
+
"cpu_freq_per_core": [
|
29 |
+
{
|
30 |
+
"current": 2249.87,
|
31 |
+
"min": 0.0,
|
32 |
+
"max": 0.0
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"current": 2249.87,
|
36 |
+
"min": 0.0,
|
37 |
+
"max": 0.0
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"current": 2249.87,
|
41 |
+
"min": 0.0,
|
42 |
+
"max": 0.0
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"current": 2249.87,
|
46 |
+
"min": 0.0,
|
47 |
+
"max": 0.0
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"current": 2249.87,
|
51 |
+
"min": 0.0,
|
52 |
+
"max": 0.0
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"current": 2249.87,
|
56 |
+
"min": 0.0,
|
57 |
+
"max": 0.0
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"current": 2249.87,
|
61 |
+
"min": 0.0,
|
62 |
+
"max": 0.0
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"current": 2249.87,
|
66 |
+
"min": 0.0,
|
67 |
+
"max": 0.0
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"current": 2249.87,
|
71 |
+
"min": 0.0,
|
72 |
+
"max": 0.0
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"current": 2249.87,
|
76 |
+
"min": 0.0,
|
77 |
+
"max": 0.0
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"current": 2249.87,
|
81 |
+
"min": 0.0,
|
82 |
+
"max": 0.0
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"current": 2249.87,
|
86 |
+
"min": 0.0,
|
87 |
+
"max": 0.0
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"current": 2249.87,
|
91 |
+
"min": 0.0,
|
92 |
+
"max": 0.0
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"current": 2249.87,
|
96 |
+
"min": 0.0,
|
97 |
+
"max": 0.0
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"current": 2249.87,
|
101 |
+
"min": 0.0,
|
102 |
+
"max": 0.0
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"current": 2249.87,
|
106 |
+
"min": 0.0,
|
107 |
+
"max": 0.0
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"current": 2249.87,
|
111 |
+
"min": 0.0,
|
112 |
+
"max": 0.0
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"current": 2249.87,
|
116 |
+
"min": 0.0,
|
117 |
+
"max": 0.0
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"current": 2249.87,
|
121 |
+
"min": 0.0,
|
122 |
+
"max": 0.0
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"current": 2249.87,
|
126 |
+
"min": 0.0,
|
127 |
+
"max": 0.0
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"current": 2249.87,
|
131 |
+
"min": 0.0,
|
132 |
+
"max": 0.0
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"current": 2249.87,
|
136 |
+
"min": 0.0,
|
137 |
+
"max": 0.0
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"current": 2249.87,
|
141 |
+
"min": 0.0,
|
142 |
+
"max": 0.0
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"current": 2249.87,
|
146 |
+
"min": 0.0,
|
147 |
+
"max": 0.0
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"current": 2249.87,
|
151 |
+
"min": 0.0,
|
152 |
+
"max": 0.0
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"current": 2249.87,
|
156 |
+
"min": 0.0,
|
157 |
+
"max": 0.0
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"current": 2249.87,
|
161 |
+
"min": 0.0,
|
162 |
+
"max": 0.0
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"current": 2249.87,
|
166 |
+
"min": 0.0,
|
167 |
+
"max": 0.0
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"current": 2249.87,
|
171 |
+
"min": 0.0,
|
172 |
+
"max": 0.0
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"current": 2249.87,
|
176 |
+
"min": 0.0,
|
177 |
+
"max": 0.0
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"current": 2249.87,
|
181 |
+
"min": 0.0,
|
182 |
+
"max": 0.0
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"current": 2249.87,
|
186 |
+
"min": 0.0,
|
187 |
+
"max": 0.0
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"current": 2249.87,
|
191 |
+
"min": 0.0,
|
192 |
+
"max": 0.0
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"current": 2249.87,
|
196 |
+
"min": 0.0,
|
197 |
+
"max": 0.0
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"current": 2249.87,
|
201 |
+
"min": 0.0,
|
202 |
+
"max": 0.0
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"current": 2249.87,
|
206 |
+
"min": 0.0,
|
207 |
+
"max": 0.0
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"current": 2249.87,
|
211 |
+
"min": 0.0,
|
212 |
+
"max": 0.0
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"current": 2249.87,
|
216 |
+
"min": 0.0,
|
217 |
+
"max": 0.0
|
218 |
+
},
|
219 |
+
{
|
220 |
+
"current": 2249.87,
|
221 |
+
"min": 0.0,
|
222 |
+
"max": 0.0
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"current": 2249.87,
|
226 |
+
"min": 0.0,
|
227 |
+
"max": 0.0
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"current": 2249.87,
|
231 |
+
"min": 0.0,
|
232 |
+
"max": 0.0
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"current": 2249.87,
|
236 |
+
"min": 0.0,
|
237 |
+
"max": 0.0
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"current": 2249.87,
|
241 |
+
"min": 0.0,
|
242 |
+
"max": 0.0
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"current": 2249.87,
|
246 |
+
"min": 0.0,
|
247 |
+
"max": 0.0
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"current": 2249.87,
|
251 |
+
"min": 0.0,
|
252 |
+
"max": 0.0
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"current": 2249.87,
|
256 |
+
"min": 0.0,
|
257 |
+
"max": 0.0
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"current": 2249.87,
|
261 |
+
"min": 0.0,
|
262 |
+
"max": 0.0
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"current": 2249.87,
|
266 |
+
"min": 0.0,
|
267 |
+
"max": 0.0
|
268 |
+
},
|
269 |
+
{
|
270 |
+
"current": 2249.87,
|
271 |
+
"min": 0.0,
|
272 |
+
"max": 0.0
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"current": 2249.87,
|
276 |
+
"min": 0.0,
|
277 |
+
"max": 0.0
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"current": 2249.87,
|
281 |
+
"min": 0.0,
|
282 |
+
"max": 0.0
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"current": 2249.87,
|
286 |
+
"min": 0.0,
|
287 |
+
"max": 0.0
|
288 |
+
},
|
289 |
+
{
|
290 |
+
"current": 2249.87,
|
291 |
+
"min": 0.0,
|
292 |
+
"max": 0.0
|
293 |
+
},
|
294 |
+
{
|
295 |
+
"current": 2249.87,
|
296 |
+
"min": 0.0,
|
297 |
+
"max": 0.0
|
298 |
+
},
|
299 |
+
{
|
300 |
+
"current": 2249.87,
|
301 |
+
"min": 0.0,
|
302 |
+
"max": 0.0
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"current": 2249.87,
|
306 |
+
"min": 0.0,
|
307 |
+
"max": 0.0
|
308 |
+
},
|
309 |
+
{
|
310 |
+
"current": 2249.87,
|
311 |
+
"min": 0.0,
|
312 |
+
"max": 0.0
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"current": 2249.87,
|
316 |
+
"min": 0.0,
|
317 |
+
"max": 0.0
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"current": 2249.87,
|
321 |
+
"min": 0.0,
|
322 |
+
"max": 0.0
|
323 |
+
},
|
324 |
+
{
|
325 |
+
"current": 2249.87,
|
326 |
+
"min": 0.0,
|
327 |
+
"max": 0.0
|
328 |
+
},
|
329 |
+
{
|
330 |
+
"current": 2249.87,
|
331 |
+
"min": 0.0,
|
332 |
+
"max": 0.0
|
333 |
+
},
|
334 |
+
{
|
335 |
+
"current": 2249.87,
|
336 |
+
"min": 0.0,
|
337 |
+
"max": 0.0
|
338 |
+
},
|
339 |
+
{
|
340 |
+
"current": 2249.87,
|
341 |
+
"min": 0.0,
|
342 |
+
"max": 0.0
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"current": 2249.87,
|
346 |
+
"min": 0.0,
|
347 |
+
"max": 0.0
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"current": 2249.87,
|
351 |
+
"min": 0.0,
|
352 |
+
"max": 0.0
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"current": 2249.87,
|
356 |
+
"min": 0.0,
|
357 |
+
"max": 0.0
|
358 |
+
},
|
359 |
+
{
|
360 |
+
"current": 2249.87,
|
361 |
+
"min": 0.0,
|
362 |
+
"max": 0.0
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"current": 2249.87,
|
366 |
+
"min": 0.0,
|
367 |
+
"max": 0.0
|
368 |
+
},
|
369 |
+
{
|
370 |
+
"current": 2249.87,
|
371 |
+
"min": 0.0,
|
372 |
+
"max": 0.0
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"current": 2249.87,
|
376 |
+
"min": 0.0,
|
377 |
+
"max": 0.0
|
378 |
+
},
|
379 |
+
{
|
380 |
+
"current": 2249.87,
|
381 |
+
"min": 0.0,
|
382 |
+
"max": 0.0
|
383 |
+
},
|
384 |
+
{
|
385 |
+
"current": 2249.87,
|
386 |
+
"min": 0.0,
|
387 |
+
"max": 0.0
|
388 |
+
},
|
389 |
+
{
|
390 |
+
"current": 2249.87,
|
391 |
+
"min": 0.0,
|
392 |
+
"max": 0.0
|
393 |
+
},
|
394 |
+
{
|
395 |
+
"current": 2249.87,
|
396 |
+
"min": 0.0,
|
397 |
+
"max": 0.0
|
398 |
+
},
|
399 |
+
{
|
400 |
+
"current": 2249.87,
|
401 |
+
"min": 0.0,
|
402 |
+
"max": 0.0
|
403 |
+
},
|
404 |
+
{
|
405 |
+
"current": 2249.87,
|
406 |
+
"min": 0.0,
|
407 |
+
"max": 0.0
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"current": 2249.87,
|
411 |
+
"min": 0.0,
|
412 |
+
"max": 0.0
|
413 |
+
},
|
414 |
+
{
|
415 |
+
"current": 2249.87,
|
416 |
+
"min": 0.0,
|
417 |
+
"max": 0.0
|
418 |
+
},
|
419 |
+
{
|
420 |
+
"current": 2249.87,
|
421 |
+
"min": 0.0,
|
422 |
+
"max": 0.0
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"current": 2249.87,
|
426 |
+
"min": 0.0,
|
427 |
+
"max": 0.0
|
428 |
+
},
|
429 |
+
{
|
430 |
+
"current": 2249.87,
|
431 |
+
"min": 0.0,
|
432 |
+
"max": 0.0
|
433 |
+
},
|
434 |
+
{
|
435 |
+
"current": 2249.87,
|
436 |
+
"min": 0.0,
|
437 |
+
"max": 0.0
|
438 |
+
},
|
439 |
+
{
|
440 |
+
"current": 2249.87,
|
441 |
+
"min": 0.0,
|
442 |
+
"max": 0.0
|
443 |
+
},
|
444 |
+
{
|
445 |
+
"current": 2249.87,
|
446 |
+
"min": 0.0,
|
447 |
+
"max": 0.0
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"current": 2249.87,
|
451 |
+
"min": 0.0,
|
452 |
+
"max": 0.0
|
453 |
+
},
|
454 |
+
{
|
455 |
+
"current": 2249.87,
|
456 |
+
"min": 0.0,
|
457 |
+
"max": 0.0
|
458 |
+
},
|
459 |
+
{
|
460 |
+
"current": 2249.87,
|
461 |
+
"min": 0.0,
|
462 |
+
"max": 0.0
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"current": 2249.87,
|
466 |
+
"min": 0.0,
|
467 |
+
"max": 0.0
|
468 |
+
},
|
469 |
+
{
|
470 |
+
"current": 2249.87,
|
471 |
+
"min": 0.0,
|
472 |
+
"max": 0.0
|
473 |
+
},
|
474 |
+
{
|
475 |
+
"current": 2249.87,
|
476 |
+
"min": 0.0,
|
477 |
+
"max": 0.0
|
478 |
+
},
|
479 |
+
{
|
480 |
+
"current": 2249.87,
|
481 |
+
"min": 0.0,
|
482 |
+
"max": 0.0
|
483 |
+
},
|
484 |
+
{
|
485 |
+
"current": 2249.87,
|
486 |
+
"min": 0.0,
|
487 |
+
"max": 0.0
|
488 |
+
},
|
489 |
+
{
|
490 |
+
"current": 2249.87,
|
491 |
+
"min": 0.0,
|
492 |
+
"max": 0.0
|
493 |
+
},
|
494 |
+
{
|
495 |
+
"current": 2249.87,
|
496 |
+
"min": 0.0,
|
497 |
+
"max": 0.0
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"current": 2249.87,
|
501 |
+
"min": 0.0,
|
502 |
+
"max": 0.0
|
503 |
+
},
|
504 |
+
{
|
505 |
+
"current": 2249.87,
|
506 |
+
"min": 0.0,
|
507 |
+
"max": 0.0
|
508 |
+
},
|
509 |
+
{
|
510 |
+
"current": 2249.87,
|
511 |
+
"min": 0.0,
|
512 |
+
"max": 0.0
|
513 |
+
},
|
514 |
+
{
|
515 |
+
"current": 2249.87,
|
516 |
+
"min": 0.0,
|
517 |
+
"max": 0.0
|
518 |
+
},
|
519 |
+
{
|
520 |
+
"current": 2249.87,
|
521 |
+
"min": 0.0,
|
522 |
+
"max": 0.0
|
523 |
+
},
|
524 |
+
{
|
525 |
+
"current": 2249.87,
|
526 |
+
"min": 0.0,
|
527 |
+
"max": 0.0
|
528 |
+
}
|
529 |
+
],
|
530 |
+
"disk": {
|
531 |
+
"/": {
|
532 |
+
"total": 200.0,
|
533 |
+
"used": 1.7721595764160156
|
534 |
+
}
|
535 |
+
},
|
536 |
+
"gpu": "NVIDIA A100 80GB PCIe",
|
537 |
+
"gpu_count": 1,
|
538 |
+
"gpu_devices": [
|
539 |
+
{
|
540 |
+
"name": "NVIDIA A100 80GB PCIe",
|
541 |
+
"memory_total": 85899345920
|
542 |
+
}
|
543 |
+
],
|
544 |
+
"memory": {
|
545 |
+
"total": 668.8548545837402
|
546 |
+
}
|
547 |
+
}
|
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"train/loss": 1.777546563744545, "train/grad_l2": 0.1973351389169693, "train/weights_l2": 11272.363778775605, "train/lr": 0.0020558542377918645, "train/seconds_per_step": 4.877207107543946, "_timestamp": 1725124200.3571296, "_runtime": 76236.1699206829, "_step": 15525}
|
checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug.log
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-08-30 19:59:24,178 INFO MainThread:29052 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
|
2 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Configure stats pid to 29052
|
3 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
4 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/settings
|
5 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
6 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
|
7 |
+
2024-08-30 19:59:24,179 WARNING MainThread:29052 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
|
8 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
|
9 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Applying login settings: {}
|
10 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug.log
|
11 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log
|
12 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():607] calling init triggers
|
13 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
|
14 |
+
config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22'}
|
15 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():657] starting backend
|
16 |
+
2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():661] setting up manager
|
17 |
+
2024-08-30 19:59:24,185 INFO MainThread:29052 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-08-30 19:59:24,187 INFO MainThread:29052 [wandb_init.py:init():669] backend started and connected
|
19 |
+
2024-08-30 19:59:24,192 INFO MainThread:29052 [wandb_init.py:init():767] updated telemetry
|
20 |
+
2024-08-30 19:59:24,198 INFO MainThread:29052 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-08-30 19:59:24,583 INFO MainThread:29052 [wandb_init.py:init():851] starting run threads in backend
|
22 |
+
2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_console_start():2463] atexit reg
|
23 |
+
2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_redirect():2309] redirect: wrap_raw
|
24 |
+
2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_redirect():2374] Wrapping output streams.
|
25 |
+
2024-08-30 19:59:24,815 INFO MainThread:29052 [wandb_run.py:_redirect():2399] Redirects installed.
|
26 |
+
2024-08-30 19:59:24,818 INFO MainThread:29052 [wandb_init.py:init():894] run started, returning control to user process
|
27 |
+
2024-08-30 19:59:44,796 INFO MainThread:29052 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22', 'n_all_param': 887492096}
|
checkpoints/wandb/run-20240830_195924-mao0tqjy/run-mao0tqjy.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34c4d82ee5fa3daa21587d65efb7972e3a2447cce764ad1cd0eaec8aa61ffb19
|
3 |
+
size 9030581
|