pszemraj commited on
Commit
4dfcb10
1 Parent(s): 9e7a8cf

Upload folder using huggingface_hub

Browse files
Files changed (29) hide show
  1. .gitattributes +1 -0
  2. checkpoints/.hydra/config.yaml +50 -0
  3. checkpoints/.hydra/hydra.yaml +154 -0
  4. checkpoints/.hydra/overrides.yaml +1 -0
  5. checkpoints/checkpoint-pt-10000/model.safetensors +3 -0
  6. checkpoints/checkpoint-pt-10000/random_states_0.pkl +3 -0
  7. checkpoints/checkpoint-pt-12500/model.safetensors +3 -0
  8. checkpoints/checkpoint-pt-12500/random_states_0.pkl +3 -0
  9. checkpoints/checkpoint-pt-15000/model.safetensors +3 -0
  10. checkpoints/checkpoint-pt-15000/random_states_0.pkl +3 -0
  11. checkpoints/checkpoint-pt-2500/model.safetensors +3 -0
  12. checkpoints/checkpoint-pt-2500/random_states_0.pkl +3 -0
  13. checkpoints/checkpoint-pt-5000/model.safetensors +3 -0
  14. checkpoints/checkpoint-pt-5000/random_states_0.pkl +3 -0
  15. checkpoints/checkpoint-pt-7500/model.safetensors +3 -0
  16. checkpoints/checkpoint-pt-7500/random_states_0.pkl +3 -0
  17. checkpoints/config.json +32 -0
  18. checkpoints/main.log +0 -0
  19. checkpoints/wandb/debug-internal.log +0 -0
  20. checkpoints/wandb/debug.log +27 -0
  21. checkpoints/wandb/run-20240830_195924-mao0tqjy/files/config.yaml +132 -0
  22. checkpoints/wandb/run-20240830_195924-mao0tqjy/files/diff.patch +163 -0
  23. checkpoints/wandb/run-20240830_195924-mao0tqjy/files/output.log +0 -0
  24. checkpoints/wandb/run-20240830_195924-mao0tqjy/files/requirements.txt +200 -0
  25. checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-metadata.json +547 -0
  26. checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-summary.json +1 -0
  27. checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log +0 -0
  28. checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug.log +27 -0
  29. checkpoints/wandb/run-20240830_195924-mao0tqjy/run-mao0tqjy.wandb +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoints/wandb/run-20240830_195924-mao0tqjy/run-mao0tqjy.wandb filter=lfs diff=lfs merge=lfs -text
checkpoints/.hydra/config.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mode: pt
2
+ device: gpu
3
+ precision: bf16
4
+ eval_only: false
5
+ predict_only: false
6
+ seed: 34534
7
+ model:
8
+ klass: hf_t5
9
+ name: pszemraj/tFINE-900m-e16-d32
10
+ overwrite:
11
+ dropout_rate: 0.0
12
+ checkpoint_path: ''
13
+ random_init: false
14
+ compile: true
15
+ tokenizer:
16
+ name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
17
+ data:
18
+ input_length: 1024
19
+ mlm_probability: 0.15
20
+ mean_noise_span_length: 3.0
21
+ num_workers: 16
22
+ optim:
23
+ name: adamwscale
24
+ base_lr: 0.01
25
+ batch_size: 128
26
+ total_steps: 20000
27
+ epochs: -1
28
+ warmup_steps: 5000
29
+ lr_scheduler: cosine
30
+ weight_decay: 0.0001
31
+ grad_clip: 1.0
32
+ grad_acc: 8
33
+ final_cosine: 2.0e-05
34
+ eval:
35
+ every_steps: 1000000000
36
+ steps: 500
37
+ checkpoint:
38
+ every_steps: 2500
39
+ logging:
40
+ use_wandb: true
41
+ wandb_config:
42
+ project: nanoT5
43
+ entity: pszemraj
44
+ tags:
45
+ - 900m
46
+ - '1024'
47
+ mode: online
48
+ every_steps: 25
49
+ grad_l2: true
50
+ weights_l2: true
checkpoints/.hydra/hydra.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: main
117
+ chdir: null
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: default
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /workspace/nanoT5
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /workspace/nanoT5/nanoT5/configs
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /workspace/nanoT5/outputs/2024-08-30/19-59-22
144
+ choices:
145
+ hydra/env: default
146
+ hydra/callbacks: null
147
+ hydra/job_logging: default
148
+ hydra/hydra_logging: default
149
+ hydra/hydra_help: default
150
+ hydra/help: default
151
+ hydra/sweeper: basic
152
+ hydra/launcher: basic
153
+ hydra/output: default
154
+ verbose: false
checkpoints/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
checkpoints/checkpoint-pt-10000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b04238335d2e95c6bfa1c92a501bef0bf99434e8e3475d41216cdc74d3d7a76
3
+ size 3550041880
checkpoints/checkpoint-pt-10000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
3
+ size 14344
checkpoints/checkpoint-pt-12500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f034c61cb3eac83e1c6a7ea881e34d255ffba59e0ca7746df690bba59229a687
3
+ size 3550041880
checkpoints/checkpoint-pt-12500/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
3
+ size 14344
checkpoints/checkpoint-pt-15000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f5cd29ef789593d59329578d0b2f454238da7789c2b5a9bee1c3d139c64a5e2
3
+ size 3550041880
checkpoints/checkpoint-pt-15000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
3
+ size 14344
checkpoints/checkpoint-pt-2500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6fb5ed5e9042405a8d4d053759f532abe0d167456785d277c71e18fa74c29a4
3
+ size 3550041880
checkpoints/checkpoint-pt-2500/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
3
+ size 14344
checkpoints/checkpoint-pt-5000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48e5e3d34b17f5fb15734e0dc5f17878d3cf58a9b748b31341c993f3e5e94f3e
3
+ size 3550041880
checkpoints/checkpoint-pt-5000/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
3
+ size 14344
checkpoints/checkpoint-pt-7500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c720f1fc7d3a669689be8dcae3d1137518b37665abda21edf4b547a0e7b1abe4
3
+ size 3550041880
checkpoints/checkpoint-pt-7500/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
3
+ size 14344
checkpoints/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pszemraj/tFINE-900m-e16-d32",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 3072,
8
+ "d_kv": 64,
9
+ "d_model": 1024,
10
+ "decoder_start_token_id": 3,
11
+ "dense_act_fn": "silu",
12
+ "dropout_rate": 0.0,
13
+ "eos_token_id": 2,
14
+ "feed_forward_proj": "gated-silu",
15
+ "initializer_factor": 1.0,
16
+ "is_bf16": true,
17
+ "is_encoder_decoder": false,
18
+ "is_gated_act": true,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "model_type": "t5",
21
+ "num_decoder_layers": 32,
22
+ "num_heads": 16,
23
+ "num_layers": 16,
24
+ "output_past": true,
25
+ "pad_token_id": 3,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 48,
28
+ "tie_word_embeddings": false,
29
+ "transformers_version": "4.44.2",
30
+ "use_cache": true,
31
+ "vocab_size": 48256
32
+ }
checkpoints/main.log ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/wandb/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/wandb/debug.log ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-30 19:59:24,178 INFO MainThread:29052 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
2
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Configure stats pid to 29052
3
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
4
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/settings
5
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
6
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-30 19:59:24,179 WARNING MainThread:29052 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
8
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
9
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Applying login settings: {}
10
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug.log
11
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log
12
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():607] calling init triggers
13
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
14
+ config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22'}
15
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():657] starting backend
16
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():661] setting up manager
17
+ 2024-08-30 19:59:24,185 INFO MainThread:29052 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-08-30 19:59:24,187 INFO MainThread:29052 [wandb_init.py:init():669] backend started and connected
19
+ 2024-08-30 19:59:24,192 INFO MainThread:29052 [wandb_init.py:init():767] updated telemetry
20
+ 2024-08-30 19:59:24,198 INFO MainThread:29052 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
21
+ 2024-08-30 19:59:24,583 INFO MainThread:29052 [wandb_init.py:init():851] starting run threads in backend
22
+ 2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_console_start():2463] atexit reg
23
+ 2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_redirect():2309] redirect: wrap_raw
24
+ 2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_redirect():2374] Wrapping output streams.
25
+ 2024-08-30 19:59:24,815 INFO MainThread:29052 [wandb_run.py:_redirect():2399] Redirects installed.
26
+ 2024-08-30 19:59:24,818 INFO MainThread:29052 [wandb_init.py:init():894] run started, returning control to user process
27
+ 2024-08-30 19:59:44,796 INFO MainThread:29052 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22', 'n_all_param': 887492096}
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ mode:
4
+ desc: null
5
+ value: pt
6
+ device:
7
+ desc: null
8
+ value: gpu
9
+ precision:
10
+ desc: null
11
+ value: bf16
12
+ eval_only:
13
+ desc: null
14
+ value: false
15
+ predict_only:
16
+ desc: null
17
+ value: false
18
+ seed:
19
+ desc: null
20
+ value: 34534
21
+ model:
22
+ desc: null
23
+ value:
24
+ klass: hf_t5
25
+ name: pszemraj/tFINE-900m-e16-d32
26
+ overwrite:
27
+ dropout_rate: 0.0
28
+ checkpoint_path: ''
29
+ random_init: false
30
+ compile: true
31
+ tokenizer:
32
+ desc: null
33
+ value:
34
+ name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
35
+ data:
36
+ desc: null
37
+ value:
38
+ input_length: 1024
39
+ mlm_probability: 0.15
40
+ mean_noise_span_length: 3.0
41
+ num_workers: 16
42
+ before_mask_input_length: 1137
43
+ target_length: 229
44
+ optim:
45
+ desc: null
46
+ value:
47
+ name: adamwscale
48
+ base_lr: 0.01
49
+ batch_size: 128
50
+ total_steps: 20000
51
+ epochs: -1
52
+ warmup_steps: 5000
53
+ lr_scheduler: cosine
54
+ weight_decay: 0.0001
55
+ grad_clip: 1.0
56
+ grad_acc: 8
57
+ final_cosine: 2.0e-05
58
+ eval:
59
+ desc: null
60
+ value:
61
+ every_steps: 1000000000
62
+ steps: 500
63
+ corrected_steps: 500
64
+ checkpoint:
65
+ desc: null
66
+ value:
67
+ every_steps: 2500
68
+ logging:
69
+ desc: null
70
+ value:
71
+ use_wandb: true
72
+ wandb_config:
73
+ project: nanoT5
74
+ entity: pszemraj
75
+ tags:
76
+ - 900m
77
+ - '1024'
78
+ mode: online
79
+ every_steps: 25
80
+ grad_l2: true
81
+ weights_l2: true
82
+ slurm_id:
83
+ desc: null
84
+ value: none
85
+ working_dir:
86
+ desc: null
87
+ value: /workspace/nanoT5/outputs/2024-08-30/19-59-22
88
+ _wandb:
89
+ desc: null
90
+ value:
91
+ python_version: 3.11.9
92
+ cli_version: 0.17.8
93
+ framework: huggingface
94
+ huggingface_version: 4.44.2
95
+ is_jupyter_run: false
96
+ is_kaggle_kernel: false
97
+ start_time: 1725047964
98
+ t:
99
+ 1:
100
+ - 1
101
+ - 11
102
+ - 41
103
+ - 49
104
+ - 50
105
+ - 51
106
+ - 55
107
+ - 71
108
+ - 100
109
+ 2:
110
+ - 1
111
+ - 11
112
+ - 41
113
+ - 49
114
+ - 50
115
+ - 51
116
+ - 55
117
+ - 71
118
+ - 100
119
+ 3:
120
+ - 15
121
+ - 16
122
+ - 23
123
+ - 61
124
+ 4: 3.11.9
125
+ 5: 0.17.8
126
+ 6: 4.44.2
127
+ 8:
128
+ - 5
129
+ 13: linux-x86_64
130
+ n_all_param:
131
+ desc: null
132
+ value: 887492096
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/diff.patch ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diff --git a/nanoT5/configs/default.yaml b/nanoT5/configs/default.yaml
2
+ index 6e10cc4..240ff3d 100644
3
+ --- a/nanoT5/configs/default.yaml
4
+ +++ b/nanoT5/configs/default.yaml
5
+ @@ -1,65 +1,51 @@
6
+ -defaults:
7
+ - - _self_
8
+ - - task: pt
9
+ - - local_env: default
10
+ -
11
+ -# Experiment args
12
+ -mode: 'pt'
13
+ +mode: pt
14
+ device: gpu
15
+ -precision: 'bf16'
16
+ +precision: bf16
17
+ eval_only: false
18
+ predict_only: false
19
+ -seed: 2137
20
+ +seed: 34534
21
+
22
+ model:
23
+ - klass: local_t5
24
+ - name: 'google/t5-v1_1-base'
25
+ - overwrite: # overwrite config with these values
26
+ - dropout_rate: 0.0
27
+ - add_config: # add these values to the config
28
+ - is_bf16: false
29
+ - checkpoint_path: ''
30
+ - random_init: true
31
+ - compile: true # Pytorch 2.0
32
+ -
33
+ + klass: hf_t5
34
+ + name: pszemraj/tFINE-900m-e16-d32
35
+ + overwrite:
36
+ + dropout_rate: 0.0
37
+ +# add_config:
38
+ +# is_bf16: false
39
+ + checkpoint_path: ''
40
+ + random_init: false
41
+ + compile: true
42
+ +tokenizer:
43
+ + name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
44
+ data:
45
+ - input_length: 512
46
+ - mlm_probability: 0.15
47
+ - mean_noise_span_length: 3.0
48
+ - num_workers: 8
49
+ -
50
+ + input_length: 1024
51
+ + mlm_probability: 0.15
52
+ + mean_noise_span_length: 3.0
53
+ + num_workers: 16
54
+ optim:
55
+ - name: adamwscale
56
+ - base_lr: 2e-2
57
+ - batch_size: 128
58
+ - total_steps: 65536
59
+ - epochs: -1 # If it's > 0 it overwrites total_steps
60
+ - warmup_steps: 10000
61
+ - lr_scheduler: cosine
62
+ - weight_decay: 0.0
63
+ - grad_clip: 1.0
64
+ - grad_acc: 1
65
+ - final_cosine: 1e-5
66
+ -
67
+ + name: adamwscale
68
+ + base_lr: 0.01
69
+ + batch_size: 128
70
+ + total_steps: 20000
71
+ + epochs: -1
72
+ + warmup_steps: 5000
73
+ + lr_scheduler: cosine
74
+ + weight_decay: 0.0001
75
+ + grad_clip: 1.0
76
+ + grad_acc: 8
77
+ + final_cosine: 2.0e-05
78
+ eval:
79
+ - every_steps: 100000 # Eval once in the end
80
+ - steps: 500
81
+ -
82
+ + every_steps: 1000000000
83
+ + steps: 500
84
+ checkpoint:
85
+ - every_steps: 100000 # Save checkpoint once in the end
86
+ -
87
+ + every_steps: 2500
88
+ logging:
89
+ - every_steps: 100
90
+ - grad_l2: true
91
+ - weights_l2: true
92
+ - use_wandb: false
93
+ - # Can remove or comment out the below if not using Weights & Biases
94
+ - wandb_config:
95
+ - project: nanoT5
96
+ - entity: 'your_wandb_username'
97
+ - tags: ['nanoT5', 'my_tag']
98
+ - mode: 'online'
99
+ -
100
+ -hydra:
101
+ - job:
102
+ - chdir: True
103
+ + use_wandb: true
104
+ + wandb_config:
105
+ + project: nanoT5
106
+ + entity: 'pszemraj'
107
+ + tags: ['900m', '1024',]
108
+ + mode: 'online'
109
+ + every_steps: 25
110
+ + grad_l2: true
111
+ + weights_l2: true
112
+ diff --git a/nanoT5/main.py b/nanoT5/main.py
113
+ index 12dfbae..c4ba985 100644
114
+ --- a/nanoT5/main.py
115
+ +++ b/nanoT5/main.py
116
+ @@ -19,6 +19,40 @@ from .utils import (
117
+ train,
118
+ )
119
+
120
+ +# >>> DYNAMO UPDATES
121
+ +
122
+ +# Torch compile arguments
123
+ +torch_compile_arguments = [
124
+ + "config.dce = True",
125
+ + "config.memory_planning = True",
126
+ + "config.memory_pool = 'combined'",
127
+ + "config.coordinate_descent_tuning = True",
128
+ + "config.max_autotune_gemm = False", # GEMM is unnecessary
129
+ + "config.autotune_multi_device = False",
130
+ + "config.max_autotune_gemm_backends = 'ATEN'", # Not much faster
131
+ + "config.aggressive_fusion = False", # Careful changes results!
132
+ + "config.cuda.enable_cuda_lto = True",
133
+ + "config.cuda.use_fast_math = True",
134
+ + "config.cuda.compile_opt_level = '-O3'",
135
+ +]
136
+ +# Torch dynamo arguments
137
+ +torch_dynamo_arguments = [
138
+ + "config.accumulated_cache_size_limit = 1024", # Bump up a bit from 256
139
+ + "config.suppress_errors = True", # Supress errors for now
140
+ + "config.do_not_emit_runtime_asserts = True",
141
+ +]
142
+ +import torch._inductor.config as config
143
+ +for _try_compile_argument in torch_compile_arguments:
144
+ + try: exec(_try_compile_argument)
145
+ + except: pass
146
+ +pass
147
+ +import torch._dynamo.config as config
148
+ +for _try_dynamo_argument in torch_dynamo_arguments:
149
+ + try: exec(_try_dynamo_argument)
150
+ + except: pass
151
+ +pass
152
+ +
153
+ +# >>> DYNAMO UPDATES
154
+
155
+ @hydra.main(config_path="configs", config_name="default", version_base="1.1")
156
+ def main(args):
157
+ @@ -83,4 +117,4 @@ def main(args):
158
+
159
+
160
+ if __name__ == "__main__":
161
+ - main()
162
+ + main()
163
+
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/requirements.txt ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GitPython==3.1.43
2
+ Jinja2==3.1.4
3
+ MarkupSafe==2.1.5
4
+ PyGObject==3.42.1
5
+ PyJWT==2.3.0
6
+ PyYAML==5.4.1
7
+ PyYAML==6.0.2
8
+ Pygments==2.11.2
9
+ Pygments==2.18.0
10
+ SecretStorage==3.3.1
11
+ Send2Trash==1.8.3
12
+ absl-py==2.1.0
13
+ accelerate==0.33.0
14
+ aiohappyeyeballs==2.4.0
15
+ aiohttp==3.10.5
16
+ aiosignal==1.3.1
17
+ antlr4-python3-runtime==4.9.3
18
+ anyio==4.4.0
19
+ argon2-cffi-bindings==21.2.0
20
+ argon2-cffi==23.1.0
21
+ arrow==1.3.0
22
+ asttokens==2.4.1
23
+ async-lru==2.0.4
24
+ attrs==24.2.0
25
+ babel==2.16.0
26
+ beautifulsoup4==4.12.3
27
+ bleach==6.1.0
28
+ blessed==1.20.0
29
+ blinker==1.4
30
+ certifi==2024.7.4
31
+ cffi==1.17.0
32
+ charset-normalizer==3.3.2
33
+ click==8.1.7
34
+ comm==0.2.2
35
+ cryptography==3.4.8
36
+ datasets==2.21.0
37
+ dbus-python==1.2.18
38
+ debugpy==1.8.5
39
+ decorator==5.1.1
40
+ defusedxml==0.7.1
41
+ dill==0.3.8
42
+ distro==1.7.0
43
+ docker-pycreds==0.4.0
44
+ entrypoints==0.4
45
+ evaluate==0.4.2
46
+ executing==2.0.1
47
+ fancycompleter==0.9.1
48
+ fastjsonschema==2.20.0
49
+ filelock==3.15.4
50
+ fire==0.6.0
51
+ fqdn==1.5.1
52
+ frozenlist==1.4.1
53
+ fsspec==2024.6.1
54
+ gitdb==4.0.11
55
+ gpustat==1.1.1
56
+ h11==0.14.0
57
+ httpcore==1.0.5
58
+ httplib2==0.20.2
59
+ httpx==0.27.0
60
+ huggingface-hub==0.24.6
61
+ hydra-core==1.3.2
62
+ idna==3.7
63
+ importlib-metadata==4.6.4
64
+ ipykernel==6.29.5
65
+ ipython-genutils==0.2.0
66
+ ipython==8.26.0
67
+ ipywidgets==8.1.3
68
+ isoduration==20.11.0
69
+ jedi==0.19.1
70
+ jeepney==0.7.1
71
+ joblib==1.4.2
72
+ json5==0.9.25
73
+ jsonpointer==3.0.0
74
+ jsonschema-specifications==2023.12.1
75
+ jsonschema==4.23.0
76
+ jupyter-archive==3.4.0
77
+ jupyter-events==0.10.0
78
+ jupyter-highlight-selected-word==0.2.0
79
+ jupyter-lsp==2.2.5
80
+ jupyter_client==7.4.9
81
+ jupyter_contrib_core==0.4.2
82
+ jupyter_contrib_nbextensions==0.7.0
83
+ jupyter_core==5.7.2
84
+ jupyter_nbextensions_configurator==0.6.4
85
+ jupyter_server==2.14.2
86
+ jupyter_server_terminals==0.5.3
87
+ jupyterlab==4.2.4
88
+ jupyterlab_pygments==0.3.0
89
+ jupyterlab_server==2.27.3
90
+ jupyterlab_widgets==3.0.11
91
+ keyring==23.5.0
92
+ launchpadlib==1.10.16
93
+ lazr.restfulclient==0.14.4
94
+ lazr.uri==1.0.6
95
+ lxml==5.3.0
96
+ matplotlib-inline==0.1.7
97
+ mistune==3.0.2
98
+ more-itertools==8.10.0
99
+ mpmath==1.3.0
100
+ multidict==6.0.5
101
+ multiprocess==0.70.16
102
+ nbclassic==1.1.0
103
+ nbclient==0.10.0
104
+ nbconvert==7.16.4
105
+ nbformat==5.10.4
106
+ nest-asyncio==1.6.0
107
+ networkx==3.3
108
+ ninja==1.11.1.1
109
+ nltk==3.9.1
110
+ notebook==6.5.5
111
+ notebook_shim==0.2.4
112
+ numpy==1.26.4
113
+ nvidia-cublas-cu12==12.1.3.1
114
+ nvidia-cuda-cupti-cu12==12.1.105
115
+ nvidia-cuda-nvrtc-cu12==12.1.105
116
+ nvidia-cuda-runtime-cu12==12.1.105
117
+ nvidia-cudnn-cu12==9.1.0.70
118
+ nvidia-cufft-cu12==11.0.2.54
119
+ nvidia-curand-cu12==10.3.2.106
120
+ nvidia-cusolver-cu12==11.4.5.107
121
+ nvidia-cusparse-cu12==12.1.0.106
122
+ nvidia-ml-py==12.560.30
123
+ nvidia-nccl-cu12==2.20.5
124
+ nvidia-nvjitlink-cu12==12.6.20
125
+ nvidia-nvtx-cu12==12.1.105
126
+ oauthlib==3.2.0
127
+ omegaconf==2.3.0
128
+ overrides==7.7.0
129
+ packaging==24.1
130
+ pandas==2.2.2
131
+ pandocfilters==1.5.1
132
+ parso==0.8.4
133
+ pdbpp==0.10.3
134
+ pexpect==4.9.0
135
+ pillow==10.4.0
136
+ pip==24.2
137
+ platformdirs==4.2.2
138
+ prometheus_client==0.20.0
139
+ prompt_toolkit==3.0.47
140
+ protobuf==3.20.3
141
+ psutil==6.0.0
142
+ ptyprocess==0.7.0
143
+ pure_eval==0.2.3
144
+ pyarrow==17.0.0
145
+ pycparser==2.22
146
+ pynvml==11.5.3
147
+ pyparsing==2.4.7
148
+ pyrepl==0.9.0
149
+ python-apt==2.4.0+ubuntu3
150
+ python-dateutil==2.9.0.post0
151
+ python-json-logger==2.0.7
152
+ pytz==2024.1
153
+ pyzmq==24.0.1
154
+ referencing==0.35.1
155
+ regex==2024.7.24
156
+ requests==2.32.3
157
+ rfc3339-validator==0.1.4
158
+ rfc3986-validator==0.1.1
159
+ rouge_score==0.1.2
160
+ rpds-py==0.20.0
161
+ safetensors==0.4.4
162
+ sentencepiece==0.2.0
163
+ sentry-sdk==2.13.0
164
+ setproctitle==1.3.3
165
+ setuptools==73.0.1
166
+ six==1.16.0
167
+ smmap==5.0.1
168
+ sniffio==1.3.1
169
+ soupsieve==2.6
170
+ stack-data==0.6.3
171
+ sympy==1.13.2
172
+ termcolor==2.4.0
173
+ terminado==0.18.1
174
+ tinycss2==1.3.0
175
+ tokenizers==0.19.1
176
+ torch==2.4.0
177
+ torchaudio==2.4.0
178
+ torchvision==0.19.0
179
+ tornado==6.4.1
180
+ tqdm==4.66.5
181
+ traitlets==5.14.3
182
+ transformers==4.44.2
183
+ triton==3.0.0
184
+ types-python-dateutil==2.9.0.20240821
185
+ typing_extensions==4.12.2
186
+ tzdata==2024.1
187
+ uri-template==1.3.0
188
+ urllib3==2.2.2
189
+ wadllib==1.3.6
190
+ wandb==0.17.8
191
+ wcwidth==0.2.13
192
+ webcolors==24.8.0
193
+ webencodings==0.5.1
194
+ websocket-client==1.8.0
195
+ wheel==0.44.0
196
+ widgetsnbextension==4.0.11
197
+ wmctrl==0.5
198
+ xxhash==3.5.0
199
+ yarl==1.9.4
200
+ zipp==1.0.0
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-metadata.json ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.5.0-41-generic-x86_64-with-glibc2.35",
3
+ "python": "3.11.9",
4
+ "heartbeatAt": "2024-08-30T19:59:24.641329",
5
+ "startedAt": "2024-08-30T19:59:24.177472",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [],
9
+ "state": "running",
10
+ "program": "-m nanoT5.main",
11
+ "codePathLocal": null,
12
+ "git": {
13
+ "remote": "https://github.com/pszemraj/nanoT5.git",
14
+ "commit": "58834d398cca39b8344c83490f8b1bec71116423"
15
+ },
16
+ "email": null,
17
+ "root": "/workspace/nanoT5",
18
+ "host": "cf696b887dc2",
19
+ "username": "root",
20
+ "executable": "/usr/bin/python",
21
+ "cpu_count": 100,
22
+ "cpu_count_logical": 100,
23
+ "cpu_freq": {
24
+ "current": 2249.869999999997,
25
+ "min": 0.0,
26
+ "max": 0.0
27
+ },
28
+ "cpu_freq_per_core": [
29
+ {
30
+ "current": 2249.87,
31
+ "min": 0.0,
32
+ "max": 0.0
33
+ },
34
+ {
35
+ "current": 2249.87,
36
+ "min": 0.0,
37
+ "max": 0.0
38
+ },
39
+ {
40
+ "current": 2249.87,
41
+ "min": 0.0,
42
+ "max": 0.0
43
+ },
44
+ {
45
+ "current": 2249.87,
46
+ "min": 0.0,
47
+ "max": 0.0
48
+ },
49
+ {
50
+ "current": 2249.87,
51
+ "min": 0.0,
52
+ "max": 0.0
53
+ },
54
+ {
55
+ "current": 2249.87,
56
+ "min": 0.0,
57
+ "max": 0.0
58
+ },
59
+ {
60
+ "current": 2249.87,
61
+ "min": 0.0,
62
+ "max": 0.0
63
+ },
64
+ {
65
+ "current": 2249.87,
66
+ "min": 0.0,
67
+ "max": 0.0
68
+ },
69
+ {
70
+ "current": 2249.87,
71
+ "min": 0.0,
72
+ "max": 0.0
73
+ },
74
+ {
75
+ "current": 2249.87,
76
+ "min": 0.0,
77
+ "max": 0.0
78
+ },
79
+ {
80
+ "current": 2249.87,
81
+ "min": 0.0,
82
+ "max": 0.0
83
+ },
84
+ {
85
+ "current": 2249.87,
86
+ "min": 0.0,
87
+ "max": 0.0
88
+ },
89
+ {
90
+ "current": 2249.87,
91
+ "min": 0.0,
92
+ "max": 0.0
93
+ },
94
+ {
95
+ "current": 2249.87,
96
+ "min": 0.0,
97
+ "max": 0.0
98
+ },
99
+ {
100
+ "current": 2249.87,
101
+ "min": 0.0,
102
+ "max": 0.0
103
+ },
104
+ {
105
+ "current": 2249.87,
106
+ "min": 0.0,
107
+ "max": 0.0
108
+ },
109
+ {
110
+ "current": 2249.87,
111
+ "min": 0.0,
112
+ "max": 0.0
113
+ },
114
+ {
115
+ "current": 2249.87,
116
+ "min": 0.0,
117
+ "max": 0.0
118
+ },
119
+ {
120
+ "current": 2249.87,
121
+ "min": 0.0,
122
+ "max": 0.0
123
+ },
124
+ {
125
+ "current": 2249.87,
126
+ "min": 0.0,
127
+ "max": 0.0
128
+ },
129
+ {
130
+ "current": 2249.87,
131
+ "min": 0.0,
132
+ "max": 0.0
133
+ },
134
+ {
135
+ "current": 2249.87,
136
+ "min": 0.0,
137
+ "max": 0.0
138
+ },
139
+ {
140
+ "current": 2249.87,
141
+ "min": 0.0,
142
+ "max": 0.0
143
+ },
144
+ {
145
+ "current": 2249.87,
146
+ "min": 0.0,
147
+ "max": 0.0
148
+ },
149
+ {
150
+ "current": 2249.87,
151
+ "min": 0.0,
152
+ "max": 0.0
153
+ },
154
+ {
155
+ "current": 2249.87,
156
+ "min": 0.0,
157
+ "max": 0.0
158
+ },
159
+ {
160
+ "current": 2249.87,
161
+ "min": 0.0,
162
+ "max": 0.0
163
+ },
164
+ {
165
+ "current": 2249.87,
166
+ "min": 0.0,
167
+ "max": 0.0
168
+ },
169
+ {
170
+ "current": 2249.87,
171
+ "min": 0.0,
172
+ "max": 0.0
173
+ },
174
+ {
175
+ "current": 2249.87,
176
+ "min": 0.0,
177
+ "max": 0.0
178
+ },
179
+ {
180
+ "current": 2249.87,
181
+ "min": 0.0,
182
+ "max": 0.0
183
+ },
184
+ {
185
+ "current": 2249.87,
186
+ "min": 0.0,
187
+ "max": 0.0
188
+ },
189
+ {
190
+ "current": 2249.87,
191
+ "min": 0.0,
192
+ "max": 0.0
193
+ },
194
+ {
195
+ "current": 2249.87,
196
+ "min": 0.0,
197
+ "max": 0.0
198
+ },
199
+ {
200
+ "current": 2249.87,
201
+ "min": 0.0,
202
+ "max": 0.0
203
+ },
204
+ {
205
+ "current": 2249.87,
206
+ "min": 0.0,
207
+ "max": 0.0
208
+ },
209
+ {
210
+ "current": 2249.87,
211
+ "min": 0.0,
212
+ "max": 0.0
213
+ },
214
+ {
215
+ "current": 2249.87,
216
+ "min": 0.0,
217
+ "max": 0.0
218
+ },
219
+ {
220
+ "current": 2249.87,
221
+ "min": 0.0,
222
+ "max": 0.0
223
+ },
224
+ {
225
+ "current": 2249.87,
226
+ "min": 0.0,
227
+ "max": 0.0
228
+ },
229
+ {
230
+ "current": 2249.87,
231
+ "min": 0.0,
232
+ "max": 0.0
233
+ },
234
+ {
235
+ "current": 2249.87,
236
+ "min": 0.0,
237
+ "max": 0.0
238
+ },
239
+ {
240
+ "current": 2249.87,
241
+ "min": 0.0,
242
+ "max": 0.0
243
+ },
244
+ {
245
+ "current": 2249.87,
246
+ "min": 0.0,
247
+ "max": 0.0
248
+ },
249
+ {
250
+ "current": 2249.87,
251
+ "min": 0.0,
252
+ "max": 0.0
253
+ },
254
+ {
255
+ "current": 2249.87,
256
+ "min": 0.0,
257
+ "max": 0.0
258
+ },
259
+ {
260
+ "current": 2249.87,
261
+ "min": 0.0,
262
+ "max": 0.0
263
+ },
264
+ {
265
+ "current": 2249.87,
266
+ "min": 0.0,
267
+ "max": 0.0
268
+ },
269
+ {
270
+ "current": 2249.87,
271
+ "min": 0.0,
272
+ "max": 0.0
273
+ },
274
+ {
275
+ "current": 2249.87,
276
+ "min": 0.0,
277
+ "max": 0.0
278
+ },
279
+ {
280
+ "current": 2249.87,
281
+ "min": 0.0,
282
+ "max": 0.0
283
+ },
284
+ {
285
+ "current": 2249.87,
286
+ "min": 0.0,
287
+ "max": 0.0
288
+ },
289
+ {
290
+ "current": 2249.87,
291
+ "min": 0.0,
292
+ "max": 0.0
293
+ },
294
+ {
295
+ "current": 2249.87,
296
+ "min": 0.0,
297
+ "max": 0.0
298
+ },
299
+ {
300
+ "current": 2249.87,
301
+ "min": 0.0,
302
+ "max": 0.0
303
+ },
304
+ {
305
+ "current": 2249.87,
306
+ "min": 0.0,
307
+ "max": 0.0
308
+ },
309
+ {
310
+ "current": 2249.87,
311
+ "min": 0.0,
312
+ "max": 0.0
313
+ },
314
+ {
315
+ "current": 2249.87,
316
+ "min": 0.0,
317
+ "max": 0.0
318
+ },
319
+ {
320
+ "current": 2249.87,
321
+ "min": 0.0,
322
+ "max": 0.0
323
+ },
324
+ {
325
+ "current": 2249.87,
326
+ "min": 0.0,
327
+ "max": 0.0
328
+ },
329
+ {
330
+ "current": 2249.87,
331
+ "min": 0.0,
332
+ "max": 0.0
333
+ },
334
+ {
335
+ "current": 2249.87,
336
+ "min": 0.0,
337
+ "max": 0.0
338
+ },
339
+ {
340
+ "current": 2249.87,
341
+ "min": 0.0,
342
+ "max": 0.0
343
+ },
344
+ {
345
+ "current": 2249.87,
346
+ "min": 0.0,
347
+ "max": 0.0
348
+ },
349
+ {
350
+ "current": 2249.87,
351
+ "min": 0.0,
352
+ "max": 0.0
353
+ },
354
+ {
355
+ "current": 2249.87,
356
+ "min": 0.0,
357
+ "max": 0.0
358
+ },
359
+ {
360
+ "current": 2249.87,
361
+ "min": 0.0,
362
+ "max": 0.0
363
+ },
364
+ {
365
+ "current": 2249.87,
366
+ "min": 0.0,
367
+ "max": 0.0
368
+ },
369
+ {
370
+ "current": 2249.87,
371
+ "min": 0.0,
372
+ "max": 0.0
373
+ },
374
+ {
375
+ "current": 2249.87,
376
+ "min": 0.0,
377
+ "max": 0.0
378
+ },
379
+ {
380
+ "current": 2249.87,
381
+ "min": 0.0,
382
+ "max": 0.0
383
+ },
384
+ {
385
+ "current": 2249.87,
386
+ "min": 0.0,
387
+ "max": 0.0
388
+ },
389
+ {
390
+ "current": 2249.87,
391
+ "min": 0.0,
392
+ "max": 0.0
393
+ },
394
+ {
395
+ "current": 2249.87,
396
+ "min": 0.0,
397
+ "max": 0.0
398
+ },
399
+ {
400
+ "current": 2249.87,
401
+ "min": 0.0,
402
+ "max": 0.0
403
+ },
404
+ {
405
+ "current": 2249.87,
406
+ "min": 0.0,
407
+ "max": 0.0
408
+ },
409
+ {
410
+ "current": 2249.87,
411
+ "min": 0.0,
412
+ "max": 0.0
413
+ },
414
+ {
415
+ "current": 2249.87,
416
+ "min": 0.0,
417
+ "max": 0.0
418
+ },
419
+ {
420
+ "current": 2249.87,
421
+ "min": 0.0,
422
+ "max": 0.0
423
+ },
424
+ {
425
+ "current": 2249.87,
426
+ "min": 0.0,
427
+ "max": 0.0
428
+ },
429
+ {
430
+ "current": 2249.87,
431
+ "min": 0.0,
432
+ "max": 0.0
433
+ },
434
+ {
435
+ "current": 2249.87,
436
+ "min": 0.0,
437
+ "max": 0.0
438
+ },
439
+ {
440
+ "current": 2249.87,
441
+ "min": 0.0,
442
+ "max": 0.0
443
+ },
444
+ {
445
+ "current": 2249.87,
446
+ "min": 0.0,
447
+ "max": 0.0
448
+ },
449
+ {
450
+ "current": 2249.87,
451
+ "min": 0.0,
452
+ "max": 0.0
453
+ },
454
+ {
455
+ "current": 2249.87,
456
+ "min": 0.0,
457
+ "max": 0.0
458
+ },
459
+ {
460
+ "current": 2249.87,
461
+ "min": 0.0,
462
+ "max": 0.0
463
+ },
464
+ {
465
+ "current": 2249.87,
466
+ "min": 0.0,
467
+ "max": 0.0
468
+ },
469
+ {
470
+ "current": 2249.87,
471
+ "min": 0.0,
472
+ "max": 0.0
473
+ },
474
+ {
475
+ "current": 2249.87,
476
+ "min": 0.0,
477
+ "max": 0.0
478
+ },
479
+ {
480
+ "current": 2249.87,
481
+ "min": 0.0,
482
+ "max": 0.0
483
+ },
484
+ {
485
+ "current": 2249.87,
486
+ "min": 0.0,
487
+ "max": 0.0
488
+ },
489
+ {
490
+ "current": 2249.87,
491
+ "min": 0.0,
492
+ "max": 0.0
493
+ },
494
+ {
495
+ "current": 2249.87,
496
+ "min": 0.0,
497
+ "max": 0.0
498
+ },
499
+ {
500
+ "current": 2249.87,
501
+ "min": 0.0,
502
+ "max": 0.0
503
+ },
504
+ {
505
+ "current": 2249.87,
506
+ "min": 0.0,
507
+ "max": 0.0
508
+ },
509
+ {
510
+ "current": 2249.87,
511
+ "min": 0.0,
512
+ "max": 0.0
513
+ },
514
+ {
515
+ "current": 2249.87,
516
+ "min": 0.0,
517
+ "max": 0.0
518
+ },
519
+ {
520
+ "current": 2249.87,
521
+ "min": 0.0,
522
+ "max": 0.0
523
+ },
524
+ {
525
+ "current": 2249.87,
526
+ "min": 0.0,
527
+ "max": 0.0
528
+ }
529
+ ],
530
+ "disk": {
531
+ "/": {
532
+ "total": 200.0,
533
+ "used": 1.7721595764160156
534
+ }
535
+ },
536
+ "gpu": "NVIDIA A100 80GB PCIe",
537
+ "gpu_count": 1,
538
+ "gpu_devices": [
539
+ {
540
+ "name": "NVIDIA A100 80GB PCIe",
541
+ "memory_total": 85899345920
542
+ }
543
+ ],
544
+ "memory": {
545
+ "total": 668.8548545837402
546
+ }
547
+ }
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 1.777546563744545, "train/grad_l2": 0.1973351389169693, "train/weights_l2": 11272.363778775605, "train/lr": 0.0020558542377918645, "train/seconds_per_step": 4.877207107543946, "_timestamp": 1725124200.3571296, "_runtime": 76236.1699206829, "_step": 15525}
checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug.log ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-08-30 19:59:24,178 INFO MainThread:29052 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
2
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Configure stats pid to 29052
3
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
4
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/settings
5
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
6
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
7
+ 2024-08-30 19:59:24,179 WARNING MainThread:29052 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
8
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
9
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_setup.py:_flush():77] Applying login settings: {}
10
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug.log
11
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log
12
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():607] calling init triggers
13
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
14
+ config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22'}
15
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():657] starting backend
16
+ 2024-08-30 19:59:24,179 INFO MainThread:29052 [wandb_init.py:init():661] setting up manager
17
+ 2024-08-30 19:59:24,185 INFO MainThread:29052 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-08-30 19:59:24,187 INFO MainThread:29052 [wandb_init.py:init():669] backend started and connected
19
+ 2024-08-30 19:59:24,192 INFO MainThread:29052 [wandb_init.py:init():767] updated telemetry
20
+ 2024-08-30 19:59:24,198 INFO MainThread:29052 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
21
+ 2024-08-30 19:59:24,583 INFO MainThread:29052 [wandb_init.py:init():851] starting run threads in backend
22
+ 2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_console_start():2463] atexit reg
23
+ 2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_redirect():2309] redirect: wrap_raw
24
+ 2024-08-30 19:59:24,814 INFO MainThread:29052 [wandb_run.py:_redirect():2374] Wrapping output streams.
25
+ 2024-08-30 19:59:24,815 INFO MainThread:29052 [wandb_run.py:_redirect():2399] Redirects installed.
26
+ 2024-08-30 19:59:24,818 INFO MainThread:29052 [wandb_init.py:init():894] run started, returning control to user process
27
+ 2024-08-30 19:59:44,796 INFO MainThread:29052 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22', 'n_all_param': 887492096}
checkpoints/wandb/run-20240830_195924-mao0tqjy/run-mao0tqjy.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34c4d82ee5fa3daa21587d65efb7972e3a2447cce764ad1cd0eaec8aa61ffb19
3
+ size 9030581