pszemraj commited on Aug 31

Commit

4dfcb10

•

1 Parent(s): 9e7a8cf

Upload folder using huggingface_hub

Browse files

Files changed (29) hide show

.gitattributes +1 -0
checkpoints/.hydra/config.yaml +50 -0
checkpoints/.hydra/hydra.yaml +154 -0
checkpoints/.hydra/overrides.yaml +1 -0
checkpoints/checkpoint-pt-10000/model.safetensors +3 -0
checkpoints/checkpoint-pt-10000/random_states_0.pkl +3 -0
checkpoints/checkpoint-pt-12500/model.safetensors +3 -0
checkpoints/checkpoint-pt-12500/random_states_0.pkl +3 -0
checkpoints/checkpoint-pt-15000/model.safetensors +3 -0
checkpoints/checkpoint-pt-15000/random_states_0.pkl +3 -0
checkpoints/checkpoint-pt-2500/model.safetensors +3 -0
checkpoints/checkpoint-pt-2500/random_states_0.pkl +3 -0
checkpoints/checkpoint-pt-5000/model.safetensors +3 -0
checkpoints/checkpoint-pt-5000/random_states_0.pkl +3 -0
checkpoints/checkpoint-pt-7500/model.safetensors +3 -0
checkpoints/checkpoint-pt-7500/random_states_0.pkl +3 -0
checkpoints/config.json +32 -0
checkpoints/main.log +0 -0
checkpoints/wandb/debug-internal.log +0 -0
checkpoints/wandb/debug.log +27 -0
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/config.yaml +132 -0
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/diff.patch +163 -0
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/output.log +0 -0
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/requirements.txt +200 -0
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-metadata.json +547 -0
checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-summary.json +1 -0
checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log +0 -0
checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug.log +27 -0
checkpoints/wandb/run-20240830_195924-mao0tqjy/run-mao0tqjy.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoints/wandb/run-20240830_195924-mao0tqjy/run-mao0tqjy.wandb filter=lfs diff=lfs merge=lfs -text

checkpoints/.hydra/config.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+mode: pt
+device: gpu
+precision: bf16
+eval_only: false
+predict_only: false
+seed: 34534
+model:
+  klass: hf_t5
+  name: pszemraj/tFINE-900m-e16-d32
+  overwrite:
+    dropout_rate: 0.0
+  checkpoint_path: ''
+  random_init: false
+  compile: true
+tokenizer:
+  name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
+data:
+  input_length: 1024
+  mlm_probability: 0.15
+  mean_noise_span_length: 3.0
+  num_workers: 16
+optim:
+  name: adamwscale
+  base_lr: 0.01
+  batch_size: 128
+  total_steps: 20000
+  epochs: -1
+  warmup_steps: 5000
+  lr_scheduler: cosine
+  weight_decay: 0.0001
+  grad_clip: 1.0
+  grad_acc: 8
+  final_cosine: 2.0e-05
+eval:
+  every_steps: 1000000000
+  steps: 500
+checkpoint:
+  every_steps: 2500
+logging:
+  use_wandb: true
+  wandb_config:
+    project: nanoT5
+    entity: pszemraj
+    tags:
+    - 900m
+    - '1024'
+    mode: online
+  every_steps: 25
+  grad_l2: true
+  weights_l2: true

checkpoints/.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,154 @@

+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task: []
+  job:
+    name: main
+    chdir: null
+    override_dirname: ''
+    id: ???
+    num: ???
+    config_name: default
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.1'
+    cwd: /workspace/nanoT5
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /workspace/nanoT5/nanoT5/configs
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /workspace/nanoT5/outputs/2024-08-30/19-59-22
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false

checkpoints/.hydra/overrides.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

checkpoints/checkpoint-pt-10000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b04238335d2e95c6bfa1c92a501bef0bf99434e8e3475d41216cdc74d3d7a76
+size 3550041880

checkpoints/checkpoint-pt-10000/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
+size 14344

checkpoints/checkpoint-pt-12500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f034c61cb3eac83e1c6a7ea881e34d255ffba59e0ca7746df690bba59229a687
+size 3550041880

checkpoints/checkpoint-pt-12500/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
+size 14344

checkpoints/checkpoint-pt-15000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f5cd29ef789593d59329578d0b2f454238da7789c2b5a9bee1c3d139c64a5e2
+size 3550041880

checkpoints/checkpoint-pt-15000/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
+size 14344

checkpoints/checkpoint-pt-2500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6fb5ed5e9042405a8d4d053759f532abe0d167456785d277c71e18fa74c29a4
+size 3550041880

checkpoints/checkpoint-pt-2500/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
+size 14344

checkpoints/checkpoint-pt-5000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48e5e3d34b17f5fb15734e0dc5f17878d3cf58a9b748b31341c993f3e5e94f3e
+size 3550041880

checkpoints/checkpoint-pt-5000/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
+size 14344

checkpoints/checkpoint-pt-7500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c720f1fc7d3a669689be8dcae3d1137518b37665abda21edf4b547a0e7b1abe4
+size 3550041880

checkpoints/checkpoint-pt-7500/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3b2ce7e08081f282216ae6d35b3e3e08ec13a6874529efce924280e2d044c09
+size 14344

checkpoints/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "pszemraj/tFINE-900m-e16-d32",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 3072,
+  "d_kv": 64,
+  "d_model": 1024,
+  "decoder_start_token_id": 3,
+  "dense_act_fn": "silu",
+  "dropout_rate": 0.0,
+  "eos_token_id": 2,
+  "feed_forward_proj": "gated-silu",
+  "initializer_factor": 1.0,
+  "is_bf16": true,
+  "is_encoder_decoder": false,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 32,
+  "num_heads": 16,
+  "num_layers": 16,
+  "output_past": true,
+  "pad_token_id": 3,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 48,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.44.2",
+  "use_cache": true,
+  "vocab_size": 48256
+}

checkpoints/main.log ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/wandb/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/wandb/debug.log ADDED Viewed

	@@ -0,0 +1,27 @@

+2024-08-30 19:59:24,178 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Configure stats pid to 29052
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/settings
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
+2024-08-30 19:59:24,179 WARNING MainThread:29052 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Applying login settings: {}
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug.log
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:init():607] calling init triggers
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
+config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22'}
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:init():657] starting backend
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:init():661] setting up manager
+2024-08-30 19:59:24,185 INFO    MainThread:29052 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-30 19:59:24,187 INFO    MainThread:29052 [wandb_init.py:init():669] backend started and connected
+2024-08-30 19:59:24,192 INFO    MainThread:29052 [wandb_init.py:init():767] updated telemetry
+2024-08-30 19:59:24,198 INFO    MainThread:29052 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
+2024-08-30 19:59:24,583 INFO    MainThread:29052 [wandb_init.py:init():851] starting run threads in backend
+2024-08-30 19:59:24,814 INFO    MainThread:29052 [wandb_run.py:_console_start():2463] atexit reg
+2024-08-30 19:59:24,814 INFO    MainThread:29052 [wandb_run.py:_redirect():2309] redirect: wrap_raw
+2024-08-30 19:59:24,814 INFO    MainThread:29052 [wandb_run.py:_redirect():2374] Wrapping output streams.
+2024-08-30 19:59:24,815 INFO    MainThread:29052 [wandb_run.py:_redirect():2399] Redirects installed.
+2024-08-30 19:59:24,818 INFO    MainThread:29052 [wandb_init.py:init():894] run started, returning control to user process
+2024-08-30 19:59:44,796 INFO    MainThread:29052 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22', 'n_all_param': 887492096}

checkpoints/wandb/run-20240830_195924-mao0tqjy/files/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+wandb_version: 1
+mode:
+  desc: null
+  value: pt
+device:
+  desc: null
+  value: gpu
+precision:
+  desc: null
+  value: bf16
+eval_only:
+  desc: null
+  value: false
+predict_only:
+  desc: null
+  value: false
+seed:
+  desc: null
+  value: 34534
+model:
+  desc: null
+  value:
+    klass: hf_t5
+    name: pszemraj/tFINE-900m-e16-d32
+    overwrite:
+      dropout_rate: 0.0
+    checkpoint_path: ''
+    random_init: false
+    compile: true
+tokenizer:
+  desc: null
+  value:
+    name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
+data:
+  desc: null
+  value:
+    input_length: 1024
+    mlm_probability: 0.15
+    mean_noise_span_length: 3.0
+    num_workers: 16
+    before_mask_input_length: 1137
+    target_length: 229
+optim:
+  desc: null
+  value:
+    name: adamwscale
+    base_lr: 0.01
+    batch_size: 128
+    total_steps: 20000
+    epochs: -1
+    warmup_steps: 5000
+    lr_scheduler: cosine
+    weight_decay: 0.0001
+    grad_clip: 1.0
+    grad_acc: 8
+    final_cosine: 2.0e-05
+eval:
+  desc: null
+  value:
+    every_steps: 1000000000
+    steps: 500
+    corrected_steps: 500
+checkpoint:
+  desc: null
+  value:
+    every_steps: 2500
+logging:
+  desc: null
+  value:
+    use_wandb: true
+    wandb_config:
+      project: nanoT5
+      entity: pszemraj
+      tags:
+      - 900m
+      - '1024'
+      mode: online
+    every_steps: 25
+    grad_l2: true
+    weights_l2: true
+slurm_id:
+  desc: null
+  value: none
+working_dir:
+  desc: null
+  value: /workspace/nanoT5/outputs/2024-08-30/19-59-22
+_wandb:
+  desc: null
+  value:
+    python_version: 3.11.9
+    cli_version: 0.17.8
+    framework: huggingface
+    huggingface_version: 4.44.2
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1725047964
+    t:
+      1:
+      - 1
+      - 11
+      - 41
+      - 49
+      - 50
+      - 51
+      - 55
+      - 71
+      - 100
+      2:
+      - 1
+      - 11
+      - 41
+      - 49
+      - 50
+      - 51
+      - 55
+      - 71
+      - 100
+      3:
+      - 15
+      - 16
+      - 23
+      - 61
+      4: 3.11.9
+      5: 0.17.8
+      6: 4.44.2
+      8:
+      - 5
+      13: linux-x86_64
+n_all_param:
+  desc: null
+  value: 887492096

checkpoints/wandb/run-20240830_195924-mao0tqjy/files/diff.patch ADDED Viewed

	@@ -0,0 +1,163 @@

+diff --git a/nanoT5/configs/default.yaml b/nanoT5/configs/default.yaml
+index 6e10cc4..240ff3d 100644
+--- a/nanoT5/configs/default.yaml
++++ b/nanoT5/configs/default.yaml
+@@ -1,65 +1,51 @@
+-defaults:
+-    - _self_
+-    - task: pt
+-    - local_env: default
+-
+-# Experiment args
+-mode: 'pt'
++mode: pt
+ device: gpu
+-precision: 'bf16'
++precision: bf16
+ eval_only: false
+ predict_only: false
+-seed: 2137
++seed: 34534
+ model:
+-    klass: local_t5
+-    name: 'google/t5-v1_1-base'
+-    overwrite: # overwrite config with these values
+-        dropout_rate: 0.0
+-    add_config: # add these values to the config
+-        is_bf16: false
+-    checkpoint_path: ''
+-    random_init: true
+-    compile: true # Pytorch 2.0
+-
++  klass: hf_t5
++  name: pszemraj/tFINE-900m-e16-d32
++  overwrite:
++    dropout_rate: 0.0
++#  add_config:
++#    is_bf16: false
++  checkpoint_path: ''
++  random_init: false
++  compile: true
++tokenizer:
++  name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
+ data:
+-    input_length: 512
+-    mlm_probability: 0.15
+-    mean_noise_span_length: 3.0
+-    num_workers: 8
+-
++  input_length: 1024
++  mlm_probability: 0.15
++  mean_noise_span_length: 3.0
++  num_workers: 16
+ optim:
+-    name: adamwscale
+-    base_lr: 2e-2
+-    batch_size: 128
+-    total_steps: 65536
+-    epochs: -1 # If it's > 0 it overwrites total_steps
+-    warmup_steps: 10000
+-    lr_scheduler: cosine
+-    weight_decay: 0.0
+-    grad_clip: 1.0
+-    grad_acc: 1
+-    final_cosine: 1e-5
+-
++  name: adamwscale
++  base_lr: 0.01
++  batch_size: 128
++  total_steps: 20000
++  epochs: -1
++  warmup_steps: 5000
++  lr_scheduler: cosine
++  weight_decay: 0.0001
++  grad_clip: 1.0
++  grad_acc: 8
++  final_cosine: 2.0e-05
+ eval:
+-    every_steps: 100000 # Eval once in the end
+-    steps: 500
+-
++  every_steps: 1000000000
++  steps: 500
+ checkpoint:
+-    every_steps: 100000 # Save checkpoint once in the end
+-
++  every_steps: 2500
+ logging:
+-    every_steps: 100
+-    grad_l2: true
+-    weights_l2: true
+-    use_wandb: false
+-    # Can remove or comment out the below if not using Weights & Biases
+-    wandb_config:
+-        project: nanoT5
+-        entity: 'your_wandb_username'
+-        tags: ['nanoT5', 'my_tag']
+-        mode: 'online'
+-
+-hydra:
+-    job:
+-        chdir: True
++  use_wandb: true
++  wandb_config:
++    project: nanoT5
++    entity: 'pszemraj'
++    tags: ['900m', '1024',]
++    mode: 'online'
++  every_steps: 25
++  grad_l2: true
++  weights_l2: true
+diff --git a/nanoT5/main.py b/nanoT5/main.py
+index 12dfbae..c4ba985 100644
+--- a/nanoT5/main.py
++++ b/nanoT5/main.py
+@@ -19,6 +19,40 @@ from .utils import (
+     train,
+ )
++# >>> DYNAMO UPDATES
++
++# Torch compile arguments
++torch_compile_arguments = [
++    "config.dce = True",
++    "config.memory_planning = True",
++    "config.memory_pool = 'combined'",
++    "config.coordinate_descent_tuning = True",
++    "config.max_autotune_gemm = False", # GEMM is unnecessary
++    "config.autotune_multi_device = False",
++    "config.max_autotune_gemm_backends = 'ATEN'", # Not much faster
++    "config.aggressive_fusion = False", # Careful changes results!
++    "config.cuda.enable_cuda_lto = True",
++    "config.cuda.use_fast_math = True",
++    "config.cuda.compile_opt_level = '-O3'",
++]
++# Torch dynamo arguments
++torch_dynamo_arguments = [
++    "config.accumulated_cache_size_limit = 1024", # Bump up a bit from 256
++    "config.suppress_errors = True", # Supress errors for now
++    "config.do_not_emit_runtime_asserts = True",
++]
++import torch._inductor.config as config
++for _try_compile_argument in torch_compile_arguments:
++    try:    exec(_try_compile_argument)
++    except: pass
++pass
++import torch._dynamo.config as config
++for _try_dynamo_argument in torch_dynamo_arguments:
++    try:    exec(_try_dynamo_argument)
++    except: pass
++pass
++
++# >>> DYNAMO UPDATES
+ @hydra.main(config_path="configs", config_name="default", version_base="1.1")
+ def main(args):
+@@ -83,4 +117,4 @@ def main(args):
+ if __name__ == "__main__":
+-    main()
++    main()

checkpoints/wandb/run-20240830_195924-mao0tqjy/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/wandb/run-20240830_195924-mao0tqjy/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,200 @@

+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyGObject==3.42.1
+PyJWT==2.3.0
+PyYAML==5.4.1
+PyYAML==6.0.2
+Pygments==2.11.2
+Pygments==2.18.0
+SecretStorage==3.3.1
+Send2Trash==1.8.3
+absl-py==2.1.0
+accelerate==0.33.0
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blessed==1.20.0
+blinker==1.4
+certifi==2024.7.4
+cffi==1.17.0
+charset-normalizer==3.3.2
+click==8.1.7
+comm==0.2.2
+cryptography==3.4.8
+datasets==2.21.0
+dbus-python==1.2.18
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distro==1.7.0
+docker-pycreds==0.4.0
+entrypoints==0.4
+evaluate==0.4.2
+executing==2.0.1
+fancycompleter==0.9.1
+fastjsonschema==2.20.0
+filelock==3.15.4
+fire==0.6.0
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+gpustat==1.1.1
+h11==0.14.0
+httpcore==1.0.5
+httplib2==0.20.2
+httpx==0.27.0
+huggingface-hub==0.24.6
+hydra-core==1.3.2
+idna==3.7
+importlib-metadata==4.6.4
+ipykernel==6.29.5
+ipython-genutils==0.2.0
+ipython==8.26.0
+ipywidgets==8.1.3
+isoduration==20.11.0
+jedi==0.19.1
+jeepney==0.7.1
+joblib==1.4.2
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema-specifications==2023.12.1
+jsonschema==4.23.0
+jupyter-archive==3.4.0
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_client==7.4.9
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.4
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.11
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+more-itertools==8.10.0
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+nbclassic==1.1.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.3
+ninja==1.11.1.1
+nltk==3.9.1
+notebook==6.5.5
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==12.560.30
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.20
+nvidia-nvtx-cu12==12.1.105
+oauthlib==3.2.0
+omegaconf==2.3.0
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.2
+pandocfilters==1.5.1
+parso==0.8.4
+pdbpp==0.10.3
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.2.2
+prometheus_client==0.20.0
+prompt_toolkit==3.0.47
+protobuf==3.20.3
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==17.0.0
+pycparser==2.22
+pynvml==11.5.3
+pyparsing==2.4.7
+pyrepl==0.9.0
+python-apt==2.4.0+ubuntu3
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+pytz==2024.1
+pyzmq==24.0.1
+referencing==0.35.1
+regex==2024.7.24
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rouge_score==0.1.2
+rpds-py==0.20.0
+safetensors==0.4.4
+sentencepiece==0.2.0
+sentry-sdk==2.13.0
+setproctitle==1.3.3
+setuptools==73.0.1
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+sympy==1.13.2
+termcolor==2.4.0
+terminado==0.18.1
+tinycss2==1.3.0
+tokenizers==0.19.1
+torch==2.4.0
+torchaudio==2.4.0
+torchvision==0.19.0
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.44.2
+triton==3.0.0
+types-python-dateutil==2.9.0.20240821
+typing_extensions==4.12.2
+tzdata==2024.1
+uri-template==1.3.0
+urllib3==2.2.2
+wadllib==1.3.6
+wandb==0.17.8
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+wheel==0.44.0
+widgetsnbextension==4.0.11
+wmctrl==0.5
+xxhash==3.5.0
+yarl==1.9.4
+zipp==1.0.0

checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,547 @@

+{
+    "os": "Linux-6.5.0-41-generic-x86_64-with-glibc2.35",
+    "python": "3.11.9",
+    "heartbeatAt": "2024-08-30T19:59:24.641329",
+    "startedAt": "2024-08-30T19:59:24.177472",
+    "docker": null,
+    "cuda": null,
+    "args": [],
+    "state": "running",
+    "program": "-m nanoT5.main",
+    "codePathLocal": null,
+    "git": {
+        "remote": "https://github.com/pszemraj/nanoT5.git",
+        "commit": "58834d398cca39b8344c83490f8b1bec71116423"
+    },
+    "email": null,
+    "root": "/workspace/nanoT5",
+    "host": "cf696b887dc2",
+    "username": "root",
+    "executable": "/usr/bin/python",
+    "cpu_count": 100,
+    "cpu_count_logical": 100,
+    "cpu_freq": {
+        "current": 2249.869999999997,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2249.87,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 200.0,
+            "used": 1.7721595764160156
+        }
+    },
+    "gpu": "NVIDIA A100 80GB PCIe",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100 80GB PCIe",
+            "memory_total": 85899345920
+        }
+    ],
+    "memory": {
+        "total": 668.8548545837402
+    }
+}

checkpoints/wandb/run-20240830_195924-mao0tqjy/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"train/loss": 1.777546563744545, "train/grad_l2": 0.1973351389169693, "train/weights_l2": 11272.363778775605, "train/lr": 0.0020558542377918645, "train/seconds_per_step": 4.877207107543946, "_timestamp": 1725124200.3571296, "_runtime": 76236.1699206829, "_step": 15525}

checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/wandb/run-20240830_195924-mao0tqjy/logs/debug.log ADDED Viewed

	@@ -0,0 +1,27 @@

+2024-08-30 19:59:24,178 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Configure stats pid to 29052
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/settings
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
+2024-08-30 19:59:24,179 WARNING MainThread:29052 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_setup.py:_flush():77] Applying login settings: {}
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug.log
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/outputs/2024-08-30/19-59-22/wandb/run-20240830_195924-mao0tqjy/logs/debug-internal.log
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:init():607] calling init triggers
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
+config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22'}
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:init():657] starting backend
+2024-08-30 19:59:24,179 INFO    MainThread:29052 [wandb_init.py:init():661] setting up manager
+2024-08-30 19:59:24,185 INFO    MainThread:29052 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-30 19:59:24,187 INFO    MainThread:29052 [wandb_init.py:init():669] backend started and connected
+2024-08-30 19:59:24,192 INFO    MainThread:29052 [wandb_init.py:init():767] updated telemetry
+2024-08-30 19:59:24,198 INFO    MainThread:29052 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
+2024-08-30 19:59:24,583 INFO    MainThread:29052 [wandb_init.py:init():851] starting run threads in backend
+2024-08-30 19:59:24,814 INFO    MainThread:29052 [wandb_run.py:_console_start():2463] atexit reg
+2024-08-30 19:59:24,814 INFO    MainThread:29052 [wandb_run.py:_redirect():2309] redirect: wrap_raw
+2024-08-30 19:59:24,814 INFO    MainThread:29052 [wandb_run.py:_redirect():2374] Wrapping output streams.
+2024-08-30 19:59:24,815 INFO    MainThread:29052 [wandb_run.py:_redirect():2399] Redirects installed.
+2024-08-30 19:59:24,818 INFO    MainThread:29052 [wandb_init.py:init():894] run started, returning control to user process
+2024-08-30 19:59:44,796 INFO    MainThread:29052 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 34534, 'model': {'klass': 'hf_t5', 'name': 'pszemraj/tFINE-900m-e16-d32', 'overwrite': {'dropout_rate': 0.0}, 'checkpoint_path': '', 'random_init': False, 'compile': True}, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 16, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.01, 'batch_size': 128, 'total_steps': 20000, 'epochs': -1, 'warmup_steps': 5000, 'lr_scheduler': 'cosine', 'weight_decay': 0.0001, 'grad_clip': 1.0, 'grad_acc': 8, 'final_cosine': 2e-05}, 'eval': {'every_steps': 1000000000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 2500}, 'logging': {'use_wandb': True, 'wandb_config': {'project': 'nanoT5', 'entity': 'pszemraj', 'tags': ['900m', '1024'], 'mode': 'online'}, 'every_steps': 25, 'grad_l2': True, 'weights_l2': True}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/outputs/2024-08-30/19-59-22', 'n_all_param': 887492096}

checkpoints/wandb/run-20240830_195924-mao0tqjy/run-mao0tqjy.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34c4d82ee5fa3daa21587d65efb7972e3a2447cce764ad1cd0eaec8aa61ffb19
+size 9030581