Edit model card

Pythia-2.8B Deduped 4K is a Pythia-2.8B Deduped model fine-tuned with a 4096 context length. Training resumed from their 143,000 step checkpoint and continued on The Pile v1 Deduped (threshold=0.87). This particular model is from a checkpoint captured at step 175,500 for an extra 134,217,728,000 tokens of training.

Note: Sequence length warmup was not used to move up from 2048 but, in hindsight, should have been applied.


  # 8 Nodes 8xA100 40GB
  "eval_batch_size": 2,

  "pipe-parallel-size": 1,
  "model-parallel-size": 1,

  "num-layers": 32,
  "hidden-size": 2560,
  "num-attention-heads": 32,
  "seq-length": 4096,
  "max-position-embeddings": 4096,

  "norm": "layernorm",
  "pos-emb": "rotary",
  "rotary-pct": 0.25,
  "no-weight-tying": true,
  "gpt-j-residual": true,
  "output-layer-parallelism": "column",

  "init_method": "small_init",
  "output_layer_init_method": "wang_init",
  "attention-config": [[["flash"], 32]],
  "scaled-upper-triang-masked-softmax-fusion": true,
  "bias-gelu-fusion": true,

  "optimizer": {
    "type": "Adam",
    "params": {
      "lr": 1.6e-5,
      "betas": [0.9, 0.95],
      "eps": 1.0e-08
  "min_lr": 8.0e-06,

      "stage": 1,
      "allgather_partitions": true,
      "allgather_bucket_size": 500000000,
      "overlap_comm": true,
      "reduce_scatter": true,
      "reduce_bucket_size": 500000000,
      "contiguous_gradients": true,
      "cpu_offload": false,
  "train_micro_batch_size_per_gpu": 4,
  "gradient-accumulation-steps": 4,
  "data-impl": "mmap",

  "checkpoint-activations": true,
  "checkpoint-num-layers": 1,
  "partition-activations": true,
  "synchronize-each-layer": true,

  "gradient_clipping": 1.0,
  "weight-decay": 0.1,
  "hidden-dropout": 0,
  "attention-dropout": 0,

  "fp16": {
      "fp16": true,
      "enabled": true,
      "loss_scale": 0,
      "loss_scale_window": 1000,
      "initial_scale_power": 12,
      "hysteresis": 2,
      "min_loss_scale": 1,

  "train-iters": 318000, 
  "lr-decay-iters": 318000,
  "distributed-backend": "nccl",
  "lr-decay-style": "cosine",
  "warmup": 0.01,
  "checkpoint-factor": 500,
  "eval-interval": 50000,
  "eval-iters": 10,
  "extra-save-iters": [0, 512, 152001],

  "train-data-paths": ["pile_0.87_deduped_text_document"],
  "valid-data-paths": ["pile_0.87_deduped_text_document"],
  "test-data-paths": ["pile_0.87_deduped_text_document"],

  "tokenizer_type": "HFTokenizer",
  "vocab-file": "20B_tokenizer.json",

  "log-interval": 10,
  "steps_per_print": 10,
  "wall_clock_breakdown": true,
  "log-grad-norm": true,

  "launcher": "slurm",
  "deepspeed_slurm": true,


This work would not have been possible without the support of Stability AI.

Downloads last month
Inference API
Model is too large to load in Inference API (serverless). To try the model, launch it on Inference Endpoints (dedicated) instead.

Dataset used to train CarperAI/pythia-2.8b-deduped-4k