{
  "architectures": [
    "LLaMAForHF"
  ],
  "batch_size": 64,
  "clip_grad_norm": 0.5,
  "context_window": 64,
  "dim": 192,
  "dropout": 0.1,
  "epochs": 5,
  "learning_rate": 0.0001,
  "max_lr": 0.0003,
  "max_seq_len": 128,
  "model_type": "llama",
  "num_heads": 8,
  "num_layers": 4,
  "test_split": 0.1,
  "torch_dtype": "float32",
  "train_split": 0.8,
  "transformers_version": "4.41.2",
  "val_split": 0.1,
  "vocab_size": 2000,
  "warmup_steps": 1000,
  "weight_decay": 0.1
}