{ "architectures": [ "LLaMAForHF" ], "batch_size": 64, "clip_grad_norm": 0.5, "context_window": 64, "dim": 192, "dropout": 0.1, "epochs": 5, "learning_rate": 0.0001, "max_lr": 0.0003, "max_seq_len": 128, "model_type": "llama", "num_heads": 8, "num_layers": 4, "test_split": 0.1, "torch_dtype": "float32", "train_split": 0.8, "transformers_version": "4.41.2", "val_split": 0.1, "vocab_size": 2000, "warmup_steps": 1000, "weight_decay": 0.1 }