{ | |
"architectures": [ | |
"LLaMAForHF" | |
], | |
"batch_size": 64, | |
"clip_grad_norm": 0.5, | |
"context_window": 64, | |
"dim": 192, | |
"dropout": 0.1, | |
"epochs": 5, | |
"learning_rate": 0.0001, | |
"max_lr": 0.0003, | |
"max_seq_len": 128, | |
"model_type": "llama", | |
"num_heads": 8, | |
"num_layers": 4, | |
"test_split": 0.1, | |
"torch_dtype": "float32", | |
"train_split": 0.8, | |
"transformers_version": "4.41.2", | |
"val_split": 0.1, | |
"vocab_size": 2000, | |
"warmup_steps": 1000, | |
"weight_decay": 0.1 | |
} | |