from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer from datasets import load_dataset model_name = "mistralai/Mistral-7B-v0.1" model = AutoModelForCausalLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Load Turkish legal dataset dataset = load_dataset("Renicames/turkish-law-chatbot") # Preprocessing def preprocess_function(examples): return tokenizer(examples['text'], padding="max_length", truncation=True) tokenized_dataset = dataset.map(preprocess_function, batched=True) # Training configuration training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", per_device_train_batch_size=2, learning_rate=2e-5, num_train_epochs=3, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], ) trainer.train() model.push_to_hub("akinuman/turkish-legal-mistral-v2") tokenizer.push_to_hub("akinuman/turkish-legal-mistral-v2")