from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments # Load the dataset dataset = load_dataset("json", data_files="dataset.jsonl") # Load the model and tokenizer model_name = "Salesforce/codegen-2B-multi" model = AutoModelForCausalLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Tokenize the dataset def tokenize_function(examples): return tokenizer(examples["input"], text_target=examples["output"], truncation=True) tokenized_dataset = dataset.map(tokenize_function, batched=True) # Define training arguments training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=5e-5, per_device_train_batch_size=4, num_train_epochs=3, save_strategy="epoch", logging_dir="./logs", ) # Train the model trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["train"], ) trainer.train() trainer.save_model("./fine_tuned_codegen") tokenizer.save_pretrained("./fine_tuned_codegen") print("Training complete. Model saved.")