t5-medsum / fine_tune_model.py
Mahalingam's picture
tunetest
23975dc
raw
history blame
3.86 kB
from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset
import base64
import json
from pathlib import Path
# Load your dataset
dataset = load_dataset("./files/")
# Assuming your dataset has 'train' split
train_dataset = dataset["train"]
# Load the T5 model and tokenizer from a local directory
model_path = "t5-small-model"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
# Define the training arguments
training_args = TrainingArguments(
output_dir="./output1", # Specify the output directory for model checkpoints and predictions
save_steps=100,
per_device_train_batch_size=4, # Adjust the batch size based on your GPU memory
save_total_limit=2, # Limit the total number of checkpoints to save
num_train_epochs=3, # Specify the number of training epochs
logging_dir="./logs", # Specify the directory for Tensorboard logs
)
# Define format_dataset function
def format_dataset(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
print(f"File content:\n{content}\n")
try:
data_list = json.loads(content)
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
return None
formatted_examples = []
for data in data_list:
input_texts = data.get("input")
targets = data.get("target")
# Convert to lists if not already
if not isinstance(input_texts, list):
input_texts = [input_texts]
if not isinstance(targets, list):
targets = [targets]
# Concatenate the texts in the list
input_text_concatenated = " ".join(input_texts)
target_text_concatenated = " ".join(targets)
# Encode concatenated texts
#inputs = tokenizer(input_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512)
#labels = tokenizer(target_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512)
# Encode concatenated texts with padding and truncation
inputs = tokenizer(
input_text_concatenated,
padding="max_length",
truncation=True,
return_tensors="pt",
max_length=512
)
labels = tokenizer(
target_text_concatenated,
padding="max_length",
truncation=True,
return_tensors="pt",
max_length=512
)
# Update the inputs dictionary with the labels
inputs["labels"] = labels["input_ids"]
formatted_examples.append(inputs)
return formatted_examples
# Process each example individually
data_files = Path("./files/").rglob("*.json")
formatted_examples = [format_dataset(file_path) for file_path in data_files if format_dataset(file_path) is not None]
# Flatten the list of examples
formatted_examples = [example for sublist in formatted_examples for example in sublist]
# Create the final dataset
train_dataset = [{"input_ids": example["input_ids"][0], "attention_mask": example["attention_mask"][0], "labels": example["labels"][0]} for example in formatted_examples]
# Instantiate the Trainer with save_tokenizer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset, #dataset["train"],
tokenizer=tokenizer, # Pass the tokenizer to the Trainer
# ... other Trainer configurations ...
)
print(f"Number of examples in the training dataset: {len(dataset['train'])}")
# Print model configuration
print("Model Configuration:")
print(model.config)
# Training loop
trainer.train()
# Save the model after training
model.save_pretrained("./output/fine-tuned-model")
tokenizer.save_pretrained("./output/fine-tuned-model")