Spaces:
Sleeping
Sleeping
from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer | |
from datasets import load_dataset | |
import base64 | |
import json | |
from pathlib import Path | |
# Load your dataset | |
dataset = load_dataset("./files/") | |
# Assuming your dataset has 'train' split | |
train_dataset = dataset["train"] | |
# Load the T5 model and tokenizer from a local directory | |
model_path = "t5-small-model" | |
tokenizer = T5Tokenizer.from_pretrained(model_path) | |
model = T5ForConditionalGeneration.from_pretrained(model_path) | |
# Define the training arguments | |
training_args = TrainingArguments( | |
output_dir="./output1", # Specify the output directory for model checkpoints and predictions | |
save_steps=100, | |
per_device_train_batch_size=4, # Adjust the batch size based on your GPU memory | |
save_total_limit=2, # Limit the total number of checkpoints to save | |
num_train_epochs=3, # Specify the number of training epochs | |
logging_dir="./logs", # Specify the directory for Tensorboard logs | |
) | |
# Define format_dataset function | |
def format_dataset(file_path): | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
print(f"File content:\n{content}\n") | |
try: | |
data_list = json.loads(content) | |
except json.JSONDecodeError as e: | |
print(f"Error decoding JSON: {e}") | |
return None | |
formatted_examples = [] | |
for data in data_list: | |
input_texts = data.get("input") | |
targets = data.get("target") | |
# Convert to lists if not already | |
if not isinstance(input_texts, list): | |
input_texts = [input_texts] | |
if not isinstance(targets, list): | |
targets = [targets] | |
# Concatenate the texts in the list | |
input_text_concatenated = " ".join(input_texts) | |
target_text_concatenated = " ".join(targets) | |
# Encode concatenated texts | |
#inputs = tokenizer(input_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512) | |
#labels = tokenizer(target_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512) | |
# Encode concatenated texts with padding and truncation | |
inputs = tokenizer( | |
input_text_concatenated, | |
padding="max_length", | |
truncation=True, | |
return_tensors="pt", | |
max_length=512 | |
) | |
labels = tokenizer( | |
target_text_concatenated, | |
padding="max_length", | |
truncation=True, | |
return_tensors="pt", | |
max_length=512 | |
) | |
# Update the inputs dictionary with the labels | |
inputs["labels"] = labels["input_ids"] | |
formatted_examples.append(inputs) | |
return formatted_examples | |
# Process each example individually | |
data_files = Path("./files/").rglob("*.json") | |
formatted_examples = [format_dataset(file_path) for file_path in data_files if format_dataset(file_path) is not None] | |
# Flatten the list of examples | |
formatted_examples = [example for sublist in formatted_examples for example in sublist] | |
# Create the final dataset | |
train_dataset = [{"input_ids": example["input_ids"][0], "attention_mask": example["attention_mask"][0], "labels": example["labels"][0]} for example in formatted_examples] | |
# Instantiate the Trainer with save_tokenizer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, #dataset["train"], | |
tokenizer=tokenizer, # Pass the tokenizer to the Trainer | |
# ... other Trainer configurations ... | |
) | |
print(f"Number of examples in the training dataset: {len(dataset['train'])}") | |
# Print model configuration | |
print("Model Configuration:") | |
print(model.config) | |
# Training loop | |
trainer.train() | |
# Save the model after training | |
model.save_pretrained("./output/fine-tuned-model") | |
tokenizer.save_pretrained("./output/fine-tuned-model") | |