from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer from datasets import load_dataset import base64 import json from pathlib import Path # Load your dataset dataset = load_dataset("./files/") # Assuming your dataset has 'train' split train_dataset = dataset["train"] # Load the T5 model and tokenizer from a local directory model_path = "t5-small-model" tokenizer = T5Tokenizer.from_pretrained(model_path) model = T5ForConditionalGeneration.from_pretrained(model_path) # Define the training arguments training_args = TrainingArguments( output_dir="./output1", # Specify the output directory for model checkpoints and predictions save_steps=100, per_device_train_batch_size=4, # Adjust the batch size based on your GPU memory save_total_limit=2, # Limit the total number of checkpoints to save num_train_epochs=3, # Specify the number of training epochs logging_dir="./logs", # Specify the directory for Tensorboard logs ) # Define format_dataset function def format_dataset(file_path): with open(file_path, 'r', encoding='utf-8') as f: content = f.read() print(f"File content:\n{content}\n") try: data_list = json.loads(content) except json.JSONDecodeError as e: print(f"Error decoding JSON: {e}") return None formatted_examples = [] for data in data_list: input_texts = data.get("input") targets = data.get("target") # Convert to lists if not already if not isinstance(input_texts, list): input_texts = [input_texts] if not isinstance(targets, list): targets = [targets] # Concatenate the texts in the list input_text_concatenated = " ".join(input_texts) target_text_concatenated = " ".join(targets) # Encode concatenated texts #inputs = tokenizer(input_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512) #labels = tokenizer(target_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512) # Encode concatenated texts with padding and truncation inputs = tokenizer( input_text_concatenated, padding="max_length", truncation=True, return_tensors="pt", max_length=512 ) labels = tokenizer( target_text_concatenated, padding="max_length", truncation=True, return_tensors="pt", max_length=512 ) # Update the inputs dictionary with the labels inputs["labels"] = labels["input_ids"] formatted_examples.append(inputs) return formatted_examples # Process each example individually data_files = Path("./files/").rglob("*.json") formatted_examples = [format_dataset(file_path) for file_path in data_files if format_dataset(file_path) is not None] # Flatten the list of examples formatted_examples = [example for sublist in formatted_examples for example in sublist] # Create the final dataset train_dataset = [{"input_ids": example["input_ids"][0], "attention_mask": example["attention_mask"][0], "labels": example["labels"][0]} for example in formatted_examples] # Instantiate the Trainer with save_tokenizer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, #dataset["train"], tokenizer=tokenizer, # Pass the tokenizer to the Trainer # ... other Trainer configurations ... ) print(f"Number of examples in the training dataset: {len(dataset['train'])}") # Print model configuration print("Model Configuration:") print(model.config) # Training loop trainer.train() # Save the model after training model.save_pretrained("./output/fine-tuned-model") tokenizer.save_pretrained("./output/fine-tuned-model")