Spaces:

Mahalingam
/

t5-medsum

Sleeping

App Files Files Community

t5-medsum / fine_tune_model.py

Mahalingam

tunetest

23975dc over 1 year ago

raw

history blame

3.86 kB

	from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer
	from datasets import load_dataset
	import base64
	import json
	from pathlib import Path

	# Load your dataset
	dataset = load_dataset("./files/")
	# Assuming your dataset has 'train' split
	train_dataset = dataset["train"]
	# Load the T5 model and tokenizer from a local directory
	model_path = "t5-small-model"
	tokenizer = T5Tokenizer.from_pretrained(model_path)
	model = T5ForConditionalGeneration.from_pretrained(model_path)

	# Define the training arguments
	training_args = TrainingArguments(
	output_dir="./output1", # Specify the output directory for model checkpoints and predictions
	save_steps=100,
	per_device_train_batch_size=4, # Adjust the batch size based on your GPU memory
	save_total_limit=2, # Limit the total number of checkpoints to save
	num_train_epochs=3, # Specify the number of training epochs
	logging_dir="./logs", # Specify the directory for Tensorboard logs
	)

	# Define format_dataset function
	def format_dataset(file_path):
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	print(f"File content:\n{content}\n")

	try:
	data_list = json.loads(content)
	except json.JSONDecodeError as e:
	print(f"Error decoding JSON: {e}")
	return None

	formatted_examples = []

	for data in data_list:
	input_texts = data.get("input")
	targets = data.get("target")

	# Convert to lists if not already
	if not isinstance(input_texts, list):
	input_texts = [input_texts]
	if not isinstance(targets, list):
	targets = [targets]

	# Concatenate the texts in the list
	input_text_concatenated = " ".join(input_texts)
	target_text_concatenated = " ".join(targets)

	# Encode concatenated texts
	#inputs = tokenizer(input_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512)
	#labels = tokenizer(target_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512)
	# Encode concatenated texts with padding and truncation
	inputs = tokenizer(
	input_text_concatenated,
	padding="max_length",
	truncation=True,
	return_tensors="pt",
	max_length=512
	)
	labels = tokenizer(
	target_text_concatenated,
	padding="max_length",
	truncation=True,
	return_tensors="pt",
	max_length=512
	)


	# Update the inputs dictionary with the labels
	inputs["labels"] = labels["input_ids"]

	formatted_examples.append(inputs)

	return formatted_examples

	# Process each example individually
	data_files = Path("./files/").rglob("*.json")
	formatted_examples = [format_dataset(file_path) for file_path in data_files if format_dataset(file_path) is not None]

	# Flatten the list of examples
	formatted_examples = [example for sublist in formatted_examples for example in sublist]

	# Create the final dataset
	train_dataset = [{"input_ids": example["input_ids"][0], "attention_mask": example["attention_mask"][0], "labels": example["labels"][0]} for example in formatted_examples]


	# Instantiate the Trainer with save_tokenizer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset, #dataset["train"],
	tokenizer=tokenizer, # Pass the tokenizer to the Trainer
	# ... other Trainer configurations ...
	)

	print(f"Number of examples in the training dataset: {len(dataset['train'])}")

	# Print model configuration
	print("Model Configuration:")
	print(model.config)
	# Training loop
	trainer.train()

	# Save the model after training
	model.save_pretrained("./output/fine-tuned-model")
	tokenizer.save_pretrained("./output/fine-tuned-model")