Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer, MT5ForConditionalGeneration | |
from transformers import T5Tokenizer | |
import streamlit as st | |
import pandas as pd | |
from datasets import Dataset | |
import torch | |
from datasets import Dataset, DatasetDict | |
from transformers import Trainer, TrainingArguments | |
tokenizer = T5Tokenizer.from_pretrained('google/mt5-base') | |
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base") | |
#st.write(model) | |
df = pd.read_csv('proverbs.csv') | |
df | |
dataset = Dataset.from_pandas(df) | |
def preprocess_function(examples): | |
inputs = examples['Proverb'] | |
targets = examples['Meaning'] | |
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length") | |
with tokenizer.as_target_tokenizer(): | |
labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length") | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
tokenized_dataset = dataset.map(preprocess_function, batched=True) | |
dataset_split = tokenized_dataset.train_test_split(test_size=0.2) | |
train_dataset = dataset_split['train'] | |
test_dataset = dataset_split['test'] | |
print(f"Training dataset size: {len(train_dataset)}") | |
print(f"Testing dataset size: {len(test_dataset)}") | |
training_args = TrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="epoch", | |
learning_rate=2e-5, | |
per_device_train_batch_size=4, | |
per_device_eval_batch_size=4, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
save_total_limit=2, | |
save_steps=500, | |
) | |
# Initialize Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset, | |
eval_dataset=tokenized_dataset, # Typically you'd have a separate eval dataset | |
) | |
# Fine-tune the model | |
trainer.train() | |
model.save_pretrained("./fine-tuned-mt5-marathi-proverbs") | |
tokenizer.save_pretrained("./fine-tuned-mt5-marathi-proverbs") |