Spaces:
Runtime error
Runtime error
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq | |
from datasets import Dataset, DatasetDict | |
import pandas as pd | |
# Load the dataset | |
file_path = "hindi_dataset.tsv" # Update with your actual file path | |
data = pd.read_csv(file_path, delimiter="\t") | |
# Convert the dataset to Hugging Face Dataset | |
hf_dataset = Dataset.from_pandas(data) | |
# Split the dataset into train and test subsets | |
split_dataset = hf_dataset.train_test_split(test_size=0.2) | |
# Create a DatasetDict with train and test splits | |
dataset = DatasetDict({ | |
"train": split_dataset["train"], | |
"test": split_dataset["test"] | |
}) | |
# Load the tokenizer and model | |
model_name = "Helsinki-NLP/opus-mt-en-hi" # Pre-trained English-to-Hindi model | |
tokenizer = MarianTokenizer.from_pretrained(model_name) | |
model = MarianMTModel.from_pretrained(model_name) | |
# Tokenize source and target text | |
def tokenize_function(examples): | |
model_inputs = tokenizer(examples['english'], truncation=True, padding='max_length', max_length=128) | |
with tokenizer.as_target_tokenizer(): | |
labels = tokenizer(examples['hindi'], truncation=True, padding='max_length', max_length=128) | |
model_inputs['labels'] = labels['input_ids'] | |
return model_inputs | |
# Apply tokenization to the dataset | |
tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
# Define the training arguments | |
training_args = Seq2SeqTrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="epoch", | |
learning_rate=2e-5, | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=16, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
save_total_limit=3, | |
predict_with_generate=True, | |
logging_dir="./logs", | |
logging_steps=10, | |
save_steps=500 | |
) | |
# Use the DataCollatorForSeq2Seq for padding | |
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) | |
# Define the Trainer | |
trainer = Seq2SeqTrainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_datasets['train'], | |
eval_dataset=tokenized_datasets['test'], | |
tokenizer=tokenizer, | |
data_collator=data_collator | |
) | |
# Train the model | |
trainer.train() | |
# Evaluate the model | |
eval_results = trainer.evaluate() | |
print("Evaluation Results:", eval_results) | |
# Test the model with sample inputs | |
def translate_text(text): | |
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) | |
translated = model.generate(**inputs) | |
return [tokenizer.decode(t, skip_special_tokens=True) for t in translated] | |
# Test translation | |
sample_text = "How are you?" | |
hindi_translation = translate_text(sample_text) | |
print(f"English: {sample_text}") | |
print(f"Hindi: {hindi_translation[0]}") | |