Spaces:
Runtime error
Runtime error
File size: 2,723 Bytes
0d125e0 ac1e0c6 3b692d3 ac1e0c6 3b692d3 ac1e0c6 3b692d3 ac1e0c6 3b692d3 ac1e0c6 3b692d3 ac1e0c6 3b692d3 0d125e0 ac1e0c6 0d125e0 ac1e0c6 0d125e0 3b692d3 ac1e0c6 3b692d3 ac1e0c6 0d125e0 3d5b8ef ac1e0c6 3d5b8ef ac1e0c6 3b692d3 0d125e0 ac1e0c6 3b692d3 3d5b8ef ac1e0c6 3b692d3 ac1e0c6 3b692d3 3d5b8ef ac1e0c6 3b692d3 ac1e0c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
import pandas as pd
# Load the dataset
file_path = "hindi_dataset.tsv" # Update with your actual file path
data = pd.read_csv(file_path, delimiter="\t")
# Convert the dataset to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data)
# Split the dataset into train and test subsets
split_dataset = hf_dataset.train_test_split(test_size=0.2)
# Create a DatasetDict with train and test splits
dataset = DatasetDict({
"train": split_dataset["train"],
"test": split_dataset["test"]
})
# Load the tokenizer and model
model_name = "Helsinki-NLP/opus-mt-en-hi" # Pre-trained English-to-Hindi model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Tokenize source and target text
def tokenize_function(examples):
model_inputs = tokenizer(examples['english'], truncation=True, padding='max_length', max_length=128)
with tokenizer.as_target_tokenizer():
labels = tokenizer(examples['hindi'], truncation=True, padding='max_length', max_length=128)
model_inputs['labels'] = labels['input_ids']
return model_inputs
# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Define the training arguments
training_args = Seq2SeqTrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
save_total_limit=3,
predict_with_generate=True,
logging_dir="./logs",
logging_steps=10,
save_steps=500
)
# Use the DataCollatorForSeq2Seq for padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# Define the Trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
tokenizer=tokenizer,
data_collator=data_collator
)
# Train the model
trainer.train()
# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)
# Test the model with sample inputs
def translate_text(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
translated = model.generate(**inputs)
return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
# Test translation
sample_text = "How are you?"
hindi_translation = translate_text(sample_text)
print(f"English: {sample_text}")
print(f"Hindi: {hindi_translation[0]}")
|