File size: 2,723 Bytes
0d125e0
ac1e0c6
 
3b692d3
ac1e0c6
 
 
3b692d3
ac1e0c6
 
3b692d3
ac1e0c6
 
3b692d3
ac1e0c6
 
 
 
 
3b692d3
ac1e0c6
 
 
 
3b692d3
0d125e0
ac1e0c6
0d125e0
ac1e0c6
0d125e0
 
 
3b692d3
ac1e0c6
 
3b692d3
ac1e0c6
 
 
0d125e0
3d5b8ef
 
 
 
 
ac1e0c6
3d5b8ef
ac1e0c6
 
 
3b692d3
 
0d125e0
 
ac1e0c6
 
 
3b692d3
 
3d5b8ef
 
 
ac1e0c6
3b692d3
 
ac1e0c6
3b692d3
 
3d5b8ef
ac1e0c6
 
 
 
 
 
 
 
3b692d3
ac1e0c6
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
import pandas as pd

# Load the dataset
file_path = "hindi_dataset.tsv"  # Update with your actual file path
data = pd.read_csv(file_path, delimiter="\t")

# Convert the dataset to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data)

# Split the dataset into train and test subsets
split_dataset = hf_dataset.train_test_split(test_size=0.2)

# Create a DatasetDict with train and test splits
dataset = DatasetDict({
    "train": split_dataset["train"],
    "test": split_dataset["test"]
})

# Load the tokenizer and model
model_name = "Helsinki-NLP/opus-mt-en-hi"  # Pre-trained English-to-Hindi model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenize source and target text
def tokenize_function(examples):
    model_inputs = tokenizer(examples['english'], truncation=True, padding='max_length', max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['hindi'], truncation=True, padding='max_length', max_length=128)
        model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500
)

# Use the DataCollatorForSeq2Seq for padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Define the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Test the model with sample inputs
def translate_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    translated = model.generate(**inputs)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

# Test translation
sample_text = "How are you?"
hindi_translation = translate_text(sample_text)
print(f"English: {sample_text}")
print(f"Hindi: {hindi_translation[0]}")