Translation
Malayalam
English

import os import argparse import pandas as pd from datasets import Dataset from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq ) from utils import compute_metrics

def load_dataset(file_path): """Load and prepare the dataset.""" df = pd.read_csv(file_path) dataset = Dataset.from_pandas(df) # Split dataset into train and validation split_dataset = dataset.train_test_split(test_size=0.1) return split_dataset

def preprocess_function(examples, tokenizer, max_length=128): """Tokenize the texts.""" inputs = [ex for ex in examples["english_text"]] targets = [ex for ex in examples["malayalam_text"]]

model_inputs = tokenizer(
    inputs,
    max_length=max_length,
    truncation=True,
    padding="max_length",
)

with tokenizer.as_target_tokenizer():
    labels = tokenizer(
        targets,
        max_length=max_length,
        truncation=True,
        padding="max_length",
    )

model_inputs["labels"] = labels["input_ids"]
return model_inputs

def main(args): # Load tokenizer and model model_name = "google/mt5-small" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load and preprocess dataset
dataset = load_dataset("dataset/malayalam_dataset.csv")

# Tokenize datasets
tokenized_datasets = dataset.map(
    lambda x: preprocess_function(x, tokenizer),
    batched=True,
    remove_columns=dataset["train"].column_names
)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./model",
    evaluation_strategy="epoch",
    learning_rate=args.learning_rate,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    num_train_epochs=args.epochs,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    push_to_hub=True,
)

# Create data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./model")
tokenizer.save_pretrained("./model")

if name == "main": parser = argparse.ArgumentParser() parser.add_argument("--epochs", type=int, default=3) parser.add_argument("--batch_size", type=int, default=8) parser.add_argument("--learning_rate", type=float, default=2e-5) args = parser.parse_args() main(args)

Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.

Model tree for Haryni/model

Finetuned
(256)
this model

Datasets used to train Haryni/model