translator / app.py
breadlicker45's picture
Update app.py
9d0782b verified
raw
history blame
2.06 kB
import gradio as gr
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import tiktoken
# Load the model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
# Define the language codes supported by the model
language_codes = {
"Arabic": "ar_AR",
"Czech": "cs_CZ",
"German": "de_DE",
"English": "en_XX",
"Spanish": "es_XX",
"Estonian": "et_EE",
"Finnish": "fi_FI",
"French": "fr_XX",
"Gujarati": "gu_IN",
"Hindi": "hi_IN",
"Italian": "it_IT",
"Japanese": "ja_XX",
"Kazakh": "kk_KZ",
"Korean": "ko_KR",
"Lithuanian": "lt_LT",
"Latvian": "lv_LV",
"Burmese": "my_MM",
"Nepali": "ne_NP",
"Dutch": "nl_XX",
"Romanian": "ro_RO",
"Russian": "ru_RU",
"Sinhala": "si_LK",
"Turkish": "tr_TR",
"Vietnamese": "vi_VN",
"Chinese": "zh_CN",
}
def translate(text, src_lang, tgt_lang):
# Set the source language
tokenizer.src_lang = language_codes[src_lang]
# Tokenize the input text
encoded = tokenizer(text, return_tensors="pt")
# Generate translation
generated_tokens = model.generate(
**encoded,
forced_bos_token_id=tokenizer.lang_code_to_id[language_codes[tgt_lang]]
)
# Decode the generated tokens
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
return translation
# Create the Gradio interface
iface = gr.Interface(
fn=translate,
inputs=[
gr.Textbox(label="Input Text"),
gr.Dropdown(choices=list(language_codes.keys()), label="Source Language"),
gr.Dropdown(choices=list(language_codes.keys()), label="Target Language"),
],
outputs=gr.Textbox(label="Translated Text"),
title="Multilingual Translation with MBart",
description="Translate text between multiple languages using the MBart model.",
)
# Launch the interface
iface.launch()