chinhon's picture
Update app.py
e631c98
raw
history blame contribute delete
No virus
2.18 kB
import gradio as gr
import nltk
import numpy as np
import re
import warnings
from nltk.tokenize import sent_tokenize
from transformers import (
MarianTokenizer,
MarianMTModel,
)
nltk.download('punkt')
#define function for text cleaning
def clean_text(text):
text = text.encode("ascii", errors="ignore").decode(
"ascii"
) # remove non-ascii, Chinese characters
text = re.sub(r"\n", " ", text)
text = re.sub(r"\n\n", " ", text)
text = re.sub(r"\t", " ", text)
text = re.sub(r"http\S+", "", text)
text = re.sub(r"ADVERTISEMENT", " ", text)
text = re.sub(
r"Download our app or subscribe to our Telegram channel for the latest updates on the coronavirus outbreak: https://cna.asia/telegram",
" ",
text,
)
text = re.sub(
r"Download our app or subscribe to our Telegram channel for the latest updates on the COVID-19 outbreak: https://cna.asia/telegram",
" ",
text,
)
text = text.strip(" ")
text = re.sub(
" +", " ", text
).strip() # get rid of multiple spaces and replace with a single
return text
# define function for translation
modchoice = "Helsinki-NLP/opus-mt-en-zh"
def translate(text):
input_text = clean_text(text)
tokenizer = MarianTokenizer.from_pretrained(modchoice)
model = MarianMTModel.from_pretrained(modchoice)
if input_text is None or text == "":
return ("Error",)
translated = model.generate(
**tokenizer.prepare_seq2seq_batch(
sent_tokenize(input_text),
truncation=True,
padding="longest",
return_tensors="pt"
)
)
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
return " ".join(tgt_text)
gradio_ui = gr.Interface(
fn=translate,
title="English-to-Chinese translation",
description="Translate English text into Chinese using MarianMT's opus-mt-en-zh model.",
inputs=gr.inputs.Textbox(
lines=20, label="Paste English text here"
),
outputs=gr.outputs.Textbox(label="Chinese translation"),
theme="huggingface",
)
gradio_ui.launch(enable_queue=True)