Spaces:

chinhon
/

translation_eng2ch

Running

App Files Files Community

translation_eng2ch / app.py

chinhon

Update app.py

e631c98 over 2 years ago

raw

history blame contribute delete

No virus

2.18 kB

	import gradio as gr
	import nltk
	import numpy as np
	import re
	import warnings

	from nltk.tokenize import sent_tokenize
	from transformers import (
	MarianTokenizer,
	MarianMTModel,
	)

	nltk.download('punkt')

	#define function for text cleaning
	def clean_text(text):
	text = text.encode("ascii", errors="ignore").decode(
	"ascii"
	) # remove non-ascii, Chinese characters
	text = re.sub(r"\n", " ", text)
	text = re.sub(r"\n\n", " ", text)
	text = re.sub(r"\t", " ", text)
	text = re.sub(r"http\S+", "", text)
	text = re.sub(r"ADVERTISEMENT", " ", text)
	text = re.sub(
	r"Download our app or subscribe to our Telegram channel for the latest updates on the coronavirus outbreak: https://cna.asia/telegram",
	" ",
	text,
	)
	text = re.sub(
	r"Download our app or subscribe to our Telegram channel for the latest updates on the COVID-19 outbreak: https://cna.asia/telegram",
	" ",
	text,
	)
	text = text.strip(" ")
	text = re.sub(
	" +", " ", text
	).strip() # get rid of multiple spaces and replace with a single
	return text


	# define function for translation
	modchoice = "Helsinki-NLP/opus-mt-en-zh"


	def translate(text):

	input_text = clean_text(text)

	tokenizer = MarianTokenizer.from_pretrained(modchoice)

	model = MarianMTModel.from_pretrained(modchoice)

	if input_text is None or text == "":
	return ("Error",)

	translated = model.generate(
	**tokenizer.prepare_seq2seq_batch(
	sent_tokenize(input_text),
	truncation=True,
	padding="longest",
	return_tensors="pt"
	)
	)

	tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

	return " ".join(tgt_text)


	gradio_ui = gr.Interface(
	fn=translate,
	title="English-to-Chinese translation",
	description="Translate English text into Chinese using MarianMT's opus-mt-en-zh model.",
	inputs=gr.inputs.Textbox(
	lines=20, label="Paste English text here"
	),
	outputs=gr.outputs.Textbox(label="Chinese translation"),
	theme="huggingface",
	)

	gradio_ui.launch(enable_queue=True)