Spaces:

pdltiet
/

PDL_translate

Sleeping

App Files Files Community

PDL_translate / app.py

vtiw

split text to batches

6d1e318 verified 11 months ago

raw

history blame

5.96 kB

	import gradio as gr
	import nltk
	nltk.download('punkt')
	from lang_list import (
	LANGUAGE_NAME_TO_CODE,
	T2TT_TARGET_LANGUAGE_NAMES,
	TEXT_SOURCE_LANGUAGE_NAMES,
	)
	DEFAULT_TARGET_LANGUAGE = "English"
	from transformers import SeamlessM4TForTextToText
	from transformers import AutoProcessor
	model = SeamlessM4TForTextToText.from_pretrained("facebook/hf-seamless-m4t-medium")
	processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")

	# text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")
	# output_tokens = model.generate(**text_inputs, tgt_lang="pan")
	# translated_text_from_text = processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
	# print(translated_text_from_text)

	def split_text_into_batches(text, max_tokens_per_batch):
	sentences = nltk.sent_tokenize(text) # Tokenize text into sentences
	batches = []
	current_batch = ""
	for sentence in sentences:
	if len(current_batch) + len(sentence) + 1 <= max_tokens_per_batch: # Add 1 for space
	current_batch += sentence + " " # Add sentence to current batch
	else:
	batches.append(current_batch.strip()) # Add current batch to batches list
	current_batch = sentence + " " # Start a new batch with the current sentence
	if current_batch:
	batches.append(current_batch.strip()) # Add the last batch
	return batches

	def run_t2tt(file_uploader , input_text: str, source_language: str, target_language: str) -> (str, bytes):
	if file_uploader is not None:
	with open(file_uploader, 'r') as file:
	input_text=file.read()
	source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
	target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
	max_tokens_per_batch= 256
	batches = split_text_into_batches(input_text, max_tokens_per_batch)
	translated_text = ""
	for batch in batches:
	text_inputs = processor(text=batch, src_lang=source_language_code, return_tensors="pt")
	output_tokens = model.generate(**text_inputs, tgt_lang=target_language_code)
	translated_batch = processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
	translated_text += translated_batch + " "
	output=translated_text.strip()
	_output_name = "result.txt"
	open(_output_name, 'w').write(output)
	return str(output), _output_name

	with gr.Blocks() as demo_t2tt:
	with gr.Row():
	with gr.Column():
	with gr.Group():
	file_uploader = gr.File(label="Upload a text file (Optional)")
	input_text = gr.Textbox(label="Input text")
	with gr.Row():
	source_language = gr.Dropdown(
	label="Source language",
	choices=TEXT_SOURCE_LANGUAGE_NAMES,
	value="Punjabi",
	)
	target_language = gr.Dropdown(
	label="Target language",
	choices=T2TT_TARGET_LANGUAGE_NAMES,
	value=DEFAULT_TARGET_LANGUAGE,
	)
	btn = gr.Button("Translate")
	with gr.Column():
	output_text = gr.Textbox(label="Translated text")
	output_file = gr.File(label="Translated text file")

	gr.Examples(
	examples=[
	[
	None,
	"The sinister destruction of the holy Akal Takht and the ruthless massacre of thousands of innocent pilgrims had unmasked the deep-seated hatred and animosity that the Indian Government had been nurturing against Sikhs ever since independence",
	"English",
	"Punjabi",
	],
	[
	None,
	"It contains. much useful information about administrative, revenue, judicial and ecclesiastical activities in various areas which, it is hoped, would supplement the information available in official records.",
	"English",
	"Hindi",
	],
	[
	None,
	"दुनिया में बहुत सी अलग-अलग भाषाएं हैं और उनमें अपने वर्ण और शब्दों का भंडार होता है. इसमें में कुछ उनके अपने शब्द होते हैं तो कुछ ऐसे भी हैं, जो दूसरी भाषाओं से लिए जाते हैं.",
	"Hindi",
	"Punjabi",
	],
	[
	None,
	"ਸੂੂਬੇ ਦੇ ਕਈ ਜ਼ਿਲ੍ਹਿਆਂ ’ਚ ਬੁੱਧਵਾਰ ਸਵੇਰੇ ਸੰਘਣੀ ਧੁੰਦ ਛਾਈ ਰਹੀ ਤੇ ਤੇਜ਼ ਹਵਾਵਾਂ ਨੇ ਕਾਂਬਾ ਹੋਰ ਵਧਾ ਦਿੱਤਾ। ਸੱਤ ਸ਼ਹਿਰਾਂ ’ਚ ਦਿਨ ਦਾ ਤਾਪਮਾਨ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੇ ਆਸਪਾਸ ਰਿਹਾ। ਸੂਬੇ ’ਚ ਵੱਧ ਤੋਂ ਵੱਧ ਤਾਪਮਾਨ ’ਚ ਵੀ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੀ ਗਿਰਾਵਟ ਦਰਜ ਕੀਤੀ ਗਈ",
	"Punjabi",
	"English",
	],
	],
	inputs=[file_uploader ,input_text, source_language, target_language],
	outputs=[output_text, output_file],
	fn=run_t2tt,
	cache_examples=False,
	api_name=False,
	)

	gr.on(
	triggers=[input_text.submit, btn.click],
	fn=run_t2tt,
	inputs=[file_uploader, input_text, source_language, target_language],
	outputs=[output_text, output_file],
	api_name="t2tt",
	)

	with gr.Blocks() as demo:
	with gr.Tabs():
	with gr.Tab(label="Translate"):
	demo_t2tt.render()

	if __name__ == "__main__":
	demo.launch()