Spaces:
Sleeping
Sleeping
File size: 5,959 Bytes
71ad94c 6d1e318 af923d2 71ad94c af923d2 71ad94c af923d2 71ad94c 6d1e318 71ad94c 3134ca6 79868fd 24332df af923d2 6d1e318 0702769 af923d2 01e654b af923d2 79868fd af923d2 3134ca6 af923d2 8eb6a3c 79868fd af923d2 ef6d6f0 af923d2 8eb6a3c ef6d6f0 af923d2 ef6d6f0 af923d2 8eb6a3c ef6d6f0 af923d2 ef6d6f0 af923d2 8eb6a3c ef6d6f0 af923d2 79868fd 3134ca6 af923d2 79868fd af923d2 79868fd 3134ca6 af923d2 7a180a8 af923d2 509ee5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import nltk
nltk.download('punkt')
from lang_list import (
LANGUAGE_NAME_TO_CODE,
T2TT_TARGET_LANGUAGE_NAMES,
TEXT_SOURCE_LANGUAGE_NAMES,
)
DEFAULT_TARGET_LANGUAGE = "English"
from transformers import SeamlessM4TForTextToText
from transformers import AutoProcessor
model = SeamlessM4TForTextToText.from_pretrained("facebook/hf-seamless-m4t-medium")
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
# text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")
# output_tokens = model.generate(**text_inputs, tgt_lang="pan")
# translated_text_from_text = processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
# print(translated_text_from_text)
def split_text_into_batches(text, max_tokens_per_batch):
sentences = nltk.sent_tokenize(text) # Tokenize text into sentences
batches = []
current_batch = ""
for sentence in sentences:
if len(current_batch) + len(sentence) + 1 <= max_tokens_per_batch: # Add 1 for space
current_batch += sentence + " " # Add sentence to current batch
else:
batches.append(current_batch.strip()) # Add current batch to batches list
current_batch = sentence + " " # Start a new batch with the current sentence
if current_batch:
batches.append(current_batch.strip()) # Add the last batch
return batches
def run_t2tt(file_uploader , input_text: str, source_language: str, target_language: str) -> (str, bytes):
if file_uploader is not None:
with open(file_uploader, 'r') as file:
input_text=file.read()
source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
max_tokens_per_batch= 256
batches = split_text_into_batches(input_text, max_tokens_per_batch)
translated_text = ""
for batch in batches:
text_inputs = processor(text=batch, src_lang=source_language_code, return_tensors="pt")
output_tokens = model.generate(**text_inputs, tgt_lang=target_language_code)
translated_batch = processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
translated_text += translated_batch + " "
output=translated_text.strip()
_output_name = "result.txt"
open(_output_name, 'w').write(output)
return str(output), _output_name
with gr.Blocks() as demo_t2tt:
with gr.Row():
with gr.Column():
with gr.Group():
file_uploader = gr.File(label="Upload a text file (Optional)")
input_text = gr.Textbox(label="Input text")
with gr.Row():
source_language = gr.Dropdown(
label="Source language",
choices=TEXT_SOURCE_LANGUAGE_NAMES,
value="Punjabi",
)
target_language = gr.Dropdown(
label="Target language",
choices=T2TT_TARGET_LANGUAGE_NAMES,
value=DEFAULT_TARGET_LANGUAGE,
)
btn = gr.Button("Translate")
with gr.Column():
output_text = gr.Textbox(label="Translated text")
output_file = gr.File(label="Translated text file")
gr.Examples(
examples=[
[
None,
"The sinister destruction of the holy Akal Takht and the ruthless massacre of thousands of innocent pilgrims had unmasked the deep-seated hatred and animosity that the Indian Government had been nurturing against Sikhs ever since independence",
"English",
"Punjabi",
],
[
None,
"It contains. much useful information about administrative, revenue, judicial and ecclesiastical activities in various areas which, it is hoped, would supplement the information available in official records.",
"English",
"Hindi",
],
[
None,
"दुनिया में बहुत सी अलग-अलग भाषाएं हैं और उनमें अपने वर्ण और शब्दों का भंडार होता है. इसमें में कुछ उनके अपने शब्द होते हैं तो कुछ ऐसे भी हैं, जो दूसरी भाषाओं से लिए जाते हैं.",
"Hindi",
"Punjabi",
],
[
None,
"ਸੂੂਬੇ ਦੇ ਕਈ ਜ਼ਿਲ੍ਹਿਆਂ ’ਚ ਬੁੱਧਵਾਰ ਸਵੇਰੇ ਸੰਘਣੀ ਧੁੰਦ ਛਾਈ ਰਹੀ ਤੇ ਤੇਜ਼ ਹਵਾਵਾਂ ਨੇ ਕਾਂਬਾ ਹੋਰ ਵਧਾ ਦਿੱਤਾ। ਸੱਤ ਸ਼ਹਿਰਾਂ ’ਚ ਦਿਨ ਦਾ ਤਾਪਮਾਨ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੇ ਆਸਪਾਸ ਰਿਹਾ। ਸੂਬੇ ’ਚ ਵੱਧ ਤੋਂ ਵੱਧ ਤਾਪਮਾਨ ’ਚ ਵੀ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੀ ਗਿਰਾਵਟ ਦਰਜ ਕੀਤੀ ਗਈ",
"Punjabi",
"English",
],
],
inputs=[file_uploader ,input_text, source_language, target_language],
outputs=[output_text, output_file],
fn=run_t2tt,
cache_examples=False,
api_name=False,
)
gr.on(
triggers=[input_text.submit, btn.click],
fn=run_t2tt,
inputs=[file_uploader, input_text, source_language, target_language],
outputs=[output_text, output_file],
api_name="t2tt",
)
with gr.Blocks() as demo:
with gr.Tabs():
with gr.Tab(label="Translate"):
demo_t2tt.render()
if __name__ == "__main__":
demo.launch() |