Spaces:

pdltiet
/

PDL_translate

Sleeping

File size: 5,959 Bytes

71ad94c
6d1e318
 
af923d2
 
 
 
 
 
71ad94c
af923d2
71ad94c
 
 
af923d2
 
 
 
71ad94c
6d1e318
 
 
 
 
 
 
 
 
 
 
 
 
71ad94c
3134ca6
79868fd
24332df
 
af923d2
 
6d1e318
 
 
 
 
 
 
 
 
0702769
 
 
af923d2
 
 
 
 
01e654b
af923d2
 
 
 
 
79868fd
af923d2
 
 
 
 
 
 
 
 
3134ca6
af923d2
 
 
 
8eb6a3c
79868fd
af923d2
ef6d6f0
af923d2
 
8eb6a3c
ef6d6f0
af923d2
ef6d6f0
af923d2
 
8eb6a3c
ef6d6f0
af923d2
ef6d6f0
af923d2
 
8eb6a3c
ef6d6f0
 
af923d2
 
 
79868fd
3134ca6
af923d2
79868fd
af923d2
 
 
 
 
 
79868fd
3134ca6
af923d2
 
 
7a180a8
 
 
 
 
af923d2
509ee5f

import gradio as gr
import nltk
nltk.download('punkt')
from lang_list import (
    LANGUAGE_NAME_TO_CODE,
    T2TT_TARGET_LANGUAGE_NAMES,
    TEXT_SOURCE_LANGUAGE_NAMES,
)
DEFAULT_TARGET_LANGUAGE = "English"
from transformers import SeamlessM4TForTextToText
from transformers import AutoProcessor
model = SeamlessM4TForTextToText.from_pretrained("facebook/hf-seamless-m4t-medium")
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")

# text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")
# output_tokens = model.generate(**text_inputs, tgt_lang="pan")
# translated_text_from_text = processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
# print(translated_text_from_text)

def split_text_into_batches(text, max_tokens_per_batch):
    sentences = nltk.sent_tokenize(text)  # Tokenize text into sentences
    batches = []
    current_batch = ""
    for sentence in sentences:
        if len(current_batch) + len(sentence) + 1 <= max_tokens_per_batch:  # Add 1 for space
            current_batch += sentence + " "  # Add sentence to current batch
        else:
            batches.append(current_batch.strip())  # Add current batch to batches list
            current_batch = sentence + " "  # Start a new batch with the current sentence
    if current_batch:
        batches.append(current_batch.strip())  # Add the last batch
    return batches

def run_t2tt(file_uploader , input_text: str, source_language: str, target_language: str) -> (str, bytes):
    if file_uploader is not None:
        with open(file_uploader, 'r') as file:
            input_text=file.read()
    source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
    target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
    max_tokens_per_batch= 256
    batches = split_text_into_batches(input_text, max_tokens_per_batch)
    translated_text = ""
    for batch in batches:
        text_inputs = processor(text=batch, src_lang=source_language_code, return_tensors="pt")
        output_tokens = model.generate(**text_inputs, tgt_lang=target_language_code)
        translated_batch = processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
        translated_text += translated_batch + " "
    output=translated_text.strip()
    _output_name = "result.txt"
    open(_output_name, 'w').write(output)
    return str(output), _output_name

with gr.Blocks() as demo_t2tt:
    with gr.Row():
        with gr.Column():
            with gr.Group():
                file_uploader = gr.File(label="Upload a text file (Optional)")
                input_text = gr.Textbox(label="Input text")
                with gr.Row():
                    source_language = gr.Dropdown(
                        label="Source language",
                        choices=TEXT_SOURCE_LANGUAGE_NAMES,
                        value="Punjabi",
                    )
                    target_language = gr.Dropdown(
                        label="Target language",
                        choices=T2TT_TARGET_LANGUAGE_NAMES,
                        value=DEFAULT_TARGET_LANGUAGE,
                    )
            btn = gr.Button("Translate")
        with gr.Column():
            output_text = gr.Textbox(label="Translated text")
            output_file = gr.File(label="Translated text file")

    gr.Examples(
        examples=[
            [
                None,
                "The sinister destruction of the holy Akal Takht and the ruthless massacre of thousands of innocent pilgrims had unmasked the deep-seated hatred and animosity that the Indian Government had been nurturing against Sikhs ever since independence",
                "English",
                "Punjabi",
            ],
            [
                None,
                "It contains. much useful information about administrative, revenue, judicial and ecclesiastical activities in various areas which, it is hoped, would supplement the information available in official records.",
                "English",
                "Hindi",
            ],
            [
                None,
                "दुनिया में बहुत सी अलग-अलग भाषाएं हैं और उनमें अपने वर्ण और शब्दों का भंडार होता है. इसमें में कुछ उनके अपने शब्द होते हैं तो कुछ ऐसे भी हैं, जो दूसरी भाषाओं से लिए जाते हैं.",
                "Hindi",
                "Punjabi",
            ],
            [
                None,
                "ਸੂੂਬੇ ਦੇ ਕਈ ਜ਼ਿਲ੍ਹਿਆਂ ’ਚ ਬੁੱਧਵਾਰ ਸਵੇਰੇ ਸੰਘਣੀ ਧੁੰਦ ਛਾਈ ਰਹੀ ਤੇ ਤੇਜ਼ ਹਵਾਵਾਂ ਨੇ ਕਾਂਬਾ ਹੋਰ ਵਧਾ ਦਿੱਤਾ। ਸੱਤ ਸ਼ਹਿਰਾਂ ’ਚ ਦਿਨ ਦਾ ਤਾਪਮਾਨ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੇ ਆਸਪਾਸ ਰਿਹਾ। ਸੂਬੇ ’ਚ ਵੱਧ ਤੋਂ ਵੱਧ ਤਾਪਮਾਨ ’ਚ ਵੀ ਦਸ ਡਿਗਰੀ ਸੈਲਸੀਅਸ ਦੀ ਗਿਰਾਵਟ ਦਰਜ ਕੀਤੀ ਗਈ",
                "Punjabi",
                "English",
            ],
        ],
        inputs=[file_uploader ,input_text, source_language, target_language],
        outputs=[output_text, output_file],
        fn=run_t2tt,
        cache_examples=False,
        api_name=False,
    )

    gr.on(
        triggers=[input_text.submit, btn.click],
        fn=run_t2tt,
        inputs=[file_uploader, input_text, source_language, target_language],
        outputs=[output_text, output_file],
        api_name="t2tt",
    )

with gr.Blocks() as demo:
    with gr.Tabs():
        with gr.Tab(label="Translate"):
            demo_t2tt.render()

if __name__ == "__main__":
    demo.launch()