Spaces:

naofunyannn
/

LLaMAX_Translator

Sleeping

File size: 8,674 Bytes

cc69e73

import torch
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import re
from polyglot.detect import Detector
from nltk.translate.bleu_score import sentence_bleu

HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = "LLaMAX/LLaMAX3-8B-Alpaca"
RELATIVE_MODEL="LLaMAX/LLaMAX3-8B"

TITLE = "<h1><center>LLaMAX Translator</center></h1>"


model = AutoModelForCausalLM.from_pretrained(
        MODEL,
        torch_dtype=torch.float16,
        device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL)


def lang_detector(text):
    min_chars = 5
    if len(text) < min_chars:
        return "Input text too short"
    try:
        detector = Detector(text).language
        lang_info = str(detector)
        code = re.search(r"name: (\w+)", lang_info).group(1)
        return code
    except Exception as e:
        return f"ERROR：{str(e)}"

def Prompt_template(inst, prompt, query, src_language, trg_language):
    inst = inst.format(src_language=src_language, trg_language=trg_language)
    instruction = f"`{inst}`"
    prompt = (
        f'{prompt}'
        f'### Instruction:\n{instruction}\n'
        f'### Input:\n{query}\n### Response:'
    )
    return prompt

# Unfinished
def chunk_text():
    pass

# Function to calculate BLEU score
def calculate_bleu_score(candidate: str, references: list):
    candidate_tokens = candidate.split()  # Tokenizing the candidate output
    bleu_score = sentence_bleu(references, candidate_tokens)  # Calculating BLEU score
    return bleu_score
    
@spaces.GPU(duration=60)
def translate(

    source_text: str, 

    source_lang: str,

    target_lang: str,

    inst: str, 

    prompt: str, 

    max_length: int,

    temperature: float,

    top_p: float,

    rp: float):
    
    print(f'Text is - {source_text}')
    
    prompt = Prompt_template(inst, prompt, source_text, source_lang, target_lang)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
        
    generate_kwargs = dict(
        input_ids=input_ids,
        max_length=max_length, 
        do_sample=True, 
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=rp,    
    )

    outputs = model.generate(**generate_kwargs)
    
    resp = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    
    #yield resp[len(prompt):]
    # Calculate BLEU score
    '''

    references = [

        'this is a dog'.split(),

        'it is dog'.split(),

        'dog it is'.split(),

        'a dog, it is'.split() 

    ]

    bleu_score = calculate_bleu_score(resp[len(prompt):], references)  # Calculate BLEU score

    '''
    references = [resp[len(prompt):].split()]  # Use the generated response as the reference
    bleu_score = calculate_bleu_score(resp[len(prompt):], references)  # Calculate BLEU score

    yield resp[len(prompt):], bleu_score

CSS = """

    h1 {

        text-align: center;

        display: block;

        height: 10vh;

        align-content: center;

        font-family: Arial, Helvetica, sans-serif;

    }

    footer {

        visibility: hidden;

    }

    font-family: Arial, Helvetica, sans-serif;

"""

LICENSE = """

Model: <a href="https://huggingface.co/LLaMAX/LLaMAX3-8B-Alpaca">LLaMAX3-8B-Alpaca</a>

"""

LANG_LIST = ['Akrikaans', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Asturian', 'Azerbaijani', \
             'Belarusian', 'Bengali', 'Bosnian', 'Bulgarian', 'Burmese', \
             'Catalan', 'Cebuano', 'Simplified Chinese', 'Traditional Chinese', 'Croatian', 'Czech', \
             'Danish', 'Dutch', 'English', 'Estonian', 'Filipino', 'Finnish', 'French', 'Fulah', \
             'Galician', 'Ganda', 'Georgian', 'German', 'Greek', 'Gujarati', \
             'Hausa', 'Hebrew', 'Hindi', 'Hungarian', \
             'Icelandic', 'Igbo', 'Indonesian', 'Irish', 'Italian', \
             'Japanese', 'Javanese', \
             'Kabuverdianu', 'Kamba', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Kyrgyz', \
             'Lao', 'Latvian', 'Lingala', 'Lithuanian', 'Luo', 'Luxembourgish', \
             'Macedonian', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Mongolian', \
             'Nepali', 'Northern', 'Norwegian', 'Nyanja', \
             'Occitan', 'Oriya', 'Oromo', \
             'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', \
             'Romanian', 'Russian', \
             'Serbian', 'Shona', 'Sindhi', 'Slovak', 'Slovenian', 'Somali', 'Sorani', 'Spanish', 'Swahili', 'Swedish', \
             'Tajik', 'Tamil', 'Telugu', 'Thai', 'Turkish', \
             'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', \
             'Vietnamese', 'Welsh', 'Wolof', 'Xhosa', 'Yoruba', 'Zulu']

chatbot = gr.Chatbot(height=600)

with gr.Blocks(theme="soft", css=CSS) as demo:
    gr.Markdown(TITLE)
    with gr.Row():
        with gr.Column(scale=4):
            source_text = gr.Textbox(
                label="Văn bản gốc",
                value="LLaMAX is a language model with powerful multilingual capabilities without loss instruction-following capabilities. "+\
                "LLaMAX supports translation between more than 100 languages, "+\
                "surpassing the performance of similarly scaled LLMs.",
                lines=10,
            )
            output_text = gr.Textbox(
                label="Văn bản đã được dịch",
                lines=10,
                show_copy_button=True,
            )

            bleu_score_output = gr.Textbox(  # New holder area for BLEU score
                label="BLEU Score",
                lines=10,
                interactive=False,
            )

        with gr.Column(scale=1):
            source_lang = gr.Dropdown(
                label="Ngôn ngữ nguồn",
                value="English",
                choices=LANG_LIST,
            )
            target_lang = gr.Dropdown(
                label="Ngôn ngữ đích",
                value="Vietnamese",
                choices=LANG_LIST,
            )
            max_length = gr.Slider(
                label="Độ dài tối đa",
                minimum=512,
                maximum=8192,
                value=4000,
                step=8,
            )
            temperature = gr.Slider(
                label="Temperature",
                minimum=0,
                maximum=1,
                value=0.3,
                step=0.1,
            )
            top_p = gr.Slider(
                label="top_p",
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=1.0,
            )
            rp = gr.Slider(
                label="Repetition penalty",
                minimum=1.0,
                maximum=2.0,
                step=0.1,
                value=1.2,
            )
            with gr.Accordion("Tùy chọn nâng cao", open=False):
                inst = gr.Textbox(
                    label="Instruction",
                    value="Translate the following sentences from {src_language} to {trg_language}.",
                    lines=3,
                )
                prompt = gr.Textbox(
                    label="Prompt",
                    # Prompt 1
                    #value="""Below is an instruction that describes a task, paired with an input that provides further context.
#Write a response that appropriately completes the request.
### Instruction:
#{instruction}
### Input:
#{query}
### Response:""",#
                    # Prompt 2
                    value="""Below is an instruction that describes a task, paired with an input that provides further context.

Write a response that ensuring accuracy and maintaining the tone and style of the original text.

### Instruction:

{instruction}

### Input:

{query}

### Response:""",
                    lines=8,
                )
                
    with gr.Row():
        submit = gr.Button(value="Submit")
        clear = gr.ClearButton([source_text, output_text])
    gr.Markdown(LICENSE)
    
    #source_text.change(lang_detector, source_text, source_lang)
    #submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text])
    submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text, bleu_score_output])

if __name__ == "__main__":
    demo.launch()