import torch
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import re
from polyglot.detect import Detector
from nltk.translate.bleu_score import sentence_bleu
HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = "LLaMAX/LLaMAX3-8B-Alpaca"
RELATIVE_MODEL="LLaMAX/LLaMAX3-8B"
TITLE = "
LLaMAX Translator
"
model = AutoModelForCausalLM.from_pretrained(
MODEL,
torch_dtype=torch.float16,
device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
def lang_detector(text):
min_chars = 5
if len(text) < min_chars:
return "Input text too short"
try:
detector = Detector(text).language
lang_info = str(detector)
code = re.search(r"name: (\w+)", lang_info).group(1)
return code
except Exception as e:
return f"ERROR:{str(e)}"
def Prompt_template(inst, prompt, query, src_language, trg_language):
inst = inst.format(src_language=src_language, trg_language=trg_language)
instruction = f"`{inst}`"
prompt = (
f'{prompt}'
f'### Instruction:\n{instruction}\n'
f'### Input:\n{query}\n### Response:'
)
return prompt
# Unfinished
def chunk_text():
pass
# Function to calculate BLEU score
def calculate_bleu_score(candidate: str, references: list):
candidate_tokens = candidate.split() # Tokenizing the candidate output
bleu_score = sentence_bleu(references, candidate_tokens) # Calculating BLEU score
return bleu_score
@spaces.GPU(duration=60)
def translate(
source_text: str,
source_lang: str,
target_lang: str,
inst: str,
prompt: str,
max_length: int,
temperature: float,
top_p: float,
rp: float):
print(f'Text is - {source_text}')
prompt = Prompt_template(inst, prompt, source_text, source_lang, target_lang)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
generate_kwargs = dict(
input_ids=input_ids,
max_length=max_length,
do_sample=True,
temperature=temperature,
top_p=top_p,
repetition_penalty=rp,
)
outputs = model.generate(**generate_kwargs)
resp = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
#yield resp[len(prompt):]
# Calculate BLEU score
'''
references = [
'this is a dog'.split(),
'it is dog'.split(),
'dog it is'.split(),
'a dog, it is'.split()
]
bleu_score = calculate_bleu_score(resp[len(prompt):], references) # Calculate BLEU score
'''
references = [resp[len(prompt):].split()] # Use the generated response as the reference
bleu_score = calculate_bleu_score(resp[len(prompt):], references) # Calculate BLEU score
yield resp[len(prompt):], bleu_score
CSS = """
h1 {
text-align: center;
display: block;
height: 10vh;
align-content: center;
font-family: Arial, Helvetica, sans-serif;
}
footer {
visibility: hidden;
}
font-family: Arial, Helvetica, sans-serif;
"""
LICENSE = """
Model: LLaMAX3-8B-Alpaca
"""
LANG_LIST = ['Akrikaans', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Asturian', 'Azerbaijani', \
'Belarusian', 'Bengali', 'Bosnian', 'Bulgarian', 'Burmese', \
'Catalan', 'Cebuano', 'Simplified Chinese', 'Traditional Chinese', 'Croatian', 'Czech', \
'Danish', 'Dutch', 'English', 'Estonian', 'Filipino', 'Finnish', 'French', 'Fulah', \
'Galician', 'Ganda', 'Georgian', 'German', 'Greek', 'Gujarati', \
'Hausa', 'Hebrew', 'Hindi', 'Hungarian', \
'Icelandic', 'Igbo', 'Indonesian', 'Irish', 'Italian', \
'Japanese', 'Javanese', \
'Kabuverdianu', 'Kamba', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Kyrgyz', \
'Lao', 'Latvian', 'Lingala', 'Lithuanian', 'Luo', 'Luxembourgish', \
'Macedonian', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Mongolian', \
'Nepali', 'Northern', 'Norwegian', 'Nyanja', \
'Occitan', 'Oriya', 'Oromo', \
'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', \
'Romanian', 'Russian', \
'Serbian', 'Shona', 'Sindhi', 'Slovak', 'Slovenian', 'Somali', 'Sorani', 'Spanish', 'Swahili', 'Swedish', \
'Tajik', 'Tamil', 'Telugu', 'Thai', 'Turkish', \
'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', \
'Vietnamese', 'Welsh', 'Wolof', 'Xhosa', 'Yoruba', 'Zulu']
chatbot = gr.Chatbot(height=600)
with gr.Blocks(theme="soft", css=CSS) as demo:
gr.Markdown(TITLE)
with gr.Row():
with gr.Column(scale=4):
source_text = gr.Textbox(
label="Văn bản gốc",
value="LLaMAX is a language model with powerful multilingual capabilities without loss instruction-following capabilities. "+\
"LLaMAX supports translation between more than 100 languages, "+\
"surpassing the performance of similarly scaled LLMs.",
lines=10,
)
output_text = gr.Textbox(
label="Văn bản đã được dịch",
lines=10,
show_copy_button=True,
)
bleu_score_output = gr.Textbox( # New holder area for BLEU score
label="BLEU Score",
lines=10,
interactive=False,
)
with gr.Column(scale=1):
source_lang = gr.Dropdown(
label="Ngôn ngữ nguồn",
value="English",
choices=LANG_LIST,
)
target_lang = gr.Dropdown(
label="Ngôn ngữ đích",
value="Vietnamese",
choices=LANG_LIST,
)
max_length = gr.Slider(
label="Độ dài tối đa",
minimum=512,
maximum=8192,
value=4000,
step=8,
)
temperature = gr.Slider(
label="Temperature",
minimum=0,
maximum=1,
value=0.3,
step=0.1,
)
top_p = gr.Slider(
label="top_p",
minimum=0.0,
maximum=1.0,
step=0.1,
value=1.0,
)
rp = gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.1,
value=1.2,
)
with gr.Accordion("Tùy chọn nâng cao", open=False):
inst = gr.Textbox(
label="Instruction",
value="Translate the following sentences from {src_language} to {trg_language}.",
lines=3,
)
prompt = gr.Textbox(
label="Prompt",
# Prompt 1
#value="""Below is an instruction that describes a task, paired with an input that provides further context.
#Write a response that appropriately completes the request.
### Instruction:
#{instruction}
### Input:
#{query}
### Response:""",#
# Prompt 2
value="""Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that ensuring accuracy and maintaining the tone and style of the original text.
### Instruction:
{instruction}
### Input:
{query}
### Response:""",
lines=8,
)
with gr.Row():
submit = gr.Button(value="Submit")
clear = gr.ClearButton([source_text, output_text])
gr.Markdown(LICENSE)
#source_text.change(lang_detector, source_text, source_lang)
#submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text])
submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text, bleu_score_output])
if __name__ == "__main__":
demo.launch()