import torch
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import re
from polyglot.detect import Detector
from nltk.translate.bleu_score import sentence_bleu
HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = "LLaMAX/LLaMAX2-7B-Alpaca"
RELATIVE_MODEL="LLaMAX/LLaMAX2-7B"
TITLE = "
LLaMAX Translator
"
DESCRIPTION = """Do tài chính có hạn nên dự án đang chỉ dùng CPU để xử lý yêu cầu. Để xử lý với tốc độ nhanh hơn thông qua GPU,
vui lòng truy cập vào notebook Kaggle sau LLaMAX3 Translator"""
model = AutoModelForCausalLM.from_pretrained(
MODEL,
torch_dtype=torch.float16,
device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
def lang_detector(text):
min_chars = 5
if len(text) < min_chars:
return "Input text too short"
try:
detector = Detector(text).language
lang_info = str(detector)
code = re.search(r"name: (\w+)", lang_info).group(1)
return code
except Exception as e:
return f"ERROR:{str(e)}"
def Prompt_template(inst, prompt, query, src_language, trg_language):
inst = inst.format(src_language=src_language, trg_language=trg_language)
instruction = f"`{inst}`"
prompt = (
f'{prompt}'
f'### Instruction:\n{instruction}\n'
f'### Input:\n{query}\n### Response:'
)
return prompt
# Unfinished
def chunk_text():
pass
# Function to calculate BLEU score
def calculate_bleu_score(candidate: str, references: list):
candidate_tokens = candidate.split()
bleu_score = sentence_bleu(references, candidate_tokens)
return bleu_score
@spaces.GPU(duration=60)
def translate(
source_text: str,
source_lang: str,
target_lang: str,
inst: str,
prompt: str,
max_length: int,
temperature: float,
top_p: float,
rp: float):
print(f'Text is - {source_text}')
prompt = Prompt_template(inst, prompt, source_text, source_lang, target_lang)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
generate_kwargs = dict(
input_ids=input_ids,
max_length=max_length,
do_sample=True,
temperature=temperature,
top_p=top_p,
repetition_penalty=rp,
)
outputs = model.generate(**generate_kwargs)
resp = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
# Calculate BLEU score
references = [resp[len(prompt):].split()]
bleu_score = calculate_bleu_score(resp[len(prompt):], references)
yield resp[len(prompt):], bleu_score
CSS = """
h1 {
text-align: center;
display: block;
height: 10vh;
align-content: center;
font-family: Arial, Helvetica, sans-serif;
}
footer {
visibility: hidden;
}
font-family: Arial, Helvetica, sans-serif;
"""
LICENSE = """
Model: LLaMAX2-7B-Alpaca
"""
LANG_LIST = ['Akrikaans', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Asturian', 'Azerbaijani', \
'Belarusian', 'Bengali', 'Bosnian', 'Bulgarian', 'Burmese', \
'Catalan', 'Cebuano', 'Simplified Chinese', 'Traditional Chinese', 'Croatian', 'Czech', \
'Danish', 'Dutch', 'English', 'Estonian', 'Filipino', 'Finnish', 'French', 'Fulah', \
'Galician', 'Ganda', 'Georgian', 'German', 'Greek', 'Gujarati', \
'Hausa', 'Hebrew', 'Hindi', 'Hungarian', \
'Icelandic', 'Igbo', 'Indonesian', 'Irish', 'Italian', \
'Japanese', 'Javanese', \
'Kabuverdianu', 'Kamba', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Kyrgyz', \
'Lao', 'Latvian', 'Lingala', 'Lithuanian', 'Luo', 'Luxembourgish', \
'Macedonian', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Mongolian', \
'Nepali', 'Northern', 'Norwegian', 'Nyanja', \
'Occitan', 'Oriya', 'Oromo', \
'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', \
'Romanian', 'Russian', \
'Serbian', 'Shona', 'Sindhi', 'Slovak', 'Slovenian', 'Somali', 'Sorani', 'Spanish', 'Swahili', 'Swedish', \
'Tajik', 'Tamil', 'Telugu', 'Thai', 'Turkish', \
'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', \
'Vietnamese', 'Welsh', 'Wolof', 'Xhosa', 'Yoruba', 'Zulu']
chatbot = gr.Chatbot(height=600)
with gr.Blocks(theme="soft", css=CSS) as demo:
gr.Markdown(TITLE)
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column(scale=4):
source_text = gr.Textbox(
label="Văn bản gốc",
value="Hello",
lines=10,
)
output_text = gr.Textbox(
label="Văn bản đã được dịch",
lines=10,
show_copy_button=True,
)
bleu_score_output = gr.Textbox(
label="BLEU Score",
lines=10,
interactive=False,
)
with gr.Column(scale=1):
source_lang = gr.Dropdown(
label="Ngôn ngữ nguồn",
value="English",
choices=LANG_LIST,
)
target_lang = gr.Dropdown(
label="Ngôn ngữ đích",
value="Vietnamese",
choices=LANG_LIST,
)
max_length = gr.Slider(
label="Độ dài tối đa",
minimum=512,
maximum=8192,
value=4000,
step=8,
)
temperature = gr.Slider(
label="Độ sáng tạo",
minimum=0,
maximum=1,
value=0.3,
step=0.1,
)
top_p = gr.Slider(
label="top_p",
minimum=0.0,
maximum=1.0,
step=0.1,
value=1.0,
)
rp = gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.1,
value=1.2,
)
with gr.Accordion("Tùy chọn nâng cao", open=False):
inst = gr.Textbox(
label="Instruction",
value="Translate the following sentences from {src_language} to {trg_language}.",
lines=3,
)
prompt = gr.Textbox(
label="Prompt",
# Prompt 1
#value="""Below is an instruction that describes a task, paired with an input that provides further context.
#Write a response that appropriately completes the request.
### Instruction:
#{instruction}
### Input:
#{query}
### Response:""",#
# Prompt 2
value="""Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that ensuring accuracy and maintaining the tone and style of the original text.
### Instruction:
{instruction}
### Input:
{query}
### Response:""",
lines=8,
)
with gr.Row():
submit = gr.Button(value="Submit")
clear = gr.ClearButton([source_text, output_text])
gr.Markdown(LICENSE)
#source_text.change(lang_detector, source_text, source_lang)
submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text, bleu_score_output])
if __name__ == "__main__":
demo.launch()