import torch import gradio as gr import spaces from transformers import AutoModelForCausalLM, AutoTokenizer import os import re from polyglot.detect import Detector from nltk.translate.bleu_score import sentence_bleu HF_TOKEN = os.environ.get("HF_TOKEN", None) MODEL = "LLaMAX/LLaMAX3-8B-Alpaca" RELATIVE_MODEL="LLaMAX/LLaMAX3-8B" TITLE = "

LLaMAX Translator

" model = AutoModelForCausalLM.from_pretrained( MODEL, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL) def lang_detector(text): min_chars = 5 if len(text) < min_chars: return "Input text too short" try: detector = Detector(text).language lang_info = str(detector) code = re.search(r"name: (\w+)", lang_info).group(1) return code except Exception as e: return f"ERROR:{str(e)}" def Prompt_template(inst, prompt, query, src_language, trg_language): inst = inst.format(src_language=src_language, trg_language=trg_language) instruction = f"`{inst}`" prompt = ( f'{prompt}' f'### Instruction:\n{instruction}\n' f'### Input:\n{query}\n### Response:' ) return prompt # Unfinished def chunk_text(): pass # Function to calculate BLEU score def calculate_bleu_score(candidate: str, references: list): candidate_tokens = candidate.split() # Tokenizing the candidate output bleu_score = sentence_bleu(references, candidate_tokens) # Calculating BLEU score return bleu_score @spaces.GPU(duration=60) def translate( source_text: str, source_lang: str, target_lang: str, inst: str, prompt: str, max_length: int, temperature: float, top_p: float, rp: float): print(f'Text is - {source_text}') prompt = Prompt_template(inst, prompt, source_text, source_lang, target_lang) input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) generate_kwargs = dict( input_ids=input_ids, max_length=max_length, do_sample=True, temperature=temperature, top_p=top_p, repetition_penalty=rp, ) outputs = model.generate(**generate_kwargs) resp = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False) #yield resp[len(prompt):] # Calculate BLEU score ''' references = [ 'this is a dog'.split(), 'it is dog'.split(), 'dog it is'.split(), 'a dog, it is'.split() ] bleu_score = calculate_bleu_score(resp[len(prompt):], references) # Calculate BLEU score ''' references = [resp[len(prompt):].split()] # Use the generated response as the reference bleu_score = calculate_bleu_score(resp[len(prompt):], references) # Calculate BLEU score yield resp[len(prompt):], bleu_score CSS = """ h1 { text-align: center; display: block; height: 10vh; align-content: center; font-family: Arial, Helvetica, sans-serif; } footer { visibility: hidden; } font-family: Arial, Helvetica, sans-serif; """ LICENSE = """ Model: LLaMAX3-8B-Alpaca """ LANG_LIST = ['Akrikaans', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Asturian', 'Azerbaijani', \ 'Belarusian', 'Bengali', 'Bosnian', 'Bulgarian', 'Burmese', \ 'Catalan', 'Cebuano', 'Simplified Chinese', 'Traditional Chinese', 'Croatian', 'Czech', \ 'Danish', 'Dutch', 'English', 'Estonian', 'Filipino', 'Finnish', 'French', 'Fulah', \ 'Galician', 'Ganda', 'Georgian', 'German', 'Greek', 'Gujarati', \ 'Hausa', 'Hebrew', 'Hindi', 'Hungarian', \ 'Icelandic', 'Igbo', 'Indonesian', 'Irish', 'Italian', \ 'Japanese', 'Javanese', \ 'Kabuverdianu', 'Kamba', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Kyrgyz', \ 'Lao', 'Latvian', 'Lingala', 'Lithuanian', 'Luo', 'Luxembourgish', \ 'Macedonian', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Mongolian', \ 'Nepali', 'Northern', 'Norwegian', 'Nyanja', \ 'Occitan', 'Oriya', 'Oromo', \ 'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', \ 'Romanian', 'Russian', \ 'Serbian', 'Shona', 'Sindhi', 'Slovak', 'Slovenian', 'Somali', 'Sorani', 'Spanish', 'Swahili', 'Swedish', \ 'Tajik', 'Tamil', 'Telugu', 'Thai', 'Turkish', \ 'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', \ 'Vietnamese', 'Welsh', 'Wolof', 'Xhosa', 'Yoruba', 'Zulu'] chatbot = gr.Chatbot(height=600) with gr.Blocks(theme="soft", css=CSS) as demo: gr.Markdown(TITLE) with gr.Row(): with gr.Column(scale=4): source_text = gr.Textbox( label="Văn bản gốc", value="LLaMAX is a language model with powerful multilingual capabilities without loss instruction-following capabilities. "+\ "LLaMAX supports translation between more than 100 languages, "+\ "surpassing the performance of similarly scaled LLMs.", lines=10, ) output_text = gr.Textbox( label="Văn bản đã được dịch", lines=10, show_copy_button=True, ) bleu_score_output = gr.Textbox( # New holder area for BLEU score label="BLEU Score", lines=10, interactive=False, ) with gr.Column(scale=1): source_lang = gr.Dropdown( label="Ngôn ngữ nguồn", value="English", choices=LANG_LIST, ) target_lang = gr.Dropdown( label="Ngôn ngữ đích", value="Vietnamese", choices=LANG_LIST, ) max_length = gr.Slider( label="Độ dài tối đa", minimum=512, maximum=8192, value=4000, step=8, ) temperature = gr.Slider( label="Temperature", minimum=0, maximum=1, value=0.3, step=0.1, ) top_p = gr.Slider( label="top_p", minimum=0.0, maximum=1.0, step=0.1, value=1.0, ) rp = gr.Slider( label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.1, value=1.2, ) with gr.Accordion("Tùy chọn nâng cao", open=False): inst = gr.Textbox( label="Instruction", value="Translate the following sentences from {src_language} to {trg_language}.", lines=3, ) prompt = gr.Textbox( label="Prompt", # Prompt 1 #value="""Below is an instruction that describes a task, paired with an input that provides further context. #Write a response that appropriately completes the request. ### Instruction: #{instruction} ### Input: #{query} ### Response:""",# # Prompt 2 value="""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that ensuring accuracy and maintaining the tone and style of the original text. ### Instruction: {instruction} ### Input: {query} ### Response:""", lines=8, ) with gr.Row(): submit = gr.Button(value="Submit") clear = gr.ClearButton([source_text, output_text]) gr.Markdown(LICENSE) #source_text.change(lang_detector, source_text, source_lang) #submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text]) submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text, bleu_score_output]) if __name__ == "__main__": demo.launch()