import torch import gradio as gr import spaces from transformers import AutoModelForCausalLM, AutoTokenizer import os import re from polyglot.detect import Detector from nltk.translate.bleu_score import sentence_bleu HF_TOKEN = os.environ.get("HF_TOKEN", None) MODEL = "LLaMAX/LLaMAX2-7B-Alpaca" RELATIVE_MODEL="LLaMAX/LLaMAX2-7B" TITLE = "

LLaMAX Translator

" DESCRIPTION = """
Do tài chính có hạn nên dự án đang chỉ dùng CPU để xử lý yêu cầu. Để xử lý với tốc độ nhanh hơn thông qua GPU, vui lòng truy cập vào notebook Kaggle sau LLaMAX3 Translator
""" model = AutoModelForCausalLM.from_pretrained( MODEL, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL) def lang_detector(text): min_chars = 5 if len(text) < min_chars: return "Input text too short" try: detector = Detector(text).language lang_info = str(detector) code = re.search(r"name: (\w+)", lang_info).group(1) return code except Exception as e: return f"ERROR:{str(e)}" def Prompt_template(inst, prompt, query, src_language, trg_language): inst = inst.format(src_language=src_language, trg_language=trg_language) instruction = f"`{inst}`" prompt = ( f'{prompt}' f'### Instruction:\n{instruction}\n' f'### Input:\n{query}\n### Response:' ) return prompt # Unfinished def chunk_text(): pass # Function to calculate BLEU score def calculate_bleu_score(candidate: str, references: list): candidate_tokens = candidate.split() bleu_score = sentence_bleu(references, candidate_tokens) return bleu_score @spaces.GPU(duration=60) def translate( source_text: str, source_lang: str, target_lang: str, inst: str, prompt: str, max_length: int, temperature: float, top_p: float, rp: float): print(f'Text is - {source_text}') prompt = Prompt_template(inst, prompt, source_text, source_lang, target_lang) input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) generate_kwargs = dict( input_ids=input_ids, max_length=max_length, do_sample=True, temperature=temperature, top_p=top_p, repetition_penalty=rp, ) outputs = model.generate(**generate_kwargs) resp = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False) # Calculate BLEU score references = [resp[len(prompt):].split()] bleu_score = calculate_bleu_score(resp[len(prompt):], references) yield resp[len(prompt):], bleu_score CSS = """ h1 { text-align: center; display: block; height: 10vh; align-content: center; font-family: Arial, Helvetica, sans-serif; } footer { visibility: hidden; } font-family: Arial, Helvetica, sans-serif; """ LICENSE = """ Model: LLaMAX2-7B-Alpaca """ LANG_LIST = ['Akrikaans', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Asturian', 'Azerbaijani', \ 'Belarusian', 'Bengali', 'Bosnian', 'Bulgarian', 'Burmese', \ 'Catalan', 'Cebuano', 'Simplified Chinese', 'Traditional Chinese', 'Croatian', 'Czech', \ 'Danish', 'Dutch', 'English', 'Estonian', 'Filipino', 'Finnish', 'French', 'Fulah', \ 'Galician', 'Ganda', 'Georgian', 'German', 'Greek', 'Gujarati', \ 'Hausa', 'Hebrew', 'Hindi', 'Hungarian', \ 'Icelandic', 'Igbo', 'Indonesian', 'Irish', 'Italian', \ 'Japanese', 'Javanese', \ 'Kabuverdianu', 'Kamba', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Kyrgyz', \ 'Lao', 'Latvian', 'Lingala', 'Lithuanian', 'Luo', 'Luxembourgish', \ 'Macedonian', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Mongolian', \ 'Nepali', 'Northern', 'Norwegian', 'Nyanja', \ 'Occitan', 'Oriya', 'Oromo', \ 'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', \ 'Romanian', 'Russian', \ 'Serbian', 'Shona', 'Sindhi', 'Slovak', 'Slovenian', 'Somali', 'Sorani', 'Spanish', 'Swahili', 'Swedish', \ 'Tajik', 'Tamil', 'Telugu', 'Thai', 'Turkish', \ 'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', \ 'Vietnamese', 'Welsh', 'Wolof', 'Xhosa', 'Yoruba', 'Zulu'] chatbot = gr.Chatbot(height=600) with gr.Blocks(theme="soft", css=CSS) as demo: gr.Markdown(TITLE) gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(scale=4): source_text = gr.Textbox( label="Văn bản gốc", value="Hello", lines=10, ) output_text = gr.Textbox( label="Văn bản đã được dịch", lines=10, show_copy_button=True, ) bleu_score_output = gr.Textbox( label="BLEU Score", lines=10, interactive=False, ) with gr.Column(scale=1): source_lang = gr.Dropdown( label="Ngôn ngữ nguồn", value="English", choices=LANG_LIST, ) target_lang = gr.Dropdown( label="Ngôn ngữ đích", value="Vietnamese", choices=LANG_LIST, ) max_length = gr.Slider( label="Độ dài tối đa", minimum=512, maximum=8192, value=4000, step=8, ) temperature = gr.Slider( label="Độ sáng tạo", minimum=0, maximum=1, value=0.3, step=0.1, ) top_p = gr.Slider( label="top_p", minimum=0.0, maximum=1.0, step=0.1, value=1.0, ) rp = gr.Slider( label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.1, value=1.2, ) with gr.Accordion("Tùy chọn nâng cao", open=False): inst = gr.Textbox( label="Instruction", value="Translate the following sentences from {src_language} to {trg_language}.", lines=3, ) prompt = gr.Textbox( label="Prompt", # Prompt 1 #value="""Below is an instruction that describes a task, paired with an input that provides further context. #Write a response that appropriately completes the request. ### Instruction: #{instruction} ### Input: #{query} ### Response:""",# # Prompt 2 value="""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that ensuring accuracy and maintaining the tone and style of the original text. ### Instruction: {instruction} ### Input: {query} ### Response:""", lines=8, ) with gr.Row(): submit = gr.Button(value="Submit") clear = gr.ClearButton([source_text, output_text]) gr.Markdown(LICENSE) #source_text.change(lang_detector, source_text, source_lang) submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text, bleu_score_output]) if __name__ == "__main__": demo.launch()