Spaces:

naofunyannn
/

LLaMAX_Translator

Sleeping

App Files Files Community

LLaMAX_Translator / app.py

naofunyannn

Upload 5 files

cc69e73 verified about 2 months ago

raw

history blame

8.67 kB

	import torch
	import gradio as gr
	import spaces
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import os
	import re
	from polyglot.detect import Detector
	from nltk.translate.bleu_score import sentence_bleu

	HF_TOKEN = os.environ.get("HF_TOKEN", None)
	MODEL = "LLaMAX/LLaMAX3-8B-Alpaca"
	RELATIVE_MODEL="LLaMAX/LLaMAX3-8B"

	TITLE = "<h1><center>LLaMAX Translator</center></h1>"


	model = AutoModelForCausalLM.from_pretrained(
	MODEL,
	torch_dtype=torch.float16,
	device_map="auto")
	tokenizer = AutoTokenizer.from_pretrained(MODEL)


	def lang_detector(text):
	min_chars = 5
	if len(text) < min_chars:
	return "Input text too short"
	try:
	detector = Detector(text).language
	lang_info = str(detector)
	code = re.search(r"name: (\w+)", lang_info).group(1)
	return code
	except Exception as e:
	return f"ERROR：{str(e)}"

	def Prompt_template(inst, prompt, query, src_language, trg_language):
	inst = inst.format(src_language=src_language, trg_language=trg_language)
	instruction = f"`{inst}`"
	prompt = (
	f'{prompt}'
	f'### Instruction:\n{instruction}\n'
	f'### Input:\n{query}\n### Response:'
	)
	return prompt

	# Unfinished
	def chunk_text():
	pass

	# Function to calculate BLEU score
	def calculate_bleu_score(candidate: str, references: list):
	candidate_tokens = candidate.split() # Tokenizing the candidate output
	bleu_score = sentence_bleu(references, candidate_tokens) # Calculating BLEU score
	return bleu_score

	@spaces.GPU(duration=60)
	def translate(
	source_text: str,
	source_lang: str,
	target_lang: str,
	inst: str,
	prompt: str,
	max_length: int,
	temperature: float,
	top_p: float,
	rp: float):

	print(f'Text is - {source_text}')

	prompt = Prompt_template(inst, prompt, source_text, source_lang, target_lang)
	input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

	generate_kwargs = dict(
	input_ids=input_ids,
	max_length=max_length,
	do_sample=True,
	temperature=temperature,
	top_p=top_p,
	repetition_penalty=rp,
	)

	outputs = model.generate(**generate_kwargs)

	resp = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

	#yield resp[len(prompt):]
	# Calculate BLEU score
	'''
	references = [
	'this is a dog'.split(),
	'it is dog'.split(),
	'dog it is'.split(),
	'a dog, it is'.split()
	]
	bleu_score = calculate_bleu_score(resp[len(prompt):], references) # Calculate BLEU score
	'''
	references = [resp[len(prompt):].split()] # Use the generated response as the reference
	bleu_score = calculate_bleu_score(resp[len(prompt):], references) # Calculate BLEU score

	yield resp[len(prompt):], bleu_score

	CSS = """
	h1 {
	text-align: center;
	display: block;
	height: 10vh;
	align-content: center;
	font-family: Arial, Helvetica, sans-serif;
	}
	footer {
	visibility: hidden;
	}
	font-family: Arial, Helvetica, sans-serif;
	"""

	LICENSE = """
	Model: <a href="https://huggingface.co/LLaMAX/LLaMAX3-8B-Alpaca">LLaMAX3-8B-Alpaca</a>
	"""

	LANG_LIST = ['Akrikaans', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Asturian', 'Azerbaijani', \
	'Belarusian', 'Bengali', 'Bosnian', 'Bulgarian', 'Burmese', \
	'Catalan', 'Cebuano', 'Simplified Chinese', 'Traditional Chinese', 'Croatian', 'Czech', \
	'Danish', 'Dutch', 'English', 'Estonian', 'Filipino', 'Finnish', 'French', 'Fulah', \
	'Galician', 'Ganda', 'Georgian', 'German', 'Greek', 'Gujarati', \
	'Hausa', 'Hebrew', 'Hindi', 'Hungarian', \
	'Icelandic', 'Igbo', 'Indonesian', 'Irish', 'Italian', \
	'Japanese', 'Javanese', \
	'Kabuverdianu', 'Kamba', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Kyrgyz', \
	'Lao', 'Latvian', 'Lingala', 'Lithuanian', 'Luo', 'Luxembourgish', \
	'Macedonian', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Mongolian', \
	'Nepali', 'Northern', 'Norwegian', 'Nyanja', \
	'Occitan', 'Oriya', 'Oromo', \
	'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', \
	'Romanian', 'Russian', \
	'Serbian', 'Shona', 'Sindhi', 'Slovak', 'Slovenian', 'Somali', 'Sorani', 'Spanish', 'Swahili', 'Swedish', \
	'Tajik', 'Tamil', 'Telugu', 'Thai', 'Turkish', \
	'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', \
	'Vietnamese', 'Welsh', 'Wolof', 'Xhosa', 'Yoruba', 'Zulu']

	chatbot = gr.Chatbot(height=600)

	with gr.Blocks(theme="soft", css=CSS) as demo:
	gr.Markdown(TITLE)
	with gr.Row():
	with gr.Column(scale=4):
	source_text = gr.Textbox(
	label="Văn bản gốc",
	value="LLaMAX is a language model with powerful multilingual capabilities without loss instruction-following capabilities. "+\
	"LLaMAX supports translation between more than 100 languages, "+\
	"surpassing the performance of similarly scaled LLMs.",
	lines=10,
	)
	output_text = gr.Textbox(
	label="Văn bản đã được dịch",
	lines=10,
	show_copy_button=True,
	)

	bleu_score_output = gr.Textbox( # New holder area for BLEU score
	label="BLEU Score",
	lines=10,
	interactive=False,
	)

	with gr.Column(scale=1):
	source_lang = gr.Dropdown(
	label="Ngôn ngữ nguồn",
	value="English",
	choices=LANG_LIST,
	)
	target_lang = gr.Dropdown(
	label="Ngôn ngữ đích",
	value="Vietnamese",
	choices=LANG_LIST,
	)
	max_length = gr.Slider(
	label="Độ dài tối đa",
	minimum=512,
	maximum=8192,
	value=4000,
	step=8,
	)
	temperature = gr.Slider(
	label="Temperature",
	minimum=0,
	maximum=1,
	value=0.3,
	step=0.1,
	)
	top_p = gr.Slider(
	label="top_p",
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=1.0,
	)
	rp = gr.Slider(
	label="Repetition penalty",
	minimum=1.0,
	maximum=2.0,
	step=0.1,
	value=1.2,
	)
	with gr.Accordion("Tùy chọn nâng cao", open=False):
	inst = gr.Textbox(
	label="Instruction",
	value="Translate the following sentences from {src_language} to {trg_language}.",
	lines=3,
	)
	prompt = gr.Textbox(
	label="Prompt",
	# Prompt 1
	#value="""Below is an instruction that describes a task, paired with an input that provides further context.
	#Write a response that appropriately completes the request.
	### Instruction:
	#{instruction}
	### Input:
	#{query}
	### Response:""",#
	# Prompt 2
	value="""Below is an instruction that describes a task, paired with an input that provides further context.
	Write a response that ensuring accuracy and maintaining the tone and style of the original text.
	### Instruction:
	{instruction}
	### Input:
	{query}
	### Response:""",
	lines=8,
	)

	with gr.Row():
	submit = gr.Button(value="Submit")
	clear = gr.ClearButton([source_text, output_text])
	gr.Markdown(LICENSE)

	#source_text.change(lang_detector, source_text, source_lang)
	#submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text])
	submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text, bleu_score_output])

	if __name__ == "__main__":
	demo.launch()