Spaces:

StarFox7
/

Llama-2-ko-7B-chat-ggml

Runtime error

App Files Files Community

Llama-2-ko-7B-chat-ggml / app.py

StarFox7

Update app.py

e1e7312 about 1 year ago

raw

history blame contribute delete

9.7 kB

	"""Run codes."""
	# pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
	# ruff: noqa: E501
	import os
	import platform
	import random
	import time
	from dataclasses import asdict, dataclass
	from pathlib import Path

	# from types import SimpleNamespace
	import gradio as gr
	import psutil
	from about_time import about_time
	from ctransformers import AutoModelForCausalLM
	from dl_hf_model import dl_hf_model
	from loguru import logger

	filename_list = [
	"Llama-2-ko-7B-chat-gguf-q4_0.bin"
	]

	url = "https://huggingface.co/StarFox7/Llama-2-ko-7B-chat-gguf/blob/main/Llama-2-ko-7B-chat-gguf-q4_0.bin"

	prompt_template = "Q: {question}. A: "

	stop_string = ["Q:", "\n"]

	logger.debug(f"{stop_string=} not used")

	_ = psutil.cpu_count(logical=False) - 1
	cpu_count: int = int(_) if _ else 1
	logger.debug(f"{cpu_count=}")

	LLM = None

	try:
	model_loc, file_size = dl_hf_model(url)
	except Exception as exc_:
	logger.error(exc_)
	raise SystemExit(1) from exc_

	LLM = AutoModelForCausalLM.from_pretrained(
	model_loc,
	model_type="llama",
	# threads=cpu_count,
	)

	logger.info(f"done load llm {model_loc=} {file_size=}G")

	os.environ["TZ"] = "Asia/Seoul"
	try:
	time.tzset() # type: ignore # pylint: disable=no-member
	except Exception:
	# Windows
	logger.warning("Windows, cant run time.tzset()")

	_ = """
	ns = SimpleNamespace(
	response="",
	generator=(_ for _ in []),
	)
	# """

	@dataclass
	class GenerationConfig:
	temperature: float = 0.7
	top_k: int = 50
	top_p: float = 0.9
	repetition_penalty: float = 1.0
	max_new_tokens: int = 1024
	seed: int = 42
	reset: bool = False
	stream: bool = True
	# threads: int = cpu_count
	# stop: list[str] = field(default_factory=lambda: [stop_string])


	def generate(
	question: str,
	llm=LLM,
	config: GenerationConfig = GenerationConfig(),
	):
	"""Run model inference, will return a Generator if streaming is true."""
	# _ = prompt_template.format(question=question)
	# print(_)

	prompt = prompt_template.format(question=question)

	return llm(
	prompt,
	**asdict(config),
	)


	logger.debug(f"{asdict(GenerationConfig())=}")


	def user(user_message, history):
	# return user_message, history + [[user_message, None]]
	history.append([user_message, None])
	return user_message, history # keep user_message


	def user1(user_message, history):
	# return user_message, history + [[user_message, None]]
	history.append([user_message, None])
	return "", history # clear user_message


	def bot_(history):
	user_message = history[-1][0]
	resp = random.choice(["How are you?", "I love you", "I'm very hungry"])
	bot_message = user_message + ": " + resp
	history[-1][1] = ""
	for character in bot_message:
	history[-1][1] += character
	time.sleep(0.02)
	yield history

	history[-1][1] = resp
	yield history


	def bot(history):
	user_message = history[-1][0]
	response = []

	logger.debug(f"{user_message=}")

	with about_time() as atime: # type: ignore
	flag = 1
	prefix = ""
	then = time.time()

	logger.debug("about to generate")

	config = GenerationConfig(reset=True)
	for elm in generate(user_message, config=config):
	if flag == 1:
	logger.debug("in the loop")
	prefix = f"({time.time() - then:.2f}s) "
	flag = 0
	print(prefix, end="", flush=True)
	logger.debug(f"{prefix=}")
	print(elm, end="", flush=True)
	# logger.debug(f"{elm}")

	temp_str = "".join(response).replace("▁"," ")
	if len(temp_str) > 2:
	if temp_str[-2:] in stop_string:
	response = response[:-2]
	break
	response.append(elm)
	history[-1][1] = prefix + "".join(response).replace("▁"," ")
	yield history

	_ = (
	f"(time elapsed: {atime.duration_human}, " # type: ignore
	f"{atime.duration/len(''.join(response)):.2f}s/char)" # type: ignore
	)

	history[-1][1] = "".join(response).replace("▁"," ") + f"\n{_}"
	yield history


	def predict_api(prompt):
	logger.debug(f"{prompt=}")
	try:
	# user_prompt = prompt
	config = GenerationConfig(
	temperature=0.2,
	top_k=10,
	top_p=0.9,
	repetition_penalty=1.0,
	max_new_tokens=512, # adjust as needed
	seed=42,
	reset=True, # reset history (cache)
	stream=False,
	# threads=cpu_count,
	# stop=prompt_prefix[1:2],
	)

	response = generate(
	prompt,
	config=config,
	)

	logger.debug(f"api: {response=}")
	except Exception as exc:
	logger.error(exc)
	response = f"{exc=}"
	# bot = {"inputs": [response]}
	# bot = [(prompt, response)]

	return response


	css = """
	.importantButton {
	background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
	border: none !important;
	}
	.importantButton:hover {
	background: linear-gradient(45deg, #ff00e0,#8500ff, #6e00ff) !important;
	border: none !important;
	}
	.disclaimer {font-variant-caps: all-small-caps; font-size: xx-small;}
	.xsmall {font-size: x-small;}
	"""

	examples_list = [
	["인생이란 뭘까요?"],
	]

	logger.info("start block")

	with gr.Blocks(
	title=f"{Path(model_loc).name}",
	theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
	css=css,
	) as block:
	# buff_var = gr.State("")
	with gr.Accordion("🎈 Info", open=False):
	# gr.HTML(
	# """<center><a href="https://huggingface.co/spaces/mikeee/mpt-30b-chat?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate"></a> and spin a CPU UPGRADE to avoid the queue</center>"""
	# )
	gr.Markdown(
	f"""<h5><center>{Path(model_loc).name}</center></h4>
	Most examples are meant for another model.
	You probably should try to test
	some related prompts.""",
	elem_classes="xsmall",
	)

	# chatbot = gr.Chatbot().style(height=700) # 500
	chatbot = gr.Chatbot(height=500)

	# buff = gr.Textbox(show_label=False, visible=True)

	with gr.Row():
	with gr.Column(scale=5):
	msg = gr.Textbox(
	label="Chat Message Box",
	placeholder="Ask me anything (press Shift+Enter or click Submit to send)",
	show_label=False,
	# container=False,
	lines=6,
	max_lines=30,
	show_copy_button=True,
	# ).style(container=False)
	)
	with gr.Column(scale=1, min_width=50):
	with gr.Row():
	submit = gr.Button("Submit", elem_classes="xsmall")
	stop = gr.Button("Stop", visible=True)
	clear = gr.Button("Clear History", visible=True)
	with gr.Row(visible=False):
	with gr.Accordion("Advanced Options:", open=False):
	with gr.Row():
	with gr.Column(scale=2):
	system = gr.Textbox(
	label="System Prompt",
	value=prompt_template,
	show_label=False,
	container=False,
	# ).style(container=False)
	)
	with gr.Column():
	with gr.Row():
	change = gr.Button("Change System Prompt")
	reset = gr.Button("Reset System Prompt")

	with gr.Accordion("Example Inputs", open=True):
	examples = gr.Examples(
	examples=examples_list,
	inputs=[msg],
	examples_per_page=40,
	)

	# with gr.Row():
	with gr.Accordion("Disclaimer", open=False):
	_ = Path(model_loc).name
	gr.Markdown(
	f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
	"factually accurate information. {_} was trained on various public datasets; while great efforts "
	"have been taken to clean the pretraining data, it is possible that this model could generate lewd, "
	"biased, or otherwise offensive outputs.",
	elem_classes=["disclaimer"],
	)

	msg_submit_event = msg.submit(
	# fn=conversation.user_turn,
	fn=user,
	inputs=[msg, chatbot],
	outputs=[msg, chatbot],
	queue=True,
	show_progress="full",
	# api_name=None,
	).then(bot, chatbot, chatbot, queue=True)
	submit_click_event = submit.click(
	fn=user1, # clear msg
	inputs=[msg, chatbot],
	outputs=[msg, chatbot],
	queue=True,
	show_progress="full",
	).then(bot, chatbot, chatbot, queue=True)
	stop.click(
	fn=None,
	inputs=None,
	outputs=None,
	cancels=[msg_submit_event, submit_click_event],
	queue=False,
	)
	clear.click(lambda: None, None, chatbot, queue=False)

	with gr.Accordion("For Chat/Translation API", open=False, visible=False):
	input_text = gr.Text()
	api_btn = gr.Button("Go", variant="primary")
	out_text = gr.Text()

	api_btn.click(
	predict_api,
	input_text,
	out_text,
	api_name="api",
	)

	concurrency_count = 1
	logger.info(f"{concurrency_count=}")

	block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)