Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
from huggingface_hub import hf_hub_download | |
from llama_cpp_cuda_tensorcores import Llama | |
REPO_ID = "MaziyarPanahi/Meta-Llama-3-70B-Instruct-GGUF" | |
MODEL_NAME = "Meta-Llama-3-70B-Instruct.Q3_K_L.gguf" | |
MAX_CONTEXT_LENGTH = 8192 | |
CUDA = True | |
SYSTEM_PROMPT = "You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability." | |
TOKEN_STOP = ["<|eot_id|>"] | |
SYS_MSG = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nSYSTEM_PROMPT<|eot_id|>\n" | |
USER_PROMPT = ( | |
"<|start_header_id|>user<|end_header_id|>\n\nUSER_PROMPT<|eot_id|>\n" | |
) | |
ASSIS_PROMPT = "<|start_header_id|>assistant<|end_header_id|>\n\n" | |
END_ASSIS_PREVIOUS_RESPONSE = "<|eot_id|>\n" | |
TASK_PROMPT = { | |
"Assistant": SYSTEM_PROMPT, | |
"Translate": "You are an expert translator. Translate the following text into English.", | |
"Summarization": "Summarizing information is my specialty. Let me know what you'd like summarized.", | |
"Grammar correction": "Grammar is my forte! Feel free to share the text you'd like me to proofread and correct.", | |
"Stable diffusion prompt generator": "You are a stable diffusion prompt generator. Break down the user's text and create a more elaborate prompt.", | |
"Play Trivia": "Engage the user in a trivia game on various topics.", | |
"Share Fun Facts": "Share interesting and fun facts on various topics.", | |
"Explain code": "You are an expert programmer guiding someone through a piece of code step by step, explaining each line and its function in detail.", | |
"Paraphrase Master": "You have the knack for transforming complex or verbose text into simpler, clearer language while retaining the original meaning and essence.", | |
"Recommend Movies": "Recommend movies based on the user's preferences.", | |
"Offer Motivational Quotes": "Offer motivational quotes to inspire the user.", | |
"Recommend Books": "Recommend books based on the user's favorite genres or interests.", | |
"Philosophical discussion": "Engage the user in a philosophical discussion", | |
"Music recommendation": "Tune time! What kind of music are you in the mood for? I'll find the perfect song for you.", | |
"Generate a Joke": "Generate a witty joke suitable for a stand-up comedy routine.", | |
"Roleplay as a Detective": "Roleplay as a detective interrogating a suspect in a murder case.", | |
"Act as a News Reporter": "Act as a news reporter covering breaking news about an alien invasion.", | |
"Play as a Space Explorer": "Play as a space explorer encountering a new alien civilization.", | |
"Be a Medieval Knight": "Imagine yourself as a medieval knight embarking on a quest to rescue a princess.", | |
"Act as a Superhero": "Act as a superhero saving a city from a supervillain's evil plot.", | |
"Play as a Pirate Captain": "Play as a pirate captain searching for buried treasure on a remote island.", | |
"Be a Famous Celebrity": "Imagine yourself as a famous celebrity attending a glamorous red-carpet event.", | |
"Design a New Invention": "Imagine you're an inventor tasked with designing a revolutionary new invention that will change the world.", | |
"Act as a Time Traveler": "You've just discovered time travel! Describe your adventures as you journey through different eras.", | |
"Play as a Magical Girl": "You are a magical girl with extraordinary powers, battling dark forces to protect your city and friends.", | |
"Act as a Shonen Protagonist": "You are a determined and spirited shonen protagonist on a quest for strength, friendship, and victory.", | |
"Roleplay as a Tsundere Character": "You are a tsundere character, initially cold and aloof but gradually warming up to others through unexpected acts of kindness.", | |
} | |
css = ".gradio-container {background-image: url('file=./assets/background.png'); background-size: cover; background-position: center; background-repeat: no-repeat;}" | |
class ChatLLM: | |
def __init__(self, config_model): | |
self.llm = None | |
self.config_model = config_model | |
# self.load_cpp_model() | |
def load_cpp_model(self): | |
self.llm = Llama(**config_model) | |
def apply_chat_template( | |
self, | |
history, | |
system_message, | |
): | |
history = history or [] | |
messages = SYS_MSG.replace("SYSTEM_PROMPT", system_message.strip()) | |
for msg in history: | |
messages += ( | |
USER_PROMPT.replace("USER_PROMPT", msg[0]) + ASSIS_PROMPT + msg[1] | |
) | |
messages += END_ASSIS_PREVIOUS_RESPONSE if msg[1] else "" | |
print(messages) | |
# messages = messages[:-1] | |
return messages | |
def response( | |
self, | |
history, | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
top_k, | |
repeat_penalty, | |
): | |
messages = self.apply_chat_template(history, system_message) | |
history[-1][1] = "" | |
if not self.llm: | |
print("Loading model") | |
self.load_cpp_model() | |
for output in self.llm( | |
messages, | |
echo=False, | |
stream=True, | |
max_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
repeat_penalty=repeat_penalty, | |
stop=TOKEN_STOP, | |
): | |
answer = output["choices"][0]["text"] | |
history[-1][1] += answer | |
# stream the response | |
yield history, history | |
def user(message, history): | |
history = history or [] | |
# Append the user's message to the conversation history | |
history.append([message, ""]) | |
return "", history | |
def clear_chat(chat_history_state, chat_message): | |
chat_history_state = [] | |
chat_message = "" | |
return chat_history_state, chat_message | |
def gui(llm_chat): | |
with gr.Blocks(theme="NoCrypt/miku", css=css) as app: | |
gr.Markdown("# Llama 3 70B Instruct GGUF") | |
gr.Markdown( | |
f""" | |
### This demo utilizes the repository ID {REPO_ID} with the model {MODEL_NAME}, powered by the LLaMA.cpp backend. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
chatbot = gr.Chatbot( | |
label="Chat", | |
height=700, | |
avatar_images=( | |
"assets/avatar_user.jpeg", | |
"assets/avatar_llama.jpeg", | |
), | |
) | |
with gr.Column(scale=1): | |
with gr.Row(): | |
message = gr.Textbox( | |
label="Message", | |
placeholder="Ask me anything.", | |
lines=3, | |
) | |
with gr.Row(): | |
submit = gr.Button(value="Send message", variant="primary") | |
clear = gr.Button(value="New chat", variant="primary") | |
stop = gr.Button(value="Stop", variant="secondary") | |
with gr.Accordion("Contextual Prompt Editor"): | |
default_task = "Assistant" | |
task_prompts_gui = gr.Dropdown( | |
TASK_PROMPT, | |
value=default_task, | |
label="Prompt selector", | |
visible=True, | |
interactive=True, | |
) | |
system_msg = gr.Textbox( | |
TASK_PROMPT[default_task], | |
label="System Message", | |
placeholder="system prompt", | |
lines=4, | |
) | |
def task_selector(choice): | |
return gr.update(value=TASK_PROMPT[choice]) | |
task_prompts_gui.change( | |
task_selector, | |
[task_prompts_gui], | |
[system_msg], | |
) | |
with gr.Accordion("Advanced settings", open=False): | |
with gr.Column(): | |
max_tokens = gr.Slider( | |
20, 4096, label="Max Tokens", step=20, value=400 | |
) | |
temperature = gr.Slider( | |
0.2, 2.0, label="Temperature", step=0.1, value=0.8 | |
) | |
top_p = gr.Slider( | |
0.0, 1.0, label="Top P", step=0.05, value=0.95 | |
) | |
top_k = gr.Slider( | |
0, 100, label="Top K", step=1, value=40 | |
) | |
repeat_penalty = gr.Slider( | |
0.0, | |
2.0, | |
label="Repetition Penalty", | |
step=0.1, | |
value=1.1, | |
) | |
chat_history_state = gr.State() | |
clear.click( | |
clear_chat, | |
inputs=[chat_history_state, message], | |
outputs=[chat_history_state, message], | |
queue=False, | |
) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
submit_click_event = submit.click( | |
fn=user, | |
inputs=[message, chat_history_state], | |
outputs=[message, chat_history_state], | |
queue=True, | |
).then( | |
fn=llm_chat.response, | |
inputs=[ | |
chat_history_state, | |
system_msg, | |
max_tokens, | |
temperature, | |
top_p, | |
top_k, | |
repeat_penalty, | |
], | |
outputs=[chatbot, chat_history_state], | |
queue=True, | |
) | |
stop.click( | |
fn=None, | |
inputs=None, | |
outputs=None, | |
cancels=[submit_click_event], | |
queue=False, | |
) | |
return app | |
if __name__ == "__main__": | |
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME) | |
config_model = { | |
"model_path": model_path, | |
"n_ctx": MAX_CONTEXT_LENGTH, | |
"n_gpu_layers": -1 if CUDA else 0, | |
} | |
llm_chat = ChatLLM(config_model) | |
app = gui(llm_chat) | |
app.queue(default_concurrency_limit=40) | |
app.launch( | |
max_threads=40, | |
share=False, | |
show_error=True, | |
quiet=False, | |
debug=True, | |
allowed_paths=["./assets/"], | |
) | |