Spaces:
Runtime error
Runtime error
from huggingface_hub import hf_hub_download | |
import logging | |
import sys | |
import gradio as gr | |
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt | |
from llama_index.llms import LlamaCPP | |
from llama_index.llms.llama_utils import ( | |
messages_to_prompt, | |
completion_to_prompt, | |
) | |
MODELS_PATH = "./models" | |
mistral_model_path = hf_hub_download( | |
repo_id= "TheBloke/Mistral-7B-Instruct-v0.2-GGUF", | |
filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf", | |
resume_download=True, | |
cache_dir=MODELS_PATH,) | |
"""Step 3 : if you use GPU then make sure ( n_gpu_layers":1) at least 1, you can increase or decrease it based on your GPU performance""" | |
llm = LlamaCPP( | |
# You can pass in the URL to a GGML model to download it automatically | |
# model_url=model_url, | |
# optionally, you can set the path to a pre-downloaded model instead of model_url | |
model_path=mistral_model_path, | |
temperature=0.1, | |
max_new_tokens=256, | |
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room | |
context_window=3900, | |
# kwargs to pass to __call__() | |
generate_kwargs={}, | |
# kwargs to pass to __init__() | |
# set to at least 1 to use GPU | |
model_kwargs={"n_gpu_layers": -1}, | |
# transform inputs into Llama2 format | |
messages_to_prompt=messages_to_prompt, | |
completion_to_prompt=completion_to_prompt, | |
verbose=True, | |
) | |
def model_initialization(model): | |
if(model !=""): | |
gr.Info("model downloading and configuration process has been started, please wait...") | |
MODELS_PATH = "./models" | |
repo_id="" | |
filename="" | |
if(model=="Llama-2-13B-chat"): | |
repo_id="TheBloke/Llama-2-13B-chat-GGUF" | |
filename="llama-2-13b-chat.Q4_K_M.gguf" | |
elif(model=="Mistral-7B-Instruct-v0.2") : | |
repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF" | |
filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf" | |
elif(model=="zephyr-7B-beta"): | |
repo_id="TheBloke/zephyr-7B-beta-GGUF " | |
filename="zephyr-7b-beta.Q4_K_M.gguf" | |
elif(model=="vicuna-7B-v1.5"): | |
repo_id="TheBloke/vicuna-7B-v1.5-GGUF" | |
filename="vicuna-7b-v1.5.Q4_K_M.gguf" | |
elif(model=="Falcon-7B-Instruct"): | |
repo_id="TheBloke/Falcon-7B-Instruct-GGML" | |
filename="falcon-7b-instruct.ggccv1.q4_1.bin" | |
elif(model=="CodeLlama-7B"): | |
repo_id="TheBloke/CodeLlama-7B-GGUF" | |
filename="codellama-7b.Q4_K_M.gguf" | |
else: | |
gr.Warning("please select at least one model") | |
mistral_model_path = hf_hub_download( | |
repo_id= repo_id, | |
filename= filename, | |
resume_download=True, | |
cache_dir=MODELS_PATH,) | |
llm = LlamaCPP( | |
# You can pass in the URL to a GGML model to download it automatically | |
# model_url=model_url, | |
# optionally, you can set the path to a pre-downloaded model instead of model_url | |
model_path=mistral_model_path, | |
temperature=0.1, | |
max_new_tokens=256, | |
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room | |
context_window=3900, | |
# kwargs to pass to __call__() | |
generate_kwargs={}, | |
# set to at least 1 to use GPU | |
model_kwargs={"n_gpu_layers": -1}, | |
# transform inputs into Llama2 format | |
messages_to_prompt=messages_to_prompt, | |
completion_to_prompt=completion_to_prompt, | |
verbose=True, | |
) | |
gr.Info("model has been configured and ready to chat") | |
return "model has been configured and ready to chat, your current model is "+model | |
def predict(message, history): | |
messages = [] | |
answer = [] | |
response = llm.stream_complete(message) | |
for bot_response in response: | |
token = bot_response.delta | |
answer.append(token) | |
final_answer = " ".join(answer) | |
yield final_answer | |
with gr.Blocks() as UI: | |
models=gr.Dropdown(["CodeLlama-7B","Llama-2-13B-chat","Falcon-7B-Instruct" "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", | |
"vicuna-7B-v1.5"],value=["CodeLlama-7B","Llama-2-13B-chat","Falcon-7B-Instruct" "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", | |
"vicuna-7B-v1.5"], label="please select at least one model", info="default model is Mistral-7B-Instruct-v0.2") | |
textInfo = gr.Textbox(value="current model is Mistral-7B-Instruct-v0.2",label="Model Status"); | |
# Chatbot interface | |
chatUI= gr.ChatInterface( | |
predict, | |
title="Open Source LLM ChatBot", | |
description="Ask any question", | |
theme="soft", | |
examples=["Hello", "are you LLM model?", "how can i finetune a pre-trained LLM model?","How can i build a chatbot using local open-souce LLM ?"], | |
cache_examples=False, | |
submit_btn="Send Message", | |
retry_btn=None, | |
undo_btn="Delete Previous", | |
clear_btn="Clear", | |
) | |
models.change(fn=model_initialization,inputs=[models],outputs=[textInfo]) | |
if __name__ == "__main__": | |
UI.launch(debug=True) # launch app |