Mahadih534 commited on
Commit
032d253
·
verified ·
1 Parent(s): 5c8fa79

uploaded again

Browse files
Files changed (1) hide show
  1. app.py +133 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+ import logging
3
+ import sys
4
+ import gradio as gr
5
+ from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
6
+ from llama_index.llms import LlamaCPP
7
+ from llama_index.llms.llama_utils import (
8
+ messages_to_prompt,
9
+ completion_to_prompt,
10
+ )
11
+
12
+ MODELS_PATH = "./models"
13
+
14
+ mistral_model_path = hf_hub_download(
15
+ repo_id= "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
16
+ filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
17
+ resume_download=True,
18
+ cache_dir=MODELS_PATH,)
19
+
20
+
21
+
22
+ """Step 3 : if you use GPU then make sure ( n_gpu_layers":1) at least 1, you can increase or decrease it based on your GPU performance"""
23
+
24
+ llm = LlamaCPP(
25
+ # You can pass in the URL to a GGML model to download it automatically
26
+ # model_url=model_url,
27
+ # optionally, you can set the path to a pre-downloaded model instead of model_url
28
+ model_path=mistral_model_path,
29
+ temperature=0.1,
30
+ max_new_tokens=256,
31
+ # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
32
+ context_window=3900,
33
+ # kwargs to pass to __call__()
34
+ generate_kwargs={},
35
+ # kwargs to pass to __init__()
36
+ # set to at least 1 to use GPU
37
+ model_kwargs={"n_gpu_layers": -1},
38
+ # transform inputs into Llama2 format
39
+ messages_to_prompt=messages_to_prompt,
40
+ completion_to_prompt=completion_to_prompt,
41
+ verbose=True,
42
+ )
43
+
44
+
45
+ def model_initialization(model):
46
+ if(model !=""):
47
+ gr.Info("model downloading and configuration process has been started, please wait...")
48
+ MODELS_PATH = "./models"
49
+ repo_id=""
50
+ filename=""
51
+ if(model=="Llama-2-13B-chat"):
52
+ repo_id="TheBloke/Llama-2-13B-chat-GGUF"
53
+ filename="llama-2-13b-chat.Q4_K_M.gguf"
54
+ elif(model=="Mistral-7B-Instruct-v0.2") :
55
+ repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
56
+ filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf"
57
+ elif(model=="zephyr-7B-beta"):
58
+ repo_id="TheBloke/zephyr-7B-beta-GGUF "
59
+ filename="zephyr-7b-beta.Q4_K_M.gguf"
60
+ elif(model=="vicuna-7B-v1.5"):
61
+ repo_id="TheBloke/vicuna-7B-v1.5-GGUF"
62
+ filename="vicuna-7b-v1.5.Q4_K_M.gguf"
63
+ elif(model=="Falcon-7B-Instruct"):
64
+ repo_id="TheBloke/Falcon-7B-Instruct-GGML"
65
+ filename="falcon-7b-instruct.ggccv1.q4_1.bin"
66
+ elif(model=="CodeLlama-7B"):
67
+ repo_id="TheBloke/CodeLlama-7B-GGUF"
68
+ filename="codellama-7b.Q4_K_M.gguf"
69
+ else:
70
+ gr.Warning("please select at least one model")
71
+
72
+
73
+ mistral_model_path = hf_hub_download(
74
+ repo_id= repo_id,
75
+ filename= filename,
76
+ resume_download=True,
77
+ cache_dir=MODELS_PATH,)
78
+
79
+ llm = LlamaCPP(
80
+ # You can pass in the URL to a GGML model to download it automatically
81
+ # model_url=model_url,
82
+ # optionally, you can set the path to a pre-downloaded model instead of model_url
83
+ model_path=mistral_model_path,
84
+ temperature=0.1,
85
+ max_new_tokens=256,
86
+ # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
87
+ context_window=3900,
88
+ # kwargs to pass to __call__()
89
+ generate_kwargs={},
90
+ # set to at least 1 to use GPU
91
+ model_kwargs={"n_gpu_layers": -1},
92
+ # transform inputs into Llama2 format
93
+ messages_to_prompt=messages_to_prompt,
94
+ completion_to_prompt=completion_to_prompt,
95
+ verbose=True,
96
+ )
97
+ gr.Info("model has been configured and ready to chat")
98
+ return "model has been configured and ready to chat, your current model is "+model
99
+
100
+ def predict(message, history):
101
+ messages = []
102
+ answer = []
103
+ response = llm.stream_complete(message)
104
+ for bot_response in response:
105
+ token = bot_response.delta
106
+ answer.append(token)
107
+ final_answer = " ".join(answer)
108
+ yield final_answer
109
+
110
+ with gr.Blocks() as UI:
111
+
112
+ models=gr.Dropdown(["CodeLlama-7B","Llama-2-13B-chat","Falcon-7B-Instruct" "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta",
113
+ "vicuna-7B-v1.5"],value=["CodeLlama-7B","Llama-2-13B-chat","Falcon-7B-Instruct" "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta",
114
+ "vicuna-7B-v1.5"], label="please select at least one model", info="default model is Mistral-7B-Instruct-v0.2")
115
+ textInfo = gr.Textbox(value="current model is Mistral-7B-Instruct-v0.2",label="Model Status");
116
+ # Chatbot interface
117
+ chatUI= gr.ChatInterface(
118
+ predict,
119
+ title="Open Source LLM ChatBot",
120
+ description="Ask any question",
121
+ theme="soft",
122
+ examples=["Hello", "are you LLM model?", "how can i finetune a pre-trained LLM model?","How can i build a chatbot using local open-souce LLM ?"],
123
+ cache_examples=False,
124
+ submit_btn="Send Message",
125
+ retry_btn=None,
126
+ undo_btn="Delete Previous",
127
+ clear_btn="Clear",
128
+ )
129
+
130
+ models.change(fn=model_initialization,inputs=[models],outputs=[textInfo])
131
+
132
+ if __name__ == "__main__":
133
+ UI.launch(debug=True) # launch app