seanpedrickcase
commited on
Commit
•
232a079
1
Parent(s):
15476e8
Changed 'large' model to Phi 3 Mini gguf 128k. Added requirements file for cpu. Put prompts in separate file.
Browse files- app.py +14 -14
- chatfuncs/chatfuncs.py +8 -60
- chatfuncs/prompts.py +67 -0
- requirements_cpu.txt +18 -0
app.py
CHANGED
@@ -68,7 +68,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
68 |
if torch_device is None:
|
69 |
torch_device = chatf.torch_device
|
70 |
|
71 |
-
if model_type == "
|
72 |
if torch_device == "cuda":
|
73 |
gpu_config.update_gpu(gpu_layers)
|
74 |
print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU.")
|
@@ -84,8 +84,8 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
84 |
try:
|
85 |
model = Llama(
|
86 |
model_path=hf_hub_download(
|
87 |
-
repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
|
88 |
-
filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
|
89 |
),
|
90 |
**vars(gpu_config) # change n_gpu_layers if you have more or less VRAM
|
91 |
)
|
@@ -95,8 +95,8 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
95 |
print(e)
|
96 |
model = Llama(
|
97 |
model_path=hf_hub_download(
|
98 |
-
repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
|
99 |
-
filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
|
100 |
),
|
101 |
**vars(cpu_config)
|
102 |
)
|
@@ -113,14 +113,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
113 |
|
114 |
if torch_device == "cuda":
|
115 |
if "flan" in model_name:
|
116 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto"
|
117 |
else:
|
118 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto"
|
119 |
else:
|
120 |
if "flan" in model_name:
|
121 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name
|
122 |
else:
|
123 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True
|
124 |
|
125 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
126 |
|
@@ -138,7 +138,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
138 |
return model_type, load_confirmation, model_type
|
139 |
|
140 |
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
141 |
-
model_type = "
|
142 |
load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
|
143 |
|
144 |
model_type = "Flan Alpaca (small, fast)"
|
@@ -180,7 +180,7 @@ with block:
|
|
180 |
|
181 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
182 |
|
183 |
-
gr.Markdown("Chat with PDF, web page or (new) csv/Excel documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (
|
184 |
|
185 |
with gr.Row():
|
186 |
current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
|
@@ -235,9 +235,9 @@ with block:
|
|
235 |
|
236 |
with gr.Tab("Advanced features"):
|
237 |
out_passages = gr.Slider(minimum=1, value = 2, maximum=10, step=1, label="Choose number of passages to retrieve from the document. Numbers greater than 2 may lead to increased hallucinations or input text being truncated.")
|
238 |
-
temp_slide = gr.Slider(minimum=0.1, value = 0.
|
239 |
with gr.Row():
|
240 |
-
model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca (small, fast)", choices = ["Flan Alpaca (small, fast)", "
|
241 |
change_model_button = gr.Button(value="Load model", scale=0)
|
242 |
with gr.Accordion("Choose number of model layers to send to GPU (WARNING: please don't modify unless you are sure you have a GPU).", open = False):
|
243 |
gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU.", value=0, minimum=0, maximum=100, step = 1, visible=True)
|
@@ -246,7 +246,7 @@ with block:
|
|
246 |
|
247 |
|
248 |
gr.HTML(
|
249 |
-
"<center>This app is based on the models Flan Alpaca and
|
250 |
)
|
251 |
|
252 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
|
|
68 |
if torch_device is None:
|
69 |
torch_device = chatf.torch_device
|
70 |
|
71 |
+
if model_type == "Phi 3 Mini (larger, slow)":
|
72 |
if torch_device == "cuda":
|
73 |
gpu_config.update_gpu(gpu_layers)
|
74 |
print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU.")
|
|
|
84 |
try:
|
85 |
model = Llama(
|
86 |
model_path=hf_hub_download(
|
87 |
+
repo_id=os.environ.get("REPO_ID", "QuantFactory/Phi-3-mini-128k-instruct-GGUF"),# "QuantFactory/Phi-3-mini-128k-instruct-GGUF"), # "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"), #"microsoft/Phi-3-mini-4k-instruct-gguf"),#"TheBloke/Mistral-7B-OpenOrca-GGUF"),
|
88 |
+
filename=os.environ.get("MODEL_FILE", "Phi-3-mini-128k-instruct.Q4_K_M.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf") #"Meta-Llama-3-8B-Instruct-v2.Q6_K.gguf") #"Phi-3-mini-4k-instruct-q4.gguf")#"mistral-7b-openorca.Q4_K_M.gguf"),
|
89 |
),
|
90 |
**vars(gpu_config) # change n_gpu_layers if you have more or less VRAM
|
91 |
)
|
|
|
95 |
print(e)
|
96 |
model = Llama(
|
97 |
model_path=hf_hub_download(
|
98 |
+
repo_id=os.environ.get("REPO_ID", "QuantFactory/Phi-3-mini-128k-instruct-GGUF"), #"QuantFactory/Phi-3-mini-128k-instruct-GGUF"), #, "microsoft/Phi-3-mini-4k-instruct-gguf"),#"QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"), #"microsoft/Phi-3-mini-4k-instruct-gguf"),#"TheBloke/Mistral-7B-OpenOrca-GGUF"),
|
99 |
+
filename=os.environ.get("MODEL_FILE", "Phi-3-mini-128k-instruct.Q4_K_M.gguf"), # "Phi-3-mini-128k-instruct.Q4_K_M.gguf") # , #"Meta-Llama-3-8B-Instruct-v2.Q6_K.gguf") #"Phi-3-mini-4k-instruct-q4.gguf"),#"mistral-7b-openorca.Q4_K_M.gguf"),
|
100 |
),
|
101 |
**vars(cpu_config)
|
102 |
)
|
|
|
113 |
|
114 |
if torch_device == "cuda":
|
115 |
if "flan" in model_name:
|
116 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
|
117 |
else:
|
118 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
|
119 |
else:
|
120 |
if "flan" in model_name:
|
121 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
|
122 |
else:
|
123 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)
|
124 |
|
125 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
126 |
|
|
|
138 |
return model_type, load_confirmation, model_type
|
139 |
|
140 |
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
141 |
+
model_type = "Phi 3 Mini (larger, slow)"
|
142 |
load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
|
143 |
|
144 |
model_type = "Flan Alpaca (small, fast)"
|
|
|
180 |
|
181 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
182 |
|
183 |
+
gr.Markdown("Chat with PDF, web page or (new) csv/Excel documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Phi 3 Mini (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
|
184 |
|
185 |
with gr.Row():
|
186 |
current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
|
|
|
235 |
|
236 |
with gr.Tab("Advanced features"):
|
237 |
out_passages = gr.Slider(minimum=1, value = 2, maximum=10, step=1, label="Choose number of passages to retrieve from the document. Numbers greater than 2 may lead to increased hallucinations or input text being truncated.")
|
238 |
+
temp_slide = gr.Slider(minimum=0.1, value = 0.5, maximum=1, step=0.1, label="Choose temperature setting for response generation.")
|
239 |
with gr.Row():
|
240 |
+
model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca (small, fast)", choices = ["Flan Alpaca (small, fast)", "Phi 3 Mini (larger, slow)"])
|
241 |
change_model_button = gr.Button(value="Load model", scale=0)
|
242 |
with gr.Accordion("Choose number of model layers to send to GPU (WARNING: please don't modify unless you are sure you have a GPU).", open = False):
|
243 |
gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU.", value=0, minimum=0, maximum=100, step = 1, visible=True)
|
|
|
246 |
|
247 |
|
248 |
gr.HTML(
|
249 |
+
"<center>This app is based on the models Flan Alpaca and Phi 3 Mini. It powered by Gradio, Transformers, and Llama.cpp.</a></center>"
|
250 |
)
|
251 |
|
252 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
chatfuncs/chatfuncs.py
CHANGED
@@ -41,6 +41,8 @@ from gensim.similarities import SparseMatrixSimilarity
|
|
41 |
from llama_cpp import Llama
|
42 |
from huggingface_hub import hf_hub_download
|
43 |
|
|
|
|
|
44 |
import gradio as gr
|
45 |
|
46 |
torch.cuda.empty_cache()
|
@@ -86,10 +88,10 @@ print("CPU threads:", threads)
|
|
86 |
temperature: float = 0.1
|
87 |
top_k: int = 3
|
88 |
top_p: float = 1
|
89 |
-
repetition_penalty: float = 1.
|
90 |
flan_alpaca_repetition_penalty: float = 1.3
|
91 |
last_n_tokens: int = 64
|
92 |
-
max_new_tokens: int =
|
93 |
seed: int = 42
|
94 |
reset: bool = False
|
95 |
stream: bool = True
|
@@ -192,66 +194,12 @@ def base_prompt_templates(model_type = "Flan Alpaca (small, fast)"):
|
|
192 |
input_variables=["page_content"]
|
193 |
)
|
194 |
|
195 |
-
# The main prompt:
|
196 |
-
|
197 |
-
instruction_prompt_template_alpaca_quote = """### Instruction:
|
198 |
-
Quote directly from the SOURCE below that best answers the QUESTION. Only quote full sentences in the correct order. If you cannot find an answer, start your response with "My best guess is: ".
|
199 |
-
|
200 |
-
CONTENT: {summaries}
|
201 |
-
QUESTION: {question}
|
202 |
-
|
203 |
-
Response:"""
|
204 |
-
|
205 |
-
instruction_prompt_template_alpaca = """### Instruction:
|
206 |
-
### User:
|
207 |
-
Answer the QUESTION using information from the following CONTENT.
|
208 |
-
CONTENT: {summaries}
|
209 |
-
QUESTION: {question}
|
210 |
-
|
211 |
-
Response:"""
|
212 |
-
|
213 |
-
|
214 |
-
instruction_prompt_template_wizard_orca = """### HUMAN:
|
215 |
-
Answer the QUESTION below based on the CONTENT. Only refer to CONTENT that directly answers the question.
|
216 |
-
CONTENT - {summaries}
|
217 |
-
QUESTION - {question}
|
218 |
-
### RESPONSE:
|
219 |
-
"""
|
220 |
-
|
221 |
-
|
222 |
-
instruction_prompt_template_orca = """
|
223 |
-
### System:
|
224 |
-
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
225 |
-
### User:
|
226 |
-
Answer the QUESTION with a short response using information from the following CONTENT.
|
227 |
-
QUESTION: {question}
|
228 |
-
CONTENT: {summaries}
|
229 |
-
|
230 |
-
### Response:"""
|
231 |
-
|
232 |
-
instruction_prompt_template_orca_quote = """
|
233 |
-
### System:
|
234 |
-
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
235 |
-
### User:
|
236 |
-
Quote text from the CONTENT to answer the QUESTION below.
|
237 |
-
QUESTION: {question}
|
238 |
-
CONTENT: {summaries}
|
239 |
-
### Response:
|
240 |
-
"""
|
241 |
-
|
242 |
-
|
243 |
-
instruction_prompt_mistral_orca = """<|im_start|>system\n
|
244 |
-
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
245 |
-
<|im_start|>user\n
|
246 |
-
Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.
|
247 |
-
CONTENT: {summaries}
|
248 |
-
QUESTION: {question}\n
|
249 |
-
Answer:<|im_end|>"""
|
250 |
|
251 |
if model_type == "Flan Alpaca (small, fast)":
|
252 |
INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
|
253 |
-
elif model_type == "
|
254 |
-
INSTRUCTION_PROMPT=PromptTemplate(template=
|
255 |
|
256 |
return INSTRUCTION_PROMPT, CONTENT_PROMPT
|
257 |
|
@@ -402,7 +350,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
402 |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
403 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
404 |
|
405 |
-
elif model_type == "
|
406 |
#tokens = model.tokenize(full_prompt)
|
407 |
|
408 |
gen_config = CtransGenGenerationConfig()
|
|
|
41 |
from llama_cpp import Llama
|
42 |
from huggingface_hub import hf_hub_download
|
43 |
|
44 |
+
from chatfuncs.prompts import instruction_prompt_template_alpaca, instruction_prompt_mistral_orca, instruction_prompt_phi3, instruction_prompt_llama3
|
45 |
+
|
46 |
import gradio as gr
|
47 |
|
48 |
torch.cuda.empty_cache()
|
|
|
88 |
temperature: float = 0.1
|
89 |
top_k: int = 3
|
90 |
top_p: float = 1
|
91 |
+
repetition_penalty: float = 1.15
|
92 |
flan_alpaca_repetition_penalty: float = 1.3
|
93 |
last_n_tokens: int = 64
|
94 |
+
max_new_tokens: int = 1024
|
95 |
seed: int = 42
|
96 |
reset: bool = False
|
97 |
stream: bool = True
|
|
|
194 |
input_variables=["page_content"]
|
195 |
)
|
196 |
|
197 |
+
# The main prompt:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
if model_type == "Flan Alpaca (small, fast)":
|
200 |
INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
|
201 |
+
elif model_type == "Phi 3 Mini (larger, slow)":
|
202 |
+
INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_phi3, input_variables=['question', 'summaries'])
|
203 |
|
204 |
return INSTRUCTION_PROMPT, CONTENT_PROMPT
|
205 |
|
|
|
350 |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
351 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
352 |
|
353 |
+
elif model_type == "Phi 3 Mini (larger, slow)":
|
354 |
#tokens = model.tokenize(full_prompt)
|
355 |
|
356 |
gen_config = CtransGenGenerationConfig()
|
chatfuncs/prompts.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
instruction_prompt_template_alpaca_quote = """### Instruction:
|
2 |
+
Quote directly from the SOURCE below that best answers the QUESTION. Only quote full sentences in the correct order. If you cannot find an answer, start your response with "My best guess is: ".
|
3 |
+
|
4 |
+
CONTENT: {summaries}
|
5 |
+
QUESTION: {question}
|
6 |
+
|
7 |
+
Response:"""
|
8 |
+
|
9 |
+
instruction_prompt_template_alpaca = """### Instruction:
|
10 |
+
### User:
|
11 |
+
Answer the QUESTION using information from the following CONTENT.
|
12 |
+
CONTENT: {summaries}
|
13 |
+
QUESTION: {question}
|
14 |
+
|
15 |
+
Response:"""
|
16 |
+
|
17 |
+
|
18 |
+
instruction_prompt_template_wizard_orca = """### HUMAN:
|
19 |
+
Answer the QUESTION below based on the CONTENT. Only refer to CONTENT that directly answers the question.
|
20 |
+
CONTENT - {summaries}
|
21 |
+
QUESTION - {question}
|
22 |
+
### RESPONSE:
|
23 |
+
"""
|
24 |
+
|
25 |
+
|
26 |
+
instruction_prompt_template_orca = """
|
27 |
+
### System:
|
28 |
+
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
29 |
+
### User:
|
30 |
+
Answer the QUESTION with a short response using information from the following CONTENT.
|
31 |
+
QUESTION: {question}
|
32 |
+
CONTENT: {summaries}
|
33 |
+
|
34 |
+
### Response:"""
|
35 |
+
|
36 |
+
instruction_prompt_template_orca_quote = """
|
37 |
+
### System:
|
38 |
+
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
39 |
+
### User:
|
40 |
+
Quote text from the CONTENT to answer the QUESTION below.
|
41 |
+
QUESTION: {question}
|
42 |
+
CONTENT: {summaries}
|
43 |
+
### Response:
|
44 |
+
"""
|
45 |
+
|
46 |
+
|
47 |
+
instruction_prompt_mistral_orca = """<|im_start|>system\n
|
48 |
+
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
49 |
+
<|im_start|>user\n
|
50 |
+
Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.
|
51 |
+
CONTENT: {summaries}
|
52 |
+
QUESTION: {question}\n
|
53 |
+
Answer:<|im_end|>"""
|
54 |
+
|
55 |
+
instruction_prompt_phi3 = """<|user|>\n
|
56 |
+
Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.\n
|
57 |
+
CONTENT: {summaries}\n
|
58 |
+
QUESTION: {question}\n
|
59 |
+
Answer:<|end|>\n
|
60 |
+
<|assistant|>"""
|
61 |
+
|
62 |
+
instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
|
63 |
+
You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
|
64 |
+
Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.\n
|
65 |
+
CONTENT: {summaries}\n
|
66 |
+
QUESTION: {question}\n
|
67 |
+
Answer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"""
|
requirements_cpu.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
langchain-community
|
3 |
+
beautifulsoup4
|
4 |
+
pandas
|
5 |
+
transformers==4.34.0
|
6 |
+
llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
7 |
+
torch
|
8 |
+
sentence_transformers==2.2.2
|
9 |
+
faiss-cpu==1.7.4
|
10 |
+
pypdf
|
11 |
+
python-docx
|
12 |
+
keybert
|
13 |
+
span_marker
|
14 |
+
gensim
|
15 |
+
gradio==3.50.2
|
16 |
+
gradio_client
|
17 |
+
nltk
|
18 |
+
scipy<1.13
|