seanpedrickcase commited on
Commit
232a079
1 Parent(s): 15476e8

Changed 'large' model to Phi 3 Mini gguf 128k. Added requirements file for cpu. Put prompts in separate file.

Browse files
Files changed (4) hide show
  1. app.py +14 -14
  2. chatfuncs/chatfuncs.py +8 -60
  3. chatfuncs/prompts.py +67 -0
  4. requirements_cpu.txt +18 -0
app.py CHANGED
@@ -68,7 +68,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
68
  if torch_device is None:
69
  torch_device = chatf.torch_device
70
 
71
- if model_type == "Mistral Open Orca (larger, slow)":
72
  if torch_device == "cuda":
73
  gpu_config.update_gpu(gpu_layers)
74
  print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU.")
@@ -84,8 +84,8 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
84
  try:
85
  model = Llama(
86
  model_path=hf_hub_download(
87
- repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
88
- filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
89
  ),
90
  **vars(gpu_config) # change n_gpu_layers if you have more or less VRAM
91
  )
@@ -95,8 +95,8 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
95
  print(e)
96
  model = Llama(
97
  model_path=hf_hub_download(
98
- repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
99
- filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
100
  ),
101
  **vars(cpu_config)
102
  )
@@ -113,14 +113,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
113
 
114
  if torch_device == "cuda":
115
  if "flan" in model_name:
116
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
117
  else:
118
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
119
  else:
120
  if "flan" in model_name:
121
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, torch_dtype=torch.float16)
122
  else:
123
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)#, torch_dtype=torch.float16)
124
 
125
  tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
126
 
@@ -138,7 +138,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
138
  return model_type, load_confirmation, model_type
139
 
140
  # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
141
- model_type = "Mistral Open Orca (larger, slow)"
142
  load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
143
 
144
  model_type = "Flan Alpaca (small, fast)"
@@ -180,7 +180,7 @@ with block:
180
 
181
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
182
 
183
- gr.Markdown("Chat with PDF, web page or (new) csv/Excel documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Mistral Open Orca (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
184
 
185
  with gr.Row():
186
  current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
@@ -235,9 +235,9 @@ with block:
235
 
236
  with gr.Tab("Advanced features"):
237
  out_passages = gr.Slider(minimum=1, value = 2, maximum=10, step=1, label="Choose number of passages to retrieve from the document. Numbers greater than 2 may lead to increased hallucinations or input text being truncated.")
238
- temp_slide = gr.Slider(minimum=0.1, value = 0.1, maximum=1, step=0.1, label="Choose temperature setting for response generation.")
239
  with gr.Row():
240
- model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca (small, fast)", choices = ["Flan Alpaca (small, fast)", "Mistral Open Orca (larger, slow)"])
241
  change_model_button = gr.Button(value="Load model", scale=0)
242
  with gr.Accordion("Choose number of model layers to send to GPU (WARNING: please don't modify unless you are sure you have a GPU).", open = False):
243
  gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU.", value=0, minimum=0, maximum=100, step = 1, visible=True)
@@ -246,7 +246,7 @@ with block:
246
 
247
 
248
  gr.HTML(
249
- "<center>This app is based on the models Flan Alpaca and Mistral Open Orca. It powered by Gradio, Transformers, and Llama.cpp.</a></center>"
250
  )
251
 
252
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
 
68
  if torch_device is None:
69
  torch_device = chatf.torch_device
70
 
71
+ if model_type == "Phi 3 Mini (larger, slow)":
72
  if torch_device == "cuda":
73
  gpu_config.update_gpu(gpu_layers)
74
  print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU.")
 
84
  try:
85
  model = Llama(
86
  model_path=hf_hub_download(
87
+ repo_id=os.environ.get("REPO_ID", "QuantFactory/Phi-3-mini-128k-instruct-GGUF"),# "QuantFactory/Phi-3-mini-128k-instruct-GGUF"), # "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"), #"microsoft/Phi-3-mini-4k-instruct-gguf"),#"TheBloke/Mistral-7B-OpenOrca-GGUF"),
88
+ filename=os.environ.get("MODEL_FILE", "Phi-3-mini-128k-instruct.Q4_K_M.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf") #"Meta-Llama-3-8B-Instruct-v2.Q6_K.gguf") #"Phi-3-mini-4k-instruct-q4.gguf")#"mistral-7b-openorca.Q4_K_M.gguf"),
89
  ),
90
  **vars(gpu_config) # change n_gpu_layers if you have more or less VRAM
91
  )
 
95
  print(e)
96
  model = Llama(
97
  model_path=hf_hub_download(
98
+ repo_id=os.environ.get("REPO_ID", "QuantFactory/Phi-3-mini-128k-instruct-GGUF"), #"QuantFactory/Phi-3-mini-128k-instruct-GGUF"), #, "microsoft/Phi-3-mini-4k-instruct-gguf"),#"QuantFactory/Meta-Llama-3-8B-Instruct-GGUF-v2"), #"microsoft/Phi-3-mini-4k-instruct-gguf"),#"TheBloke/Mistral-7B-OpenOrca-GGUF"),
99
+ filename=os.environ.get("MODEL_FILE", "Phi-3-mini-128k-instruct.Q4_K_M.gguf"), # "Phi-3-mini-128k-instruct.Q4_K_M.gguf") # , #"Meta-Llama-3-8B-Instruct-v2.Q6_K.gguf") #"Phi-3-mini-4k-instruct-q4.gguf"),#"mistral-7b-openorca.Q4_K_M.gguf"),
100
  ),
101
  **vars(cpu_config)
102
  )
 
113
 
114
  if torch_device == "cuda":
115
  if "flan" in model_name:
116
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
117
  else:
118
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
119
  else:
120
  if "flan" in model_name:
121
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
122
  else:
123
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)
124
 
125
  tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
126
 
 
138
  return model_type, load_confirmation, model_type
139
 
140
  # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
141
+ model_type = "Phi 3 Mini (larger, slow)"
142
  load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
143
 
144
  model_type = "Flan Alpaca (small, fast)"
 
180
 
181
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
182
 
183
+ gr.Markdown("Chat with PDF, web page or (new) csv/Excel documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Phi 3 Mini (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
184
 
185
  with gr.Row():
186
  current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
 
235
 
236
  with gr.Tab("Advanced features"):
237
  out_passages = gr.Slider(minimum=1, value = 2, maximum=10, step=1, label="Choose number of passages to retrieve from the document. Numbers greater than 2 may lead to increased hallucinations or input text being truncated.")
238
+ temp_slide = gr.Slider(minimum=0.1, value = 0.5, maximum=1, step=0.1, label="Choose temperature setting for response generation.")
239
  with gr.Row():
240
+ model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca (small, fast)", choices = ["Flan Alpaca (small, fast)", "Phi 3 Mini (larger, slow)"])
241
  change_model_button = gr.Button(value="Load model", scale=0)
242
  with gr.Accordion("Choose number of model layers to send to GPU (WARNING: please don't modify unless you are sure you have a GPU).", open = False):
243
  gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU.", value=0, minimum=0, maximum=100, step = 1, visible=True)
 
246
 
247
 
248
  gr.HTML(
249
+ "<center>This app is based on the models Flan Alpaca and Phi 3 Mini. It powered by Gradio, Transformers, and Llama.cpp.</a></center>"
250
  )
251
 
252
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
chatfuncs/chatfuncs.py CHANGED
@@ -41,6 +41,8 @@ from gensim.similarities import SparseMatrixSimilarity
41
  from llama_cpp import Llama
42
  from huggingface_hub import hf_hub_download
43
 
 
 
44
  import gradio as gr
45
 
46
  torch.cuda.empty_cache()
@@ -86,10 +88,10 @@ print("CPU threads:", threads)
86
  temperature: float = 0.1
87
  top_k: int = 3
88
  top_p: float = 1
89
- repetition_penalty: float = 1.3
90
  flan_alpaca_repetition_penalty: float = 1.3
91
  last_n_tokens: int = 64
92
- max_new_tokens: int = 256
93
  seed: int = 42
94
  reset: bool = False
95
  stream: bool = True
@@ -192,66 +194,12 @@ def base_prompt_templates(model_type = "Flan Alpaca (small, fast)"):
192
  input_variables=["page_content"]
193
  )
194
 
195
- # The main prompt:
196
-
197
- instruction_prompt_template_alpaca_quote = """### Instruction:
198
- Quote directly from the SOURCE below that best answers the QUESTION. Only quote full sentences in the correct order. If you cannot find an answer, start your response with "My best guess is: ".
199
-
200
- CONTENT: {summaries}
201
- QUESTION: {question}
202
-
203
- Response:"""
204
-
205
- instruction_prompt_template_alpaca = """### Instruction:
206
- ### User:
207
- Answer the QUESTION using information from the following CONTENT.
208
- CONTENT: {summaries}
209
- QUESTION: {question}
210
-
211
- Response:"""
212
-
213
-
214
- instruction_prompt_template_wizard_orca = """### HUMAN:
215
- Answer the QUESTION below based on the CONTENT. Only refer to CONTENT that directly answers the question.
216
- CONTENT - {summaries}
217
- QUESTION - {question}
218
- ### RESPONSE:
219
- """
220
-
221
-
222
- instruction_prompt_template_orca = """
223
- ### System:
224
- You are an AI assistant that follows instruction extremely well. Help as much as you can.
225
- ### User:
226
- Answer the QUESTION with a short response using information from the following CONTENT.
227
- QUESTION: {question}
228
- CONTENT: {summaries}
229
-
230
- ### Response:"""
231
-
232
- instruction_prompt_template_orca_quote = """
233
- ### System:
234
- You are an AI assistant that follows instruction extremely well. Help as much as you can.
235
- ### User:
236
- Quote text from the CONTENT to answer the QUESTION below.
237
- QUESTION: {question}
238
- CONTENT: {summaries}
239
- ### Response:
240
- """
241
-
242
-
243
- instruction_prompt_mistral_orca = """<|im_start|>system\n
244
- You are an AI assistant that follows instruction extremely well. Help as much as you can.
245
- <|im_start|>user\n
246
- Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.
247
- CONTENT: {summaries}
248
- QUESTION: {question}\n
249
- Answer:<|im_end|>"""
250
 
251
  if model_type == "Flan Alpaca (small, fast)":
252
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
253
- elif model_type == "Mistral Open Orca (larger, slow)":
254
- INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_mistral_orca, input_variables=['question', 'summaries'])
255
 
256
  return INSTRUCTION_PROMPT, CONTENT_PROMPT
257
 
@@ -402,7 +350,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
402
  print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
403
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
404
 
405
- elif model_type == "Mistral Open Orca (larger, slow)":
406
  #tokens = model.tokenize(full_prompt)
407
 
408
  gen_config = CtransGenGenerationConfig()
 
41
  from llama_cpp import Llama
42
  from huggingface_hub import hf_hub_download
43
 
44
+ from chatfuncs.prompts import instruction_prompt_template_alpaca, instruction_prompt_mistral_orca, instruction_prompt_phi3, instruction_prompt_llama3
45
+
46
  import gradio as gr
47
 
48
  torch.cuda.empty_cache()
 
88
  temperature: float = 0.1
89
  top_k: int = 3
90
  top_p: float = 1
91
+ repetition_penalty: float = 1.15
92
  flan_alpaca_repetition_penalty: float = 1.3
93
  last_n_tokens: int = 64
94
+ max_new_tokens: int = 1024
95
  seed: int = 42
96
  reset: bool = False
97
  stream: bool = True
 
194
  input_variables=["page_content"]
195
  )
196
 
197
+ # The main prompt:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  if model_type == "Flan Alpaca (small, fast)":
200
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
201
+ elif model_type == "Phi 3 Mini (larger, slow)":
202
+ INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_phi3, input_variables=['question', 'summaries'])
203
 
204
  return INSTRUCTION_PROMPT, CONTENT_PROMPT
205
 
 
350
  print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
351
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
352
 
353
+ elif model_type == "Phi 3 Mini (larger, slow)":
354
  #tokens = model.tokenize(full_prompt)
355
 
356
  gen_config = CtransGenGenerationConfig()
chatfuncs/prompts.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ instruction_prompt_template_alpaca_quote = """### Instruction:
2
+ Quote directly from the SOURCE below that best answers the QUESTION. Only quote full sentences in the correct order. If you cannot find an answer, start your response with "My best guess is: ".
3
+
4
+ CONTENT: {summaries}
5
+ QUESTION: {question}
6
+
7
+ Response:"""
8
+
9
+ instruction_prompt_template_alpaca = """### Instruction:
10
+ ### User:
11
+ Answer the QUESTION using information from the following CONTENT.
12
+ CONTENT: {summaries}
13
+ QUESTION: {question}
14
+
15
+ Response:"""
16
+
17
+
18
+ instruction_prompt_template_wizard_orca = """### HUMAN:
19
+ Answer the QUESTION below based on the CONTENT. Only refer to CONTENT that directly answers the question.
20
+ CONTENT - {summaries}
21
+ QUESTION - {question}
22
+ ### RESPONSE:
23
+ """
24
+
25
+
26
+ instruction_prompt_template_orca = """
27
+ ### System:
28
+ You are an AI assistant that follows instruction extremely well. Help as much as you can.
29
+ ### User:
30
+ Answer the QUESTION with a short response using information from the following CONTENT.
31
+ QUESTION: {question}
32
+ CONTENT: {summaries}
33
+
34
+ ### Response:"""
35
+
36
+ instruction_prompt_template_orca_quote = """
37
+ ### System:
38
+ You are an AI assistant that follows instruction extremely well. Help as much as you can.
39
+ ### User:
40
+ Quote text from the CONTENT to answer the QUESTION below.
41
+ QUESTION: {question}
42
+ CONTENT: {summaries}
43
+ ### Response:
44
+ """
45
+
46
+
47
+ instruction_prompt_mistral_orca = """<|im_start|>system\n
48
+ You are an AI assistant that follows instruction extremely well. Help as much as you can.
49
+ <|im_start|>user\n
50
+ Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.
51
+ CONTENT: {summaries}
52
+ QUESTION: {question}\n
53
+ Answer:<|im_end|>"""
54
+
55
+ instruction_prompt_phi3 = """<|user|>\n
56
+ Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.\n
57
+ CONTENT: {summaries}\n
58
+ QUESTION: {question}\n
59
+ Answer:<|end|>\n
60
+ <|assistant|>"""
61
+
62
+ instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
63
+ You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
64
+ Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.\n
65
+ CONTENT: {summaries}\n
66
+ QUESTION: {question}\n
67
+ Answer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"""
requirements_cpu.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ beautifulsoup4
4
+ pandas
5
+ transformers==4.34.0
6
+ llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
7
+ torch
8
+ sentence_transformers==2.2.2
9
+ faiss-cpu==1.7.4
10
+ pypdf
11
+ python-docx
12
+ keybert
13
+ span_marker
14
+ gensim
15
+ gradio==3.50.2
16
+ gradio_client
17
+ nltk
18
+ scipy<1.13