Sean-Case commited on
Commit
d5a8385
1 Parent(s): 9aef340

Stop generation button. Better model load. Trying one source that's longer.

Browse files
Files changed (2) hide show
  1. app.py +27 -10
  2. chatfuncs/chatfuncs.py +72 -22
app.py CHANGED
@@ -13,7 +13,6 @@ from langchain.vectorstores import FAISS
13
  import gradio as gr
14
 
15
  from transformers import AutoTokenizer
16
- from dataclasses import asdict, dataclass
17
 
18
  # Alternative model sources
19
  from ctransformers import AutoModelForCausalLM
@@ -83,7 +82,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
83
  if model_type == "Orca Mini":
84
 
85
  gpu_config.update_gpu(gpu_layers)
86
- cpu_config.update_gpu(0)
87
 
88
  print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.")
89
 
@@ -92,8 +91,13 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
92
 
93
  try:
94
  model = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
 
 
95
  except:
96
  model = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
 
 
 
97
 
98
  tokenizer = []
99
 
@@ -126,8 +130,10 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
126
  chatf.tokenizer = tokenizer
127
  chatf.model_type = model_type
128
 
129
- print("Finished loading model: ", model_type)
130
- return model_type
 
 
131
 
132
  # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
133
  model_type = "Orca Mini"
@@ -173,7 +179,7 @@ with block:
173
 
174
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
175
 
176
- gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca Mini), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Likes and dislike responses will be saved to disk to improve the model. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
177
 
178
  current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
179
 
@@ -181,8 +187,8 @@ with block:
181
 
182
  with gr.Row():
183
  chat_height = 500
184
- chatbot = gr.Chatbot(height=chat_height, avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False)
185
- sources = gr.HTML(value = "Source paragraphs where I looked for answers will appear here", height=chat_height)
186
 
187
  with gr.Row():
188
  message = gr.Textbox(
@@ -191,7 +197,8 @@ with block:
191
  )
192
  with gr.Row():
193
  submit = gr.Button(value="Send message", variant="secondary", scale = 1)
194
- clear = gr.Button(value="Clear chat", variant="secondary", scale=0)
 
195
 
196
  examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
197
  #value = "What were the five pillars of the previous borough plan?",
@@ -220,7 +227,10 @@ with block:
220
 
221
  with gr.Tab("Advanced features"):
222
  model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca", choices = ["Flan Alpaca", "Orca Mini"])
223
- gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU (please don't change if you don't know what you're doing).", value=0, minimum=0, maximum=6, step = 1, scale = 0, visible=False)
 
 
 
224
 
225
  gr.HTML(
226
  "<center>This app is based on the models Flan Alpaca and Orca Mini. It powered by Gradio, Transformers, Ctransformers, and Langchain.</a></center>"
@@ -228,7 +238,11 @@ with block:
228
 
229
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
230
 
231
- model_choice.change(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state])
 
 
 
 
232
 
233
  # Load in a pdf
234
  load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
@@ -259,6 +273,9 @@ with block:
259
  then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
260
  then(lambda: chatf.restore_interactivity(), None, [message], queue=False)
261
 
 
 
 
262
  # Clear box
263
  clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
264
  clear.click(lambda: None, None, chatbot, queue=False)
 
13
  import gradio as gr
14
 
15
  from transformers import AutoTokenizer
 
16
 
17
  # Alternative model sources
18
  from ctransformers import AutoModelForCausalLM
 
82
  if model_type == "Orca Mini":
83
 
84
  gpu_config.update_gpu(gpu_layers)
85
+ cpu_config.update_gpu(gpu_layers)
86
 
87
  print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.")
88
 
 
91
 
92
  try:
93
  model = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
94
+ #model = AutoModelForCausalLM.from_pretrained('Aryanne/Sheared-LLaMA-1.3B-gguf', model_type='llama', model_file='q8_0-sheared-llama-1.3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
95
+ #model = AutoModelForCausalLM.from_pretrained('TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF', model_type='llama', model_file='tinyllama-1.1b-1t-openorca.Q8_0.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
96
  except:
97
  model = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
98
+ #model = AutoModelForCausalLM.from_pretrained('Aryanne/Sheared-LLaMA-1.3B-gguf', model_type='llama', model_file='q8_0-sheared-llama-1.3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
99
+ #model = AutoModelForCausalLM.from_pretrained('TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF', model_type='llama', model_file='tinyllama-1.1b-1t-openorca.Q8_0.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
100
+
101
 
102
  tokenizer = []
103
 
 
130
  chatf.tokenizer = tokenizer
131
  chatf.model_type = model_type
132
 
133
+ load_confirmation = "Finished loading model: " + model_type
134
+
135
+ print(load_confirmation)
136
+ return model_type, load_confirmation
137
 
138
  # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
139
  model_type = "Orca Mini"
 
179
 
180
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
181
 
182
+ gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca Mini), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
183
 
184
  current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
185
 
 
187
 
188
  with gr.Row():
189
  chat_height = 500
190
+ chatbot = gr.Chatbot(height=chat_height, avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1)
191
+ sources = gr.HTML(value = "Source paragraphs where I looked for answers will appear here", height=chat_height, scale = 2)
192
 
193
  with gr.Row():
194
  message = gr.Textbox(
 
197
  )
198
  with gr.Row():
199
  submit = gr.Button(value="Send message", variant="secondary", scale = 1)
200
+ clear = gr.Button(value="Clear chat", variant="secondary", scale=0)
201
+ stop = gr.Button(value="Stop generating", variant="secondary", scale=0)
202
 
203
  examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
204
  #value = "What were the five pillars of the previous borough plan?",
 
227
 
228
  with gr.Tab("Advanced features"):
229
  model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca", choices = ["Flan Alpaca", "Orca Mini"])
230
+ with gr.Row():
231
+ gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU (WARNING: please don't modify unless you have a GPU).", value=0, minimum=0, maximum=6, step = 1, visible=True)
232
+ change_model_button = gr.Button(value="Load model", scale=0)
233
+ load_text = gr.Text(label="Load status")
234
 
235
  gr.HTML(
236
  "<center>This app is based on the models Flan Alpaca and Orca Mini. It powered by Gradio, Transformers, Ctransformers, and Langchain.</a></center>"
 
238
 
239
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
240
 
241
+ change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
242
+ then(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text]).\
243
+ then(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
244
+ then(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
245
+ then(lambda: None, None, chatbot, queue=False)
246
 
247
  # Load in a pdf
248
  load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
 
273
  then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
274
  then(lambda: chatf.restore_interactivity(), None, [message], queue=False)
275
 
276
+ # Stop box
277
+ stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
278
+
279
  # Clear box
280
  clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
281
  clear.click(lambda: None, None, chatbot, queue=False)
chatfuncs/chatfuncs.py CHANGED
@@ -69,7 +69,7 @@ kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniL
69
 
70
  if torch.cuda.is_available():
71
  torch_device = "cuda"
72
- gpu_layers = 6
73
  else:
74
  torch_device = "cpu"
75
  gpu_layers = 0
@@ -82,25 +82,38 @@ print("CPU threads:", threads)
82
  temperature: float = 0.1
83
  top_k: int = 3
84
  top_p: float = 1
85
- repetition_penalty: float = 1.05
86
  flan_alpaca_repetition_penalty: float = 1.3
 
87
  last_n_tokens: int = 64
88
- max_new_tokens: int = 125
89
  seed: int = 42
90
  reset: bool = False
91
  stream: bool = True
92
  threads: int = threads
93
- batch_size:int = 1024
94
- context_length:int = 4096
95
  sample = True
96
 
97
 
98
  class CtransInitConfig_gpu:
99
- def __init__(self, temperature=0.1, top_k=3, top_p=1, repetition_penalty=1.05, last_n_tokens=64, max_new_tokens=125, seed=42, reset=False, stream=True, threads=None, batch_size=1024, context_length=4096, gpu_layers=None):
 
 
 
 
 
 
 
 
 
 
 
 
100
  self.temperature = temperature
101
  self.top_k = top_k
102
  self.top_p = top_p
103
- self.repetition_penalty = repetition_penalty
104
  self.last_n_tokens = last_n_tokens
105
  self.max_new_tokens = max_new_tokens
106
  self.seed = seed
@@ -124,17 +137,38 @@ gpu_config = CtransInitConfig_gpu()
124
  cpu_config = CtransInitConfig_cpu()
125
 
126
 
127
- @dataclass
 
 
 
 
 
 
 
 
 
 
 
128
  class CtransGenGenerationConfig:
129
- top_k: int = top_k
130
- top_p: float = top_p
131
- temperature: float = temperature
132
- repetition_penalty: float = repetition_penalty
133
- last_n_tokens: int = last_n_tokens
134
- seed: int = seed
135
- batch_size:int = batch_size
136
- threads: int = threads
137
- reset: bool = True
 
 
 
 
 
 
 
 
 
 
138
 
139
  # Vectorstore funcs
140
 
@@ -199,6 +233,12 @@ def base_prompt_templates(model_type = "Flan Alpaca"):
199
 
200
  Response:"""
201
 
 
 
 
 
 
 
202
  instruction_prompt_template_orca = """
203
  ### System:
204
  You are an AI assistant that follows instruction extremely well. Help as much as you can.
@@ -215,7 +255,15 @@ def base_prompt_templates(model_type = "Flan Alpaca"):
215
  Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.
216
  CONTENT: {summaries}
217
  QUESTION: {question}\n
218
- <|im_end|>"""
 
 
 
 
 
 
 
 
219
 
220
  if model_type == "Flan Alpaca":
221
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
@@ -233,12 +281,12 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
233
  new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
234
 
235
 
236
- docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 2,
237
  vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
238
  #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
239
 
240
  # Expand the found passages to the neighbouring context
241
- docs_keep_as_doc, doc_df = get_expanded_passages(vectorstore, docs_keep_out, width=1)
242
 
243
  if docs_keep_as_doc == []:
244
  {"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
@@ -301,7 +349,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type):
301
  streamer=streamer,
302
  max_new_tokens=max_new_tokens,
303
  do_sample=sample,
304
- repetition_penalty=flan_alpaca_repetition_penalty,
305
  top_p=top_p,
306
  temperature=temperature,
307
  top_k=top_k
@@ -332,13 +380,15 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type):
332
  elif model_type == "Orca Mini":
333
  tokens = model.tokenize(full_prompt)
334
 
 
 
335
  # Pull the generated text from the streamer, and update the model output.
336
  start = time.time()
337
  NUM_TOKENS=0
338
  print('-'*4+'Start Generation'+'-'*4)
339
 
340
  history[-1][1] = ""
341
- for new_text in model.generate(tokens, **asdict(CtransGenGenerationConfig())): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
342
  if new_text == None: new_text = ""
343
  history[-1][1] += model.detokenize(new_text) #new_text
344
  NUM_TOKENS+=1
 
69
 
70
  if torch.cuda.is_available():
71
  torch_device = "cuda"
72
+ gpu_layers = 0
73
  else:
74
  torch_device = "cpu"
75
  gpu_layers = 0
 
82
  temperature: float = 0.1
83
  top_k: int = 3
84
  top_p: float = 1
85
+ repetition_penalty: float = 1.3
86
  flan_alpaca_repetition_penalty: float = 1.3
87
+ tinyllama_repetition_penalty: float = 1.5
88
  last_n_tokens: int = 64
89
+ max_new_tokens: int = 512
90
  seed: int = 42
91
  reset: bool = False
92
  stream: bool = True
93
  threads: int = threads
94
+ batch_size:int = 256
95
+ context_length:int = 2048
96
  sample = True
97
 
98
 
99
  class CtransInitConfig_gpu:
100
+ def __init__(self, temperature=temperature,
101
+ top_k=top_k,
102
+ top_p=top_p,
103
+ repetition_penalty=repetition_penalty,
104
+ last_n_tokens=last_n_tokens,
105
+ max_new_tokens=max_new_tokens,
106
+ seed=seed,
107
+ reset=reset,
108
+ stream=stream,
109
+ threads=threads,
110
+ batch_size=batch_size,
111
+ context_length=context_length,
112
+ gpu_layers=gpu_layers):
113
  self.temperature = temperature
114
  self.top_k = top_k
115
  self.top_p = top_p
116
+ self.repetition_penalty = repetition_penalty# repetition_penalty
117
  self.last_n_tokens = last_n_tokens
118
  self.max_new_tokens = max_new_tokens
119
  self.seed = seed
 
137
  cpu_config = CtransInitConfig_cpu()
138
 
139
 
140
+ #@dataclass
141
+ #class CtransGenGenerationConfig:
142
+ # top_k: int = top_k
143
+ # top_p: float = top_p
144
+ # temperature: float = temperature
145
+ # repetition_penalty: float = tinyllama_repetition_penalty
146
+ # last_n_tokens: int = last_n_tokens
147
+ # seed: int = seed
148
+ # batch_size:int = batch_size
149
+ # threads: int = threads
150
+ # reset: bool = True
151
+
152
  class CtransGenGenerationConfig:
153
+ def __init__(self, temperature=temperature,
154
+ top_k=top_k,
155
+ top_p=top_p,
156
+ repetition_penalty=repetition_penalty,
157
+ last_n_tokens=last_n_tokens,
158
+ seed=seed,
159
+ threads=threads,
160
+ batch_size=batch_size,
161
+ reset=True
162
+ ):
163
+ self.temperature = temperature
164
+ self.top_k = top_k
165
+ self.top_p = top_p
166
+ self.repetition_penalty = repetition_penalty# repetition_penalty
167
+ self.last_n_tokens = last_n_tokens
168
+ self.seed = seed
169
+ self.threads = threads
170
+ self.batch_size = batch_size
171
+ self.reset = reset
172
 
173
  # Vectorstore funcs
174
 
 
233
 
234
  Response:"""
235
 
236
+ instruction_prompt_template_sheared_llama = """Answer the QUESTION using information from the following CONTENT.
237
+ CONTENT: {summaries}
238
+ QUESTION: {question}
239
+
240
+ Answer:"""
241
+
242
  instruction_prompt_template_orca = """
243
  ### System:
244
  You are an AI assistant that follows instruction extremely well. Help as much as you can.
 
255
  Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.
256
  CONTENT: {summaries}
257
  QUESTION: {question}\n
258
+ Answer:<|im_end|>"""
259
+
260
+ instruction_prompt_tinyllama_orca = """<|im_start|>system\n
261
+ You are an AI assistant that follows instruction extremely well. Help as much as you can.
262
+ <|im_start|>user\n
263
+ Answer the QUESTION using information from the following CONTENT. Only quote text that directly answers the question and nothing more. If you can't find an answer to the question, respond with "Sorry, I can't find an answer to that question.".
264
+ CONTENT: {summaries}
265
+ QUESTION: {question}\n
266
+ Answer:<|im_end|>"""
267
 
268
  if model_type == "Flan Alpaca":
269
  INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
 
281
  new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
282
 
283
 
284
+ docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 1,
285
  vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
286
  #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
287
 
288
  # Expand the found passages to the neighbouring context
289
+ docs_keep_as_doc, doc_df = get_expanded_passages(vectorstore, docs_keep_out, width=3)
290
 
291
  if docs_keep_as_doc == []:
292
  {"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
 
349
  streamer=streamer,
350
  max_new_tokens=max_new_tokens,
351
  do_sample=sample,
352
+ repetition_penalty=repetition_penalty,
353
  top_p=top_p,
354
  temperature=temperature,
355
  top_k=top_k
 
380
  elif model_type == "Orca Mini":
381
  tokens = model.tokenize(full_prompt)
382
 
383
+ gen_config = CtransGenGenerationConfig()
384
+
385
  # Pull the generated text from the streamer, and update the model output.
386
  start = time.time()
387
  NUM_TOKENS=0
388
  print('-'*4+'Start Generation'+'-'*4)
389
 
390
  history[-1][1] = ""
391
+ for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
392
  if new_text == None: new_text = ""
393
  history[-1][1] += model.detokenize(new_text) #new_text
394
  NUM_TOKENS+=1