Spaces:
Running
Running
AFischer1985
commited on
Commit
•
87e8aba
1
Parent(s):
a97d3f8
Update run.py
Browse files
run.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
# Title: Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
|
3 |
# Author: Andreas Fischer
|
4 |
# Date: October 10th, 2024
|
5 |
-
# Last update: October
|
6 |
##########################################################################################
|
7 |
|
8 |
import os
|
@@ -16,17 +16,18 @@ import ocrmypdf #convertPDF
|
|
16 |
from pypdf import PdfReader #convertPDF
|
17 |
import re #format_prompt
|
18 |
import gradio as gr # multimodal_response
|
19 |
-
from huggingface_hub import InferenceClient #multimodal_response
|
20 |
-
|
|
|
21 |
|
22 |
#---------------------------------------------------
|
23 |
# Specify models for text generation and embeddings
|
24 |
#---------------------------------------------------
|
25 |
|
26 |
myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
|
27 |
-
#myModel="princeton-nlp/gemma-2-9b-it-SimPO"
|
28 |
-
#myModel="google/gemma-2-2b-it"
|
29 |
#myModel="meta-llama/Llama-3.1-8B-Instruct"
|
|
|
|
|
30 |
#mod=myModel
|
31 |
#tok=AutoTokenizer.from_pretrained(mod) #,token="hf_...")
|
32 |
#cha=[{"role":"system","content":"A"},{"role":"user","content":"B"},{"role":"assistant","content":"C"}]
|
@@ -34,6 +35,27 @@ myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
|
|
34 |
#res=tok.apply_chat_template(cha)
|
35 |
#print(tok.decode(res))
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
|
38 |
#jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
|
39 |
device='cuda:0' if torch.cuda.is_available() else 'cpu'
|
@@ -84,8 +106,8 @@ def format_prompt0(message, history):
|
|
84 |
|
85 |
def format_prompt(message, history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=False,
|
86 |
startOfString="<s>", template0=" [INST] {system} [/INST] </s>",template1=" [INST] {message} [/INST]",template2=" {response}</s>"): # mistralai/Mixtral-8x7B-Instruct-v0.1
|
87 |
-
#startOfString="<bos>",template0="<start_of_turn>user\n{system}<end_of_turn>\n<start_of_turn>model\n<end_of_turn>\n",template1="<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n",template2="<end_of_turn>\n"): # google/gemma-2-2b-it
|
88 |
-
#startOfString="", template0="<|start_header_id|>system<|end_header_id|>\n\n{system}\n<|eot_id|>", template1="<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id
|
89 |
if zeichenlimit is None: zeichenlimit=1000000000 # :-)
|
90 |
prompt = ""
|
91 |
if RAGAddon is not None:
|
@@ -256,17 +278,144 @@ def add_doc(path, session):
|
|
256 |
print(len(x))
|
257 |
if(len(x)==0):
|
258 |
chunkSize=40000
|
259 |
-
for i in range(round(len(corpus)/chunkSize+0.5)):
|
260 |
print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
|
261 |
ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
|
262 |
batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
|
263 |
textIDs=[str(id) for id in ids[0:len(batch)]]
|
264 |
-
ids=[str(id+len(x)+1) for id in ids[0:len(batch)]]
|
265 |
collection.add(documents=batch, ids=ids,
|
266 |
-
metadatas=[{"date": str("2024-10-10")} for b in batch])
|
267 |
print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
|
268 |
now = datetime.now()
|
269 |
gr.Info(f"Indexing complete!")
|
270 |
-
print(now-then)
|
271 |
return(collection)
|
272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
# Title: Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
|
3 |
# Author: Andreas Fischer
|
4 |
# Date: October 10th, 2024
|
5 |
+
# Last update: October 26th, 2024
|
6 |
##########################################################################################
|
7 |
|
8 |
import os
|
|
|
16 |
from pypdf import PdfReader #convertPDF
|
17 |
import re #format_prompt
|
18 |
import gradio as gr # multimodal_response
|
19 |
+
from huggingface_hub import InferenceClient # multimodal_response
|
20 |
+
import json # multimodal_response (on-prem)
|
21 |
+
import requests # multimodal_response (on-prem)
|
22 |
|
23 |
#---------------------------------------------------
|
24 |
# Specify models for text generation and embeddings
|
25 |
#---------------------------------------------------
|
26 |
|
27 |
myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
|
|
|
|
|
28 |
#myModel="meta-llama/Llama-3.1-8B-Instruct"
|
29 |
+
#myModel="QuantFactory/gemma-2-9b-it-SimPO-GGUF"
|
30 |
+
#myModel="bartowski/gemma-2-9b-it-GGUF"
|
31 |
#mod=myModel
|
32 |
#tok=AutoTokenizer.from_pretrained(mod) #,token="hf_...")
|
33 |
#cha=[{"role":"system","content":"A"},{"role":"user","content":"B"},{"role":"assistant","content":"C"}]
|
|
|
35 |
#res=tok.apply_chat_template(cha)
|
36 |
#print(tok.decode(res))
|
37 |
|
38 |
+
if("GGUF" in myModel): # start Llama-cpp-server for GGUF-models on premises:
|
39 |
+
#modelPath="/home/af/gguf/models/bartowski/gemma-2-9b-it-GGUF/gemma-2-9b-it-Q4_K_M.gguf"
|
40 |
+
modelPath="/home/af/gguf/models/QuantFactory/gemma-2-9b-it-SimPO-GGUF/gemma-2-9b-it-SimPO.Q4_K_M.gguf"
|
41 |
+
if(os.path.exists(modelPath)==False):
|
42 |
+
#url="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF/resolve/main/gemma-2-9b-it-Q4_K_M.gguf?download=true"
|
43 |
+
url="https://huggingface.co/QuantFactory/gemma-2-9b-it-SimPO-GGUF/resolve/main/gemma-2-9b-it-SimPO.Q4_K_M.gguf?download=true"
|
44 |
+
response = requests.get(url)
|
45 |
+
with open("./model.gguf", mode="wb") as file:
|
46 |
+
file.write(response.content)
|
47 |
+
print("Model downloaded")
|
48 |
+
modelPath="./model.gguf"
|
49 |
+
print(modelPath)
|
50 |
+
import subprocess
|
51 |
+
command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "4", "--n_gpu_layers","42"] #20
|
52 |
+
subprocess.Popen(command)
|
53 |
+
print("Server ready!")
|
54 |
+
|
55 |
+
url="http://0.0.0.0:2600/v1/completions"
|
56 |
+
body={"prompt":"test","max_tokens":1000, "echo":"False","stream":"False"} #e.g. Mixtral-Instruct
|
57 |
+
test=requests.post(url, json=body, stream=False)
|
58 |
+
|
59 |
jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
|
60 |
#jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
|
61 |
device='cuda:0' if torch.cuda.is_available() else 'cpu'
|
|
|
106 |
|
107 |
def format_prompt(message, history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=False,
|
108 |
startOfString="<s>", template0=" [INST] {system} [/INST] </s>",template1=" [INST] {message} [/INST]",template2=" {response}</s>"): # mistralai/Mixtral-8x7B-Instruct-v0.1
|
109 |
+
#startOfString="<bos>",template0="<start_of_turn>user\n{system}<end_of_turn>\n<start_of_turn>model\n<end_of_turn>\n",template1="<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n",template2="{response}<end_of_turn>\n"): # google/gemma-2-2b-it
|
110 |
+
#startOfString="<|begin_of_text|><", template0="<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n{system}\n<|eot_id|>", template1="<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", template2="{response}</eot_id>"): # meta-llama/Llama-3.1-8B-Instruct
|
111 |
if zeichenlimit is None: zeichenlimit=1000000000 # :-)
|
112 |
prompt = ""
|
113 |
if RAGAddon is not None:
|
|
|
278 |
print(len(x))
|
279 |
if(len(x)==0):
|
280 |
chunkSize=40000
|
281 |
+
for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
|
282 |
print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
|
283 |
ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
|
284 |
batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
|
285 |
textIDs=[str(id) for id in ids[0:len(batch)]]
|
286 |
+
ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
|
287 |
collection.add(documents=batch, ids=ids,
|
288 |
+
metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
|
289 |
print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
|
290 |
now = datetime.now()
|
291 |
gr.Info(f"Indexing complete!")
|
292 |
+
print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
|
293 |
return(collection)
|
294 |
|
295 |
+
|
296 |
+
#--------------------------------------------------------
|
297 |
+
# Function for response to user queries and pot. addenda
|
298 |
+
#--------------------------------------------------------
|
299 |
+
|
300 |
+
def multimodal_response(message, history, dropdown, hfToken, request: gr.Request):
|
301 |
+
print("def multimodal response!")
|
302 |
+
if(hfToken.startswith("hf_")): # use HF-hub with custom token if token is provided
|
303 |
+
inferenceClient = InferenceClient(model=myModel, token=hfToken)
|
304 |
+
else:
|
305 |
+
inferenceClient = InferenceClient(myModel)
|
306 |
+
global databases
|
307 |
+
if request:
|
308 |
+
session=request.session_hash
|
309 |
+
else:
|
310 |
+
session="0"
|
311 |
+
length=str(len(history))
|
312 |
+
print(databases)
|
313 |
+
if(not databases[-1][1]==session):
|
314 |
+
databases.append((date.today(),session))
|
315 |
+
#print(databases)
|
316 |
+
query=message["text"]
|
317 |
+
if(len(message["files"])>0): # is there at least one file attached?
|
318 |
+
collection=add_doc(message["files"][0], session)
|
319 |
+
else: # otherwise, you still want to get the collection with the session-based db
|
320 |
+
collection=add_doc(message["text"], session)
|
321 |
+
client = chromadb.PersistentClient(path=dbPath)
|
322 |
+
print(str(client.list_collections()))
|
323 |
+
x=collection.get(include=[])["ids"]
|
324 |
+
ragQuery=[format_prompt(query, history, historylimit=2,
|
325 |
+
#startOfString="", template0="{system}\n",template1="USER: {message}\n\n",template2="ASSISTANT: {response}\n\n") if len(history)>0 else query] # embed simply-formated dialogue
|
326 |
+
startOfString="", template1="{message}\n\n",template2="") if len(history)>0 else query] # embed simple string of User-queries only
|
327 |
+
context=collection.query(query_texts=ragQuery, n_results=3)
|
328 |
+
#context=["<Kontext "+str(i)+"> "+str(c)+"</Kontext "+str(i)+">" for i,c in enumerate(context["documents"][0])]
|
329 |
+
context=["Kontext "+str(i+1)+": \""+re.sub("\"","'",str(c))+"\"" for i,c in enumerate(context["documents"][0])]
|
330 |
+
gr.Info("Kontext:\n"+str(context))
|
331 |
+
generate_kwargs = dict(
|
332 |
+
temperature=float(0.9),
|
333 |
+
max_new_tokens=5000,
|
334 |
+
top_p=0.95,
|
335 |
+
repetition_penalty=1.0,
|
336 |
+
do_sample=True,
|
337 |
+
seed=42,
|
338 |
+
)
|
339 |
+
system="Mit Blick auf das folgende Gespräch und den relevanten Kontext, antworte auf die aktuelle Frage des Nutzers. "+\
|
340 |
+
"Antworte ausschließlich auf Basis der Informationen im Kontext.\n\nKontext:\n\n"+\
|
341 |
+
str("\n\n".join(context))
|
342 |
+
#"Given the following conversation, relevant context, and a follow up question, "+\
|
343 |
+
#"reply with an answer to the current question the user is asking. "+\
|
344 |
+
#"Return only your response to the question given the above information "+\
|
345 |
+
#"following the users instructions as needed.\n\nContext:"+\
|
346 |
+
print(system)
|
347 |
+
#formatted_prompt = format_prompt0(system+"\n"+query, history)
|
348 |
+
formatted_prompt = format_prompt(query, history,system=system)
|
349 |
+
print(formatted_prompt)
|
350 |
+
output = ""
|
351 |
+
if(not "GGUF" in myModel):
|
352 |
+
try:
|
353 |
+
stream = inferenceClient.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
|
354 |
+
for response in stream:
|
355 |
+
output += response.token.text
|
356 |
+
yield output
|
357 |
+
except Exception as e:
|
358 |
+
output = "Für weitere Antworten von der KI gebe bitte einen gültigen HuggingFace-Token an."
|
359 |
+
if(len(context)>0):
|
360 |
+
output += "\nBis dahin helfen dir hoffentlich die folgenden Quellen weiter:"
|
361 |
+
yield output
|
362 |
+
print(str(e))
|
363 |
+
else:
|
364 |
+
try:
|
365 |
+
#generate_kwargs["prompt"]=formatted_prompt #
|
366 |
+
generate_kwargs={"prompt":formatted_prompt,"max_tokens":1000, "echo":"False","stream":"True"} #e.g. Mixtral-Instruct
|
367 |
+
url="http://0.0.0.0:2600/v1/completions"
|
368 |
+
response=""
|
369 |
+
buffer=""
|
370 |
+
print("URL: "+url)
|
371 |
+
print("User: "+str(message)+"\nAssistant: ")
|
372 |
+
for text in requests.post(url, json=generate_kwargs, stream=True): #-H 'accept: application/json' -H 'Content-Type: application/json'
|
373 |
+
if buffer is None: buffer=""
|
374 |
+
buffer=str("".join(buffer))
|
375 |
+
text=text.decode('utf-8')
|
376 |
+
if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
|
377 |
+
buffer=buffer.split('"finish_reason": null}]}')
|
378 |
+
if(len(buffer)==1):
|
379 |
+
buffer="".join(buffer)
|
380 |
+
pass
|
381 |
+
if(len(buffer)==2):
|
382 |
+
part=buffer[0]+'"finish_reason": null}]}'
|
383 |
+
if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
|
384 |
+
try:
|
385 |
+
part = str(json.loads(part)["choices"][0]["text"])
|
386 |
+
print(part, end="", flush=True)
|
387 |
+
output += part
|
388 |
+
buffer=""
|
389 |
+
except Exception as e:
|
390 |
+
print("Exception:"+str(e))
|
391 |
+
pass
|
392 |
+
yield output
|
393 |
+
except Exception as e:
|
394 |
+
output = "Die KI antwortet gerade nicht."
|
395 |
+
if(len(context)>0):
|
396 |
+
output += "\nBis dahin helfen dir hoffentlich die folgenden Quellen weiter:"
|
397 |
+
yield output
|
398 |
+
print(str(e))
|
399 |
+
if(len(context)>0):
|
400 |
+
output=output+"\n\n<br><details open><summary><strong>Quellen</strong></summary><br><ul>"+ "".join(["<li>" + c + "</li>" for c in context])+"</ul></details>"
|
401 |
+
yield output
|
402 |
+
|
403 |
+
#------------------------------
|
404 |
+
# Launch Gradio-ChatInterface
|
405 |
+
#------------------------------
|
406 |
+
|
407 |
+
|
408 |
+
i=gr.ChatInterface(multimodal_response,
|
409 |
+
title="Frag dein PDF",
|
410 |
+
multimodal=True,
|
411 |
+
additional_inputs=[
|
412 |
+
gr.Dropdown(
|
413 |
+
info="Wähle eine Variante",
|
414 |
+
choices=["1","2","3"],
|
415 |
+
value="1",
|
416 |
+
label="Variante"),
|
417 |
+
gr.Textbox(
|
418 |
+
value="",
|
419 |
+
label="HF_token"),
|
420 |
+
])
|
421 |
+
i.launch() #allowed_paths=["."])
|