AFischer1985 commited on
Commit
87e8aba
1 Parent(s): a97d3f8

Update run.py

Browse files
Files changed (1) hide show
  1. run.py +160 -11
run.py CHANGED
@@ -2,7 +2,7 @@
2
  # Title: Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
3
  # Author: Andreas Fischer
4
  # Date: October 10th, 2024
5
- # Last update: October 25th, 2024
6
  ##########################################################################################
7
 
8
  import os
@@ -16,17 +16,18 @@ import ocrmypdf #convertPDF
16
  from pypdf import PdfReader #convertPDF
17
  import re #format_prompt
18
  import gradio as gr # multimodal_response
19
- from huggingface_hub import InferenceClient #multimodal_response
20
-
 
21
 
22
  #---------------------------------------------------
23
  # Specify models for text generation and embeddings
24
  #---------------------------------------------------
25
 
26
  myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
27
- #myModel="princeton-nlp/gemma-2-9b-it-SimPO"
28
- #myModel="google/gemma-2-2b-it"
29
  #myModel="meta-llama/Llama-3.1-8B-Instruct"
 
 
30
  #mod=myModel
31
  #tok=AutoTokenizer.from_pretrained(mod) #,token="hf_...")
32
  #cha=[{"role":"system","content":"A"},{"role":"user","content":"B"},{"role":"assistant","content":"C"}]
@@ -34,6 +35,27 @@ myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
34
  #res=tok.apply_chat_template(cha)
35
  #print(tok.decode(res))
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
38
  #jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
39
  device='cuda:0' if torch.cuda.is_available() else 'cpu'
@@ -84,8 +106,8 @@ def format_prompt0(message, history):
84
 
85
  def format_prompt(message, history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=False,
86
  startOfString="<s>", template0=" [INST] {system} [/INST] </s>",template1=" [INST] {message} [/INST]",template2=" {response}</s>"): # mistralai/Mixtral-8x7B-Instruct-v0.1
87
- #startOfString="<bos>",template0="<start_of_turn>user\n{system}<end_of_turn>\n<start_of_turn>model\n<end_of_turn>\n",template1="<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n",template2="<end_of_turn>\n"): # google/gemma-2-2b-it
88
- #startOfString="", template0="<|start_header_id|>system<|end_header_id|>\n\n{system}\n<|eot_id|>", template1="<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|>", template2="<|start_header_id|>assistant<|end_header_id|>\n\n{response}</eot_id>"): # meta-llama/Llama-3.1-8B-Instruct?
89
  if zeichenlimit is None: zeichenlimit=1000000000 # :-)
90
  prompt = ""
91
  if RAGAddon is not None:
@@ -256,17 +278,144 @@ def add_doc(path, session):
256
  print(len(x))
257
  if(len(x)==0):
258
  chunkSize=40000
259
- for i in range(round(len(corpus)/chunkSize+0.5)):
260
  print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
261
  ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
262
  batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
263
  textIDs=[str(id) for id in ids[0:len(batch)]]
264
- ids=[str(id+len(x)+1) for id in ids[0:len(batch)]]
265
  collection.add(documents=batch, ids=ids,
266
- metadatas=[{"date": str("2024-10-10")} for b in batch])
267
  print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
268
  now = datetime.now()
269
  gr.Info(f"Indexing complete!")
270
- print(now-then)
271
  return(collection)
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  # Title: Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
3
  # Author: Andreas Fischer
4
  # Date: October 10th, 2024
5
+ # Last update: October 26th, 2024
6
  ##########################################################################################
7
 
8
  import os
 
16
  from pypdf import PdfReader #convertPDF
17
  import re #format_prompt
18
  import gradio as gr # multimodal_response
19
+ from huggingface_hub import InferenceClient # multimodal_response
20
+ import json # multimodal_response (on-prem)
21
+ import requests # multimodal_response (on-prem)
22
 
23
  #---------------------------------------------------
24
  # Specify models for text generation and embeddings
25
  #---------------------------------------------------
26
 
27
  myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
 
 
28
  #myModel="meta-llama/Llama-3.1-8B-Instruct"
29
+ #myModel="QuantFactory/gemma-2-9b-it-SimPO-GGUF"
30
+ #myModel="bartowski/gemma-2-9b-it-GGUF"
31
  #mod=myModel
32
  #tok=AutoTokenizer.from_pretrained(mod) #,token="hf_...")
33
  #cha=[{"role":"system","content":"A"},{"role":"user","content":"B"},{"role":"assistant","content":"C"}]
 
35
  #res=tok.apply_chat_template(cha)
36
  #print(tok.decode(res))
37
 
38
+ if("GGUF" in myModel): # start Llama-cpp-server for GGUF-models on premises:
39
+ #modelPath="/home/af/gguf/models/bartowski/gemma-2-9b-it-GGUF/gemma-2-9b-it-Q4_K_M.gguf"
40
+ modelPath="/home/af/gguf/models/QuantFactory/gemma-2-9b-it-SimPO-GGUF/gemma-2-9b-it-SimPO.Q4_K_M.gguf"
41
+ if(os.path.exists(modelPath)==False):
42
+ #url="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF/resolve/main/gemma-2-9b-it-Q4_K_M.gguf?download=true"
43
+ url="https://huggingface.co/QuantFactory/gemma-2-9b-it-SimPO-GGUF/resolve/main/gemma-2-9b-it-SimPO.Q4_K_M.gguf?download=true"
44
+ response = requests.get(url)
45
+ with open("./model.gguf", mode="wb") as file:
46
+ file.write(response.content)
47
+ print("Model downloaded")
48
+ modelPath="./model.gguf"
49
+ print(modelPath)
50
+ import subprocess
51
+ command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "4", "--n_gpu_layers","42"] #20
52
+ subprocess.Popen(command)
53
+ print("Server ready!")
54
+
55
+ url="http://0.0.0.0:2600/v1/completions"
56
+ body={"prompt":"test","max_tokens":1000, "echo":"False","stream":"False"} #e.g. Mixtral-Instruct
57
+ test=requests.post(url, json=body, stream=False)
58
+
59
  jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
60
  #jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
61
  device='cuda:0' if torch.cuda.is_available() else 'cpu'
 
106
 
107
  def format_prompt(message, history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=False,
108
  startOfString="<s>", template0=" [INST] {system} [/INST] </s>",template1=" [INST] {message} [/INST]",template2=" {response}</s>"): # mistralai/Mixtral-8x7B-Instruct-v0.1
109
+ #startOfString="<bos>",template0="<start_of_turn>user\n{system}<end_of_turn>\n<start_of_turn>model\n<end_of_turn>\n",template1="<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n",template2="{response}<end_of_turn>\n"): # google/gemma-2-2b-it
110
+ #startOfString="<|begin_of_text|><", template0="<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n{system}\n<|eot_id|>", template1="<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", template2="{response}</eot_id>"): # meta-llama/Llama-3.1-8B-Instruct
111
  if zeichenlimit is None: zeichenlimit=1000000000 # :-)
112
  prompt = ""
113
  if RAGAddon is not None:
 
278
  print(len(x))
279
  if(len(x)==0):
280
  chunkSize=40000
281
+ for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
282
  print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
283
  ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
284
  batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
285
  textIDs=[str(id) for id in ids[0:len(batch)]]
286
+ ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
287
  collection.add(documents=batch, ids=ids,
288
+ metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
289
  print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
290
  now = datetime.now()
291
  gr.Info(f"Indexing complete!")
292
+ print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
293
  return(collection)
294
 
295
+
296
+ #--------------------------------------------------------
297
+ # Function for response to user queries and pot. addenda
298
+ #--------------------------------------------------------
299
+
300
+ def multimodal_response(message, history, dropdown, hfToken, request: gr.Request):
301
+ print("def multimodal response!")
302
+ if(hfToken.startswith("hf_")): # use HF-hub with custom token if token is provided
303
+ inferenceClient = InferenceClient(model=myModel, token=hfToken)
304
+ else:
305
+ inferenceClient = InferenceClient(myModel)
306
+ global databases
307
+ if request:
308
+ session=request.session_hash
309
+ else:
310
+ session="0"
311
+ length=str(len(history))
312
+ print(databases)
313
+ if(not databases[-1][1]==session):
314
+ databases.append((date.today(),session))
315
+ #print(databases)
316
+ query=message["text"]
317
+ if(len(message["files"])>0): # is there at least one file attached?
318
+ collection=add_doc(message["files"][0], session)
319
+ else: # otherwise, you still want to get the collection with the session-based db
320
+ collection=add_doc(message["text"], session)
321
+ client = chromadb.PersistentClient(path=dbPath)
322
+ print(str(client.list_collections()))
323
+ x=collection.get(include=[])["ids"]
324
+ ragQuery=[format_prompt(query, history, historylimit=2,
325
+ #startOfString="", template0="{system}\n",template1="USER: {message}\n\n",template2="ASSISTANT: {response}\n\n") if len(history)>0 else query] # embed simply-formated dialogue
326
+ startOfString="", template1="{message}\n\n",template2="") if len(history)>0 else query] # embed simple string of User-queries only
327
+ context=collection.query(query_texts=ragQuery, n_results=3)
328
+ #context=["<Kontext "+str(i)+"> "+str(c)+"</Kontext "+str(i)+">" for i,c in enumerate(context["documents"][0])]
329
+ context=["Kontext "+str(i+1)+": \""+re.sub("\"","'",str(c))+"\"" for i,c in enumerate(context["documents"][0])]
330
+ gr.Info("Kontext:\n"+str(context))
331
+ generate_kwargs = dict(
332
+ temperature=float(0.9),
333
+ max_new_tokens=5000,
334
+ top_p=0.95,
335
+ repetition_penalty=1.0,
336
+ do_sample=True,
337
+ seed=42,
338
+ )
339
+ system="Mit Blick auf das folgende Gespräch und den relevanten Kontext, antworte auf die aktuelle Frage des Nutzers. "+\
340
+ "Antworte ausschließlich auf Basis der Informationen im Kontext.\n\nKontext:\n\n"+\
341
+ str("\n\n".join(context))
342
+ #"Given the following conversation, relevant context, and a follow up question, "+\
343
+ #"reply with an answer to the current question the user is asking. "+\
344
+ #"Return only your response to the question given the above information "+\
345
+ #"following the users instructions as needed.\n\nContext:"+\
346
+ print(system)
347
+ #formatted_prompt = format_prompt0(system+"\n"+query, history)
348
+ formatted_prompt = format_prompt(query, history,system=system)
349
+ print(formatted_prompt)
350
+ output = ""
351
+ if(not "GGUF" in myModel):
352
+ try:
353
+ stream = inferenceClient.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
354
+ for response in stream:
355
+ output += response.token.text
356
+ yield output
357
+ except Exception as e:
358
+ output = "Für weitere Antworten von der KI gebe bitte einen gültigen HuggingFace-Token an."
359
+ if(len(context)>0):
360
+ output += "\nBis dahin helfen dir hoffentlich die folgenden Quellen weiter:"
361
+ yield output
362
+ print(str(e))
363
+ else:
364
+ try:
365
+ #generate_kwargs["prompt"]=formatted_prompt #
366
+ generate_kwargs={"prompt":formatted_prompt,"max_tokens":1000, "echo":"False","stream":"True"} #e.g. Mixtral-Instruct
367
+ url="http://0.0.0.0:2600/v1/completions"
368
+ response=""
369
+ buffer=""
370
+ print("URL: "+url)
371
+ print("User: "+str(message)+"\nAssistant: ")
372
+ for text in requests.post(url, json=generate_kwargs, stream=True): #-H 'accept: application/json' -H 'Content-Type: application/json'
373
+ if buffer is None: buffer=""
374
+ buffer=str("".join(buffer))
375
+ text=text.decode('utf-8')
376
+ if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
377
+ buffer=buffer.split('"finish_reason": null}]}')
378
+ if(len(buffer)==1):
379
+ buffer="".join(buffer)
380
+ pass
381
+ if(len(buffer)==2):
382
+ part=buffer[0]+'"finish_reason": null}]}'
383
+ if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
384
+ try:
385
+ part = str(json.loads(part)["choices"][0]["text"])
386
+ print(part, end="", flush=True)
387
+ output += part
388
+ buffer=""
389
+ except Exception as e:
390
+ print("Exception:"+str(e))
391
+ pass
392
+ yield output
393
+ except Exception as e:
394
+ output = "Die KI antwortet gerade nicht."
395
+ if(len(context)>0):
396
+ output += "\nBis dahin helfen dir hoffentlich die folgenden Quellen weiter:"
397
+ yield output
398
+ print(str(e))
399
+ if(len(context)>0):
400
+ output=output+"\n\n<br><details open><summary><strong>Quellen</strong></summary><br><ul>"+ "".join(["<li>" + c + "</li>" for c in context])+"</ul></details>"
401
+ yield output
402
+
403
+ #------------------------------
404
+ # Launch Gradio-ChatInterface
405
+ #------------------------------
406
+
407
+
408
+ i=gr.ChatInterface(multimodal_response,
409
+ title="Frag dein PDF",
410
+ multimodal=True,
411
+ additional_inputs=[
412
+ gr.Dropdown(
413
+ info="Wähle eine Variante",
414
+ choices=["1","2","3"],
415
+ value="1",
416
+ label="Variante"),
417
+ gr.Textbox(
418
+ value="",
419
+ label="HF_token"),
420
+ ])
421
+ i.launch() #allowed_paths=["."])