Frag-dein-PDF

Sleeping

App Files Files Community

AFischer1985 commited on Oct 26, 2024

Commit

87e8aba

verified ·

1 Parent(s): a97d3f8

Update run.py

Browse files

Files changed (1) hide show

run.py +160 -11

run.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # Title:  Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
 # Author: Andreas Fischer
 # Date:   October 10th, 2024
-# Last update: October 25th, 2024
 ##########################################################################################
 import os
@@ -16,17 +16,18 @@ import ocrmypdf #convertPDF
 from pypdf import PdfReader #convertPDF
 import re #format_prompt
 import gradio as gr # multimodal_response
-from huggingface_hub import InferenceClient #multimodal_response
 #---------------------------------------------------
 # Specify models for text generation and embeddings
 #---------------------------------------------------
 myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
-#myModel="princeton-nlp/gemma-2-9b-it-SimPO"
-#myModel="google/gemma-2-2b-it"
 #myModel="meta-llama/Llama-3.1-8B-Instruct"
 #mod=myModel
 #tok=AutoTokenizer.from_pretrained(mod) #,token="hf_...")
 #cha=[{"role":"system","content":"A"},{"role":"user","content":"B"},{"role":"assistant","content":"C"}]
@@ -34,6 +35,27 @@ myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
 #res=tok.apply_chat_template(cha)
 #print(tok.decode(res))
 jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
 #jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
 device='cuda:0' if torch.cuda.is_available() else 'cpu'
@@ -84,8 +106,8 @@ def format_prompt0(message, history):
 def format_prompt(message, history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=False,
   startOfString="<s>", template0=" [INST] {system} [/INST] </s>",template1=" [INST] {message} [/INST]",template2=" {response}</s>"): # mistralai/Mixtral-8x7B-Instruct-v0.1
-  #startOfString="<bos>",template0="<start_of_turn>user\n{system}<end_of_turn>\n<start_of_turn>model\n<end_of_turn>\n",template1="<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n",template2="<end_of_turn>\n"): # google/gemma-2-2b-it
-  #startOfString="", template0="<|start_header_id|>system<|end_header_id|>\n\n{system}\n<|eot_id|>", template1="<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|>", template2="<|start_header_id|>assistant<|end_header_id|>\n\n{response}</eot_id>"): # meta-llama/Llama-3.1-8B-Instruct?
   if zeichenlimit is None: zeichenlimit=1000000000 # :-)
   prompt = ""
   if RAGAddon is not None:
@@ -256,17 +278,144 @@ def add_doc(path, session):
     print(len(x))
     if(len(x)==0):
       chunkSize=40000
-      for i in range(round(len(corpus)/chunkSize+0.5)):
         print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
         ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
         batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
         textIDs=[str(id) for id in ids[0:len(batch)]]
-        ids=[str(id+len(x)+1) for id in ids[0:len(batch)]]
         collection.add(documents=batch, ids=ids,
-          metadatas=[{"date": str("2024-10-10")} for b in batch])
         print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
     now = datetime.now()
     gr.Info(f"Indexing complete!")
-    print(now-then)
   return(collection)

 # Title:  Gradio Interface to LLM-chatbot with dynamic RAG-funcionality and ChromaDB
 # Author: Andreas Fischer
 # Date:   October 10th, 2024
+# Last update: October 26th, 2024
 ##########################################################################################
 import os
 from pypdf import PdfReader #convertPDF
 import re #format_prompt
 import gradio as gr # multimodal_response
+from huggingface_hub import InferenceClient # multimodal_response
+import json # multimodal_response (on-prem)
+import requests # multimodal_response (on-prem)
 #---------------------------------------------------
 # Specify models for text generation and embeddings
 #---------------------------------------------------
 myModel="mistralai/Mixtral-8x7b-instruct-v0.1"
 #myModel="meta-llama/Llama-3.1-8B-Instruct"
+#myModel="QuantFactory/gemma-2-9b-it-SimPO-GGUF"
+#myModel="bartowski/gemma-2-9b-it-GGUF"
 #mod=myModel
 #tok=AutoTokenizer.from_pretrained(mod) #,token="hf_...")
 #cha=[{"role":"system","content":"A"},{"role":"user","content":"B"},{"role":"assistant","content":"C"}]
 #res=tok.apply_chat_template(cha)
 #print(tok.decode(res))
+if("GGUF" in myModel): # start Llama-cpp-server for GGUF-models on premises:
+  #modelPath="/home/af/gguf/models/bartowski/gemma-2-9b-it-GGUF/gemma-2-9b-it-Q4_K_M.gguf"
+  modelPath="/home/af/gguf/models/QuantFactory/gemma-2-9b-it-SimPO-GGUF/gemma-2-9b-it-SimPO.Q4_K_M.gguf"
+  if(os.path.exists(modelPath)==False):
+    #url="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF/resolve/main/gemma-2-9b-it-Q4_K_M.gguf?download=true"
+    url="https://huggingface.co/QuantFactory/gemma-2-9b-it-SimPO-GGUF/resolve/main/gemma-2-9b-it-SimPO.Q4_K_M.gguf?download=true"
+    response = requests.get(url)
+    with open("./model.gguf", mode="wb") as file:
+      file.write(response.content)
+    print("Model downloaded")
+    modelPath="./model.gguf"
+  print(modelPath)
+  import subprocess
+  command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "4", "--n_gpu_layers","42"] #20
+  subprocess.Popen(command)
+  print("Server ready!")
+url="http://0.0.0.0:2600/v1/completions"
+body={"prompt":"test","max_tokens":1000, "echo":"False","stream":"False"} #e.g. Mixtral-Instruct
+test=requests.post(url, json=body, stream=False)
 jina = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.bfloat16)
 #jira.save_pretrained("jinaai_jina-embeddings-v2-base-de")
 device='cuda:0' if torch.cuda.is_available() else 'cpu'
 def format_prompt(message, history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=False,
   startOfString="<s>", template0=" [INST] {system} [/INST] </s>",template1=" [INST] {message} [/INST]",template2=" {response}</s>"): # mistralai/Mixtral-8x7B-Instruct-v0.1
+  #startOfString="<bos>",template0="<start_of_turn>user\n{system}<end_of_turn>\n<start_of_turn>model\n<end_of_turn>\n",template1="<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n",template2="{response}<end_of_turn>\n"): # google/gemma-2-2b-it
+  #startOfString="<|begin_of_text|><", template0="<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n{system}\n<|eot_id|>", template1="<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", template2="{response}</eot_id>"): # meta-llama/Llama-3.1-8B-Instruct
   if zeichenlimit is None: zeichenlimit=1000000000 # :-)
   prompt = ""
   if RAGAddon is not None:
     print(len(x))
     if(len(x)==0):
       chunkSize=40000
+      for i in range(round(len(corpus)/chunkSize+0.5)): #0 is first batch, 3 is last (incomplete) batch given 133497 texts
         print("embed batch "+str(i)+" of "+str(round(len(corpus)/chunkSize+0.5)))
         ids=list(range(i*chunkSize,(i*chunkSize+chunkSize)))
         batch=corpus[i*chunkSize:(i*chunkSize+chunkSize)]
         textIDs=[str(id) for id in ids[0:len(batch)]]
+        ids=[str(id+len(x)+1) for id in ids[0:len(batch)]] # id refers to chromadb-unique ID
         collection.add(documents=batch, ids=ids,
+          metadatas=[{"date": str("2024-10-10")} for b in batch]) #"textID":textIDs, "id":ids,
         print("finished batch "+str(i)+" of "+str(round(len(corpus)/40000+0.5)))
     now = datetime.now()
     gr.Info(f"Indexing complete!")
+    print(now-then) #zu viel GB für sentences (GPU), bzw. 0:00:10.375087 für chunks
   return(collection)
+#--------------------------------------------------------
+# Function for response to user queries and pot. addenda
+#--------------------------------------------------------
+def multimodal_response(message, history, dropdown, hfToken, request: gr.Request):
+  print("def multimodal response!")
+  if(hfToken.startswith("hf_")): # use HF-hub with custom token if token is provided
+    inferenceClient = InferenceClient(model=myModel, token=hfToken)
+  else:
+    inferenceClient = InferenceClient(myModel)
+  global databases
+  if request:
+    session=request.session_hash
+  else:
+    session="0"
+  length=str(len(history))
+  print(databases)
+  if(not databases[-1][1]==session):
+    databases.append((date.today(),session))
+    #print(databases)
+  query=message["text"]
+  if(len(message["files"])>0): # is there at least one file attached?
+    collection=add_doc(message["files"][0], session)
+  else: # otherwise, you still want to get the collection with the session-based db
+    collection=add_doc(message["text"], session)
+  client = chromadb.PersistentClient(path=dbPath)
+  print(str(client.list_collections()))
+  x=collection.get(include=[])["ids"]
+  ragQuery=[format_prompt(query, history, historylimit=2,
+    #startOfString="", template0="{system}\n",template1="USER: {message}\n\n",template2="ASSISTANT: {response}\n\n") if len(history)>0 else query] # embed simply-formated dialogue
+    startOfString="", template1="{message}\n\n",template2="") if len(history)>0 else query] # embed simple string of User-queries only
+  context=collection.query(query_texts=ragQuery, n_results=3)
+  #context=["<Kontext "+str(i)+"> "+str(c)+"</Kontext "+str(i)+">" for i,c in enumerate(context["documents"][0])]
+  context=["Kontext "+str(i+1)+": \""+re.sub("\"","'",str(c))+"\"" for i,c in enumerate(context["documents"][0])]
+  gr.Info("Kontext:\n"+str(context))
+  generate_kwargs = dict(
+        temperature=float(0.9),
+        max_new_tokens=5000,
+        top_p=0.95,
+        repetition_penalty=1.0,
+        do_sample=True,
+        seed=42,
+  )
+  system="Mit Blick auf das folgende Gespräch und den relevanten Kontext, antworte auf die aktuelle Frage des Nutzers. "+\
+  "Antworte ausschließlich auf Basis der Informationen im Kontext.\n\nKontext:\n\n"+\
+  str("\n\n".join(context))
+  #"Given the following conversation, relevant context, and a follow up question, "+\
+  #"reply with an answer to the current question the user is asking. "+\
+  #"Return only your response to the question given the above information "+\
+  #"following the users instructions as needed.\n\nContext:"+\
+  print(system)
+  #formatted_prompt = format_prompt0(system+"\n"+query, history)
+  formatted_prompt = format_prompt(query, history,system=system)
+  print(formatted_prompt)
+  output = ""
+  if(not "GGUF" in myModel):
+    try:
+      stream = inferenceClient.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+      for response in stream:
+        output += response.token.text
+        yield output
+    except Exception as e:
+      output = "Für weitere Antworten von der KI gebe bitte einen gültigen HuggingFace-Token an."
+      if(len(context)>0):
+        output += "\nBis dahin helfen dir hoffentlich die folgenden Quellen weiter:"
+      yield output
+      print(str(e))
+  else:
+    try:
+      #generate_kwargs["prompt"]=formatted_prompt #
+      generate_kwargs={"prompt":formatted_prompt,"max_tokens":1000, "echo":"False","stream":"True"} #e.g. Mixtral-Instruct
+      url="http://0.0.0.0:2600/v1/completions"
+      response=""
+      buffer=""
+      print("URL: "+url)
+      print("User: "+str(message)+"\nAssistant: ")
+      for text in requests.post(url, json=generate_kwargs, stream=True):  #-H 'accept: application/json' -H 'Content-Type: application/json'
+        if buffer is None: buffer=""
+        buffer=str("".join(buffer))
+        text=text.decode('utf-8')
+        if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
+        buffer=buffer.split('"finish_reason": null}]}')
+        if(len(buffer)==1):
+          buffer="".join(buffer)
+          pass
+        if(len(buffer)==2):
+          part=buffer[0]+'"finish_reason": null}]}'
+          if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
+          try:
+            part = str(json.loads(part)["choices"][0]["text"])
+            print(part, end="", flush=True)
+            output += part
+            buffer=""
+          except Exception as e:
+            print("Exception:"+str(e))
+            pass
+        yield output
+    except Exception as e:
+      output = "Die KI antwortet gerade nicht."
+      if(len(context)>0):
+        output += "\nBis dahin helfen dir hoffentlich die folgenden Quellen weiter:"
+      yield output
+      print(str(e))
+  if(len(context)>0):
+    output=output+"\n\n<br><details open><summary><strong>Quellen</strong></summary><br><ul>"+ "".join(["<li>" + c + "</li>" for c in context])+"</ul></details>"
+  yield output
+#------------------------------
+# Launch Gradio-ChatInterface
+#------------------------------
+i=gr.ChatInterface(multimodal_response,
+  title="Frag dein PDF",
+  multimodal=True,
+  additional_inputs=[
+    gr.Dropdown(
+      info="Wähle eine Variante",
+      choices=["1","2","3"],
+      value="1",
+      label="Variante"),
+           gr.Textbox(
+      value="",
+      label="HF_token"),
+  ])
+i.launch() #allowed_paths=["."])