AFischer1985 commited on
Commit
47e0125
1 Parent(s): 923df80

Update run.py

Browse files
Files changed (1) hide show
  1. run.py +133 -80
run.py CHANGED
@@ -1,8 +1,8 @@
1
  #########################################################################################
2
- # Title: German AI-Interface to the Hugging Face Hub with advanced RAG
3
  # Author: Andreas Fischer
4
  # Date: January 31st, 2023
5
- # Last update: February 21st, 2024
6
  ##########################################################################################
7
 
8
  #https://github.com/abetlen/llama-cpp-python/issues/306
@@ -26,10 +26,13 @@ if(os.path.exists(filename)==True): os.remove(filename)
26
  #-----------
27
  import os
28
  import chromadb
29
- dbPath="/home/af/Schreibtisch/gradio/Chroma/db"
30
- if(os.path.exists(dbPath)==False): dbPath="/home/user/app/db"
 
31
 
 
32
  print(dbPath)
 
33
  #client = chromadb.Client()
34
  path=dbPath
35
  client = chromadb.PersistentClient(path=path)
@@ -40,7 +43,7 @@ from chromadb.utils import embedding_functions
40
  default_ef = embedding_functions.DefaultEmbeddingFunction()
41
  #sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
42
  #instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-large", device="cuda")
43
- embeddingModel = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer") #, device="cuda")
44
 
45
  print(str(client.list_collections()))
46
 
@@ -143,37 +146,41 @@ x=[x["type2"] for x in rag0["metadatas"][0]]
143
  x.index("1c") if "1c" in x else len(x)+1
144
 
145
 
146
- # Get model
147
- #-----------
148
-
149
- import os
150
- import requests
151
-
152
- modelPath="/home/af/gguf/models/discolm_german_7b_v1.Q4_0.gguf"
153
- if(os.path.exists(modelPath)==False):
154
- #url="https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GGUF/resolve/main/wizardlm-13b-v1.2.Q4_0.gguf"
155
- #url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
156
- #url="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true"
157
- url="https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q4_0.gguf?download=true"
158
- response = requests.get(url)
159
- with open("./model.gguf", mode="wb") as file:
160
- file.write(response.content)
161
- print("Model downloaded")
162
- modelPath="./model.gguf"
163
-
164
- print(modelPath)
165
-
166
-
167
- # Llama-cpp-Server
168
- #------------------
169
 
170
- import subprocess
171
- n="20"
172
- if("mixtral-8x7b-instruct" in modelPath): n="0" # mixtral seems to cause problems here...
173
 
174
- command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "8", "--n_gpu_layers", n]
175
- subprocess.Popen(command)
176
- print("Server ready!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
 
179
  # Gradio-GUI
@@ -346,58 +353,104 @@ def response(message, history):
346
  historylimit=historylimit # number of past messages to consider for response to current message
347
  )
348
  print(prompt)
349
- # url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
350
- url="http://0.0.0.0:2600/v1/completions"
351
- body={"prompt":prompt,"max_tokens":None, "echo":"False","stream":"True"} # e.g. Mixtral-Instruct
352
- if("discolm_german_7b" in modelPath): body.update({"stop": ["<|im_end|>"]}) # fix stop-token of DiscoLM
353
- response="" #+"("+myType+")\n"
354
- buffer=""
355
- print("URL: "+url)
356
- print("User: "+message+"\nAI: ")
357
- for text in requests.post(url, json=body, stream=True): #-H 'accept: application/json' -H 'Content-Type: application/json'
358
- if buffer is None: buffer=""
359
- buffer=str("".join(buffer))
360
- # print("*** Raw String: "+str(text)+"\n***\n")
361
- text=text.decode('utf-8')
362
- if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
363
- # print("\n*** Buffer: "+str(buffer)+"\n***\n")
364
- buffer=buffer.split('"finish_reason": null}]}')
365
- if(len(buffer)==1):
366
- buffer="".join(buffer)
367
- pass
368
- if(len(buffer)==2):
369
- part=buffer[0]+'"finish_reason": null}]}'
370
- if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
371
- try:
372
- part = str(json.loads(part)["choices"][0]["text"])
373
  print(part, end="", flush=True)
374
- response=response+part
375
- buffer="" # reset buffer
376
- except Exception as e:
377
- print("Exception:"+str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  pass
379
- yield response
380
- if((myType=="1a")|(myType=="1b")): #add RAG-results to chat-output if appropriate
381
- response=response+"\n\n<br><details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
382
- yield response
383
- history.append((message, response)) # add current dialog to history
384
- # Store current state in DB if settings=="Permanent"
385
- if (settings=="Permanent"):
386
- x=collection.get(include=[])["ids"] # add current dialog to db
387
- collection.add(
388
- documents=[message,response],
389
- metadatas=[
390
- { "source": "ICH", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"},
391
- { "source": "DU", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"}
392
- ],
393
- ids=[str(len(x)+1),str(len(x)+2)]
394
- )
395
- json.dump(history,open(filename,'w',encoding="utf-8"),ensure_ascii=False)
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
  gr.ChatInterface(
398
  response,
399
  chatbot=gr.Chatbot(value=[[None,"Herzlich willkommen! Ich bin ein KI-basiertes Assistenzsystem, das für jede Anfrage die am besten geeigneten KI-Tools empfiehlt.<br>Aktuell bin ich wenig mehr als eine Tech-Demo und kenne nur 7 KI-Modelle - also sei bitte nicht zu streng mit mir.<br>Was ist dein Anliegen?"]],render_markdown=True),
400
- title="German AI-Interface to the Hugging Face Hub with advanced RAG",
401
  #additional_inputs=[gr.Dropdown(["Permanent","Temporär"],value="Temporär",label="Dialog sichern?")]
402
  ).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
403
  print("Interface up and running!")
 
1
  #########################################################################################
2
+ # Title: German AI-Interface with advanced RAG
3
  # Author: Andreas Fischer
4
  # Date: January 31st, 2023
5
+ # Last update: February 22st, 2024
6
  ##########################################################################################
7
 
8
  #https://github.com/abetlen/llama-cpp-python/issues/306
 
26
  #-----------
27
  import os
28
  import chromadb
29
+ dbPath = "/home/af/Schreibtisch/Code/gradio/Chroma/db"
30
+ onPrem = True if(os.path.exists(dbPath)) else False
31
+ if(onPrem==False): dbPath="/home/user/app/db"
32
 
33
+ onPrem=False
34
  print(dbPath)
35
+
36
  #client = chromadb.Client()
37
  path=dbPath
38
  client = chromadb.PersistentClient(path=path)
 
43
  default_ef = embedding_functions.DefaultEmbeddingFunction()
44
  #sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
45
  #instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-large", device="cuda")
46
+ embeddingModel = embedding_functions.InstructorEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer", device="cuda" if(onPrem) else "cpu")
47
 
48
  print(str(client.list_collections()))
49
 
 
146
  x.index("1c") if "1c" in x else len(x)+1
147
 
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
+ # Model
151
+ #-------
152
+ #onPrem=False
153
 
154
+ if(onPrem==False):
155
+ modelPath="mistralai/Mixtral-8x7B-Instruct-v0.1"
156
+ from huggingface_hub import InferenceClient
157
+ import gradio as gr
158
+ client = InferenceClient(
159
+ modelPath
160
+ #"mistralai/Mixtral-8x7B-Instruct-v0.1"
161
+ #"mistralai/Mistral-7B-Instruct-v0.1"
162
+ )
163
+ else:
164
+ import os
165
+ import requests
166
+ import subprocess
167
+ modelPath="/home/af/gguf/models/discolm_german_7b_v1.Q4_0.gguf"
168
+ if(os.path.exists(modelPath)==False):
169
+ #url="https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GGUF/resolve/main/wizardlm-13b-v1.2.Q4_0.gguf"
170
+ #url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
171
+ #url="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true"
172
+ url="https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q4_0.gguf?download=true"
173
+ response = requests.get(url)
174
+ with open("./model.gguf", mode="wb") as file:
175
+ file.write(response.content)
176
+ print("Model downloaded")
177
+ modelPath="./model.gguf"
178
+ print(modelPath)
179
+ n="20"
180
+ if("mixtral-8x7b-instruct" in modelPath): n="0" # mixtral seems to cause problems here...
181
+ command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "8", "--n_gpu_layers", n]
182
+ subprocess.Popen(command)
183
+ print("Server ready!")
184
 
185
 
186
  # Gradio-GUI
 
353
  historylimit=historylimit # number of past messages to consider for response to current message
354
  )
355
  print(prompt)
356
+
357
+ ## Request response from model
358
+ #------------------------------
359
+ print("AI running on prem!" if(onPrem) else "AI running HFHub!")
360
+ if(onPrem==False):
361
+ temperature=float(0.9)
362
+ max_new_tokens=500
363
+ top_p=0.95
364
+ repetition_penalty=1.0
365
+ if temperature < 1e-2: temperature = 1e-2
366
+ top_p = float(top_p)
367
+ generate_kwargs = dict(
368
+ temperature=temperature,
369
+ max_new_tokens=max_new_tokens,
370
+ top_p=top_p,
371
+ repetition_penalty=repetition_penalty,
372
+ do_sample=True,
373
+ seed=42,
374
+ )
375
+ stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
376
+ response = ""
377
+ print("User: "+message+"\nAI: ")
378
+ for text in stream:
379
+ part=text.token.text
380
  print(part, end="", flush=True)
381
+ response += part
382
+ yield response
383
+ if((myType=="1a")|(myType=="1b")): #add RAG-results to chat-output if appropriate
384
+ response=response+"\n\n<br><details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
385
+ yield response
386
+ history.append((message, response)) # add current dialog to history
387
+ # Store current state in DB if settings=="Permanent"
388
+ if (settings=="Permanent"):
389
+ x=collection.get(include=[])["ids"] # add current dialog to db
390
+ collection.add(
391
+ documents=[message,response],
392
+ metadatas=[
393
+ { "source": "ICH", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"},
394
+ { "source": "DU", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"}
395
+ ],
396
+ ids=[str(len(x)+1),str(len(x)+2)]
397
+ )
398
+ json.dump(history,open(filename,'w',encoding="utf-8"),ensure_ascii=False)
399
+
400
+ if(onPrem==True):
401
+ # url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
402
+ url="http://0.0.0.0:2600/v1/completions"
403
+ body={"prompt":prompt,"max_tokens":None, "echo":"False","stream":"True"} # e.g. Mixtral-Instruct
404
+ if("discolm_german_7b" in modelPath): body.update({"stop": ["<|im_end|>"]}) # fix stop-token of DiscoLM
405
+ response="" #+"("+myType+")\n"
406
+ buffer=""
407
+ #print("URL: "+url)
408
+ print("User: "+message+"\nAI: ")
409
+ for text in requests.post(url, json=body, stream=True): #-H 'accept: application/json' -H 'Content-Type: application/json'
410
+ if buffer is None: buffer=""
411
+ buffer=str("".join(buffer))
412
+ # print("*** Raw String: "+str(text)+"\n***\n")
413
+ text=text.decode('utf-8')
414
+ if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
415
+ # print("\n*** Buffer: "+str(buffer)+"\n***\n")
416
+ buffer=buffer.split('"finish_reason": null}]}')
417
+ if(len(buffer)==1):
418
+ buffer="".join(buffer)
419
  pass
420
+ if(len(buffer)==2):
421
+ part=buffer[0]+'"finish_reason": null}]}'
422
+ if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
423
+ try:
424
+ part = str(json.loads(part)["choices"][0]["text"])
425
+ print(part, end="", flush=True)
426
+ response=response+part
427
+ buffer="" # reset buffer
428
+ except Exception as e:
429
+ print("Exception:"+str(e))
430
+ pass
431
+ yield response
432
+ if((myType=="1a")|(myType=="1b")): #add RAG-results to chat-output if appropriate
433
+ response=response+"\n\n<br><details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
434
+ yield response
435
+ history.append((message, response)) # add current dialog to history
436
+ # Store current state in DB if settings=="Permanent"
437
+ if (settings=="Permanent"):
438
+ x=collection.get(include=[])["ids"] # add current dialog to db
439
+ collection.add(
440
+ documents=[message,response],
441
+ metadatas=[
442
+ { "source": "ICH", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"},
443
+ { "source": "DU", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"}
444
+ ],
445
+ ids=[str(len(x)+1),str(len(x)+2)]
446
+ )
447
+ json.dump(history,open(filename,'w',encoding="utf-8"),ensure_ascii=False)
448
+
449
 
450
  gr.ChatInterface(
451
  response,
452
  chatbot=gr.Chatbot(value=[[None,"Herzlich willkommen! Ich bin ein KI-basiertes Assistenzsystem, das für jede Anfrage die am besten geeigneten KI-Tools empfiehlt.<br>Aktuell bin ich wenig mehr als eine Tech-Demo und kenne nur 7 KI-Modelle - also sei bitte nicht zu streng mit mir.<br>Was ist dein Anliegen?"]],render_markdown=True),
453
+ title="German AI-Interface with advanced RAG",
454
  #additional_inputs=[gr.Dropdown(["Permanent","Temporär"],value="Temporär",label="Dialog sichern?")]
455
  ).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
456
  print("Interface up and running!")