Spaces:
Running
Running
File size: 4,369 Bytes
8489876 751c1e6 818675e 8489876 ec648c9 8489876 818675e 8489876 39043d6 8489876 39043d6 8489876 251281a 8489876 251281a 8489876 ec648c9 8489876 ec648c9 8489876 ec648c9 8489876 ec648c9 8489876 ec648c9 8489876 ec648c9 8489876 251281a 3be34c2 8489876 251281a b0fc8f9 251281a b0fc8f9 251281a b0fc8f9 251281a 8489876 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
#########################################################################################
# Title: Gradio Interface to LLM-chatbot with on premises
# Author: Andreas Fischer
# Date: October 15th, 2023
# Last update: January 21st, 2024
##########################################################################################
#https://github.com/abetlen/llama-cpp-python/issues/306
#sudo apt install libclblast-dev
#CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir -v
# Get model
#-----------
import os
import requests
##modelPath="/home/af/gguf/models/phi-2.Q4_0.gguf"
#modelPath="/home/af/gguf/models/openchat-3.5-0106.Q4_0.gguf"
modelPath="/home/af/gguf/models/SauerkrautLM-7b-HerO-q8_0.gguf"
#modelPath="/home/af/gguf/models/sauerkrautlm-una-solar-instruct.Q4_0.gguf"
#modelPath="/home/af/gguf/models/decilm-7b-uniform-gqa-q8_0.gguf"
#modelPath="/home/af/gguf/models/mixtral-8x7b-instruct-v0.1.Q4_0.gguf"
if(os.path.exists(modelPath)==False):
#url="https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GGUF/resolve/main/wizardlm-13b-v1.2.Q4_0.gguf"
#url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
#url="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true"
url="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF/resolve/main/gemma-2-9b-it-Q4_K_M.gguf?download=true"
response = requests.get(url)
with open("./model.gguf", mode="wb") as file:
file.write(response.content)
print("Model downloaded")
modelPath="./model.gguf"
print(modelPath)
# Llama-cpp-Server
#------------------
import subprocess
command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "4", "--n_gpu_layers","20"]
subprocess.Popen(command)
print("Server ready!")
# Gradio-GUI
#------------
import gradio as gr
import requests
import json
def response(message, history):
prompt=message
system="Du bist ein KI-basiertes Assistenzsystem."
model="gemma-2"
if("gemma-2" in model):
prompt=f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
if("mixtral-8x7b-instruct" in model):
prompt=f"[INST] {prompt} [/INST]"
if("Mistral-7B-Instruct" in model):
prompt=f"[INST] {prompt} [/INST]"
if("openchat-3.5" in model):
prompt=f"GPT4 Correct User: {system} {prompt}<|end_of_turn|>GPT4 Correct Assistant:"
if("SauerkrautLM-7b-HerO" in model):
prompt=f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
if("WizardLM-13B-V1.2" in model):
prompt=f"{system} USER: {prompt} ASSISTANT: "
if("phi-2" in model):
prompt=f"Instruct: {prompt}\nOutput:"
print(prompt)
#url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
url="http://0.0.0.0:2600/v1/completions"
body={"prompt":prompt,"max_tokens":1000, "echo":"False","stream":"True"} #e.g. Mixtral-Instruct
response=""
buffer=""
print("URL: "+url)
print("User: "+message+"\nAI: ")
for text in requests.post(url, json=body, stream=True): #-H 'accept: application/json' -H 'Content-Type: application/json'
if buffer is None: buffer=""
buffer=str("".join(buffer))
#print("*** Raw String: "+str(text)+"\n***\n")
text=text.decode('utf-8')
if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
#print("\n*** Buffer: "+str(buffer)+"\n***\n")
buffer=buffer.split('"finish_reason": null}]}')
if(len(buffer)==1):
buffer="".join(buffer)
pass
if(len(buffer)==2):
part=buffer[0]+'"finish_reason": null}]}'
if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
try:
part = str(json.loads(part)["choices"][0]["text"])
print(part, end="", flush=True)
response=response+part
buffer="" # reset buffer
except Exception as e:
print("Exception:"+str(e))
pass
yield response
gr.ChatInterface(response, chatbot=gr.Chatbot(render_markdown=True),title="AI-Interface").queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
print("Interface up and running!")
|