mixtral_gguf / gradio_server.py
tdlvjin's picture
Create gradio_server.py
665afe9 verified
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import torch
print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# print(f"CMAKE_ARGS={os.environ['CMAKE_ARGS']}")
# print(f"FORCE_CMAKE={os.environ['FORCE_CMAKE']}")
print(f'Llama={Llama.__name__}')
models_path = 'models/'
if not os.path.exists(models_path):
os.makedirs(models_path)
downloaded_model_path = hf_hub_download(repo_id="miqudev/miqu-1-70b",
filename="miqu-1-70b.q4_k_m.gguf",local_dir = models_path)
print(f'Downloaded path: {downloaded_model_path}')
print('Initializing model...')
llm = Llama(
model_path=downloaded_model_path,
n_ctx=4096,
n_threads=10,
n_gpu_layers=100,
temp=0.7,
n_batch = 512,
n_predict = -1,
n_keep = 0
)
print('Model loaded.')
def mix_query(query, history):
output = llm(
f"[INST] {query} [/INST]",
max_tokens=1024,
stop=["</s>"],
echo=False
)
print(output['choices'][0]['text'])
return output['choices'][0]['text']
demo = gr.ChatInterface(fn=mix_query,
examples=["Explain the Fermi paradox"], title="TARS",
theme="soft")
demo.launch(share=True,server_name='0.0.0.0')