Spaces:

tdlvjin
/

mixtral_gguf

No application file

mixtral_gguf / gradio_server.py

Create gradio_server.py

665afe9 verified about 1 year ago

1.35 kB


	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os

	import torch
	print(f"Is CUDA available: {torch.cuda.is_available()}")
	# True
	print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
	# print(f"CMAKE_ARGS={os.environ['CMAKE_ARGS']}")
	# print(f"FORCE_CMAKE={os.environ['FORCE_CMAKE']}")
	print(f'Llama={Llama.__name__}')


	models_path = 'models/'

	if not os.path.exists(models_path):
	os.makedirs(models_path)

	downloaded_model_path = hf_hub_download(repo_id="miqudev/miqu-1-70b",
	filename="miqu-1-70b.q4_k_m.gguf",local_dir = models_path)

	print(f'Downloaded path: {downloaded_model_path}')

	print('Initializing model...')
	llm = Llama(
	model_path=downloaded_model_path,
	n_ctx=4096,
	n_threads=10,
	n_gpu_layers=100,
	temp=0.7,
	n_batch = 512,
	n_predict = -1,
	n_keep = 0
	)
	print('Model loaded.')

	def mix_query(query, history):
	output = llm(
	f"[INST] {query} [/INST]",
	max_tokens=1024,
	stop=["</s>"],
	echo=False
	)

	print(output['choices'][0]['text'])
	return output['choices'][0]['text']

	demo = gr.ChatInterface(fn=mix_query,

	examples=["Explain the Fermi paradox"], title="TARS",
	theme="soft")
	demo.launch(share=True,server_name='0.0.0.0')