Spaces:

tdlvjin
/

mixtral_gguf

No application file

tdlvjin commited on Feb 1, 2024

Commit

665afe9

verified ·

1 Parent(s): a7e01e3

Create gradio_server.py

Files changed (1) hide show

gradio_server.py ADDED Viewed

+import gradio as gr
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+import os
+import torch
+print(f"Is CUDA available: {torch.cuda.is_available()}")
+# True
+print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
+# print(f"CMAKE_ARGS={os.environ['CMAKE_ARGS']}")
+# print(f"FORCE_CMAKE={os.environ['FORCE_CMAKE']}")
+print(f'Llama={Llama.__name__}')
+models_path = 'models/'
+if not os.path.exists(models_path):
+    os.makedirs(models_path)
+downloaded_model_path = hf_hub_download(repo_id="miqudev/miqu-1-70b",
+                        filename="miqu-1-70b.q4_k_m.gguf",local_dir = models_path)
+print(f'Downloaded path: {downloaded_model_path}')
+print('Initializing model...')
+llm = Llama(
+  model_path=downloaded_model_path,
+   n_ctx=4096,
+   n_threads=10,
+   n_gpu_layers=100,
+   temp=0.7,
+   n_batch = 512,
+   n_predict = -1,
+  n_keep = 0
+)
+print('Model loaded.')
+def mix_query(query, history):
+    output = llm(
+    f"[INST] {query} [/INST]",
+    max_tokens=1024,
+    stop=["</s>"],
+    echo=False
+    )
+    print(output['choices'][0]['text'])
+    return output['choices'][0]['text']
+demo = gr.ChatInterface(fn=mix_query,
+                        examples=["Explain the Fermi paradox"], title="TARS",
+                        theme="soft")
+demo.launch(share=True,server_name='0.0.0.0')