tdlvjin commited on
Commit
665afe9
·
verified ·
1 Parent(s): a7e01e3

Create gradio_server.py

Browse files
Files changed (1) hide show
  1. gradio_server.py +54 -0
gradio_server.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from llama_cpp import Llama
4
+ from huggingface_hub import hf_hub_download
5
+ import os
6
+
7
+ import torch
8
+ print(f"Is CUDA available: {torch.cuda.is_available()}")
9
+ # True
10
+ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
11
+ # print(f"CMAKE_ARGS={os.environ['CMAKE_ARGS']}")
12
+ # print(f"FORCE_CMAKE={os.environ['FORCE_CMAKE']}")
13
+ print(f'Llama={Llama.__name__}')
14
+
15
+
16
+ models_path = 'models/'
17
+
18
+ if not os.path.exists(models_path):
19
+ os.makedirs(models_path)
20
+
21
+ downloaded_model_path = hf_hub_download(repo_id="miqudev/miqu-1-70b",
22
+ filename="miqu-1-70b.q4_k_m.gguf",local_dir = models_path)
23
+
24
+ print(f'Downloaded path: {downloaded_model_path}')
25
+
26
+ print('Initializing model...')
27
+ llm = Llama(
28
+ model_path=downloaded_model_path,
29
+ n_ctx=4096,
30
+ n_threads=10,
31
+ n_gpu_layers=100,
32
+ temp=0.7,
33
+ n_batch = 512,
34
+ n_predict = -1,
35
+ n_keep = 0
36
+ )
37
+ print('Model loaded.')
38
+
39
+ def mix_query(query, history):
40
+ output = llm(
41
+ f"[INST] {query} [/INST]",
42
+ max_tokens=1024,
43
+ stop=["</s>"],
44
+ echo=False
45
+ )
46
+
47
+ print(output['choices'][0]['text'])
48
+ return output['choices'][0]['text']
49
+
50
+ demo = gr.ChatInterface(fn=mix_query,
51
+
52
+ examples=["Explain the Fermi paradox"], title="TARS",
53
+ theme="soft")
54
+ demo.launch(share=True,server_name='0.0.0.0')