CaioXapelaum commited on
Commit
7e8b5c9
·
verified ·
1 Parent(s): bdcfa4c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +181 -28
app.py CHANGED
@@ -1,48 +1,201 @@
1
- import os
2
- import gradio as gr
3
- from llama_cpp import Llama
4
  import spaces
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- llm = None # Global model variable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  @spaces.GPU()
9
- def chat(message, history):
 
 
 
 
 
 
 
 
 
 
 
 
10
  global llm
 
11
 
12
- # Initialize Llama model if not already loaded
13
- if llm is None:
14
  try:
15
- llm = Llama.from_pretrained(
16
- repo_id="bartowski/Meta-Llama-3.1-70B-Instruct-GGUF",
17
- filename="Meta-Llama-3.1-70B-Instruct-Q4_K_S.gguf",
18
- verbose=False # Set to True if you need debug information
 
 
19
  )
 
20
  except Exception as e:
21
  return f"Error loading model: {str(e)}"
22
 
23
- # Prepare the chat history
24
- chat_history = [{"role": "user", "content": user_input} for user_input, _ in history]
25
- chat_history.append({"role": "user", "content": message})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- # Stream the response back as it's generated
 
 
 
 
 
 
 
28
  try:
29
- response_stream = llm.create_chat_completion_stream(
30
- messages=chat_history,
31
- stream=True # Ensure streaming is enabled
 
 
 
32
  )
33
 
34
- output = ""
35
- for chunk in response_stream:
36
- output += chunk['text'] # Collect and stream text chunks
37
- yield output # Stream each updated response chunk
38
-
39
  except Exception as e:
40
- yield f"Error generating response: {str(e)}"
41
 
42
- # Gradio Chat Interface setup
43
  demo = gr.ChatInterface(
44
- fn=chat, # Function to call when the user sends a message
45
- title="Chat with Meta-Llama 3.1 70B", # Title for the chat interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  chatbot=gr.Chatbot(
47
  scale=1,
48
  likeable=False,
@@ -51,4 +204,4 @@ demo = gr.ChatInterface(
51
  )
52
 
53
  if __name__ == "__main__":
54
- demo.launch()
 
 
 
 
1
  import spaces
2
+ import subprocess
3
+ from llama_cpp import Llama
4
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
5
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
6
+ from llama_cpp_agent.chat_history import BasicChatHistory
7
+ from llama_cpp_agent.chat_history.messages import Roles
8
+ import gradio as gr
9
+ from huggingface_hub import hf_hub_download
10
+ import os
11
+ import cv2
12
+
13
+ huggingface_token = os.environ.get('HF_TOKEN')
14
 
15
+ # Download the Meta-Llama-3.1-8B-Instruct model
16
+ hf_hub_download(
17
+ repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
18
+ filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
19
+ local_dir="./models",
20
+ token=huggingface_token
21
+ )
22
+
23
+ hf_hub_download(
24
+ repo_id="bartowski/Mistral-Nemo-Instruct-2407-GGUF",
25
+ filename="Mistral-Nemo-Instruct-2407-Q5_K_M.gguf",
26
+ local_dir="./models",
27
+ token=huggingface_token
28
+ )
29
+
30
+ hf_hub_download(
31
+ repo_id="bartowski/gemma-2-2b-it-GGUF",
32
+ filename="gemma-2-2b-it-Q6_K_L.gguf",
33
+ local_dir="./models",
34
+ token=huggingface_token
35
+ )
36
+
37
+ hf_hub_download(
38
+ repo_id="bartowski/openchat-3.6-8b-20240522-GGUF",
39
+ filename="openchat-3.6-8b-20240522-Q6_K.gguf",
40
+ local_dir="./models",
41
+ token=huggingface_token
42
+ )
43
+
44
+ hf_hub_download(
45
+ repo_id="bartowski/Llama-3-Groq-8B-Tool-Use-GGUF",
46
+ filename="Llama-3-Groq-8B-Tool-Use-Q6_K.gguf",
47
+ local_dir="./models",
48
+ token=huggingface_token
49
+ )
50
+
51
+ hf_hub_download(
52
+ repo_id="bartowski/MiniCPM-V-2_6-GGUF",
53
+ filename="MiniCPM-V-2_6-Q6_K.gguf",
54
+ local_dir="./models",
55
+ token=huggingface_token
56
+ )
57
+
58
+ hf_hub_download(
59
+ repo_id="CaioXapelaum/Llama-3.1-Storm-8B-Q5_K_M-GGUF",
60
+ filename="llama-3.1-storm-8b-q5_k_m.gguf",
61
+ local_dir="./models",
62
+ token=huggingface_token
63
+ )
64
+
65
+ hf_hub_download(
66
+ repo_id="CaioXapelaum/Orca-2-7b-Patent-Instruct-Llama-2-Q5_K_M-GGUF",
67
+ filename="orca-2-7b-patent-instruct-llama-2-q5_k_m.gguf",
68
+ local_dir="./models",
69
+ token=huggingface_token
70
+ )
71
+
72
+
73
+
74
+ llm = None
75
+ llm_model = None
76
+
77
+ cv2.setNumThreads(1)
78
 
79
  @spaces.GPU()
80
+ def respond(
81
+ message,
82
+ history: list[tuple[str, str]],
83
+ model,
84
+ system_message,
85
+ max_tokens,
86
+ temperature,
87
+ top_p,
88
+ top_k,
89
+ repeat_penalty,
90
+ ):
91
+ chat_template = MessagesFormatterType.GEMMA_2
92
+
93
  global llm
94
+ global llm_model
95
 
96
+ # Load model only if it's not already loaded or if a new model is selected
97
+ if llm is None or llm_model != model:
98
  try:
99
+ llm = Llama(
100
+ model_path=f"models/{model}",
101
+ flash_attn=True,
102
+ n_gpu_layers=81, # Adjust based on available GPU resources
103
+ n_batch=1024,
104
+ n_ctx=8192,
105
  )
106
+ llm_model = model
107
  except Exception as e:
108
  return f"Error loading model: {str(e)}"
109
 
110
+ provider = LlamaCppPythonProvider(llm)
111
+
112
+ agent = LlamaCppAgent(
113
+ provider,
114
+ system_prompt=f"{system_message}",
115
+ predefined_messages_formatter_type=chat_template,
116
+ debug_output=True
117
+ )
118
+
119
+ settings = provider.get_provider_default_settings()
120
+ settings.temperature = temperature
121
+ settings.top_k = top_k
122
+ settings.top_p = top_p
123
+ settings.max_tokens = max_tokens
124
+ settings.repeat_penalty = repeat_penalty
125
+ settings.stream = True
126
+
127
+ messages = BasicChatHistory()
128
 
129
+ # Add user and assistant messages to the history
130
+ for msn in history:
131
+ user = {'role': Roles.user, 'content': msn[0]}
132
+ assistant = {'role': Roles.assistant, 'content': msn[1]}
133
+ messages.add_message(user)
134
+ messages.add_message(assistant)
135
+
136
+ # Stream the response
137
  try:
138
+ stream = agent.get_chat_response(
139
+ message,
140
+ llm_sampling_settings=settings,
141
+ chat_history=messages,
142
+ returns_streaming_generator=True,
143
+ print_output=False
144
  )
145
 
146
+ outputs = ""
147
+ for output in stream:
148
+ outputs += output
149
+ yield outputs
 
150
  except Exception as e:
151
+ yield f"Error during response generation: {str(e)}"
152
 
 
153
  demo = gr.ChatInterface(
154
+ fn=respond,
155
+ additional_inputs=[
156
+ gr.Dropdown([
157
+ 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
158
+ 'Mistral-Nemo-Instruct-2407-Q5_K_M.gguf',
159
+ 'gemma-2-2b-it-Q6_K_L.gguf',
160
+ 'openchat-3.6-8b-20240522-Q6_K.gguf',
161
+ 'Llama-3-Groq-8B-Tool-Use-Q6_K.gguf',
162
+ 'MiniCPM-V-2_6-Q6_K.gguf',
163
+ 'llama-3.1-storm-8b-q5_k_m.gguf',
164
+ 'orca-2-7b-patent-instruct-llama-2-q5_k_m.gguf'
165
+ ],
166
+ value="gemma-2-2b-it-Q6_K_L.gguf",
167
+ label="Model"
168
+ ),
169
+ gr.Textbox(value="You are a helpful assistant.", label="System message"),
170
+ gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
171
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
172
+ gr.Slider(
173
+ minimum=0.1,
174
+ maximum=1.0,
175
+ value=0.95,
176
+ step=0.05,
177
+ label="Top-p",
178
+ ),
179
+ gr.Slider(
180
+ minimum=0,
181
+ maximum=100,
182
+ value=40,
183
+ step=1,
184
+ label="Top-k",
185
+ ),
186
+ gr.Slider(
187
+ minimum=0.0,
188
+ maximum=2.0,
189
+ value=1.1,
190
+ step=0.1,
191
+ label="Repetition penalty",
192
+ ),
193
+ ],
194
+ retry_btn="Retry",
195
+ undo_btn="Undo",
196
+ clear_btn="Clear",
197
+ submit_btn="Send",
198
+ title="Chat with lots of Models and LLMs using llama.cpp",
199
  chatbot=gr.Chatbot(
200
  scale=1,
201
  likeable=False,
 
204
  )
205
 
206
  if __name__ == "__main__":
207
+ demo.launch()