v5
Browse files- modules/pmbl.py +41 -21
modules/pmbl.py
CHANGED
@@ -72,7 +72,7 @@ class PMBL:
|
|
72 |
except Exception as e:
|
73 |
print(f"Error preparing model: {e}")
|
74 |
# Fall back to using a smaller model that's more easily handled
|
75 |
-
return
|
76 |
|
77 |
def _load_history_sync(self):
|
78 |
"""Load chat history from local file"""
|
@@ -176,25 +176,16 @@ class PMBL:
|
|
176 |
)
|
177 |
|
178 |
try:
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
else:
|
190 |
-
# Use our merged Qwen model
|
191 |
-
print(f"Loading model from: {self.prepared_model_path}")
|
192 |
-
model = AutoModelForCausalLM.from_pretrained(
|
193 |
-
model_file=self.prepared_model_path,
|
194 |
-
model_type="llama",
|
195 |
-
gpu_layers=50,
|
196 |
-
context_length=n_ctx
|
197 |
-
)
|
198 |
|
199 |
# Generate response with streaming
|
200 |
response = model(
|
@@ -213,7 +204,36 @@ class PMBL:
|
|
213 |
|
214 |
except Exception as e:
|
215 |
print(f"Error generating response: {e}")
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
def save_chat(self, prompt, response):
|
219 |
"""Save chat to history"""
|
|
|
72 |
except Exception as e:
|
73 |
print(f"Error preparing model: {e}")
|
74 |
# Fall back to using a smaller model that's more easily handled
|
75 |
+
return None
|
76 |
|
77 |
def _load_history_sync(self):
|
78 |
"""Load chat history from local file"""
|
|
|
176 |
)
|
177 |
|
178 |
try:
|
179 |
+
print(f"Loading model from: {self.prepared_model_path}")
|
180 |
+
|
181 |
+
# The correct way to load a local model file with ctransformers
|
182 |
+
model = AutoModelForCausalLM.from_pretrained(
|
183 |
+
"TheBloke/Llama-2-7B-Chat-GGUF", # This is just a placeholder, we'll use the local file
|
184 |
+
model_file=self.prepared_model_path, # Specify the actual file to use
|
185 |
+
model_type="llama",
|
186 |
+
gpu_layers=50,
|
187 |
+
context_length=n_ctx
|
188 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
# Generate response with streaming
|
191 |
response = model(
|
|
|
204 |
|
205 |
except Exception as e:
|
206 |
print(f"Error generating response: {e}")
|
207 |
+
|
208 |
+
# Fall back to the smaller model
|
209 |
+
try:
|
210 |
+
fallback_model = AutoModelForCausalLM.from_pretrained(
|
211 |
+
"TheBloke/Llama-2-7B-Chat-GGUF",
|
212 |
+
model_type="llama",
|
213 |
+
gpu_layers=50,
|
214 |
+
context_length=n_ctx
|
215 |
+
)
|
216 |
+
|
217 |
+
fallback_response = fallback_model(
|
218 |
+
system_prompt,
|
219 |
+
max_new_tokens=1024,
|
220 |
+
temperature=0.6,
|
221 |
+
top_p=0.95,
|
222 |
+
top_k=30,
|
223 |
+
stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
|
224 |
+
stream=True
|
225 |
+
)
|
226 |
+
|
227 |
+
# First yield an error message
|
228 |
+
yield f"I encountered an error with the primary model, switching to backup: {str(e)}\n\n"
|
229 |
+
|
230 |
+
# Then yield the fallback model's response
|
231 |
+
for chunk in fallback_response:
|
232 |
+
yield chunk
|
233 |
+
|
234 |
+
except Exception as fallback_error:
|
235 |
+
# If even the fallback fails, return a simple error message
|
236 |
+
yield f"I'm sorry, both models encountered errors. Original error: {str(e)}. Fallback error: {str(fallback_error)}. Please try again with a simpler query."
|
237 |
|
238 |
def save_chat(self, prompt, response):
|
239 |
"""Save chat to history"""
|