Upload 3 files
Browse files- app.py +1 -1
- llmdolphin.py +195 -184
app.py
CHANGED
@@ -221,7 +221,7 @@ with gr.Blocks(fill_width=True, elem_id="container", css=css, delete_cache=(60,
|
|
221 |
).success(
|
222 |
fn=dolphin_respond_auto,
|
223 |
inputs=[prompt, chatbot],
|
224 |
-
outputs=[chatbot, result],
|
225 |
queue=True,
|
226 |
show_progress="full",
|
227 |
show_api=False,
|
|
|
221 |
).success(
|
222 |
fn=dolphin_respond_auto,
|
223 |
inputs=[prompt, chatbot],
|
224 |
+
outputs=[chatbot, result, prompt],
|
225 |
queue=True,
|
226 |
show_progress="full",
|
227 |
show_api=False,
|
llmdolphin.py
CHANGED
@@ -976,19 +976,19 @@ def add_dolphin_models(query, format_name):
|
|
976 |
if s and "" in s: s.remove("")
|
977 |
if len(s) == 1:
|
978 |
repo = s[0]
|
979 |
-
if not api.repo_exists(repo_id = repo): return gr.update(
|
980 |
files = api.list_repo_files(repo_id = repo)
|
981 |
for file in files:
|
982 |
if str(file).endswith(".gguf"): add_models[filename] = [repo, format]
|
983 |
elif len(s) >= 2:
|
984 |
repo = s[0]
|
985 |
filename = s[1]
|
986 |
-
if not api.repo_exists(repo_id = repo) or not api.file_exists(repo_id = repo, filename = filename): return gr.update(
|
987 |
add_models[filename] = [repo, format]
|
988 |
-
else: return gr.update(
|
989 |
except Exception as e:
|
990 |
print(e)
|
991 |
-
return gr.update(
|
992 |
llm_models = (llm_models | add_models).copy()
|
993 |
update_llm_model_tupled_list()
|
994 |
choices = get_dolphin_models()
|
@@ -1235,84 +1235,89 @@ def dolphin_respond(
|
|
1235 |
repeat_penalty: float = 1.1,
|
1236 |
progress=gr.Progress(track_tqdm=True),
|
1237 |
):
|
1238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1239 |
|
1240 |
-
|
1241 |
-
|
1242 |
-
|
1243 |
-
|
1244 |
-
|
1245 |
-
|
1246 |
-
|
1247 |
-
flash_attn=True,
|
1248 |
-
n_gpu_layers=81, # 81
|
1249 |
-
n_batch=1024,
|
1250 |
-
n_ctx=8192, #8192
|
1251 |
-
)
|
1252 |
-
provider = LlamaCppPythonProvider(llm)
|
1253 |
-
|
1254 |
-
agent = LlamaCppAgent(
|
1255 |
-
provider,
|
1256 |
-
system_prompt=f"{system_message}",
|
1257 |
-
predefined_messages_formatter_type=chat_template if not isinstance(chat_template, MessagesFormatter) else None,
|
1258 |
-
custom_messages_formatter=chat_template if isinstance(chat_template, MessagesFormatter) else None,
|
1259 |
-
debug_output=False
|
1260 |
-
)
|
1261 |
-
|
1262 |
-
settings = provider.get_provider_default_settings()
|
1263 |
-
settings.temperature = temperature
|
1264 |
-
settings.top_k = top_k
|
1265 |
-
settings.top_p = top_p
|
1266 |
-
settings.max_tokens = max_tokens
|
1267 |
-
settings.repeat_penalty = repeat_penalty
|
1268 |
-
settings.stream = True
|
1269 |
-
|
1270 |
-
messages = BasicChatHistory()
|
1271 |
-
|
1272 |
-
for msn in history:
|
1273 |
-
user = {
|
1274 |
-
'role': Roles.user,
|
1275 |
-
'content': msn[0]
|
1276 |
-
}
|
1277 |
-
assistant = {
|
1278 |
-
'role': Roles.assistant,
|
1279 |
-
'content': msn[1]
|
1280 |
-
}
|
1281 |
-
messages.add_message(user)
|
1282 |
-
messages.add_message(assistant)
|
1283 |
-
|
1284 |
-
stream = agent.get_chat_response(
|
1285 |
-
message,
|
1286 |
-
llm_sampling_settings=settings,
|
1287 |
-
chat_history=messages,
|
1288 |
-
returns_streaming_generator=True,
|
1289 |
-
print_output=False
|
1290 |
-
)
|
1291 |
-
|
1292 |
-
progress(0.5, desc="Processing...")
|
1293 |
-
|
1294 |
-
outputs = ""
|
1295 |
-
for output in stream:
|
1296 |
-
outputs += output
|
1297 |
-
yield [(outputs, None)]
|
1298 |
|
1299 |
|
1300 |
def dolphin_parse(
|
1301 |
history: list[tuple[str, str]],
|
1302 |
):
|
1303 |
-
if dolphin_sysprompt_mode == "Chat with LLM" or not history or len(history) < 1:
|
1304 |
-
return "", gr.update(visible=True), gr.update(visible=True)
|
1305 |
try:
|
|
|
|
|
1306 |
msg = history[-1][0]
|
1307 |
raw_prompt = get_raw_prompt(msg)
|
1308 |
-
|
1309 |
-
|
1310 |
-
|
1311 |
-
|
1312 |
-
|
1313 |
-
|
1314 |
-
|
1315 |
-
|
|
|
1316 |
|
1317 |
|
1318 |
@torch.inference_mode()
|
@@ -1329,87 +1334,92 @@ def dolphin_respond_auto(
|
|
1329 |
repeat_penalty: float = 1.1,
|
1330 |
progress=gr.Progress(track_tqdm=True),
|
1331 |
):
|
1332 |
-
|
1333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1334 |
|
1335 |
-
|
1336 |
-
|
1337 |
-
|
1338 |
-
|
1339 |
-
|
1340 |
-
|
1341 |
-
|
1342 |
-
|
1343 |
-
|
1344 |
-
n_batch=1024,
|
1345 |
-
n_ctx=8192, #8192
|
1346 |
-
)
|
1347 |
-
provider = LlamaCppPythonProvider(llm)
|
1348 |
-
|
1349 |
-
agent = LlamaCppAgent(
|
1350 |
-
provider,
|
1351 |
-
system_prompt=f"{system_message}",
|
1352 |
-
predefined_messages_formatter_type=chat_template if not isinstance(chat_template, MessagesFormatter) else None,
|
1353 |
-
custom_messages_formatter=chat_template if isinstance(chat_template, MessagesFormatter) else None,
|
1354 |
-
debug_output=False
|
1355 |
-
)
|
1356 |
-
|
1357 |
-
settings = provider.get_provider_default_settings()
|
1358 |
-
settings.temperature = temperature
|
1359 |
-
settings.top_k = top_k
|
1360 |
-
settings.top_p = top_p
|
1361 |
-
settings.max_tokens = max_tokens
|
1362 |
-
settings.repeat_penalty = repeat_penalty
|
1363 |
-
settings.stream = True
|
1364 |
-
|
1365 |
-
messages = BasicChatHistory()
|
1366 |
-
|
1367 |
-
for msn in history:
|
1368 |
-
user = {
|
1369 |
-
'role': Roles.user,
|
1370 |
-
'content': msn[0]
|
1371 |
-
}
|
1372 |
-
assistant = {
|
1373 |
-
'role': Roles.assistant,
|
1374 |
-
'content': msn[1]
|
1375 |
-
}
|
1376 |
-
messages.add_message(user)
|
1377 |
-
messages.add_message(assistant)
|
1378 |
-
|
1379 |
-
progress(0, desc="Translating...")
|
1380 |
-
stream = agent.get_chat_response(
|
1381 |
-
message,
|
1382 |
-
llm_sampling_settings=settings,
|
1383 |
-
chat_history=messages,
|
1384 |
-
returns_streaming_generator=True,
|
1385 |
-
print_output=False
|
1386 |
-
)
|
1387 |
-
|
1388 |
-
progress(0.5, desc="Processing...")
|
1389 |
-
|
1390 |
-
outputs = ""
|
1391 |
-
for output in stream:
|
1392 |
-
outputs += output
|
1393 |
-
yield [(outputs, None)], gr.update()
|
1394 |
|
1395 |
|
1396 |
def dolphin_parse_simple(
|
1397 |
message: str,
|
1398 |
history: list[tuple[str, str]],
|
1399 |
):
|
1400 |
-
#if not is_japanese(message): return message
|
1401 |
-
if dolphin_sysprompt_mode == "Chat with LLM" or not history or len(history) < 1: return message
|
1402 |
try:
|
|
|
|
|
1403 |
msg = history[-1][0]
|
1404 |
raw_prompt = get_raw_prompt(msg)
|
1405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1406 |
return ""
|
1407 |
-
prompts = []
|
1408 |
-
if dolphin_sysprompt_mode == "Japanese to Danbooru Dictionary" and is_japanese(raw_prompt):
|
1409 |
-
prompts = list_uniq(jatags_to_danbooru_tags(to_list_ja(raw_prompt)) + ["nsfw", "explicit", "rating_explicit"])
|
1410 |
-
else:
|
1411 |
-
prompts = list_uniq(to_list(raw_prompt) + ["nsfw", "explicit", "rating_explicit"])
|
1412 |
-
return ", ".join(prompts)
|
1413 |
|
1414 |
|
1415 |
# https://huggingface.co/spaces/CaioXapelaum/GGUF-Playground
|
@@ -1430,47 +1440,47 @@ def respond_playground(
|
|
1430 |
top_k,
|
1431 |
repeat_penalty,
|
1432 |
):
|
1433 |
-
if override_llm_format:
|
1434 |
-
chat_template = override_llm_format
|
1435 |
-
else:
|
1436 |
-
chat_template = llm_models[model][1]
|
1437 |
-
|
1438 |
-
llm = Llama(
|
1439 |
-
model_path=str(Path(f"{llm_models_dir}/{model}")),
|
1440 |
-
flash_attn=True,
|
1441 |
-
n_gpu_layers=81, # 81
|
1442 |
-
n_batch=1024,
|
1443 |
-
n_ctx=8192, #8192
|
1444 |
-
)
|
1445 |
-
provider = LlamaCppPythonProvider(llm)
|
1446 |
-
|
1447 |
-
agent = LlamaCppAgent(
|
1448 |
-
provider,
|
1449 |
-
system_prompt=f"{system_message}",
|
1450 |
-
predefined_messages_formatter_type=chat_template if not isinstance(chat_template, MessagesFormatter) else None,
|
1451 |
-
custom_messages_formatter=chat_template if isinstance(chat_template, MessagesFormatter) else None,
|
1452 |
-
debug_output=False
|
1453 |
-
)
|
1454 |
-
|
1455 |
-
settings = provider.get_provider_default_settings()
|
1456 |
-
settings.temperature = temperature
|
1457 |
-
settings.top_k = top_k
|
1458 |
-
settings.top_p = top_p
|
1459 |
-
settings.max_tokens = max_tokens
|
1460 |
-
settings.repeat_penalty = repeat_penalty
|
1461 |
-
settings.stream = True
|
1462 |
-
|
1463 |
-
messages = BasicChatHistory()
|
1464 |
-
|
1465 |
-
# Add user and assistant messages to the history
|
1466 |
-
for msn in history:
|
1467 |
-
user = {'role': Roles.user, 'content': msn[0]}
|
1468 |
-
assistant = {'role': Roles.assistant, 'content': msn[1]}
|
1469 |
-
messages.add_message(user)
|
1470 |
-
messages.add_message(assistant)
|
1471 |
-
|
1472 |
-
# Stream the response
|
1473 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1474 |
stream = agent.get_chat_response(
|
1475 |
message,
|
1476 |
llm_sampling_settings=settings,
|
@@ -1484,4 +1494,5 @@ def respond_playground(
|
|
1484 |
outputs += output
|
1485 |
yield outputs
|
1486 |
except Exception as e:
|
1487 |
-
|
|
|
|
976 |
if s and "" in s: s.remove("")
|
977 |
if len(s) == 1:
|
978 |
repo = s[0]
|
979 |
+
if not api.repo_exists(repo_id = repo): return gr.update()
|
980 |
files = api.list_repo_files(repo_id = repo)
|
981 |
for file in files:
|
982 |
if str(file).endswith(".gguf"): add_models[filename] = [repo, format]
|
983 |
elif len(s) >= 2:
|
984 |
repo = s[0]
|
985 |
filename = s[1]
|
986 |
+
if not api.repo_exists(repo_id = repo) or not api.file_exists(repo_id = repo, filename = filename): return gr.update()
|
987 |
add_models[filename] = [repo, format]
|
988 |
+
else: return gr.update()
|
989 |
except Exception as e:
|
990 |
print(e)
|
991 |
+
return gr.update()
|
992 |
llm_models = (llm_models | add_models).copy()
|
993 |
update_llm_model_tupled_list()
|
994 |
choices = get_dolphin_models()
|
|
|
1235 |
repeat_penalty: float = 1.1,
|
1236 |
progress=gr.Progress(track_tqdm=True),
|
1237 |
):
|
1238 |
+
try:
|
1239 |
+
progress(0, desc="Processing...")
|
1240 |
+
|
1241 |
+
if override_llm_format:
|
1242 |
+
chat_template = override_llm_format
|
1243 |
+
else:
|
1244 |
+
chat_template = llm_models[model][1]
|
1245 |
+
|
1246 |
+
llm = Llama(
|
1247 |
+
model_path=str(Path(f"{llm_models_dir}/{model}")),
|
1248 |
+
flash_attn=True,
|
1249 |
+
n_gpu_layers=81, # 81
|
1250 |
+
n_batch=1024,
|
1251 |
+
n_ctx=8192, #8192
|
1252 |
+
)
|
1253 |
+
provider = LlamaCppPythonProvider(llm)
|
1254 |
+
|
1255 |
+
agent = LlamaCppAgent(
|
1256 |
+
provider,
|
1257 |
+
system_prompt=f"{system_message}",
|
1258 |
+
predefined_messages_formatter_type=chat_template if not isinstance(chat_template, MessagesFormatter) else None,
|
1259 |
+
custom_messages_formatter=chat_template if isinstance(chat_template, MessagesFormatter) else None,
|
1260 |
+
debug_output=False
|
1261 |
+
)
|
1262 |
+
|
1263 |
+
settings = provider.get_provider_default_settings()
|
1264 |
+
settings.temperature = temperature
|
1265 |
+
settings.top_k = top_k
|
1266 |
+
settings.top_p = top_p
|
1267 |
+
settings.max_tokens = max_tokens
|
1268 |
+
settings.repeat_penalty = repeat_penalty
|
1269 |
+
settings.stream = True
|
1270 |
+
|
1271 |
+
messages = BasicChatHistory()
|
1272 |
+
|
1273 |
+
for msn in history:
|
1274 |
+
user = {
|
1275 |
+
'role': Roles.user,
|
1276 |
+
'content': msn[0]
|
1277 |
+
}
|
1278 |
+
assistant = {
|
1279 |
+
'role': Roles.assistant,
|
1280 |
+
'content': msn[1]
|
1281 |
+
}
|
1282 |
+
messages.add_message(user)
|
1283 |
+
messages.add_message(assistant)
|
1284 |
+
|
1285 |
+
stream = agent.get_chat_response(
|
1286 |
+
message,
|
1287 |
+
llm_sampling_settings=settings,
|
1288 |
+
chat_history=messages,
|
1289 |
+
returns_streaming_generator=True,
|
1290 |
+
print_output=False
|
1291 |
+
)
|
1292 |
+
|
1293 |
+
progress(0.5, desc="Processing...")
|
1294 |
|
1295 |
+
outputs = ""
|
1296 |
+
for output in stream:
|
1297 |
+
outputs += output
|
1298 |
+
yield [(outputs, None)]
|
1299 |
+
except Exception as e:
|
1300 |
+
print(e)
|
1301 |
+
yield [("", None)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1302 |
|
1303 |
|
1304 |
def dolphin_parse(
|
1305 |
history: list[tuple[str, str]],
|
1306 |
):
|
|
|
|
|
1307 |
try:
|
1308 |
+
if dolphin_sysprompt_mode == "Chat with LLM" or not history or len(history) < 1:
|
1309 |
+
return "", gr.update(), gr.update()
|
1310 |
msg = history[-1][0]
|
1311 |
raw_prompt = get_raw_prompt(msg)
|
1312 |
+
prompts = []
|
1313 |
+
if dolphin_sysprompt_mode == "Japanese to Danbooru Dictionary" and is_japanese(raw_prompt):
|
1314 |
+
prompts = list_uniq(jatags_to_danbooru_tags(to_list_ja(raw_prompt)) + ["nsfw", "explicit"])
|
1315 |
+
else:
|
1316 |
+
prompts = list_uniq(to_list(raw_prompt) + ["nsfw", "explicit"])
|
1317 |
+
return ", ".join(prompts), gr.update(interactive=True), gr.update(interactive=True)
|
1318 |
+
except Exception as e:
|
1319 |
+
print(e)
|
1320 |
+
return "", gr.update(), gr.update()
|
1321 |
|
1322 |
|
1323 |
@torch.inference_mode()
|
|
|
1334 |
repeat_penalty: float = 1.1,
|
1335 |
progress=gr.Progress(track_tqdm=True),
|
1336 |
):
|
1337 |
+
try:
|
1338 |
+
#if not is_japanese(message): return [(None, None)]
|
1339 |
+
progress(0, desc="Processing...")
|
1340 |
+
|
1341 |
+
if override_llm_format:
|
1342 |
+
chat_template = override_llm_format
|
1343 |
+
else:
|
1344 |
+
chat_template = llm_models[model][1]
|
1345 |
+
|
1346 |
+
llm = Llama(
|
1347 |
+
model_path=str(Path(f"{llm_models_dir}/{model}")),
|
1348 |
+
flash_attn=True,
|
1349 |
+
n_gpu_layers=81, # 81
|
1350 |
+
n_batch=1024,
|
1351 |
+
n_ctx=8192, #8192
|
1352 |
+
)
|
1353 |
+
provider = LlamaCppPythonProvider(llm)
|
1354 |
+
|
1355 |
+
agent = LlamaCppAgent(
|
1356 |
+
provider,
|
1357 |
+
system_prompt=f"{system_message}",
|
1358 |
+
predefined_messages_formatter_type=chat_template if not isinstance(chat_template, MessagesFormatter) else None,
|
1359 |
+
custom_messages_formatter=chat_template if isinstance(chat_template, MessagesFormatter) else None,
|
1360 |
+
debug_output=False
|
1361 |
+
)
|
1362 |
+
|
1363 |
+
settings = provider.get_provider_default_settings()
|
1364 |
+
settings.temperature = temperature
|
1365 |
+
settings.top_k = top_k
|
1366 |
+
settings.top_p = top_p
|
1367 |
+
settings.max_tokens = max_tokens
|
1368 |
+
settings.repeat_penalty = repeat_penalty
|
1369 |
+
settings.stream = True
|
1370 |
+
|
1371 |
+
messages = BasicChatHistory()
|
1372 |
+
|
1373 |
+
for msn in history:
|
1374 |
+
user = {
|
1375 |
+
'role': Roles.user,
|
1376 |
+
'content': msn[0]
|
1377 |
+
}
|
1378 |
+
assistant = {
|
1379 |
+
'role': Roles.assistant,
|
1380 |
+
'content': msn[1]
|
1381 |
+
}
|
1382 |
+
messages.add_message(user)
|
1383 |
+
messages.add_message(assistant)
|
1384 |
+
|
1385 |
+
progress(0, desc="Translating...")
|
1386 |
+
stream = agent.get_chat_response(
|
1387 |
+
message,
|
1388 |
+
llm_sampling_settings=settings,
|
1389 |
+
chat_history=messages,
|
1390 |
+
returns_streaming_generator=True,
|
1391 |
+
print_output=False
|
1392 |
+
)
|
1393 |
|
1394 |
+
progress(0.5, desc="Processing...")
|
1395 |
+
|
1396 |
+
outputs = ""
|
1397 |
+
for output in stream:
|
1398 |
+
outputs += output
|
1399 |
+
yield [(outputs, None)], gr.update(), gr.update()
|
1400 |
+
except Exception as e:
|
1401 |
+
print(e)
|
1402 |
+
yield [("", None)], gr.update(), gr.update()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1403 |
|
1404 |
|
1405 |
def dolphin_parse_simple(
|
1406 |
message: str,
|
1407 |
history: list[tuple[str, str]],
|
1408 |
):
|
|
|
|
|
1409 |
try:
|
1410 |
+
#if not is_japanese(message): return message
|
1411 |
+
if dolphin_sysprompt_mode == "Chat with LLM" or not history or len(history) < 1: return message
|
1412 |
msg = history[-1][0]
|
1413 |
raw_prompt = get_raw_prompt(msg)
|
1414 |
+
prompts = []
|
1415 |
+
if dolphin_sysprompt_mode == "Japanese to Danbooru Dictionary" and is_japanese(raw_prompt):
|
1416 |
+
prompts = list_uniq(jatags_to_danbooru_tags(to_list_ja(raw_prompt)) + ["nsfw", "explicit", "rating_explicit"])
|
1417 |
+
else:
|
1418 |
+
prompts = list_uniq(to_list(raw_prompt) + ["nsfw", "explicit", "rating_explicit"])
|
1419 |
+
return ", ".join(prompts)
|
1420 |
+
except Exception as e:
|
1421 |
+
print(e)
|
1422 |
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
1423 |
|
1424 |
|
1425 |
# https://huggingface.co/spaces/CaioXapelaum/GGUF-Playground
|
|
|
1440 |
top_k,
|
1441 |
repeat_penalty,
|
1442 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1443 |
try:
|
1444 |
+
if override_llm_format:
|
1445 |
+
chat_template = override_llm_format
|
1446 |
+
else:
|
1447 |
+
chat_template = llm_models[model][1]
|
1448 |
+
|
1449 |
+
llm = Llama(
|
1450 |
+
model_path=str(Path(f"{llm_models_dir}/{model}")),
|
1451 |
+
flash_attn=True,
|
1452 |
+
n_gpu_layers=81, # 81
|
1453 |
+
n_batch=1024,
|
1454 |
+
n_ctx=8192, #8192
|
1455 |
+
)
|
1456 |
+
provider = LlamaCppPythonProvider(llm)
|
1457 |
+
|
1458 |
+
agent = LlamaCppAgent(
|
1459 |
+
provider,
|
1460 |
+
system_prompt=f"{system_message}",
|
1461 |
+
predefined_messages_formatter_type=chat_template if not isinstance(chat_template, MessagesFormatter) else None,
|
1462 |
+
custom_messages_formatter=chat_template if isinstance(chat_template, MessagesFormatter) else None,
|
1463 |
+
debug_output=False
|
1464 |
+
)
|
1465 |
+
|
1466 |
+
settings = provider.get_provider_default_settings()
|
1467 |
+
settings.temperature = temperature
|
1468 |
+
settings.top_k = top_k
|
1469 |
+
settings.top_p = top_p
|
1470 |
+
settings.max_tokens = max_tokens
|
1471 |
+
settings.repeat_penalty = repeat_penalty
|
1472 |
+
settings.stream = True
|
1473 |
+
|
1474 |
+
messages = BasicChatHistory()
|
1475 |
+
|
1476 |
+
# Add user and assistant messages to the history
|
1477 |
+
for msn in history:
|
1478 |
+
user = {'role': Roles.user, 'content': msn[0]}
|
1479 |
+
assistant = {'role': Roles.assistant, 'content': msn[1]}
|
1480 |
+
messages.add_message(user)
|
1481 |
+
messages.add_message(assistant)
|
1482 |
+
|
1483 |
+
# Stream the response
|
1484 |
stream = agent.get_chat_response(
|
1485 |
message,
|
1486 |
llm_sampling_settings=settings,
|
|
|
1494 |
outputs += output
|
1495 |
yield outputs
|
1496 |
except Exception as e:
|
1497 |
+
print(e)
|
1498 |
+
yield ""
|