multi chatbot
Browse files- app-old.py +78 -0
- app.py +76 -7
- local.ipynb → debug.ipynb +30 -58
- local-requirements.txt → finetuning-requirements.txt +0 -0
app-old.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from llama_cpp import Llama
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
llm = Llama.from_pretrained(
|
5 |
+
repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m",
|
6 |
+
filename="unsloth.Q4_K_M.gguf",
|
7 |
+
)
|
8 |
+
|
9 |
+
llm2 = Llama.from_pretrained(
|
10 |
+
repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-16bit",
|
11 |
+
filename="unsloth.F16.gguf",
|
12 |
+
)
|
13 |
+
|
14 |
+
def predict(message, history):
|
15 |
+
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
16 |
+
for user_message, bot_message in history:
|
17 |
+
if user_message:
|
18 |
+
messages.append({"role": "user", "content": user_message})
|
19 |
+
if bot_message:
|
20 |
+
messages.append({"role": "assistant", "content": bot_message})
|
21 |
+
messages.append({"role": "user", "content": message})
|
22 |
+
|
23 |
+
response = ""
|
24 |
+
for chunk in llm.create_chat_completion(
|
25 |
+
stream=True,
|
26 |
+
messages=messages,
|
27 |
+
):
|
28 |
+
part = chunk["choices"][0]["delta"].get("content", None)
|
29 |
+
if part:
|
30 |
+
response += part
|
31 |
+
yield response
|
32 |
+
|
33 |
+
|
34 |
+
def predict2(message, history):
|
35 |
+
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
36 |
+
for user_message, bot_message in history:
|
37 |
+
if user_message:
|
38 |
+
messages.append({"role": "user", "content": user_message})
|
39 |
+
if bot_message:
|
40 |
+
messages.append({"role": "assistant", "content": bot_message})
|
41 |
+
messages.append({"role": "user", "content": message})
|
42 |
+
|
43 |
+
response = ""
|
44 |
+
for chunk in llm2.create_chat_completion(
|
45 |
+
stream=True,
|
46 |
+
messages=messages,
|
47 |
+
):
|
48 |
+
part = chunk["choices"][0]["delta"].get("content", None)
|
49 |
+
if part:
|
50 |
+
response += part
|
51 |
+
yield response
|
52 |
+
|
53 |
+
|
54 |
+
chat1 = gr.ChatInterface(predict, title="4-bit")
|
55 |
+
chat2 = gr.ChatInterface(predict2, title="16-bit")
|
56 |
+
chat3 = gr.ChatInterface(predict2, title="16-bit")
|
57 |
+
|
58 |
+
def update_chat(value):
|
59 |
+
if value == "4-bit":
|
60 |
+
chat1.render(visible=True)
|
61 |
+
chat2.render(visible=False)
|
62 |
+
chat3.render(visible=False)
|
63 |
+
elif value == "16-bit":
|
64 |
+
chat1.render(visible=False)
|
65 |
+
chat2.render(visible=True)
|
66 |
+
chat3.render(visible=False)
|
67 |
+
else:
|
68 |
+
chat1.render(visible=False)
|
69 |
+
chat2.render(visible=False)
|
70 |
+
chat3.render(visible=True)
|
71 |
+
|
72 |
+
with gr.Blocks() as demo:
|
73 |
+
|
74 |
+
gr.Markdown("# Quantized Llama Comparison for Code Generation")
|
75 |
+
dropdown = gr.Dropdown(["4-bit", "16-bit", "32-bit"], label="Choose model version", value="4-bit")
|
76 |
+
dropdown.change(fn=update_chat, inputs=dropdown, outputs=[chat1, chat2, chat3])
|
77 |
+
|
78 |
+
demo.launch()
|
app.py
CHANGED
@@ -1,12 +1,19 @@
|
|
1 |
-
from llama_cpp import Llama
|
2 |
import gradio as gr
|
|
|
3 |
|
|
|
4 |
llm = Llama.from_pretrained(
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
7 |
)
|
8 |
|
9 |
-
|
|
|
10 |
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
11 |
for user_message, bot_message in history:
|
12 |
if user_message:
|
@@ -25,7 +32,69 @@ def predict(message, history):
|
|
25 |
response += part
|
26 |
yield response
|
27 |
|
28 |
-
demo = gr.ChatInterface(predict)
|
29 |
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from llama_cpp import Llama
|
3 |
|
4 |
+
# Load models
|
5 |
llm = Llama.from_pretrained(
|
6 |
+
repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m",
|
7 |
+
filename="unsloth.Q4_K_M.gguf",
|
8 |
+
)
|
9 |
+
|
10 |
+
llm2 = Llama.from_pretrained(
|
11 |
+
repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-16bit",
|
12 |
+
filename="unsloth.F16.gguf",
|
13 |
)
|
14 |
|
15 |
+
# Define prediction functions
|
16 |
+
def predict(message, history, model):
|
17 |
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
18 |
for user_message, bot_message in history:
|
19 |
if user_message:
|
|
|
32 |
response += part
|
33 |
yield response
|
34 |
|
|
|
35 |
|
36 |
+
def predict2(message, history, model):
|
37 |
+
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
38 |
+
for user_message, bot_message in history:
|
39 |
+
if user_message:
|
40 |
+
messages.append({"role": "user", "content": user_message})
|
41 |
+
if bot_message:
|
42 |
+
messages.append({"role": "assistant", "content": bot_message})
|
43 |
+
messages.append({"role": "user", "content": message})
|
44 |
+
|
45 |
+
response = ""
|
46 |
+
for chunk in llm2.create_chat_completion(
|
47 |
+
stream=True,
|
48 |
+
messages=messages,
|
49 |
+
):
|
50 |
+
part = chunk["choices"][0]["delta"].get("content", None)
|
51 |
+
if part:
|
52 |
+
response += part
|
53 |
+
yield response
|
54 |
+
|
55 |
+
def predict3(message, history, model):
|
56 |
+
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
57 |
+
for user_message, bot_message in history:
|
58 |
+
if user_message:
|
59 |
+
messages.append({"role": "user", "content": user_message})
|
60 |
+
if bot_message:
|
61 |
+
messages.append({"role": "assistant", "content": bot_message})
|
62 |
+
messages.append({"role": "user", "content": message})
|
63 |
+
|
64 |
+
response = ""
|
65 |
+
for chunk in llm2.create_chat_completion(
|
66 |
+
stream=True,
|
67 |
+
messages=messages,
|
68 |
+
):
|
69 |
+
part = chunk["choices"][0]["delta"].get("content", None)
|
70 |
+
if part:
|
71 |
+
response += part
|
72 |
+
yield response
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
# Define ChatInterfaces
|
77 |
+
io1 = gr.ChatInterface(predict, title="4-bit")
|
78 |
+
io2 = gr.ChatInterface(predict2, title="8-bit") # Placeholder
|
79 |
+
io3 = gr.ChatInterface(predict3, title="16-bit")
|
80 |
+
io4 = gr.ChatInterface(predict2, title="32-bit") # Placeholder
|
81 |
+
|
82 |
+
# Dropdown and visibility mapping
|
83 |
+
chat_interfaces = {"4-bit": io1, "8-bit": io2, "16-bit": io3, "32-bit": io4}
|
84 |
+
|
85 |
+
# Define UI
|
86 |
+
with gr.Blocks() as demo:
|
87 |
+
gr.Markdown("# Quantized Llama Comparison for Code Generation")
|
88 |
+
|
89 |
+
with gr.Tab("4-bit"):
|
90 |
+
io1.render()
|
91 |
+
with gr.Tab("8-bit"):
|
92 |
+
io2.render()
|
93 |
+
with gr.Tab("16-bit"):
|
94 |
+
io3.render()
|
95 |
+
with gr.Tab("32-bit"):
|
96 |
+
io4.render()
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
demo.launch()
|
local.ipynb → debug.ipynb
RENAMED
@@ -2,40 +2,9 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
-
{
|
9 |
-
"name": "stderr",
|
10 |
-
"output_type": "stream",
|
11 |
-
"text": [
|
12 |
-
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
|
13 |
-
]
|
14 |
-
}
|
15 |
-
],
|
16 |
-
"source": [
|
17 |
-
"import transformers"
|
18 |
-
]
|
19 |
-
},
|
20 |
-
{
|
21 |
-
"cell_type": "code",
|
22 |
-
"execution_count": 3,
|
23 |
-
"metadata": {},
|
24 |
-
"outputs": [
|
25 |
-
{
|
26 |
-
"data": {
|
27 |
-
"application/vnd.jupyter.widget-view+json": {
|
28 |
-
"model_id": "c94c88bacc2c48cb8ce50e93d73e15eb",
|
29 |
-
"version_major": 2,
|
30 |
-
"version_minor": 0
|
31 |
-
},
|
32 |
-
"text/plain": [
|
33 |
-
"unsloth.Q4_K_M.gguf: 0%| | 0.00/808M [00:00<?, ?B/s]"
|
34 |
-
]
|
35 |
-
},
|
36 |
-
"metadata": {},
|
37 |
-
"output_type": "display_data"
|
38 |
-
},
|
39 |
{
|
40 |
"name": "stderr",
|
41 |
"output_type": "stream",
|
@@ -552,6 +521,8 @@
|
|
552 |
],
|
553 |
"source": [
|
554 |
"from llama_cpp import Llama\n",
|
|
|
|
|
555 |
"\n",
|
556 |
"llm = Llama.from_pretrained(\n",
|
557 |
"\trepo_id=\"Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m\",\n",
|
@@ -561,44 +532,45 @@
|
|
561 |
},
|
562 |
{
|
563 |
"cell_type": "code",
|
564 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
"metadata": {},
|
566 |
"outputs": [
|
567 |
{
|
568 |
"name": "stderr",
|
569 |
"output_type": "stream",
|
570 |
"text": [
|
571 |
-
"
|
572 |
-
"llama_perf_context_print:
|
573 |
-
"llama_perf_context_print:
|
574 |
-
"llama_perf_context_print:
|
|
|
575 |
]
|
576 |
},
|
577 |
{
|
578 |
-
"
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
" 'model': '/home/robert/.cache/huggingface/hub/models--Robzy--Llama-3.2-1B-Instruct-Finetuned-q4_k_m/snapshots/49dc2f37761bb04ce3513b70087676029ccd4f20/./unsloth.Q4_K_M.gguf',\n",
|
584 |
-
" 'choices': [{'index': 0,\n",
|
585 |
-
" 'message': {'role': 'assistant',\n",
|
586 |
-
" 'content': \"The tower is a prominent landmark in the capital of France, standing tall and proud in the heart of the city. It is a grandiose structure, with a sleek and modern design that reflects the country's rich history and architectural heritage. The tower is adorned with intricate details and ornate carvings, adding to its majestic appearance.\\n\\nThe tower is a marvel of engineering, with a sturdy foundation that allows it to stand tall for centuries. Its height is impressive, with a grand staircase that winds its way up to the top of the tower. The staircase is lined with elegant railings, providing a comfortable and safe path for visitors to ascend.\\n\\nThe tower is also home to a museum, showcasing a vast collection of art and artifacts from French history. The museum is a treasure trove of knowledge, with exhibits on everything from the Renaissance to the modern era. Visitors can explore the exhibits, learning about the country's rich cultural heritage.\\n\\nThe tower is a popular destination for tourists and locals alike, offering a unique and unforgettable experience. Visitors can take a guided tour of the tower, learning about its history and significance. The tower is also a popular spot for weddings and other special events, making it a beloved landmark in the city.\\n\\nOverall, the tower is a stunning and iconic landmark that reflects the best of French culture and architecture. Its grandeur and beauty make it a must-visit destination for anyone traveling to the capital of France.\"},\n",
|
587 |
-
" 'logprobs': None,\n",
|
588 |
-
" 'finish_reason': 'stop'}],\n",
|
589 |
-
" 'usage': {'prompt_tokens': 45, 'completion_tokens': 288, 'total_tokens': 333}}"
|
590 |
-
]
|
591 |
-
},
|
592 |
-
"execution_count": 20,
|
593 |
-
"metadata": {},
|
594 |
-
"output_type": "execute_result"
|
595 |
}
|
596 |
],
|
597 |
"source": [
|
598 |
-
"
|
599 |
-
"
|
600 |
-
"
|
601 |
-
"
|
|
|
|
|
|
|
602 |
]
|
603 |
}
|
604 |
],
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
{
|
9 |
"name": "stderr",
|
10 |
"output_type": "stream",
|
|
|
521 |
],
|
522 |
"source": [
|
523 |
"from llama_cpp import Llama\n",
|
524 |
+
"import gradio as gr\n",
|
525 |
+
"import time\n",
|
526 |
"\n",
|
527 |
"llm = Llama.from_pretrained(\n",
|
528 |
"\trepo_id=\"Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m\",\n",
|
|
|
532 |
},
|
533 |
{
|
534 |
"cell_type": "code",
|
535 |
+
"execution_count": 4,
|
536 |
+
"metadata": {},
|
537 |
+
"outputs": [],
|
538 |
+
"source": [
|
539 |
+
"import time"
|
540 |
+
]
|
541 |
+
},
|
542 |
+
{
|
543 |
+
"cell_type": "code",
|
544 |
+
"execution_count": 5,
|
545 |
"metadata": {},
|
546 |
"outputs": [
|
547 |
{
|
548 |
"name": "stderr",
|
549 |
"output_type": "stream",
|
550 |
"text": [
|
551 |
+
"Llama.generate: 35 prefix-match hit, remaining 1 prompt tokens to eval\n",
|
552 |
+
"llama_perf_context_print: load time = 406.81 ms\n",
|
553 |
+
"llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second)\n",
|
554 |
+
"llama_perf_context_print: eval time = 0.00 ms / 31 runs ( 0.00 ms per token, inf tokens per second)\n",
|
555 |
+
"llama_perf_context_print: total time = 953.44 ms / 32 tokens\n"
|
556 |
]
|
557 |
},
|
558 |
{
|
559 |
+
"name": "stdout",
|
560 |
+
"output_type": "stream",
|
561 |
+
"text": [
|
562 |
+
"Tokens per second: 31.380398024839145\n"
|
563 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
564 |
}
|
565 |
],
|
566 |
"source": [
|
567 |
+
"t0 = time.time()\n",
|
568 |
+
"res = llm.create_chat_completion(messages = [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}])\n",
|
569 |
+
"t1 = time.time()\n",
|
570 |
+
"\n",
|
571 |
+
"num_response_tokens = int(res['usage']['completion_tokens'])\n",
|
572 |
+
"tokens_per_second = num_response_tokens / (t1 - t0)\n",
|
573 |
+
"print(f\"Tokens per second: {tokens_per_second}\")"
|
574 |
]
|
575 |
}
|
576 |
],
|
local-requirements.txt → finetuning-requirements.txt
RENAMED
File without changes
|