Robzy commited on
Commit
fdae8b1
·
1 Parent(s): 56659bb

multi chatbot

Browse files
app-old.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp import Llama
2
+ import gradio as gr
3
+
4
+ llm = Llama.from_pretrained(
5
+ repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m",
6
+ filename="unsloth.Q4_K_M.gguf",
7
+ )
8
+
9
+ llm2 = Llama.from_pretrained(
10
+ repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-16bit",
11
+ filename="unsloth.F16.gguf",
12
+ )
13
+
14
+ def predict(message, history):
15
+ messages = [{"role": "system", "content": "You are a helpful assistant."}]
16
+ for user_message, bot_message in history:
17
+ if user_message:
18
+ messages.append({"role": "user", "content": user_message})
19
+ if bot_message:
20
+ messages.append({"role": "assistant", "content": bot_message})
21
+ messages.append({"role": "user", "content": message})
22
+
23
+ response = ""
24
+ for chunk in llm.create_chat_completion(
25
+ stream=True,
26
+ messages=messages,
27
+ ):
28
+ part = chunk["choices"][0]["delta"].get("content", None)
29
+ if part:
30
+ response += part
31
+ yield response
32
+
33
+
34
+ def predict2(message, history):
35
+ messages = [{"role": "system", "content": "You are a helpful assistant."}]
36
+ for user_message, bot_message in history:
37
+ if user_message:
38
+ messages.append({"role": "user", "content": user_message})
39
+ if bot_message:
40
+ messages.append({"role": "assistant", "content": bot_message})
41
+ messages.append({"role": "user", "content": message})
42
+
43
+ response = ""
44
+ for chunk in llm2.create_chat_completion(
45
+ stream=True,
46
+ messages=messages,
47
+ ):
48
+ part = chunk["choices"][0]["delta"].get("content", None)
49
+ if part:
50
+ response += part
51
+ yield response
52
+
53
+
54
+ chat1 = gr.ChatInterface(predict, title="4-bit")
55
+ chat2 = gr.ChatInterface(predict2, title="16-bit")
56
+ chat3 = gr.ChatInterface(predict2, title="16-bit")
57
+
58
+ def update_chat(value):
59
+ if value == "4-bit":
60
+ chat1.render(visible=True)
61
+ chat2.render(visible=False)
62
+ chat3.render(visible=False)
63
+ elif value == "16-bit":
64
+ chat1.render(visible=False)
65
+ chat2.render(visible=True)
66
+ chat3.render(visible=False)
67
+ else:
68
+ chat1.render(visible=False)
69
+ chat2.render(visible=False)
70
+ chat3.render(visible=True)
71
+
72
+ with gr.Blocks() as demo:
73
+
74
+ gr.Markdown("# Quantized Llama Comparison for Code Generation")
75
+ dropdown = gr.Dropdown(["4-bit", "16-bit", "32-bit"], label="Choose model version", value="4-bit")
76
+ dropdown.change(fn=update_chat, inputs=dropdown, outputs=[chat1, chat2, chat3])
77
+
78
+ demo.launch()
app.py CHANGED
@@ -1,12 +1,19 @@
1
- from llama_cpp import Llama
2
  import gradio as gr
 
3
 
 
4
  llm = Llama.from_pretrained(
5
- repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m",
6
- filename="unsloth.Q4_K_M.gguf",
 
 
 
 
 
7
  )
8
 
9
- def predict(message, history):
 
10
  messages = [{"role": "system", "content": "You are a helpful assistant."}]
11
  for user_message, bot_message in history:
12
  if user_message:
@@ -25,7 +32,69 @@ def predict(message, history):
25
  response += part
26
  yield response
27
 
28
- demo = gr.ChatInterface(predict)
29
 
30
- if __name__ == "__main__":
31
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from llama_cpp import Llama
3
 
4
+ # Load models
5
  llm = Llama.from_pretrained(
6
+ repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m",
7
+ filename="unsloth.Q4_K_M.gguf",
8
+ )
9
+
10
+ llm2 = Llama.from_pretrained(
11
+ repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-16bit",
12
+ filename="unsloth.F16.gguf",
13
  )
14
 
15
+ # Define prediction functions
16
+ def predict(message, history, model):
17
  messages = [{"role": "system", "content": "You are a helpful assistant."}]
18
  for user_message, bot_message in history:
19
  if user_message:
 
32
  response += part
33
  yield response
34
 
 
35
 
36
+ def predict2(message, history, model):
37
+ messages = [{"role": "system", "content": "You are a helpful assistant."}]
38
+ for user_message, bot_message in history:
39
+ if user_message:
40
+ messages.append({"role": "user", "content": user_message})
41
+ if bot_message:
42
+ messages.append({"role": "assistant", "content": bot_message})
43
+ messages.append({"role": "user", "content": message})
44
+
45
+ response = ""
46
+ for chunk in llm2.create_chat_completion(
47
+ stream=True,
48
+ messages=messages,
49
+ ):
50
+ part = chunk["choices"][0]["delta"].get("content", None)
51
+ if part:
52
+ response += part
53
+ yield response
54
+
55
+ def predict3(message, history, model):
56
+ messages = [{"role": "system", "content": "You are a helpful assistant."}]
57
+ for user_message, bot_message in history:
58
+ if user_message:
59
+ messages.append({"role": "user", "content": user_message})
60
+ if bot_message:
61
+ messages.append({"role": "assistant", "content": bot_message})
62
+ messages.append({"role": "user", "content": message})
63
+
64
+ response = ""
65
+ for chunk in llm2.create_chat_completion(
66
+ stream=True,
67
+ messages=messages,
68
+ ):
69
+ part = chunk["choices"][0]["delta"].get("content", None)
70
+ if part:
71
+ response += part
72
+ yield response
73
+
74
+
75
+
76
+ # Define ChatInterfaces
77
+ io1 = gr.ChatInterface(predict, title="4-bit")
78
+ io2 = gr.ChatInterface(predict2, title="8-bit") # Placeholder
79
+ io3 = gr.ChatInterface(predict3, title="16-bit")
80
+ io4 = gr.ChatInterface(predict2, title="32-bit") # Placeholder
81
+
82
+ # Dropdown and visibility mapping
83
+ chat_interfaces = {"4-bit": io1, "8-bit": io2, "16-bit": io3, "32-bit": io4}
84
+
85
+ # Define UI
86
+ with gr.Blocks() as demo:
87
+ gr.Markdown("# Quantized Llama Comparison for Code Generation")
88
+
89
+ with gr.Tab("4-bit"):
90
+ io1.render()
91
+ with gr.Tab("8-bit"):
92
+ io2.render()
93
+ with gr.Tab("16-bit"):
94
+ io3.render()
95
+ with gr.Tab("32-bit"):
96
+ io4.render()
97
+
98
+
99
+
100
+ demo.launch()
local.ipynb → debug.ipynb RENAMED
@@ -2,40 +2,9 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [
8
- {
9
- "name": "stderr",
10
- "output_type": "stream",
11
- "text": [
12
- "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
13
- ]
14
- }
15
- ],
16
- "source": [
17
- "import transformers"
18
- ]
19
- },
20
- {
21
- "cell_type": "code",
22
- "execution_count": 3,
23
- "metadata": {},
24
- "outputs": [
25
- {
26
- "data": {
27
- "application/vnd.jupyter.widget-view+json": {
28
- "model_id": "c94c88bacc2c48cb8ce50e93d73e15eb",
29
- "version_major": 2,
30
- "version_minor": 0
31
- },
32
- "text/plain": [
33
- "unsloth.Q4_K_M.gguf: 0%| | 0.00/808M [00:00<?, ?B/s]"
34
- ]
35
- },
36
- "metadata": {},
37
- "output_type": "display_data"
38
- },
39
  {
40
  "name": "stderr",
41
  "output_type": "stream",
@@ -552,6 +521,8 @@
552
  ],
553
  "source": [
554
  "from llama_cpp import Llama\n",
 
 
555
  "\n",
556
  "llm = Llama.from_pretrained(\n",
557
  "\trepo_id=\"Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m\",\n",
@@ -561,44 +532,45 @@
561
  },
562
  {
563
  "cell_type": "code",
564
- "execution_count": 20,
 
 
 
 
 
 
 
 
 
565
  "metadata": {},
566
  "outputs": [
567
  {
568
  "name": "stderr",
569
  "output_type": "stream",
570
  "text": [
571
- "llama_perf_context_print: load time = 414.62 ms\n",
572
- "llama_perf_context_print: prompt eval time = 0.00 ms / 45 tokens ( 0.00 ms per token, inf tokens per second)\n",
573
- "llama_perf_context_print: eval time = 0.00 ms / 288 runs ( 0.00 ms per token, inf tokens per second)\n",
574
- "llama_perf_context_print: total time = 8736.94 ms / 333 tokens\n"
 
575
  ]
576
  },
577
  {
578
- "data": {
579
- "text/plain": [
580
- "{'id': 'chatcmpl-7b3f051b-3008-4c34-afb4-da527e07904c',\n",
581
- " 'object': 'chat.completion',\n",
582
- " 'created': 1733027296,\n",
583
- " 'model': '/home/robert/.cache/huggingface/hub/models--Robzy--Llama-3.2-1B-Instruct-Finetuned-q4_k_m/snapshots/49dc2f37761bb04ce3513b70087676029ccd4f20/./unsloth.Q4_K_M.gguf',\n",
584
- " 'choices': [{'index': 0,\n",
585
- " 'message': {'role': 'assistant',\n",
586
- " 'content': \"The tower is a prominent landmark in the capital of France, standing tall and proud in the heart of the city. It is a grandiose structure, with a sleek and modern design that reflects the country's rich history and architectural heritage. The tower is adorned with intricate details and ornate carvings, adding to its majestic appearance.\\n\\nThe tower is a marvel of engineering, with a sturdy foundation that allows it to stand tall for centuries. Its height is impressive, with a grand staircase that winds its way up to the top of the tower. The staircase is lined with elegant railings, providing a comfortable and safe path for visitors to ascend.\\n\\nThe tower is also home to a museum, showcasing a vast collection of art and artifacts from French history. The museum is a treasure trove of knowledge, with exhibits on everything from the Renaissance to the modern era. Visitors can explore the exhibits, learning about the country's rich cultural heritage.\\n\\nThe tower is a popular destination for tourists and locals alike, offering a unique and unforgettable experience. Visitors can take a guided tour of the tower, learning about its history and significance. The tower is also a popular spot for weddings and other special events, making it a beloved landmark in the city.\\n\\nOverall, the tower is a stunning and iconic landmark that reflects the best of French culture and architecture. Its grandeur and beauty make it a must-visit destination for anyone traveling to the capital of France.\"},\n",
587
- " 'logprobs': None,\n",
588
- " 'finish_reason': 'stop'}],\n",
589
- " 'usage': {'prompt_tokens': 45, 'completion_tokens': 288, 'total_tokens': 333}}"
590
- ]
591
- },
592
- "execution_count": 20,
593
- "metadata": {},
594
- "output_type": "execute_result"
595
  }
596
  ],
597
  "source": [
598
- "messages = [\n",
599
- " {\"role\": \"user\", \"content\": \"Describe a tall tower in the capital of France.\"},\n",
600
- "]\n",
601
- "llm.create_chat_completion(messages)"
 
 
 
602
  ]
603
  }
604
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": null,
6
  "metadata": {},
7
  "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  {
9
  "name": "stderr",
10
  "output_type": "stream",
 
521
  ],
522
  "source": [
523
  "from llama_cpp import Llama\n",
524
+ "import gradio as gr\n",
525
+ "import time\n",
526
  "\n",
527
  "llm = Llama.from_pretrained(\n",
528
  "\trepo_id=\"Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m\",\n",
 
532
  },
533
  {
534
  "cell_type": "code",
535
+ "execution_count": 4,
536
+ "metadata": {},
537
+ "outputs": [],
538
+ "source": [
539
+ "import time"
540
+ ]
541
+ },
542
+ {
543
+ "cell_type": "code",
544
+ "execution_count": 5,
545
  "metadata": {},
546
  "outputs": [
547
  {
548
  "name": "stderr",
549
  "output_type": "stream",
550
  "text": [
551
+ "Llama.generate: 35 prefix-match hit, remaining 1 prompt tokens to eval\n",
552
+ "llama_perf_context_print: load time = 406.81 ms\n",
553
+ "llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second)\n",
554
+ "llama_perf_context_print: eval time = 0.00 ms / 31 runs ( 0.00 ms per token, inf tokens per second)\n",
555
+ "llama_perf_context_print: total time = 953.44 ms / 32 tokens\n"
556
  ]
557
  },
558
  {
559
+ "name": "stdout",
560
+ "output_type": "stream",
561
+ "text": [
562
+ "Tokens per second: 31.380398024839145\n"
563
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
564
  }
565
  ],
566
  "source": [
567
+ "t0 = time.time()\n",
568
+ "res = llm.create_chat_completion(messages = [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}])\n",
569
+ "t1 = time.time()\n",
570
+ "\n",
571
+ "num_response_tokens = int(res['usage']['completion_tokens'])\n",
572
+ "tokens_per_second = num_response_tokens / (t1 - t0)\n",
573
+ "print(f\"Tokens per second: {tokens_per_second}\")"
574
  ]
575
  }
576
  ],
local-requirements.txt → finetuning-requirements.txt RENAMED
File without changes