Spaces:

Robzy
/

llm

Runtime error

App Files Files Community

Robzy commited on Dec 1, 2024

Commit

fdae8b1

1 Parent(s): 56659bb

multi chatbot

Browse files

Files changed (4) hide show

app-old.py +78 -0
app.py +76 -7
local.ipynb → debug.ipynb +30 -58
local-requirements.txt → finetuning-requirements.txt +0 -0

app-old.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from llama_cpp import Llama
+import gradio as gr
+llm = Llama.from_pretrained(
+	repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m",
+	filename="unsloth.Q4_K_M.gguf",
+)
+llm2 = Llama.from_pretrained(
+    repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-16bit",
+    filename="unsloth.F16.gguf",
+)
+def predict(message, history):
+    messages = [{"role": "system", "content": "You are a helpful assistant."}]
+    for user_message, bot_message in history:
+        if user_message:
+            messages.append({"role": "user", "content": user_message})
+        if bot_message:
+            messages.append({"role": "assistant", "content": bot_message})
+    messages.append({"role": "user", "content": message})
+    response = ""
+    for chunk in llm.create_chat_completion(
+        stream=True,
+        messages=messages,
+    ):
+        part = chunk["choices"][0]["delta"].get("content", None)
+        if part:
+            response += part
+        yield response
+def predict2(message, history):
+    messages = [{"role": "system", "content": "You are a helpful assistant."}]
+    for user_message, bot_message in history:
+        if user_message:
+            messages.append({"role": "user", "content": user_message})
+        if bot_message:
+            messages.append({"role": "assistant", "content": bot_message})
+    messages.append({"role": "user", "content": message})
+    response = ""
+    for chunk in llm2.create_chat_completion(
+        stream=True,
+        messages=messages,
+    ):
+        part = chunk["choices"][0]["delta"].get("content", None)
+        if part:
+            response += part
+        yield response
+chat1 = gr.ChatInterface(predict, title="4-bit")
+chat2 = gr.ChatInterface(predict2, title="16-bit")
+chat3 = gr.ChatInterface(predict2, title="16-bit")
+def update_chat(value):
+    if value == "4-bit":
+        chat1.render(visible=True)
+        chat2.render(visible=False)
+        chat3.render(visible=False)
+    elif value == "16-bit":
+        chat1.render(visible=False)
+        chat2.render(visible=True)
+        chat3.render(visible=False)
+    else:
+        chat1.render(visible=False)
+        chat2.render(visible=False)
+        chat3.render(visible=True)
+with gr.Blocks() as demo:
+    gr.Markdown("# Quantized Llama Comparison for Code Generation")
+    dropdown = gr.Dropdown(["4-bit", "16-bit", "32-bit"], label="Choose model version", value="4-bit")
+    dropdown.change(fn=update_chat, inputs=dropdown, outputs=[chat1, chat2, chat3])
+demo.launch()

app.py CHANGED Viewed

@@ -1,12 +1,19 @@
-from llama_cpp import Llama
 import gradio as gr
 llm = Llama.from_pretrained(
-	repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m",
-	filename="unsloth.Q4_K_M.gguf",
 )
-def predict(message, history):
     messages = [{"role": "system", "content": "You are a helpful assistant."}]
     for user_message, bot_message in history:
         if user_message:
@@ -25,7 +32,69 @@ def predict(message, history):
             response += part
         yield response
-demo = gr.ChatInterface(predict)
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from llama_cpp import Llama
+# Load models
 llm = Llama.from_pretrained(
+    repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m",
+    filename="unsloth.Q4_K_M.gguf",
+)
+llm2 = Llama.from_pretrained(
+    repo_id="Robzy/Llama-3.2-1B-Instruct-Finetuned-16bit",
+    filename="unsloth.F16.gguf",
 )
+# Define prediction functions
+def predict(message, history, model):
     messages = [{"role": "system", "content": "You are a helpful assistant."}]
     for user_message, bot_message in history:
         if user_message:
             response += part
         yield response
+def predict2(message, history, model):
+    messages = [{"role": "system", "content": "You are a helpful assistant."}]
+    for user_message, bot_message in history:
+        if user_message:
+            messages.append({"role": "user", "content": user_message})
+        if bot_message:
+            messages.append({"role": "assistant", "content": bot_message})
+    messages.append({"role": "user", "content": message})
+    response = ""
+    for chunk in llm2.create_chat_completion(
+        stream=True,
+        messages=messages,
+    ):
+        part = chunk["choices"][0]["delta"].get("content", None)
+        if part:
+            response += part
+        yield response
+def predict3(message, history, model):
+    messages = [{"role": "system", "content": "You are a helpful assistant."}]
+    for user_message, bot_message in history:
+        if user_message:
+            messages.append({"role": "user", "content": user_message})
+        if bot_message:
+            messages.append({"role": "assistant", "content": bot_message})
+    messages.append({"role": "user", "content": message})
+    response = ""
+    for chunk in llm2.create_chat_completion(
+        stream=True,
+        messages=messages,
+    ):
+        part = chunk["choices"][0]["delta"].get("content", None)
+        if part:
+            response += part
+        yield response
+# Define ChatInterfaces
+io1 = gr.ChatInterface(predict, title="4-bit")
+io2 = gr.ChatInterface(predict2, title="8-bit")  # Placeholder
+io3 = gr.ChatInterface(predict3, title="16-bit")
+io4 = gr.ChatInterface(predict2, title="32-bit")  # Placeholder
+# Dropdown and visibility mapping
+chat_interfaces = {"4-bit": io1, "8-bit": io2, "16-bit": io3, "32-bit": io4}
+# Define UI
+with gr.Blocks() as demo:
+    gr.Markdown("# Quantized Llama Comparison for Code Generation")
+    with gr.Tab("4-bit"):
+        io1.render()
+    with gr.Tab("8-bit"):
+        io2.render()
+    with gr.Tab("16-bit"):
+        io3.render()
+    with gr.Tab("32-bit"):
+        io4.render()
+demo.launch()

local.ipynb → debug.ipynb RENAMED Viewed

@@ -2,40 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import transformers"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c94c88bacc2c48cb8ce50e93d73e15eb",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "unsloth.Q4_K_M.gguf:   0%|          | 0.00/808M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stderr",
      "output_type": "stream",
@@ -552,6 +521,8 @@
    ],
    "source": [
     "from llama_cpp import Llama\n",
     "\n",
     "llm = Llama.from_pretrained(\n",
     "\trepo_id=\"Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m\",\n",
@@ -561,44 +532,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "llama_perf_context_print:        load time =     414.62 ms\n",
-      "llama_perf_context_print: prompt eval time =       0.00 ms /    45 tokens (    0.00 ms per token,      inf tokens per second)\n",
-      "llama_perf_context_print:        eval time =       0.00 ms /   288 runs   (    0.00 ms per token,      inf tokens per second)\n",
-      "llama_perf_context_print:       total time =    8736.94 ms /   333 tokens\n"
      ]
     },
     {
-     "data": {
-      "text/plain": [
-       "{'id': 'chatcmpl-7b3f051b-3008-4c34-afb4-da527e07904c',\n",
-       " 'object': 'chat.completion',\n",
-       " 'created': 1733027296,\n",
-       " 'model': '/home/robert/.cache/huggingface/hub/models--Robzy--Llama-3.2-1B-Instruct-Finetuned-q4_k_m/snapshots/49dc2f37761bb04ce3513b70087676029ccd4f20/./unsloth.Q4_K_M.gguf',\n",
-       " 'choices': [{'index': 0,\n",
-       "   'message': {'role': 'assistant',\n",
-       "    'content': \"The tower is a prominent landmark in the capital of France, standing tall and proud in the heart of the city. It is a grandiose structure, with a sleek and modern design that reflects the country's rich history and architectural heritage. The tower is adorned with intricate details and ornate carvings, adding to its majestic appearance.\\n\\nThe tower is a marvel of engineering, with a sturdy foundation that allows it to stand tall for centuries. Its height is impressive, with a grand staircase that winds its way up to the top of the tower. The staircase is lined with elegant railings, providing a comfortable and safe path for visitors to ascend.\\n\\nThe tower is also home to a museum, showcasing a vast collection of art and artifacts from French history. The museum is a treasure trove of knowledge, with exhibits on everything from the Renaissance to the modern era. Visitors can explore the exhibits, learning about the country's rich cultural heritage.\\n\\nThe tower is a popular destination for tourists and locals alike, offering a unique and unforgettable experience. Visitors can take a guided tour of the tower, learning about its history and significance. The tower is also a popular spot for weddings and other special events, making it a beloved landmark in the city.\\n\\nOverall, the tower is a stunning and iconic landmark that reflects the best of French culture and architecture. Its grandeur and beauty make it a must-visit destination for anyone traveling to the capital of France.\"},\n",
-       "   'logprobs': None,\n",
-       "   'finish_reason': 'stop'}],\n",
-       " 'usage': {'prompt_tokens': 45, 'completion_tokens': 288, 'total_tokens': 333}}"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
-    "messages = [\n",
-    "    {\"role\": \"user\", \"content\": \"Describe a tall tower in the capital of France.\"},\n",
-    "]\n",
-    "llm.create_chat_completion(messages)"
    ]
   }
  ],

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
    ],
    "source": [
     "from llama_cpp import Llama\n",
+    "import gradio as gr\n",
+    "import time\n",
     "\n",
     "llm = Llama.from_pretrained(\n",
     "\trepo_id=\"Robzy/Llama-3.2-1B-Instruct-Finetuned-q4_k_m\",\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Llama.generate: 35 prefix-match hit, remaining 1 prompt tokens to eval\n",
+      "llama_perf_context_print:        load time =     406.81 ms\n",
+      "llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)\n",
+      "llama_perf_context_print:        eval time =       0.00 ms /    31 runs   (    0.00 ms per token,      inf tokens per second)\n",
+      "llama_perf_context_print:       total time =     953.44 ms /    32 tokens\n"
      ]
     },
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tokens per second: 31.380398024839145\n"
+     ]
     }
    ],
    "source": [
+    "t0 = time.time()\n",
+    "res = llm.create_chat_completion(messages = [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}])\n",
+    "t1 = time.time()\n",
+    "\n",
+    "num_response_tokens = int(res['usage']['completion_tokens'])\n",
+    "tokens_per_second = num_response_tokens / (t1 - t0)\n",
+    "print(f\"Tokens per second: {tokens_per_second}\")"
    ]
   }
  ],

local-requirements.txt → finetuning-requirements.txt RENAMED Viewed

File without changes