Spaces:

KwabsHug
/

FrontEndasPromptEngineeringTest

Sleeping

kwabs22 commited on Jan 30, 2024

Commit

4213f50

1 Parent(s): 9c1188f

Testing Stable LM 2 1.6B Zephyr - llama-cpp wheel issue

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -4,11 +4,22 @@ FROM python:3.8-slim
 # Set the working directory in the container
 WORKDIR /usr/src/app
-# Install wget
-RUN apt-get update && apt-get install -y wget
-# Download the model file
-RUN wget -O stablelm-2-zephyr-1_6b-Q4_0.gguf https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b/resolve/main/stablelm-2-zephyr-1_6b-Q4_0.gguf?download=true
 # Copy the current directory contents into the container at /usr/src/app
 COPY . .
@@ -20,4 +31,4 @@ RUN pip install --no-cache-dir -r requirements.txt
 EXPOSE 7860
 # Run app.py when the container launches
-CMD ["python", "./app.py"]

 # Set the working directory in the container
 WORKDIR /usr/src/app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+  wget \
+  git \
+  build-essential
+# Clone llama.cpp
+RUN git clone https://github.com/ggerganov/llama.cpp.git
+# Download the model
+RUN cd llama.cpp/models && \
+    wget -O stablelm-2-zephyr-1_6b-Q4_0.gguf https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b/resolve/main/stablelm-2-zephyr-1_6b-Q4_0.gguf?download=true
+# Build llama.cpp
+RUN cd llama.cpp && \
+    make -j
 # Copy the current directory contents into the container at /usr/src/app
 COPY . .
 EXPOSE 7860
 # Run app.py when the container launches
+CMD ["python", "./app.py"]

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import gradio as gr
-from llama_cpp import Llama
 import random
 # Initialize model
-llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31))
 def generate_response(user_message):
     encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:"
     tokens = llm.tokenize(encodeduserm)
@@ -18,6 +20,19 @@ def generate_response(user_message):
         if count >= 500 or (token == llm.token_eos()):
             break
     return output.decode()
 iface = gr.Interface(
     fn=generate_response,

 import gradio as gr
+#from llama_cpp import Llama
 import random
+import subprocess
 # Initialize model
+#llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31))
+"""
 def generate_response(user_message):
     encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:"
     tokens = llm.tokenize(encodeduserm)
         if count >= 500 or (token == llm.token_eos()):
             break
     return output.decode()
+"""
+def generate_response(user_message):
+    cmd = [
+        "./llama.cpp/main",  # Path to the executable
+        "-m", "llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
+        "-p", user_message,
+        "-n", "400",
+        "-e"
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    return result.stdout
 iface = gr.Interface(
     fn=generate_response,

requirements.txt CHANGED Viewed