kwabs22 commited on
Commit
4213f50
·
1 Parent(s): 9c1188f

Testing Stable LM 2 1.6B Zephyr - llama-cpp wheel issue

Browse files
Files changed (3) hide show
  1. Dockerfile +16 -5
  2. app.py +17 -2
  3. requirements.txt +0 -1
Dockerfile CHANGED
@@ -4,11 +4,22 @@ FROM python:3.8-slim
4
  # Set the working directory in the container
5
  WORKDIR /usr/src/app
6
 
7
- # Install wget
8
- RUN apt-get update && apt-get install -y wget
 
 
 
9
 
10
- # Download the model file
11
- RUN wget -O stablelm-2-zephyr-1_6b-Q4_0.gguf https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b/resolve/main/stablelm-2-zephyr-1_6b-Q4_0.gguf?download=true
 
 
 
 
 
 
 
 
12
 
13
  # Copy the current directory contents into the container at /usr/src/app
14
  COPY . .
@@ -20,4 +31,4 @@ RUN pip install --no-cache-dir -r requirements.txt
20
  EXPOSE 7860
21
 
22
  # Run app.py when the container launches
23
- CMD ["python", "./app.py"]
 
4
  # Set the working directory in the container
5
  WORKDIR /usr/src/app
6
 
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ wget \
10
+ git \
11
+ build-essential
12
 
13
+ # Clone llama.cpp
14
+ RUN git clone https://github.com/ggerganov/llama.cpp.git
15
+
16
+ # Download the model
17
+ RUN cd llama.cpp/models && \
18
+ wget -O stablelm-2-zephyr-1_6b-Q4_0.gguf https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b/resolve/main/stablelm-2-zephyr-1_6b-Q4_0.gguf?download=true
19
+
20
+ # Build llama.cpp
21
+ RUN cd llama.cpp && \
22
+ make -j
23
 
24
  # Copy the current directory contents into the container at /usr/src/app
25
  COPY . .
 
31
  EXPOSE 7860
32
 
33
  # Run app.py when the container launches
34
+ CMD ["python", "./app.py"]
app.py CHANGED
@@ -1,10 +1,12 @@
1
  import gradio as gr
2
- from llama_cpp import Llama
3
  import random
 
4
 
5
  # Initialize model
6
- llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31))
7
 
 
8
  def generate_response(user_message):
9
  encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:"
10
  tokens = llm.tokenize(encodeduserm)
@@ -18,6 +20,19 @@ def generate_response(user_message):
18
  if count >= 500 or (token == llm.token_eos()):
19
  break
20
  return output.decode()
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  iface = gr.Interface(
23
  fn=generate_response,
 
1
  import gradio as gr
2
+ #from llama_cpp import Llama
3
  import random
4
+ import subprocess
5
 
6
  # Initialize model
7
+ #llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31))
8
 
9
+ """
10
  def generate_response(user_message):
11
  encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:"
12
  tokens = llm.tokenize(encodeduserm)
 
20
  if count >= 500 or (token == llm.token_eos()):
21
  break
22
  return output.decode()
23
+ """
24
+
25
+ def generate_response(user_message):
26
+ cmd = [
27
+ "./llama.cpp/main", # Path to the executable
28
+ "-m", "llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
29
+ "-p", user_message,
30
+ "-n", "400",
31
+ "-e"
32
+ ]
33
+ result = subprocess.run(cmd, capture_output=True, text=True)
34
+ return result.stdout
35
+
36
 
37
  iface = gr.Interface(
38
  fn=generate_response,
requirements.txt CHANGED
@@ -1,2 +1 @@
1
- llama-cpp-python
2
  gradio
 
 
1
  gradio