toaster61 commited on
Commit
e3396ba
1 Parent(s): 559ea97
Files changed (5) hide show
  1. Dockerfile +18 -17
  2. __pycache__/app.cpython-311.pyc +0 -0
  3. app.py +15 -6
  4. run-docker.sh +5 -0
  5. system.prompt +1 -0
Dockerfile CHANGED
@@ -1,27 +1,28 @@
1
- FROM python:3.11.1-bullseye
 
2
 
 
3
  USER root
4
 
5
- ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 
 
6
 
7
- RUN apt install g++ -y
8
- RUN git clone https://github.com/ggerganov/llama.cpp.git
9
- RUN cd llama.cpp
10
- RUN make
11
- RUN wget https://huggingface.co/OpenBuddy/openbuddy-ggml/resolve/main/openbuddy-openllama-3b-v10-q5_0.bin
12
 
13
- COPY . ./
14
-
15
- RUN chmod -R 777 ./
16
-
17
- WORKDIR ./
18
 
 
19
  RUN python3 -m pip install -U --no-cache-dir pip setuptools wheel
20
-
21
  RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
22
 
23
- RUN mkdir -p /.cache/huggingface/hub
24
- RUN chown -R root:root /.cache/huggingface/hub
25
- RUN chmod -R 777 /.cache/huggingface/hub
26
-
27
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Loading base. I'm using Debian, u can use whatever u want.
2
+ FROM python:3.11.5-slim-bookworm
3
 
4
+ # Just for sure everything will be fine.
5
  USER root
6
 
7
+ # Installing gcc compiler and main library.
8
+ RUN apt update && apt install gcc cmake build-essential -y
9
+ RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python==0.1.78
10
 
11
+ # Installing wget and downloading model.
12
+ RUN apt install wget -y
13
+ RUN wget -O model.bin https://huggingface.co/OpenBuddy/openbuddy-ggml/resolve/main/openbuddy-openllama-3b-v10-q5_0.bin
14
+ # You can use other models! Visit https://huggingface.co/OpenBuddy/openbuddy-ggml and choose model that u like!
15
+ # Or u can comment this two RUNs and include in Space/repo/Docker image own model with name "model.bin".
16
 
17
+ # Copying files into folder and making it working dir.
18
+ RUN mkdir app
19
+ COPY . /app
20
+ RUN chmod -R 777 /app
21
+ WORKDIR /app
22
 
23
+ # Updating pip and installing everything from requirements
24
  RUN python3 -m pip install -U --no-cache-dir pip setuptools wheel
 
25
  RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
26
 
27
+ # Now it's time to run Quart app using uvicorn! (It's faster, trust me.)
 
 
 
28
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
__pycache__/app.cpython-311.pyc ADDED
Binary file (2.8 kB). View file
 
app.py CHANGED
@@ -2,6 +2,7 @@ from quart import Quart, request
2
  from llama_cpp import Llama
3
 
4
  app = Quart(__name__)
 
5
 
6
  with open('system.prompt', 'r', encoding='utf-8') as f:
7
  prompt = f.read()
@@ -10,18 +11,26 @@ with open('system.prompt', 'r', encoding='utf-8') as f:
10
  async def echo():
11
  try:
12
  data = await request.get_json()
13
- if data.get("max_tokens") != None and data.get("max_tokens") > 500: data['max_tokens'] = 500
14
  userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: "
15
  except: return {"error": "Not enough data"}, 400
16
- return {"output": output}
 
 
 
 
 
17
 
18
  @app.get("/")
19
  async def get():
20
- return '''<h1>Hello, world!</h1>
 
21
  This is showcase how to make own server with OpenBuddy's model.<br>
22
  I'm using here 3b model just for example. Also here's only CPU power.<br>
23
  But you can use GPU power as well!<br>
24
- <br>
25
  <h1>How to GPU?</h1>
26
-
27
- '''
 
 
 
 
2
  from llama_cpp import Llama
3
 
4
  app = Quart(__name__)
5
+ llm = Llama(model_path="./model.bin")
6
 
7
  with open('system.prompt', 'r', encoding='utf-8') as f:
8
  prompt = f.read()
 
11
  async def echo():
12
  try:
13
  data = await request.get_json()
14
+ maxTokens = data.get("max_tokens", 64)
15
  userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: "
16
  except: return {"error": "Not enough data"}, 400
17
+ try:
18
+ output = llm(userPrompt, max_tokens=maxTokens, stop=["User:", "\n"], echo=False)
19
+ return {"output": output["choices"][0]["text"]}
20
+ except Exception as e:
21
+ print(e)
22
+ return {"error": "Server error"}, 500
23
 
24
  @app.get("/")
25
  async def get():
26
+ return '''<style>a:visited{color:black;}</style>
27
+ <h1>Hello, world!</h1>
28
  This is showcase how to make own server with OpenBuddy's model.<br>
29
  I'm using here 3b model just for example. Also here's only CPU power.<br>
30
  But you can use GPU power as well!<br>
 
31
  <h1>How to GPU?</h1>
32
+ Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code>, <code>`DLLAMA_METAL`</code> or <code>`DLLAMA_METAL`</code>.<br>
33
+ Powered by <a href="https://github.com/abetlen/llama-cpp-python">llama-cpp-python</a>, <a href="https://quart.palletsprojects.com/">Quart</a> and <a href="https://www.uvicorn.org/">Uvicorn</a>.<br>
34
+ <h1>How to test it on own machine?</h1>
35
+ You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
36
+ Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>'''
run-docker.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # This is SH file for running Dockerfile.
2
+ # Use it for tests. AND INSTALL DOCKER BEFORE U RUN IT!!!
3
+
4
+ docker build -t llama-server .
5
+ docker run -dp 0.0.0.0:7860:7860 llama-server
system.prompt CHANGED
@@ -0,0 +1 @@
 
 
1
+ Prompt: Отвечай максимально кратко и по делу.