Baraaqasem commited on
Commit
651d019
1 Parent(s): 6c0a273

Upload 711 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.dockerignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .idea
2
+ .github
3
+ .vscode
4
+ .devcontainer
5
+ models
6
+ examples/chatbot-ui/models
7
+ examples/rwkv/models
8
+ examples/**/models
9
+ Dockerfile*
10
+ __pycache__
11
+
12
+ # SonarQube
13
+ .scannerwork
14
+
15
+ # backend virtual environments
16
+ **/venv
17
+ backend/python/**/source
.editorconfig ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ root = true
3
+
4
+ [*]
5
+ indent_style = space
6
+ indent_size = 2
7
+ end_of_line = lf
8
+ charset = utf-8
9
+ trim_trailing_whitespace = true
10
+ insert_final_newline = true
11
+
12
+ [*.go]
13
+ indent_style = tab
14
+
15
+ [Makefile]
16
+ indent_style = tab
17
+
18
+ [*.proto]
19
+ indent_size = 2
20
+
21
+ [*.py]
22
+ indent_size = 4
23
+
24
+ [*.js]
25
+ indent_size = 2
26
+
27
+ [*.yaml]
28
+ indent_size = 2
29
+
30
+ [*.md]
31
+ trim_trailing_whitespace = false
.env ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Set number of threads.
2
+ ## Note: prefer the number of physical cores. Overbooking the CPU degrades performance notably.
3
+ # LOCALAI_THREADS=14
4
+
5
+ ## Specify a different bind address (defaults to ":8080")
6
+ # LOCALAI_ADDRESS=127.0.0.1:8080
7
+
8
+ ## Default models context size
9
+ # LOCALAI_CONTEXT_SIZE=512
10
+ #
11
+ ## Define galleries.
12
+ ## models will to install will be visible in `/models/available`
13
+ # LOCALAI_GALLERIES=[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]
14
+
15
+ ## CORS settings
16
+ # LOCALAI_CORS=true
17
+ # LOCALAI_CORS_ALLOW_ORIGINS=*
18
+
19
+ ## Default path for models
20
+ #
21
+ # LOCALAI_MODELS_PATH=/models
22
+
23
+ ## Enable debug mode
24
+ # LOCALAI_LOG_LEVEL=debug
25
+
26
+ ## Disables COMPEL (Diffusers)
27
+ # COMPEL=0
28
+
29
+ ## Enable/Disable single backend (useful if only one GPU is available)
30
+ # LOCALAI_SINGLE_ACTIVE_BACKEND=true
31
+
32
+ ## Specify a build type. Available: cublas, openblas, clblas.
33
+ ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
34
+ ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
35
+ ## clBLAS: This is an open-source implementation of the BLAS library that uses OpenCL, a framework for writing programs that execute across heterogeneous platforms consisting of CPUs, GPUs, and other processors. clBLAS is designed to take advantage of the parallel computing power of GPUs but can also run on any hardware that supports OpenCL. This includes hardware from different vendors like Nvidia, AMD, and Intel.
36
+ # BUILD_TYPE=openblas
37
+
38
+ ## Uncomment and set to true to enable rebuilding from source
39
+ # REBUILD=true
40
+
41
+ ## Enable go tags, available: stablediffusion, tts
42
+ ## stablediffusion: image generation with stablediffusion
43
+ ## tts: enables text-to-speech with go-piper
44
+ ## (requires REBUILD=true)
45
+ #
46
+ # GO_TAGS=stablediffusion
47
+
48
+ ## Path where to store generated images
49
+ # LOCALAI_IMAGE_PATH=/tmp/generated/images
50
+
51
+ ## Specify a default upload limit in MB (whisper)
52
+ # LOCALAI_UPLOAD_LIMIT=15
53
+
54
+ ## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
55
+ # LOCALAI_EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
56
+
57
+ ### Advanced settings ###
58
+ ### Those are not really used by LocalAI, but from components in the stack ###
59
+ ##
60
+ ### Preload libraries
61
+ # LD_PRELOAD=
62
+
63
+ ### Huggingface cache for models
64
+ # HUGGINGFACE_HUB_CACHE=/usr/local/huggingface
65
+
66
+ ### Python backends GRPC max workers
67
+ ### Default number of workers for GRPC Python backends.
68
+ ### This actually controls wether a backend can process multiple requests or not.
69
+ # PYTHON_GRPC_MAX_WORKERS=1
70
+
71
+ ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
72
+ # LLAMACPP_PARALLEL=1
73
+
74
+ ### Define a list of GRPC Servers for llama-cpp workers to distribute the load
75
+ # https://github.com/ggerganov/llama.cpp/pull/6829
76
+ # https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
77
+ # LLAMACPP_GRPC_SERVERS=""
78
+
79
+ ### Enable to run parallel requests
80
+ # LOCALAI_PARALLEL_REQUESTS=true
81
+
82
+ # Enable to allow p2p mode
83
+ # LOCALAI_P2P=true
84
+
85
+ ### Watchdog settings
86
+ ###
87
+ # Enables watchdog to kill backends that are inactive for too much time
88
+ # LOCALAI_WATCHDOG_IDLE=true
89
+ #
90
+ # Time in duration format (e.g. 1h30m) after which a backend is considered idle
91
+ # LOCALAI_WATCHDOG_IDLE_TIMEOUT=5m
92
+ #
93
+ # Enables watchdog to kill backends that are busy for too much time
94
+ # LOCALAI_WATCHDOG_BUSY=true
95
+ #
96
+ # Time in duration format (e.g. 1h30m) after which a backend is considered busy
97
+ # LOCALAI_WATCHDOG_BUSY_TIMEOUT=5m
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.sh text eol=lf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # go-llama build artifacts
2
+ /sources/
3
+ __pycache__/
4
+ *.a
5
+ get-sources
6
+ prepare-sources
7
+ /backend/cpp/llama/grpc-server
8
+ /backend/cpp/llama/llama.cpp
9
+ /backend/cpp/llama-*
10
+
11
+ *.log
12
+
13
+ go-ggml-transformers
14
+ go-gpt2
15
+ go-rwkv
16
+ whisper.cpp
17
+ /bloomz
18
+ go-bert
19
+
20
+ # LocalAI build binary
21
+ LocalAI
22
+ local-ai
23
+ # prevent above rules from omitting the helm chart
24
+ !charts/*
25
+ # prevent above rules from omitting the api/localai folder
26
+ !api/localai
27
+ !core/**/localai
28
+
29
+ # Ignore models
30
+ models/*
31
+ test-models/
32
+ test-dir/
33
+
34
+ release/
35
+
36
+ # just in case
37
+ .DS_Store
38
+ .idea
39
+
40
+ # Generated during build
41
+ backend-assets/*
42
+ !backend-assets/.keep
43
+ prepare
44
+ /ggml-metal.metal
45
+ docs/static/gallery.html
46
+
47
+ # Protobuf generated files
48
+ *.pb.go
49
+ *pb2.py
50
+ *pb2_grpc.py
51
+
52
+ # SonarQube
53
+ .scannerwork
54
+
55
+ # backend virtual environments
56
+ **/venv
57
+
58
+ # per-developer customization files for the development container
59
+ .devcontainer/customization/*
.gitmodules ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [submodule "docs/themes/hugo-theme-relearn"]
2
+ path = docs/themes/hugo-theme-relearn
3
+ url = https://github.com/McShelby/hugo-theme-relearn.git
4
+ [submodule "docs/themes/lotusdocs"]
5
+ path = docs/themes/lotusdocs
6
+ url = https://github.com/colinwilson/lotusdocs
.yamllint ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ extends: default
2
+
3
+ rules:
4
+ line-length: disable
CONTRIBUTING.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to LocalAI
2
+
3
+ Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Getting Started](#getting-started)
8
+ - [Prerequisites](#prerequisites)
9
+ - [Setting up the Development Environment](#setting-up-the-development-environment)
10
+ - [Contributing](#contributing)
11
+ - [Submitting an Issue](#submitting-an-issue)
12
+ - [Creating a Pull Request (PR)](#creating-a-pull-request-pr)
13
+ - [Coding Guidelines](#coding-guidelines)
14
+ - [Testing](#testing)
15
+ - [Documentation](#documentation)
16
+ - [Community and Communication](#community-and-communication)
17
+
18
+ ## Getting Started
19
+
20
+ ### Prerequisites
21
+
22
+ - Golang [1.21]
23
+ - Git
24
+ - macOS/Linux
25
+
26
+ ### Setting up the Development Environment and running localAI in the local environment
27
+
28
+ 1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
29
+ 2. Navigate to the project directory: `cd LocalAI`
30
+ 3. Install the required dependencies ( see https://localai.io/basics/build/#build-localai-locally )
31
+ 4. Build LocalAI: `make build`
32
+ 5. Run LocalAI: `./local-ai`
33
+
34
+ ## Contributing
35
+
36
+ We welcome contributions from everyone! To get started, follow these steps:
37
+
38
+ ### Submitting an Issue
39
+
40
+ If you find a bug, have a feature request, or encounter any issues, please check the [issue tracker](https://github.com/go-skynet/LocalAI/issues) to see if a similar issue has already been reported. If not, feel free to [create a new issue](https://github.com/go-skynet/LocalAI/issues/new) and provide as much detail as possible.
41
+
42
+ ### Creating a Pull Request (PR)
43
+
44
+ 1. Fork the repository.
45
+ 2. Create a new branch with a descriptive name: `git checkout -b [branch name]`
46
+ 3. Make your changes and commit them.
47
+ 4. Push the changes to your fork: `git push origin [branch name]`
48
+ 5. Create a new pull request from your branch to the main project's `main` or `master` branch.
49
+ 6. Provide a clear description of your changes in the pull request.
50
+ 7. Make any requested changes during the review process.
51
+ 8. Once your PR is approved, it will be merged into the main project.
52
+
53
+ ## Coding Guidelines
54
+
55
+ - No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.
56
+
57
+ ## Testing
58
+
59
+ `make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.
60
+
61
+ ### Running AIO tests
62
+
63
+ All-In-One images has a set of tests that automatically verifies that most of the endpoints works correctly, a flow can be :
64
+
65
+ ```bash
66
+ # Build the LocalAI docker image
67
+ make DOCKER_IMAGE=local-ai docker
68
+
69
+ # Build the corresponding AIO image
70
+ BASE_IMAGE=local-ai DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
71
+
72
+ # Run the AIO e2e tests
73
+ LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio make run-e2e-aio
74
+ ```
75
+
76
+ ## Documentation
77
+
78
+ We are welcome the contribution of the documents, please open new PR or create a new issue. The documentation is available under `docs/` https://github.com/mudler/LocalAI/tree/master/docs
79
+
80
+ ## Community and Communication
81
+
82
+ - You can reach out via the Github issue tracker.
83
+ - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
84
+ - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
Dockerfile ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG IMAGE_TYPE=extras
2
+ ARG BASE_IMAGE=ubuntu:22.04
3
+ ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
4
+ ARG INTEL_BASE_IMAGE=${BASE_IMAGE}
5
+
6
+ # The requirements-core target is common to all images. It should not be placed in requirements-core unless every single build will use it.
7
+ FROM ${BASE_IMAGE} AS requirements-core
8
+
9
+ USER root
10
+
11
+ ARG GO_VERSION=1.22.6
12
+ ARG CMAKE_VERSION=3.26.4
13
+ ARG CMAKE_FROM_SOURCE=false
14
+ ARG TARGETARCH
15
+ ARG TARGETVARIANT
16
+
17
+ ENV DEBIAN_FRONTEND=noninteractive
18
+ ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
19
+
20
+
21
+ RUN apt-get update && \
22
+ apt-get install -y --no-install-recommends \
23
+ build-essential \
24
+ ccache \
25
+ ca-certificates \
26
+ curl libssl-dev \
27
+ git \
28
+ unzip upx-ucl && \
29
+ apt-get clean && \
30
+ rm -rf /var/lib/apt/lists/*
31
+
32
+ # Install CMake (the version in 22.04 is too old)
33
+ RUN <<EOT bash
34
+ if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
35
+ curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
36
+ else
37
+ apt-get update && \
38
+ apt-get install -y \
39
+ cmake && \
40
+ apt-get clean && \
41
+ rm -rf /var/lib/apt/lists/*
42
+ fi
43
+ EOT
44
+
45
+ # Install Go
46
+ RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
47
+ ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
48
+
49
+ # Install grpc compilers
50
+ RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
51
+ go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
52
+
53
+ COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
54
+ RUN update-ca-certificates
55
+
56
+ RUN test -n "$TARGETARCH" \
57
+ || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
58
+
59
+ # Use the variables in subsequent instructions
60
+ RUN echo "Target Architecture: $TARGETARCH"
61
+ RUN echo "Target Variant: $TARGETVARIANT"
62
+
63
+ # Cuda
64
+ ENV PATH=/usr/local/cuda/bin:${PATH}
65
+
66
+ # HipBLAS requirements
67
+ ENV PATH=/opt/rocm/bin:${PATH}
68
+
69
+ # OpenBLAS requirements and stable diffusion
70
+ RUN apt-get update && \
71
+ apt-get install -y --no-install-recommends \
72
+ libopenblas-dev \
73
+ libopencv-dev && \
74
+ apt-get clean && \
75
+ rm -rf /var/lib/apt/lists/*
76
+
77
+ # Set up OpenCV
78
+ RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
79
+
80
+ WORKDIR /build
81
+
82
+ ###################################
83
+ ###################################
84
+
85
+ # The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
86
+ FROM requirements-core AS requirements-extras
87
+
88
+ RUN curl -LsSf https://astral.sh/uv/install.sh | sh
89
+ ENV PATH="/root/.cargo/bin:${PATH}"
90
+
91
+ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
92
+ RUN apt-get update && \
93
+ apt-get install -y --no-install-recommends \
94
+ espeak-ng \
95
+ espeak \
96
+ python3-pip \
97
+ python-is-python3 \
98
+ python3-dev llvm \
99
+ python3-venv && \
100
+ apt-get clean && \
101
+ rm -rf /var/lib/apt/lists/* && \
102
+ pip install --upgrade pip
103
+
104
+ # Install grpcio-tools (the version in 22.04 is too old)
105
+ RUN pip install --user grpcio-tools
106
+
107
+ ###################################
108
+ ###################################
109
+
110
+ # The requirements-drivers target is for BUILD_TYPE specific items. If you need to install something specific to CUDA, or specific to ROCM, it goes here.
111
+ # This target will be built on top of requirements-core or requirements-extras as retermined by the IMAGE_TYPE build-arg
112
+ FROM requirements-${IMAGE_TYPE} AS requirements-drivers
113
+
114
+ ARG BUILD_TYPE
115
+ ARG CUDA_MAJOR_VERSION=12
116
+ ARG CUDA_MINOR_VERSION=0
117
+
118
+ ENV BUILD_TYPE=${BUILD_TYPE}
119
+
120
+ # Vulkan requirements
121
+ RUN <<EOT bash
122
+ if [ "${BUILD_TYPE}" = "vulkan" ]; then
123
+ apt-get update && \
124
+ apt-get install -y --no-install-recommends \
125
+ software-properties-common pciutils wget gpg-agent && \
126
+ wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
127
+ wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
128
+ apt-get update && \
129
+ apt-get install -y \
130
+ vulkan-sdk && \
131
+ apt-get clean && \
132
+ rm -rf /var/lib/apt/lists/*
133
+ fi
134
+ EOT
135
+
136
+ # CuBLAS requirements
137
+ RUN <<EOT bash
138
+ if [ "${BUILD_TYPE}" = "cublas" ]; then
139
+ apt-get update && \
140
+ apt-get install -y --no-install-recommends \
141
+ software-properties-common pciutils
142
+ if [ "amd64" = "$TARGETARCH" ]; then
143
+ curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
144
+ fi
145
+ if [ "arm64" = "$TARGETARCH" ]; then
146
+ curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
147
+ fi
148
+ dpkg -i cuda-keyring_1.1-1_all.deb && \
149
+ rm -f cuda-keyring_1.1-1_all.deb && \
150
+ apt-get update && \
151
+ apt-get install -y --no-install-recommends \
152
+ cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
153
+ libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
154
+ libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
155
+ libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
156
+ libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
157
+ libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
158
+ apt-get clean && \
159
+ rm -rf /var/lib/apt/lists/*
160
+ fi
161
+ EOT
162
+
163
+ # If we are building with clblas support, we need the libraries for the builds
164
+ RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
165
+ apt-get update && \
166
+ apt-get install -y --no-install-recommends \
167
+ libclblast-dev && \
168
+ apt-get clean && \
169
+ rm -rf /var/lib/apt/lists/* \
170
+ ; fi
171
+
172
+ RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
173
+ apt-get update && \
174
+ apt-get install -y --no-install-recommends \
175
+ hipblas-dev \
176
+ rocblas-dev && \
177
+ apt-get clean && \
178
+ rm -rf /var/lib/apt/lists/* && \
179
+ # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
180
+ # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
181
+ ldconfig \
182
+ ; fi
183
+
184
+ ###################################
185
+ ###################################
186
+
187
+ # Temporary workaround for Intel's repository to work correctly
188
+ # https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/APT-Repository-not-working-signatures-invalid/m-p/1599436/highlight/true#M36143
189
+ # This is a temporary workaround until Intel fixes their repository
190
+ FROM ${INTEL_BASE_IMAGE} AS intel
191
+ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
192
+ gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
193
+ RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" > /etc/apt/sources.list.d/intel-graphics.list
194
+
195
+ ###################################
196
+ ###################################
197
+
198
+ # The grpc target does one thing, it builds and installs GRPC. This is in it's own layer so that it can be effectively cached by CI.
199
+ # You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
200
+ FROM ${GRPC_BASE_IMAGE} AS grpc
201
+
202
+ # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
203
+ ARG GRPC_MAKEFLAGS="-j4 -Otarget"
204
+ ARG GRPC_VERSION=v1.65.0
205
+ ARG CMAKE_FROM_SOURCE=false
206
+ ARG CMAKE_VERSION=3.26.4
207
+
208
+ ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
209
+
210
+ WORKDIR /build
211
+
212
+ RUN apt-get update && \
213
+ apt-get install -y --no-install-recommends \
214
+ ca-certificates \
215
+ build-essential curl libssl-dev \
216
+ git && \
217
+ apt-get clean && \
218
+ rm -rf /var/lib/apt/lists/*
219
+
220
+ # Install CMake (the version in 22.04 is too old)
221
+ RUN <<EOT bash
222
+ if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
223
+ curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
224
+ else
225
+ apt-get update && \
226
+ apt-get install -y \
227
+ cmake && \
228
+ apt-get clean && \
229
+ rm -rf /var/lib/apt/lists/*
230
+ fi
231
+ EOT
232
+
233
+ # We install GRPC to a different prefix here so that we can copy in only the build artifacts later
234
+ # saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
235
+ # and running make install in the target container
236
+ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
237
+ mkdir -p /build/grpc/cmake/build && \
238
+ cd /build/grpc/cmake/build && \
239
+ sed -i "216i\ TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
240
+ cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
241
+ make && \
242
+ make install && \
243
+ rm -rf /build
244
+
245
+ ###################################
246
+ ###################################
247
+
248
+ # The builder-base target has the arguments, variables, and copies shared between full builder images and the uncompiled devcontainer
249
+
250
+ FROM requirements-drivers AS builder-base
251
+
252
+ ARG GO_TAGS="stablediffusion tts p2p"
253
+ ARG GRPC_BACKENDS
254
+ ARG MAKEFLAGS
255
+ ARG LD_FLAGS="-s -w"
256
+
257
+ ENV GRPC_BACKENDS=${GRPC_BACKENDS}
258
+ ENV GO_TAGS=${GO_TAGS}
259
+ ENV MAKEFLAGS=${MAKEFLAGS}
260
+ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
261
+ ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
262
+ ENV NVIDIA_VISIBLE_DEVICES=all
263
+ ENV LD_FLAGS=${LD_FLAGS}
264
+
265
+ RUN echo "GO_TAGS: $GO_TAGS" && echo "TARGETARCH: $TARGETARCH"
266
+
267
+ WORKDIR /build
268
+
269
+
270
+ # We need protoc installed, and the version in 22.04 is too old. We will create one as part installing the GRPC build below
271
+ # but that will also being in a newer version of absl which stablediffusion cannot compile with. This version of protoc is only
272
+ # here so that we can generate the grpc code for the stablediffusion build
273
+ RUN <<EOT bash
274
+ if [ "amd64" = "$TARGETARCH" ]; then
275
+ curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
276
+ unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
277
+ rm protoc.zip
278
+ fi
279
+ if [ "arm64" = "$TARGETARCH" ]; then
280
+ curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
281
+ unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
282
+ rm protoc.zip
283
+ fi
284
+ EOT
285
+
286
+
287
+ ###################################
288
+ ###################################
289
+
290
+ # This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
291
+ # In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
292
+ FROM builder-base AS builder-sd
293
+
294
+ # stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
295
+ COPY Makefile .
296
+ COPY go.mod .
297
+ COPY go.sum .
298
+ COPY backend/backend.proto ./backend/backend.proto
299
+ COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
300
+ COPY pkg/grpc ./pkg/grpc
301
+ COPY pkg/stablediffusion ./pkg/stablediffusion
302
+ RUN git init
303
+ RUN make sources/go-stable-diffusion
304
+ RUN touch prepare-sources
305
+
306
+ # Actually build the backend
307
+ RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
308
+
309
+ ###################################
310
+ ###################################
311
+
312
+ # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
313
+ # Adjustments to the build process should likely be made here.
314
+ FROM builder-sd AS builder
315
+
316
+ # Install the pre-built GRPC
317
+ COPY --from=grpc /opt/grpc /usr/local
318
+
319
+ # Rebuild with defaults backends
320
+ WORKDIR /build
321
+
322
+ COPY . .
323
+ COPY .git .
324
+
325
+ RUN make prepare
326
+
327
+ ## Build the binary
328
+ ## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
329
+ ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
330
+ ## (both will use CUDA or hipblas for the actual computation)
331
+ RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
332
+ SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
333
+ else \
334
+ make build; \
335
+ fi
336
+
337
+ RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
338
+ mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
339
+ touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
340
+ ; fi
341
+
342
+ ###################################
343
+ ###################################
344
+
345
+ # The devcontainer target is not used on CI. It is a target for developers to use locally -
346
+ # rather than copying files it mounts them locally and leaves building to the developer
347
+
348
+ FROM builder-base AS devcontainer
349
+
350
+ ARG FFMPEG
351
+
352
+ COPY --from=grpc /opt/grpc /usr/local
353
+
354
+ COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
355
+
356
+ COPY .devcontainer-scripts /.devcontainer-scripts
357
+
358
+ # Add FFmpeg
359
+ RUN if [ "${FFMPEG}" = "true" ]; then \
360
+ apt-get update && \
361
+ apt-get install -y --no-install-recommends \
362
+ ffmpeg && \
363
+ apt-get clean && \
364
+ rm -rf /var/lib/apt/lists/* \
365
+ ; fi
366
+
367
+ RUN apt-get update && \
368
+ apt-get install -y --no-install-recommends \
369
+ ssh less wget
370
+ # For the devcontainer, leave apt functional in case additional devtools are needed at runtime.
371
+
372
+ RUN go install github.com/go-delve/delve/cmd/dlv@latest
373
+
374
+ RUN go install github.com/mikefarah/yq/v4@latest
375
+
376
+ ###################################
377
+ ###################################
378
+
379
+ # This is the final target. The result of this target will be the image uploaded to the registry.
380
+ # If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
381
+ FROM requirements-drivers
382
+
383
+ ARG FFMPEG
384
+ ARG BUILD_TYPE
385
+ ARG TARGETARCH
386
+ ARG IMAGE_TYPE=extras
387
+ ARG EXTRA_BACKENDS
388
+ ARG MAKEFLAGS
389
+
390
+ ENV BUILD_TYPE=${BUILD_TYPE}
391
+ ENV REBUILD=false
392
+ ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
393
+ ENV MAKEFLAGS=${MAKEFLAGS}
394
+
395
+ ARG CUDA_MAJOR_VERSION=12
396
+ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
397
+ ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
398
+ ENV NVIDIA_VISIBLE_DEVICES=all
399
+
400
+ # Add FFmpeg
401
+ RUN if [ "${FFMPEG}" = "true" ]; then \
402
+ apt-get update && \
403
+ apt-get install -y --no-install-recommends \
404
+ ffmpeg && \
405
+ apt-get clean && \
406
+ rm -rf /var/lib/apt/lists/* \
407
+ ; fi
408
+
409
+ WORKDIR /build
410
+
411
+ # we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
412
+ # so when `entrypoint.sh` runs `make build` again (which it does by default), the build would fail
413
+ # see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
414
+ # https://github.com/go-skynet/LocalAI/pull/434
415
+ COPY . .
416
+
417
+ COPY --from=builder /build/sources ./sources/
418
+ COPY --from=grpc /opt/grpc /usr/local
419
+
420
+ RUN make prepare-sources
421
+
422
+ # Copy the binary
423
+ COPY --from=builder /build/local-ai ./
424
+
425
+ # Copy shared libraries for piper
426
+ COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
427
+
428
+ # do not let stablediffusion rebuild (requires an older version of absl)
429
+ COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
430
+
431
+ # Change the shell to bash so we can use [[ tests below
432
+ SHELL ["/bin/bash", "-c"]
433
+ # We try to strike a balance between individual layer size (as that affects total push time) and total image size
434
+ # Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
435
+ # Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer
436
+
437
+ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
438
+ make -C backend/python/coqui \
439
+ ; fi && \
440
+ if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
441
+ make -C backend/python/parler-tts \
442
+ ; fi && \
443
+ if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
444
+ make -C backend/python/diffusers \
445
+ ; fi && \
446
+ if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
447
+ make -C backend/python/transformers-musicgen \
448
+ ; fi
449
+
450
+ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
451
+ make -C backend/python/vall-e-x \
452
+ ; fi && \
453
+ if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
454
+ make -C backend/python/openvoice \
455
+ ; fi && \
456
+ if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
457
+ make -C backend/python/sentencetransformers \
458
+ ; fi && \
459
+ if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
460
+ make -C backend/python/exllama2 \
461
+ ; fi && \
462
+ if [[ ( "${EXTRA_BACKENDS}" =~ "transformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
463
+ make -C backend/python/transformers \
464
+ ; fi
465
+
466
+ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
467
+ make -C backend/python/vllm \
468
+ ; fi && \
469
+ if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
470
+ make -C backend/python/autogptq \
471
+ ; fi && \
472
+ if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
473
+ make -C backend/python/bark \
474
+ ; fi && \
475
+ if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
476
+ make -C backend/python/rerankers \
477
+ ; fi && \
478
+ if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
479
+ make -C backend/python/mamba \
480
+ ; fi
481
+
482
+ # Make sure the models directory exists
483
+ RUN mkdir -p /build/models
484
+
485
+ # Define the health check command
486
+ HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
487
+ CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1
488
+
489
+ VOLUME /build/models
490
+ EXPOSE 8080
491
+ ENTRYPOINT [ "/build/entrypoint.sh" ]
Dockerfile.aio ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ARG BASE_IMAGE=ubuntu:22.04
2
+
3
+ FROM ${BASE_IMAGE}
4
+
5
+ RUN apt-get update && apt-get install -y pciutils && apt-get clean
6
+
7
+ COPY aio/ /aio
8
+ ENTRYPOINT [ "/aio/entrypoint.sh" ]
Earthfile ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ VERSION 0.7
2
+
3
+ build:
4
+ FROM DOCKERFILE -f Dockerfile .
5
+ SAVE ARTIFACT /usr/bin/local-ai AS LOCAL local-ai
Entitlements.plist ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3
+ <plist version="1.0">
4
+ <dict>
5
+ <key>com.apple.security.network.client</key>
6
+ <true/>
7
+ <key>com.apple.security.network.server</key>
8
+ <true/>
9
+ </dict>
10
+ </plist>
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Makefile ADDED
@@ -0,0 +1,932 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GOCMD=go
2
+ GOTEST=$(GOCMD) test
3
+ GOVET=$(GOCMD) vet
4
+ BINARY_NAME=local-ai
5
+
6
+ DETECT_LIBS?=true
7
+
8
+ # llama.cpp versions
9
+ GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
10
+ GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
11
+ CPPLLAMA_VERSION?=d5a409e57fe8bd24fef597ab8a31110d390a6392
12
+
13
+ # go-rwkv version
14
+ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
15
+ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
16
+
17
+ # whisper.cpp version
18
+ WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
19
+ WHISPER_CPP_VERSION?=0377596b77a3602e36430320cbe45f8c305ef04a
20
+
21
+ # bert.cpp version
22
+ BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
23
+ BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
24
+
25
+ # go-piper version
26
+ PIPER_REPO?=https://github.com/mudler/go-piper
27
+ PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
28
+
29
+ # stablediffusion version
30
+ STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
31
+ STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
32
+
33
+ # tinydream version
34
+ TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
35
+ TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
36
+
37
+ export BUILD_TYPE?=
38
+ export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
39
+ export CMAKE_ARGS?=
40
+ export BACKEND_LIBS?=
41
+
42
+ CGO_LDFLAGS?=
43
+ CGO_LDFLAGS_WHISPER?=
44
+ CGO_LDFLAGS_WHISPER+=-lggml
45
+ CUDA_LIBPATH?=/usr/local/cuda/lib64/
46
+ GO_TAGS?=
47
+ BUILD_ID?=
48
+
49
+ TEST_DIR=/tmp/test
50
+
51
+ TEST_FLAKES?=5
52
+
53
+ RANDOM := $(shell bash -c 'echo $$RANDOM')
54
+
55
+ VERSION?=$(shell git describe --always --tags || echo "dev" )
56
+ # go tool nm ./local-ai | grep Commit
57
+ LD_FLAGS?=-s -w
58
+ override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Version=$(VERSION)"
59
+ override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"
60
+
61
+ OPTIONAL_TARGETS?=
62
+
63
+ export OS := $(shell uname -s)
64
+ ARCH := $(shell uname -m)
65
+ GREEN := $(shell tput -Txterm setaf 2)
66
+ YELLOW := $(shell tput -Txterm setaf 3)
67
+ WHITE := $(shell tput -Txterm setaf 7)
68
+ CYAN := $(shell tput -Txterm setaf 6)
69
+ RESET := $(shell tput -Txterm sgr0)
70
+
71
+ UPX?=
72
+ # check if upx exists
73
+ ifeq (, $(shell which upx))
74
+ UPX=
75
+ else
76
+ UPX=$(shell which upx)
77
+ endif
78
+
79
+ # Default Docker bridge IP
80
+ E2E_BRIDGE_IP?=172.17.0.1
81
+
82
+ ifndef UNAME_S
83
+ UNAME_S := $(shell uname -s)
84
+ endif
85
+
86
+ ifeq ($(OS),Darwin)
87
+
88
+ ifeq ($(OSX_SIGNING_IDENTITY),)
89
+ OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
90
+ endif
91
+
92
+ # on OSX, if BUILD_TYPE is blank, we should default to use Metal
93
+ ifeq ($(BUILD_TYPE),)
94
+ BUILD_TYPE=metal
95
+ # disable metal if on Darwin and any other value is explicitly passed.
96
+ else ifneq ($(BUILD_TYPE),metal)
97
+ CMAKE_ARGS+=-DGGML_METAL=OFF
98
+ export GGML_NO_ACCELERATE=1
99
+ export GGML_NO_METAL=1
100
+ endif
101
+
102
+ ifeq ($(BUILD_TYPE),metal)
103
+ # -lcblas removed: it seems to always be listed as a duplicate flag.
104
+ CGO_LDFLAGS += -framework Accelerate
105
+ endif
106
+ else
107
+ CGO_LDFLAGS_WHISPER+=-lgomp
108
+ endif
109
+
110
+ ifeq ($(BUILD_TYPE),openblas)
111
+ CGO_LDFLAGS+=-lopenblas
112
+ export GGML_OPENBLAS=1
113
+ endif
114
+
115
+ ifeq ($(BUILD_TYPE),cublas)
116
+ CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
117
+ export GGML_CUDA=1
118
+ CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
119
+ endif
120
+
121
+ ifeq ($(BUILD_TYPE),vulkan)
122
+ CMAKE_ARGS+=-DGGML_VULKAN=1
123
+ endif
124
+
125
+ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
126
+ export GGML_SYCL=1
127
+ endif
128
+
129
+ ifeq ($(BUILD_TYPE),sycl_f16)
130
+ export GGML_SYCL_F16=1
131
+ endif
132
+
133
+ ifeq ($(BUILD_TYPE),hipblas)
134
+ ROCM_HOME ?= /opt/rocm
135
+ ROCM_PATH ?= /opt/rocm
136
+ LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
137
+ export CXX=$(ROCM_HOME)/llvm/bin/clang++
138
+ export CC=$(ROCM_HOME)/llvm/bin/clang
139
+ # llama-ggml has no hipblas support, so override it here.
140
+ export STABLE_BUILD_TYPE=
141
+ export GGML_HIPBLAS=1
142
+ GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
143
+ AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
144
+ CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
145
+ CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
146
+ endif
147
+
148
+ ifeq ($(BUILD_TYPE),metal)
149
+ CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
150
+ export GGML_METAL=1
151
+ endif
152
+
153
+ ifeq ($(BUILD_TYPE),clblas)
154
+ CGO_LDFLAGS+=-lOpenCL -lclblast
155
+ export GGML_OPENBLAS=1
156
+ endif
157
+
158
+ # glibc-static or glibc-devel-static required
159
+ ifeq ($(STATIC),true)
160
+ LD_FLAGS+=-linkmode external -extldflags -static
161
+ endif
162
+
163
+ ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
164
+ # OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
165
+ OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
166
+ endif
167
+
168
+ ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
169
+ # OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
170
+ OPTIONAL_GRPC+=backend-assets/grpc/tinydream
171
+ endif
172
+
173
+ ifeq ($(findstring tts,$(GO_TAGS)),tts)
174
+ # OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
175
+ # OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
176
+ PIPER_CGO_CXXFLAGS+=-I$(CURDIR)/sources/go-piper/piper/src/cpp -I$(CURDIR)/sources/go-piper/piper/build/fi/include -I$(CURDIR)/sources/go-piper/piper/build/pi/include -I$(CURDIR)/sources/go-piper/piper/build/si/include
177
+ PIPER_CGO_LDFLAGS+=-L$(CURDIR)/sources/go-piper/piper/build/fi/lib -L$(CURDIR)/sources/go-piper/piper/build/pi/lib -L$(CURDIR)/sources/go-piper/piper/build/si/lib -lfmt -lspdlog -lucd
178
+ OPTIONAL_GRPC+=backend-assets/grpc/piper
179
+ endif
180
+
181
+ ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
182
+ ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
183
+ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
184
+ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
185
+ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
186
+ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
187
+ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
188
+ ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
189
+ ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
190
+ ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
191
+ ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
192
+ ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
193
+ # Use filter-out to remove the specified backends
194
+ ALL_GRPC_BACKENDS := $(filter-out $(SKIP_GRPC_BACKEND),$(ALL_GRPC_BACKENDS))
195
+
196
+ GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
197
+ TEST_PATHS?=./api/... ./pkg/... ./core/...
198
+
199
+ # If empty, then we build all
200
+ ifeq ($(GRPC_BACKENDS),)
201
+ GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
202
+ endif
203
+
204
+ ifeq ($(BUILD_API_ONLY),true)
205
+ GRPC_BACKENDS=
206
+ endif
207
+
208
+ .PHONY: all test build vendor get-sources prepare-sources prepare
209
+
210
+ all: help
211
+
212
+ ## BERT embeddings
213
+ sources/go-bert.cpp:
214
+ mkdir -p sources/go-bert.cpp
215
+ cd sources/go-bert.cpp && \
216
+ git init && \
217
+ git remote add origin $(BERT_REPO) && \
218
+ git fetch origin && \
219
+ git checkout $(BERT_VERSION) && \
220
+ git submodule update --init --recursive --depth 1 --single-branch
221
+
222
+ sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
223
+ $(MAKE) -C sources/go-bert.cpp libgobert.a
224
+
225
+ ## go-llama.cpp
226
+ sources/go-llama.cpp:
227
+ mkdir -p sources/go-llama.cpp
228
+ cd sources/go-llama.cpp && \
229
+ git init && \
230
+ git remote add origin $(GOLLAMA_REPO) && \
231
+ git fetch origin && \
232
+ git checkout $(GOLLAMA_VERSION) && \
233
+ git submodule update --init --recursive --depth 1 --single-branch
234
+
235
+ sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
236
+ $(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
237
+
238
+ ## go-piper
239
+ sources/go-piper:
240
+ mkdir -p sources/go-piper
241
+ cd sources/go-piper && \
242
+ git init && \
243
+ git remote add origin $(PIPER_REPO) && \
244
+ git fetch origin && \
245
+ git checkout $(PIPER_VERSION) && \
246
+ git submodule update --init --recursive --depth 1 --single-branch
247
+
248
+ sources/go-piper/libpiper_binding.a: sources/go-piper
249
+ $(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
250
+
251
+
252
+ ## RWKV
253
+ sources/go-rwkv.cpp:
254
+ mkdir -p sources/go-rwkv.cpp
255
+ cd sources/go-rwkv.cpp && \
256
+ git init && \
257
+ git remote add origin $(RWKV_REPO) && \
258
+ git fetch origin && \
259
+ git checkout $(RWKV_VERSION) && \
260
+ git submodule update --init --recursive --depth 1 --single-branch
261
+
262
+ sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
263
+ cd sources/go-rwkv.cpp && cd rwkv.cpp && cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF && cmake --build . && cp librwkv.a ..
264
+
265
+ ## stable diffusion
266
+ sources/go-stable-diffusion:
267
+ mkdir -p sources/go-stable-diffusion
268
+ cd sources/go-stable-diffusion && \
269
+ git init && \
270
+ git remote add origin $(STABLEDIFFUSION_REPO) && \
271
+ git fetch origin && \
272
+ git checkout $(STABLEDIFFUSION_VERSION) && \
273
+ git submodule update --init --recursive --depth 1 --single-branch
274
+
275
+ sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
276
+ CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
277
+
278
+ ## tiny-dream
279
+ sources/go-tiny-dream:
280
+ mkdir -p sources/go-tiny-dream
281
+ cd sources/go-tiny-dream && \
282
+ git init && \
283
+ git remote add origin $(TINYDREAM_REPO) && \
284
+ git fetch origin && \
285
+ git checkout $(TINYDREAM_VERSION) && \
286
+ git submodule update --init --recursive --depth 1 --single-branch
287
+
288
+ sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
289
+ $(MAKE) -C sources/go-tiny-dream libtinydream.a
290
+
291
+ ## whisper
292
+ sources/whisper.cpp:
293
+ mkdir -p sources/whisper.cpp
294
+ cd sources/whisper.cpp && \
295
+ git init && \
296
+ git remote add origin $(WHISPER_REPO) && \
297
+ git fetch origin && \
298
+ git checkout $(WHISPER_CPP_VERSION) && \
299
+ git submodule update --init --recursive --depth 1 --single-branch
300
+
301
+ sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
302
+ cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
303
+
304
+ get-sources: sources/go-llama.cpp sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
305
+
306
+ replace:
307
+ $(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
308
+ $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
309
+ $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
310
+ $(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
311
+ $(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
312
+ $(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
313
+ $(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
314
+ $(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
315
+
316
+ dropreplace:
317
+ $(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
318
+ $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
319
+ $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
320
+ $(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
321
+ $(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
322
+ $(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
323
+ $(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
324
+ $(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
325
+
326
+ prepare-sources: get-sources replace
327
+ $(GOCMD) mod download
328
+
329
+ ## GENERIC
330
+ rebuild: ## Rebuilds the project
331
+ $(GOCMD) clean -cache
332
+ $(MAKE) -C sources/go-llama.cpp clean
333
+ $(MAKE) -C sources/go-rwkv.cpp clean
334
+ $(MAKE) -C sources/whisper.cpp clean
335
+ $(MAKE) -C sources/go-stable-diffusion clean
336
+ $(MAKE) -C sources/go-bert.cpp clean
337
+ $(MAKE) -C sources/go-piper clean
338
+ $(MAKE) -C sources/go-tiny-dream clean
339
+ $(MAKE) build
340
+
341
+ prepare: prepare-sources $(OPTIONAL_TARGETS)
342
+
343
+ clean: ## Remove build related file
344
+ $(GOCMD) clean -cache
345
+ rm -f prepare
346
+ rm -rf ./sources
347
+ rm -rf $(BINARY_NAME)
348
+ rm -rf release/
349
+ rm -rf backend-assets/*
350
+ $(MAKE) -C backend/cpp/grpc clean
351
+ $(MAKE) -C backend/cpp/llama clean
352
+ rm -rf backend/cpp/llama-* || true
353
+ $(MAKE) dropreplace
354
+ $(MAKE) protogen-clean
355
+ rmdir pkg/grpc/proto || true
356
+
357
+ clean-tests:
358
+ rm -rf test-models
359
+ rm -rf test-dir
360
+ rm -rf core/http/backend-assets
361
+
362
+ clean-dc: clean
363
+ cp -r /build/backend-assets /workspace/backend-assets
364
+
365
+ ## Build:
366
+ build: prepare backend-assets grpcs ## Build the project
367
+ $(info ${GREEN}I local-ai build info:${RESET})
368
+ $(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
369
+ $(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
370
+ $(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
371
+ $(info ${GREEN}I UPX: ${YELLOW}$(UPX)${RESET})
372
+ ifneq ($(BACKEND_LIBS),)
373
+ $(MAKE) backend-assets/lib
374
+ cp -f $(BACKEND_LIBS) backend-assets/lib/
375
+ endif
376
+ CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
377
+
378
+ build-minimal:
379
+ BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
380
+
381
+ build-api:
382
+ BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=p2p $(MAKE) build
383
+
384
+ backend-assets/lib:
385
+ mkdir -p backend-assets/lib
386
+
387
+ dist:
388
+ $(MAKE) backend-assets/grpc/llama-cpp-avx2
389
+ ifeq ($(DETECT_LIBS),true)
390
+ scripts/prepare-libs.sh backend-assets/grpc/llama-cpp-avx2
391
+ endif
392
+ ifeq ($(OS),Darwin)
393
+ BUILD_TYPE=none $(MAKE) backend-assets/grpc/llama-cpp-fallback
394
+ else
395
+ $(MAKE) backend-assets/grpc/llama-cpp-cuda
396
+ $(MAKE) backend-assets/grpc/llama-cpp-hipblas
397
+ $(MAKE) backend-assets/grpc/llama-cpp-sycl_f16
398
+ $(MAKE) backend-assets/grpc/llama-cpp-sycl_f32
399
+ endif
400
+ GO_TAGS="tts p2p" $(MAKE) build
401
+ ifeq ($(DETECT_LIBS),true)
402
+ scripts/prepare-libs.sh backend-assets/grpc/piper
403
+ endif
404
+ GO_TAGS="tts p2p" STATIC=true $(MAKE) build
405
+ mkdir -p release
406
+ # if BUILD_ID is empty, then we don't append it to the binary name
407
+ ifeq ($(BUILD_ID),)
408
+ cp $(BINARY_NAME) release/$(BINARY_NAME)-$(OS)-$(ARCH)
409
+ shasum -a 256 release/$(BINARY_NAME)-$(OS)-$(ARCH) > release/$(BINARY_NAME)-$(OS)-$(ARCH).sha256
410
+ else
411
+ cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)
412
+ shasum -a 256 release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH) > release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH).sha256
413
+ endif
414
+
415
+ dist-cross-linux-arm64:
416
+ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" GO_TAGS="p2p" \
417
+ STATIC=true $(MAKE) build
418
+ mkdir -p release
419
+ # if BUILD_ID is empty, then we don't append it to the binary name
420
+ ifeq ($(BUILD_ID),)
421
+ cp $(BINARY_NAME) release/$(BINARY_NAME)-$(OS)-arm64
422
+ shasum -a 256 release/$(BINARY_NAME)-$(OS)-arm64 > release/$(BINARY_NAME)-$(OS)-arm64.sha256
423
+ else
424
+ cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64
425
+ shasum -a 256 release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64 > release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64.sha256
426
+ endif
427
+
428
+ osx-signed: build
429
+ codesign --deep --force --sign "$(OSX_SIGNING_IDENTITY)" --entitlements "./Entitlements.plist" "./$(BINARY_NAME)"
430
+
431
+ ## Run
432
+ run: prepare ## run local-ai
433
+ CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./
434
+
435
+ test-models/testmodel.ggml:
436
+ mkdir test-models
437
+ mkdir test-dir
438
+ wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
439
+ wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
440
+ wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
441
+ wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
442
+ wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
443
+ wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
444
+ cp tests/models_fixtures/* test-models
445
+
446
+ prepare-test: grpcs
447
+ cp -rf backend-assets core/http
448
+ cp tests/models_fixtures/* test-models
449
+
450
+ test: prepare test-models/testmodel.ggml grpcs
451
+ @echo 'Running tests'
452
+ export GO_TAGS="tts stablediffusion debug"
453
+ $(MAKE) prepare-test
454
+ HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
455
+ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
456
+ $(MAKE) test-llama
457
+ $(MAKE) test-llama-gguf
458
+ $(MAKE) test-tts
459
+ $(MAKE) test-stablediffusion
460
+
461
+ prepare-e2e:
462
+ mkdir -p $(TEST_DIR)
463
+ cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
464
+ test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
465
+ docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=0 --build-arg FFMPEG=true -t localai-tests .
466
+
467
+ run-e2e-image:
468
+ ls -liah $(abspath ./tests/e2e-fixtures)
469
+ docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
470
+
471
+ run-e2e-aio: protogen-go
472
+ @echo 'Running e2e AIO tests'
473
+ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
474
+
475
+ test-e2e:
476
+ @echo 'Running e2e tests'
477
+ BUILD_TYPE=$(BUILD_TYPE) \
478
+ LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
479
+ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
480
+
481
+ teardown-e2e:
482
+ rm -rf $(TEST_DIR) || true
483
+ docker stop $$(docker ps -q --filter ancestor=localai-tests)
484
+
485
+ test-llama: prepare-test
486
+ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
487
+ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
488
+
489
+ test-llama-gguf: prepare-test
490
+ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
491
+ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
492
+
493
+ test-tts: prepare-test
494
+ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
495
+ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
496
+
497
+ test-stablediffusion: prepare-test
498
+ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
499
+ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
500
+
501
+ test-stores: backend-assets/grpc/local-store
502
+ mkdir -p tests/integration/backend-assets/grpc
503
+ cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
504
+ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration
505
+
506
+ test-container:
507
+ docker build --target requirements -t local-ai-test-container .
508
+ docker run -ti --rm --entrypoint /bin/bash -ti -v $(abspath ./):/build local-ai-test-container
509
+
510
+ ## Help:
511
+ help: ## Show this help.
512
+ @echo ''
513
+ @echo 'Usage:'
514
+ @echo ' ${YELLOW}make${RESET} ${GREEN}<target>${RESET}'
515
+ @echo ''
516
+ @echo 'Targets:'
517
+ @awk 'BEGIN {FS = ":.*?## "} { \
518
+ if (/^[a-zA-Z_-]+:.*?##.*$$/) {printf " ${YELLOW}%-20s${GREEN}%s${RESET}\n", $$1, $$2} \
519
+ else if (/^## .*$$/) {printf " ${CYAN}%s${RESET}\n", substr($$1,4)} \
520
+ }' $(MAKEFILE_LIST)
521
+
522
+ .PHONY: protogen
523
+ protogen: protogen-go protogen-python
524
+
525
+ .PHONY: protogen-clean
526
+ protogen-clean: protogen-go-clean protogen-python-clean
527
+
528
+ .PHONY: protogen-go
529
+ protogen-go:
530
+ mkdir -p pkg/grpc/proto
531
+ protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
532
+ backend/backend.proto
533
+
534
+ .PHONY: protogen-go-clean
535
+ protogen-go-clean:
536
+ $(RM) pkg/grpc/proto/backend.pb.go pkg/grpc/proto/backend_grpc.pb.go
537
+ $(RM) bin/*
538
+
539
+ .PHONY: protogen-python
540
+ protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
541
+
542
+ .PHONY: protogen-python-clean
543
+ protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
544
+
545
+ .PHONY: autogptq-protogen
546
+ autogptq-protogen:
547
+ $(MAKE) -C backend/python/autogptq protogen
548
+
549
+ .PHONY: autogptq-protogen-clean
550
+ autogptq-protogen-clean:
551
+ $(MAKE) -C backend/python/autogptq protogen-clean
552
+
553
+ .PHONY: bark-protogen
554
+ bark-protogen:
555
+ $(MAKE) -C backend/python/bark protogen
556
+
557
+ .PHONY: bark-protogen-clean
558
+ bark-protogen-clean:
559
+ $(MAKE) -C backend/python/bark protogen-clean
560
+
561
+ .PHONY: coqui-protogen
562
+ coqui-protogen:
563
+ $(MAKE) -C backend/python/coqui protogen
564
+
565
+ .PHONY: coqui-protogen-clean
566
+ coqui-protogen-clean:
567
+ $(MAKE) -C backend/python/coqui protogen-clean
568
+
569
+ .PHONY: diffusers-protogen
570
+ diffusers-protogen:
571
+ $(MAKE) -C backend/python/diffusers protogen
572
+
573
+ .PHONY: diffusers-protogen-clean
574
+ diffusers-protogen-clean:
575
+ $(MAKE) -C backend/python/diffusers protogen-clean
576
+
577
+ .PHONY: exllama2-protogen
578
+ exllama2-protogen:
579
+ $(MAKE) -C backend/python/exllama2 protogen
580
+
581
+ .PHONY: exllama2-protogen-clean
582
+ exllama2-protogen-clean:
583
+ $(MAKE) -C backend/python/exllama2 protogen-clean
584
+
585
+ .PHONY: mamba-protogen
586
+ mamba-protogen:
587
+ $(MAKE) -C backend/python/mamba protogen
588
+
589
+ .PHONY: mamba-protogen-clean
590
+ mamba-protogen-clean:
591
+ $(MAKE) -C backend/python/mamba protogen-clean
592
+
593
+ .PHONY: rerankers-protogen
594
+ rerankers-protogen:
595
+ $(MAKE) -C backend/python/rerankers protogen
596
+
597
+ .PHONY: rerankers-protogen-clean
598
+ rerankers-protogen-clean:
599
+ $(MAKE) -C backend/python/rerankers protogen-clean
600
+
601
+ .PHONY: sentencetransformers-protogen
602
+ sentencetransformers-protogen:
603
+ $(MAKE) -C backend/python/sentencetransformers protogen
604
+
605
+ .PHONY: sentencetransformers-protogen-clean
606
+ sentencetransformers-protogen-clean:
607
+ $(MAKE) -C backend/python/sentencetransformers protogen-clean
608
+
609
+ .PHONY: transformers-protogen
610
+ transformers-protogen:
611
+ $(MAKE) -C backend/python/transformers protogen
612
+
613
+ .PHONY: transformers-protogen-clean
614
+ transformers-protogen-clean:
615
+ $(MAKE) -C backend/python/transformers protogen-clean
616
+
617
+ .PHONY: parler-tts-protogen
618
+ parler-tts-protogen:
619
+ $(MAKE) -C backend/python/parler-tts protogen
620
+
621
+ .PHONY: parler-tts-protogen-clean
622
+ parler-tts-protogen-clean:
623
+ $(MAKE) -C backend/python/parler-tts protogen-clean
624
+
625
+ .PHONY: transformers-musicgen-protogen
626
+ transformers-musicgen-protogen:
627
+ $(MAKE) -C backend/python/transformers-musicgen protogen
628
+
629
+ .PHONY: transformers-musicgen-protogen-clean
630
+ transformers-musicgen-protogen-clean:
631
+ $(MAKE) -C backend/python/transformers-musicgen protogen-clean
632
+
633
+ .PHONY: vall-e-x-protogen
634
+ vall-e-x-protogen:
635
+ $(MAKE) -C backend/python/vall-e-x protogen
636
+
637
+ .PHONY: vall-e-x-protogen-clean
638
+ vall-e-x-protogen-clean:
639
+ $(MAKE) -C backend/python/vall-e-x protogen-clean
640
+
641
+ .PHONY: openvoice-protogen
642
+ openvoice-protogen:
643
+ $(MAKE) -C backend/python/openvoice protogen
644
+
645
+ .PHONY: openvoice-protogen-clean
646
+ openvoice-protogen-clean:
647
+ $(MAKE) -C backend/python/openvoice protogen-clean
648
+
649
+ .PHONY: vllm-protogen
650
+ vllm-protogen:
651
+ $(MAKE) -C backend/python/vllm protogen
652
+
653
+ .PHONY: vllm-protogen-clean
654
+ vllm-protogen-clean:
655
+ $(MAKE) -C backend/python/vllm protogen-clean
656
+
657
+ ## GRPC
658
+ # Note: it is duplicated in the Dockerfile
659
+ prepare-extra-conda-environments: protogen-python
660
+ $(MAKE) -C backend/python/autogptq
661
+ $(MAKE) -C backend/python/bark
662
+ $(MAKE) -C backend/python/coqui
663
+ $(MAKE) -C backend/python/diffusers
664
+ $(MAKE) -C backend/python/vllm
665
+ $(MAKE) -C backend/python/mamba
666
+ $(MAKE) -C backend/python/sentencetransformers
667
+ $(MAKE) -C backend/python/rerankers
668
+ $(MAKE) -C backend/python/transformers
669
+ $(MAKE) -C backend/python/transformers-musicgen
670
+ $(MAKE) -C backend/python/parler-tts
671
+ $(MAKE) -C backend/python/vall-e-x
672
+ $(MAKE) -C backend/python/openvoice
673
+ $(MAKE) -C backend/python/exllama2
674
+
675
+ prepare-test-extra: protogen-python
676
+ $(MAKE) -C backend/python/transformers
677
+ $(MAKE) -C backend/python/diffusers
678
+
679
+ test-extra: prepare-test-extra
680
+ $(MAKE) -C backend/python/transformers test
681
+ $(MAKE) -C backend/python/diffusers test
682
+
683
+ backend-assets:
684
+ mkdir -p backend-assets
685
+ ifeq ($(BUILD_API_ONLY),true)
686
+ touch backend-assets/keep
687
+ endif
688
+
689
+ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_binding.a
690
+ mkdir -p backend-assets/espeak-ng-data
691
+ @cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
692
+
693
+ backend-assets/grpc: protogen-go replace
694
+ mkdir -p backend-assets/grpc
695
+
696
+ backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
697
+ CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
698
+ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
699
+ ifneq ($(UPX),)
700
+ $(UPX) backend-assets/grpc/bert-embeddings
701
+ endif
702
+
703
+ backend-assets/grpc/huggingface: backend-assets/grpc
704
+ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
705
+ ifneq ($(UPX),)
706
+ $(UPX) backend-assets/grpc/huggingface
707
+ endif
708
+
709
+ backend/cpp/llama/llama.cpp:
710
+ LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
711
+
712
+ INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
713
+ INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
714
+ ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
715
+ -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
716
+ -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
717
+ -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
718
+ -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
719
+ build-llama-cpp-grpc-server:
720
+ # Conditionally build grpc for the llama backend to use if needed
721
+ ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
722
+ $(MAKE) -C backend/cpp/grpc build
723
+ _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
724
+ _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
725
+ PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
726
+ CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
727
+ LLAMA_VERSION=$(CPPLLAMA_VERSION) \
728
+ $(MAKE) -C backend/cpp/${VARIANT} grpc-server
729
+ else
730
+ echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
731
+ LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
732
+ endif
733
+
734
+ # This target is for manually building a variant with-auto detected flags
735
+ backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/llama.cpp
736
+ cp -rf backend/cpp/llama backend/cpp/llama-cpp
737
+ $(MAKE) -C backend/cpp/llama-cpp purge
738
+ $(info ${GREEN}I llama-cpp build info:avx2${RESET})
739
+ $(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server
740
+ cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp
741
+
742
+ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.cpp
743
+ cp -rf backend/cpp/llama backend/cpp/llama-avx2
744
+ $(MAKE) -C backend/cpp/llama-avx2 purge
745
+ $(info ${GREEN}I llama-cpp build info:avx2${RESET})
746
+ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
747
+ cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
748
+
749
+ backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
750
+ cp -rf backend/cpp/llama backend/cpp/llama-avx
751
+ $(MAKE) -C backend/cpp/llama-avx purge
752
+ $(info ${GREEN}I llama-cpp build info:avx${RESET})
753
+ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
754
+ cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx
755
+
756
+ backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/llama.cpp
757
+ cp -rf backend/cpp/llama backend/cpp/llama-fallback
758
+ $(MAKE) -C backend/cpp/llama-fallback purge
759
+ $(info ${GREEN}I llama-cpp build info:fallback${RESET})
760
+ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
761
+ cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
762
+ # TODO: every binary should have its own folder instead, so can have different metal implementations
763
+ ifeq ($(BUILD_TYPE),metal)
764
+ cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
765
+ endif
766
+
767
+ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
768
+ cp -rf backend/cpp/llama backend/cpp/llama-cuda
769
+ $(MAKE) -C backend/cpp/llama-cuda purge
770
+ $(info ${GREEN}I llama-cpp build info:cuda${RESET})
771
+ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
772
+ cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
773
+
774
+ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc backend/cpp/llama/llama.cpp
775
+ cp -rf backend/cpp/llama backend/cpp/llama-hipblas
776
+ $(MAKE) -C backend/cpp/llama-hipblas purge
777
+ $(info ${GREEN}I llama-cpp build info:hipblas${RESET})
778
+ BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
779
+ cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
780
+
781
+ backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc backend/cpp/llama/llama.cpp
782
+ cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
783
+ $(MAKE) -C backend/cpp/llama-sycl_f16 purge
784
+ $(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
785
+ BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
786
+ cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16
787
+
788
+ backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc backend/cpp/llama/llama.cpp
789
+ cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
790
+ $(MAKE) -C backend/cpp/llama-sycl_f32 purge
791
+ $(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
792
+ BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
793
+ cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32
794
+
795
+ backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc backend/cpp/llama/llama.cpp
796
+ cp -rf backend/cpp/llama backend/cpp/llama-grpc
797
+ $(MAKE) -C backend/cpp/llama-grpc purge
798
+ $(info ${GREEN}I llama-cpp build info:grpc${RESET})
799
+ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
800
+ cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
801
+
802
+ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
803
+ mkdir -p backend-assets/util/
804
+ cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
805
+
806
+ backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
807
+ CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
808
+ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
809
+ ifneq ($(UPX),)
810
+ $(UPX) backend-assets/grpc/llama-ggml
811
+ endif
812
+
813
+ backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
814
+ CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
815
+ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
816
+ ifneq ($(UPX),)
817
+ $(UPX) backend-assets/grpc/piper
818
+ endif
819
+
820
+ backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
821
+ CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
822
+ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
823
+ ifneq ($(UPX),)
824
+ $(UPX) backend-assets/grpc/rwkv
825
+ endif
826
+
827
+ backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
828
+ CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
829
+ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
830
+ ifneq ($(UPX),)
831
+ $(UPX) backend-assets/grpc/stablediffusion
832
+ endif
833
+
834
+ backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
835
+ CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
836
+ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
837
+ ifneq ($(UPX),)
838
+ $(UPX) backend-assets/grpc/tinydream
839
+ endif
840
+
841
+ backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
842
+ CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
843
+ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
844
+ ifneq ($(UPX),)
845
+ $(UPX) backend-assets/grpc/whisper
846
+ endif
847
+
848
+ backend-assets/grpc/local-store: backend-assets/grpc
849
+ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/local-store ./backend/go/stores/
850
+ ifneq ($(UPX),)
851
+ $(UPX) backend-assets/grpc/local-store
852
+ endif
853
+
854
+ grpcs: prepare $(GRPC_BACKENDS)
855
+
856
+ DOCKER_IMAGE?=local-ai
857
+ DOCKER_AIO_IMAGE?=local-ai-aio
858
+ IMAGE_TYPE?=core
859
+ BASE_IMAGE?=ubuntu:22.04
860
+
861
+ docker:
862
+ docker build \
863
+ --build-arg BASE_IMAGE=$(BASE_IMAGE) \
864
+ --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
865
+ --build-arg GO_TAGS="$(GO_TAGS)" \
866
+ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
867
+ --build-arg BUILD_TYPE=$(BUILD_TYPE) \
868
+ -t $(DOCKER_IMAGE) .
869
+
870
+ docker-cuda11:
871
+ docker build \
872
+ --build-arg CUDA_MAJOR_VERSION=11 \
873
+ --build-arg CUDA_MINOR_VERSION=8 \
874
+ --build-arg BASE_IMAGE=$(BASE_IMAGE) \
875
+ --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
876
+ --build-arg GO_TAGS="$(GO_TAGS)" \
877
+ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
878
+ --build-arg BUILD_TYPE=$(BUILD_TYPE) \
879
+ -t $(DOCKER_IMAGE)-cuda11 .
880
+
881
+ docker-aio:
882
+ @echo "Building AIO image with base $(BASE_IMAGE) as $(DOCKER_AIO_IMAGE)"
883
+ docker build \
884
+ --build-arg BASE_IMAGE=$(BASE_IMAGE) \
885
+ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
886
+ -t $(DOCKER_AIO_IMAGE) -f Dockerfile.aio .
887
+
888
+ docker-aio-all:
889
+ $(MAKE) docker-aio DOCKER_AIO_SIZE=cpu
890
+ $(MAKE) docker-aio DOCKER_AIO_SIZE=cpu
891
+
892
+ docker-image-intel:
893
+ docker build \
894
+ --build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
895
+ --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
896
+ --build-arg GO_TAGS="none" \
897
+ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
898
+ --build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
899
+
900
+ docker-image-intel-xpu:
901
+ docker build \
902
+ --build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
903
+ --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
904
+ --build-arg GO_TAGS="none" \
905
+ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
906
+ --build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
907
+
908
+ .PHONY: swagger
909
+ swagger:
910
+ swag init -g core/http/app.go --output swagger
911
+
912
+ .PHONY: gen-assets
913
+ gen-assets:
914
+ $(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
915
+
916
+ ## Documentation
917
+ docs/layouts/_default:
918
+ mkdir -p docs/layouts/_default
919
+
920
+ docs/static/gallery.html: docs/layouts/_default
921
+ $(GOCMD) run ./.github/ci/modelslist.go ./gallery/index.yaml > docs/static/gallery.html
922
+
923
+ docs/public: docs/layouts/_default docs/static/gallery.html
924
+ cd docs && hugo --minify
925
+
926
+ docs-clean:
927
+ rm -rf docs/public
928
+ rm -rf docs/static/gallery.html
929
+
930
+ .PHONY: docs
931
+ docs: docs/static/gallery.html
932
+ cd docs && hugo serve
README.md CHANGED
@@ -1,11 +1,241 @@
1
- ---
2
- title: Chat
3
- emoji: 🐠
4
- colorFrom: pink
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">
2
+ <br>
3
+ <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
4
+ LocalAI
5
+ <br>
6
+ </h1>
7
+
8
+ <p align="center">
9
+ <a href="https://github.com/go-skynet/LocalAI/fork" target="blank">
10
+ <img src="https://img.shields.io/github/forks/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI forks"/>
11
+ </a>
12
+ <a href="https://github.com/go-skynet/LocalAI/stargazers" target="blank">
13
+ <img src="https://img.shields.io/github/stars/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI stars"/>
14
+ </a>
15
+ <a href="https://github.com/go-skynet/LocalAI/pulls" target="blank">
16
+ <img src="https://img.shields.io/github/issues-pr/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI pull-requests"/>
17
+ </a>
18
+ <a href='https://github.com/go-skynet/LocalAI/releases'>
19
+ <img src='https://img.shields.io/github/release/go-skynet/LocalAI?&label=Latest&style=for-the-badge'>
20
+ </a>
21
+ </p>
22
+
23
+ <p align="center">
24
+ <a href="https://hub.docker.com/r/localai/localai" target="blank">
25
+ <img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker" alt="LocalAI Docker hub"/>
26
+ </a>
27
+ <a href="https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest" target="blank">
28
+ <img src="https://img.shields.io/badge/quay.io-images-important.svg?" alt="LocalAI Quay.io"/>
29
+ </a>
30
+ </p>
31
+
32
+ <p align="center">
33
+ <a href="https://twitter.com/LocalAI_API" target="blank">
34
+ <img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
35
+ </a>
36
+ <a href="https://discord.gg/uJAeKSAGDy" target="blank">
37
+ <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
38
+ </a>
39
+ </p>
40
+
41
+ > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
42
+ >
43
+ > [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples)
44
+
45
+ [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
46
+
47
+ **LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
48
+
49
+ ![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
50
+
51
+ Run the installer script:
52
+
53
+ ```bash
54
+ curl https://localai.io/install.sh | sh
55
+ ```
56
+
57
+ Or run with docker:
58
+ ```bash
59
+ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
60
+ # Alternative images:
61
+ # - if you have an Nvidia GPU:
62
+ # docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
63
+ # - without preconfigured models
64
+ # docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
65
+ # - without preconfigured models for Nvidia GPUs
66
+ # docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
67
+ ```
68
+
69
+ To load models:
70
+
71
+ ```bash
72
+ # From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
73
+ local-ai run llama-3.2-1b-instruct:q4_k_m
74
+ # Start LocalAI with the phi-2 model directly from huggingface
75
+ local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
76
+ # Install and run a model from the Ollama OCI registry
77
+ local-ai run ollama://gemma:2b
78
+ # Run a model from a configuration file
79
+ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
80
+ # Install and run a model from a standard OCI registry (e.g., Docker Hub)
81
+ local-ai run oci://localai/phi-2:latest
82
+ ```
83
+
84
+ [💻 Getting started](https://localai.io/basics/getting_started/index.html)
85
+
86
+ ## 📰 Latest project news
87
+
88
+ - Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
89
+ - Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
90
+ - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
91
+ - June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
92
+ - June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
93
+ - May 2024: 🔥🔥 Decentralized P2P llama.cpp: https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs https://localai.io/features/distribute/
94
+ - May 2024: 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
95
+ - May 2024: 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
96
+ - May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
97
+ - May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
98
+ - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
99
+
100
+ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
101
+
102
+ ## 🔥🔥 Hot topics (looking for help):
103
+
104
+ - Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
105
+ - Realtime API https://github.com/mudler/LocalAI/issues/3714
106
+ - 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
107
+ - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
108
+ - Backends v2: https://github.com/mudler/LocalAI/issues/1126
109
+ - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
110
+ - Assistant API: https://github.com/mudler/LocalAI/issues/1273
111
+ - Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
112
+ - Vulkan: https://github.com/mudler/LocalAI/issues/1647
113
+ - Anthropic API: https://github.com/mudler/LocalAI/issues/1808
114
+
115
+ If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
116
+
117
+ ## 🚀 [Features](https://localai.io/features/)
118
+
119
+ - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
120
+ - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
121
+ - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
122
+ - 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
123
+ - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/)
124
+ - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
125
+ - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
126
+ - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
127
+ - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
128
+ - 📈 [Reranker API](https://localai.io/features/reranker/)
129
+ - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
130
+ - 🌍 Integrated WebUI!
131
+
132
+ ## 💻 Usage
133
+
134
+ Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
135
+
136
+ ### 🔗 Community and integrations
137
+
138
+ Build and deploy custom containers:
139
+ - https://github.com/sozercan/aikit
140
+
141
+ WebUIs:
142
+ - https://github.com/Jirubizu/localai-admin
143
+ - https://github.com/go-skynet/LocalAI-frontend
144
+ - QA-Pilot(An interactive chat project that leverages LocalAI LLMs for rapid understanding and navigation of GitHub code repository) https://github.com/reid41/QA-Pilot
145
+
146
+ Model galleries
147
+ - https://github.com/go-skynet/model-gallery
148
+
149
+ Other:
150
+ - Helm chart https://github.com/go-skynet/helm-charts
151
+ - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
152
+ - Terminal utility https://github.com/djcopley/ShellOracle
153
+ - Local Smart assistant https://github.com/mudler/LocalAGI
154
+ - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
155
+ - Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
156
+ - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
157
+ - Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
158
+ - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
159
+ - Github Actions: https://github.com/marketplace/actions/start-localai
160
+ - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
161
+
162
+
163
+ ### 🔗 Resources
164
+
165
+ - [LLM finetuning guide](https://localai.io/docs/advanced/fine-tuning/)
166
+ - [How to build locally](https://localai.io/basics/build/index.html)
167
+ - [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
168
+ - [Projects integrating LocalAI](https://localai.io/docs/integrations/)
169
+ - [How tos section](https://io.midori-ai.xyz/howtos/) (curated by our community)
170
+
171
+ ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)
172
+
173
+ - [Run Visual studio code with LocalAI (SUSE)](https://www.suse.com/c/running-ai-locally/)
174
+ - 🆕 [Run LocalAI on Jetson Nano Devkit](https://mudler.pm/posts/local-ai-jetson-nano-devkit/)
175
+ - [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
176
+ - [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
177
+ - [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
178
+ - [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
179
+ - [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
180
+ - [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65)
181
+
182
+ ## Citation
183
+
184
+ If you utilize this repository, data in a downstream project, please consider citing it with:
185
+
186
+ ```
187
+ @misc{localai,
188
+ author = {Ettore Di Giacinto},
189
+ title = {LocalAI: The free, Open source OpenAI alternative},
190
+ year = {2023},
191
+ publisher = {GitHub},
192
+ journal = {GitHub repository},
193
+ howpublished = {\url{https://github.com/go-skynet/LocalAI}},
194
+ ```
195
+
196
+ ## ❤️ Sponsors
197
+
198
+ > Do you find LocalAI useful?
199
+
200
+ Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.
201
+
202
+ A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):
203
+
204
+ <p align="center">
205
+ <a href="https://www.spectrocloud.com/" target="blank">
206
+ <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
207
+ </a>
208
+ <a href="https://www.premai.io/" target="blank">
209
+ <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
210
+ </a>
211
+ </p>
212
+
213
+ ## 🌟 Star history
214
+
215
+ [![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)
216
+
217
+ ## 📖 License
218
+
219
+ LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).
220
+
221
+ MIT - Author Ettore Di Giacinto <mudler@localai.io>
222
+
223
+ ## 🙇 Acknowledgements
224
+
225
+ LocalAI couldn't have been built without the help of great software already available from the community. Thank you!
226
+
227
+ - [llama.cpp](https://github.com/ggerganov/llama.cpp)
228
+ - https://github.com/tatsu-lab/stanford_alpaca
229
+ - https://github.com/cornelk/llama-go for the initial ideas
230
+ - https://github.com/antimatter15/alpaca.cpp
231
+ - https://github.com/EdVince/Stable-Diffusion-NCNN
232
+ - https://github.com/ggerganov/whisper.cpp
233
+ - https://github.com/saharNooby/rwkv.cpp
234
+ - https://github.com/rhasspy/piper
235
+
236
+ ## 🤗 Contributors
237
+
238
+ This is a community project, a special thanks to our contributors! 🤗
239
+ <a href="https://github.com/go-skynet/LocalAI/graphs/contributors">
240
+ <img src="https://contrib.rocks/image?repo=go-skynet/LocalAI" />
241
+ </a>
SECURITY.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Security Policy
2
+
3
+ ## Introduction
4
+
5
+ At LocalAI, we take the security of our software seriously. We understand the importance of protecting our community from vulnerabilities and are committed to ensuring the safety and security of our users.
6
+
7
+ ## Supported Versions
8
+
9
+ We provide support and updates for certain versions of our software. The following table outlines which versions are currently supported with security updates:
10
+
11
+ | Version | Supported |
12
+ | ------- | ------------------ |
13
+ | > 2.0 | :white_check_mark: |
14
+ | < 2.0 | :x: |
15
+
16
+ Please ensure that you are using a supported version to receive the latest security updates.
17
+
18
+ ## Reporting a Vulnerability
19
+
20
+ We encourage the responsible disclosure of any security vulnerabilities. If you believe you've found a security issue in our software, we kindly ask you to follow the steps below to report it to us:
21
+
22
+ 1. **Email Us:** Send an email to [security@localai.io](mailto:security@localai.io) with a detailed report. Please do not disclose the vulnerability publicly or to any third parties before it has been addressed by us.
23
+
24
+ 2. **Expect a Response:** We aim to acknowledge receipt of vulnerability reports within 48 hours. Our security team will review your report and work closely with you to understand the impact and ensure a thorough investigation.
25
+
26
+ 3. **Collaboration:** If the vulnerability is accepted, we will work with you and our community to address the issue promptly. We'll keep you informed throughout the resolution process and may request additional information or collaboration.
27
+
28
+ 4. **Disclosure:** Once the vulnerability has been resolved, we encourage a coordinated disclosure. We believe in transparency and will work with you to ensure that our community is informed in a responsible manner.
29
+
30
+ ## Use of Third-Party Platforms
31
+
32
+ As a Free and Open Source Software (FOSS) organization, we do not offer monetary bounties. However, researchers who wish to report vulnerabilities can also do so via [Huntr](https://huntr.dev/bounties), a platform that recognizes contributions to open source security.
33
+
34
+ ## Contact
35
+
36
+ For any security-related inquiries beyond vulnerability reporting, please contact us at [security@localai.io](mailto:security@localai.io).
37
+
38
+ ## Acknowledgments
39
+
40
+ We appreciate the efforts of those who contribute to the security of our project. Your responsible disclosure is invaluable to the safety and integrity of LocalAI.
41
+
42
+ Thank you for helping us keep LocalAI secure.
aio/cpu/README.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ## AIO CPU size
2
+
3
+ Use this image with CPU-only.
4
+
5
+ Please keep using only C++ backends so the base image is as small as possible (without CUDA, cuDNN, python, etc).
aio/cpu/embeddings.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: text-embedding-ada-002
2
+ backend: bert-embeddings
3
+ parameters:
4
+ model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
5
+
6
+ usage: |
7
+ You can test this model with curl like this:
8
+
9
+ curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
10
+ "input": "Your text string goes here",
11
+ "model": "text-embedding-ada-002"
12
+ }'
aio/cpu/image-gen.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: stablediffusion
2
+ backend: stablediffusion
3
+ parameters:
4
+ model: stablediffusion_assets
5
+
6
+ license: "BSD-3"
7
+ urls:
8
+ - https://github.com/EdVince/Stable-Diffusion-NCNN
9
+ - https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
10
+
11
+ description: |
12
+ Stable Diffusion in NCNN with c++, supported txt2img and img2img
13
+
14
+ download_files:
15
+ - filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
16
+ sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
17
+ uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
18
+ - filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
19
+ sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
20
+ uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
21
+ - filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
22
+ sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
23
+ uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
24
+ - filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
25
+ sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
26
+ uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
27
+ - filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
28
+ sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
29
+ uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
30
+ - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
31
+ sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
32
+ uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
33
+ - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
34
+ sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
35
+ uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
36
+ - filename: "stablediffusion_assets/log_sigmas.bin"
37
+ sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
38
+ uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
39
+ - filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
40
+ sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
41
+ uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
42
+ - filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
43
+ sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
44
+ uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
45
+ - filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
46
+ sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
47
+ uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
48
+ - filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
49
+ sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
50
+ uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
51
+ - filename: "stablediffusion_assets/vocab.txt"
52
+ sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
53
+ uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
54
+
55
+ usage: |
56
+ curl http://localhost:8080/v1/images/generations \
57
+ -H "Content-Type: application/json" \
58
+ -d '{
59
+ "prompt": "<positive prompt>|<negative prompt>",
60
+ "step": 25,
61
+ "size": "512x512"
62
+ }'
aio/cpu/rerank.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: jina-reranker-v1-base-en
2
+ backend: rerankers
3
+ parameters:
4
+ model: cross-encoder
5
+
6
+ usage: |
7
+ You can test this model with curl like this:
8
+
9
+ curl http://localhost:8080/v1/rerank \
10
+ -H "Content-Type: application/json" \
11
+ -d '{
12
+ "model": "jina-reranker-v1-base-en",
13
+ "query": "Organic skincare products for sensitive skin",
14
+ "documents": [
15
+ "Eco-friendly kitchenware for modern homes",
16
+ "Biodegradable cleaning supplies for eco-conscious consumers",
17
+ "Organic cotton baby clothes for sensitive skin",
18
+ "Natural organic skincare range for sensitive skin",
19
+ "Tech gadgets for smart homes: 2024 edition",
20
+ "Sustainable gardening tools and compost solutions",
21
+ "Sensitive skin-friendly facial cleansers and toners",
22
+ "Organic food wraps and storage solutions",
23
+ "All-natural pet food for dogs with allergies",
24
+ "Yoga mats made from recycled materials"
25
+ ],
26
+ "top_n": 3
27
+ }'
aio/cpu/speech-to-text.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: whisper-1
2
+ backend: whisper
3
+ parameters:
4
+ model: ggml-whisper-base.bin
5
+
6
+ usage: |
7
+ ## example audio file
8
+ wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
9
+
10
+ ## Send the example audio file to the transcriptions endpoint
11
+ curl http://localhost:8080/v1/audio/transcriptions \
12
+ -H "Content-Type: multipart/form-data" \
13
+ -F file="@$PWD/gb1.ogg" -F model="whisper-1"
14
+
15
+ download_files:
16
+ - filename: "ggml-whisper-base.bin"
17
+ sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
18
+ uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
aio/cpu/text-to-speech.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: tts-1
2
+ download_files:
3
+ - filename: voice-en-us-amy-low.tar.gz
4
+ uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
5
+
6
+ parameters:
7
+ model: en-us-amy-low.onnx
8
+
9
+ usage: |
10
+ To test if this model works as expected, you can use the following curl command:
11
+
12
+ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
13
+ "model":"voice-en-us-amy-low",
14
+ "input": "Hi, this is a test."
15
+ }'
aio/cpu/text-to-text.yaml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: gpt-4
2
+ mmap: true
3
+ parameters:
4
+ model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
5
+ context_size: 8192
6
+
7
+ stopwords:
8
+ - "<|im_end|>"
9
+ - "<dummy32000>"
10
+ - "</tool_call>"
11
+ - "<|eot_id|>"
12
+ - "<|end_of_text|>"
13
+
14
+ function:
15
+ # disable injecting the "answer" tool
16
+ disable_no_action: true
17
+
18
+ grammar:
19
+ # This allows the grammar to also return messages
20
+ mixed_mode: true
21
+ # Suffix to add to the grammar
22
+ #prefix: '<tool_call>\n'
23
+ # Force parallel calls in the grammar
24
+ # parallel_calls: true
25
+
26
+ return_name_in_function_response: true
27
+ # Without grammar uncomment the lines below
28
+ # Warning: this is relying only on the capability of the
29
+ # LLM model to generate the correct function call.
30
+ json_regex_match:
31
+ - "(?s)<tool_call>(.*?)</tool_call>"
32
+ - "(?s)<tool_call>(.*?)"
33
+ replace_llm_results:
34
+ # Drop the scratchpad content from responses
35
+ - key: "(?s)<scratchpad>.*</scratchpad>"
36
+ value: ""
37
+ replace_function_results:
38
+ # Replace everything that is not JSON array or object
39
+ #
40
+ - key: '(?s)^[^{\[]*'
41
+ value: ""
42
+ - key: '(?s)[^}\]]*$'
43
+ value: ""
44
+ - key: "'([^']*?)'"
45
+ value: "_DQUOTE_${1}_DQUOTE_"
46
+ - key: '\\"'
47
+ value: "__TEMP_QUOTE__"
48
+ - key: "\'"
49
+ value: "'"
50
+ - key: "_DQUOTE_"
51
+ value: '"'
52
+ - key: "__TEMP_QUOTE__"
53
+ value: '"'
54
+ # Drop the scratchpad content from responses
55
+ - key: "(?s)<scratchpad>.*</scratchpad>"
56
+ value: ""
57
+
58
+ template:
59
+ chat: |
60
+ {{.Input -}}
61
+ <|im_start|>assistant
62
+ chat_message: |
63
+ <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
64
+ {{- if .FunctionCall }}
65
+ <tool_call>
66
+ {{- else if eq .RoleName "tool" }}
67
+ <tool_response>
68
+ {{- end }}
69
+ {{- if .Content}}
70
+ {{.Content }}
71
+ {{- end }}
72
+ {{- if .FunctionCall}}
73
+ {{toJson .FunctionCall}}
74
+ {{- end }}
75
+ {{- if .FunctionCall }}
76
+ </tool_call>
77
+ {{- else if eq .RoleName "tool" }}
78
+ </tool_response>
79
+ {{- end }}<|im_end|>
80
+ completion: |
81
+ {{.Input}}
82
+ function: |-
83
+ <|im_start|>system
84
+ You are a function calling AI model.
85
+ Here are the available tools:
86
+ <tools>
87
+ {{range .Functions}}
88
+ {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
89
+ {{end}}
90
+ </tools>
91
+ You should call the tools provided to you sequentially
92
+ Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
93
+ <scratchpad>
94
+ {step-by-step reasoning and plan in bullet points}
95
+ </scratchpad>
96
+ For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
97
+ <tool_call>
98
+ {"arguments": <args-dict>, "name": <function-name>}
99
+ </tool_call><|im_end|>
100
+ {{.Input -}}
101
+ <|im_start|>assistant
aio/cpu/vision.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ backend: llama-cpp
2
+ context_size: 4096
3
+ f16: true
4
+ mmap: true
5
+ name: gpt-4o
6
+
7
+ roles:
8
+ user: "USER:"
9
+ assistant: "ASSISTANT:"
10
+ system: "SYSTEM:"
11
+
12
+ mmproj: bakllava-mmproj.gguf
13
+ parameters:
14
+ model: bakllava.gguf
15
+
16
+ template:
17
+ chat: |
18
+ A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
19
+ {{.Input}}
20
+ ASSISTANT:
21
+
22
+ download_files:
23
+ - filename: bakllava.gguf
24
+ uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
25
+ - filename: bakllava-mmproj.gguf
26
+ uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
27
+
28
+ usage: |
29
+ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
30
+ "model": "gpt-4-vision-preview",
31
+ "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
aio/entrypoint.sh ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ echo "===> LocalAI All-in-One (AIO) container starting..."
4
+
5
+ GPU_ACCELERATION=false
6
+ GPU_VENDOR=""
7
+
8
+ function check_intel() {
9
+ if lspci | grep -E 'VGA|3D' | grep -iq intel; then
10
+ echo "Intel GPU detected"
11
+ if [ -d /opt/intel ]; then
12
+ GPU_ACCELERATION=true
13
+ GPU_VENDOR=intel
14
+ else
15
+ echo "Intel GPU detected, but Intel GPU drivers are not installed. GPU acceleration will not be available."
16
+ fi
17
+ fi
18
+ }
19
+
20
+ function check_nvidia_wsl() {
21
+ if lspci | grep -E 'VGA|3D' | grep -iq "Microsoft Corporation Device 008e"; then
22
+ # We make the assumption this WSL2 cars is NVIDIA, then check for nvidia-smi
23
+ # Make sure the container was run with `--gpus all` as the only required parameter
24
+ echo "NVIDIA GPU detected via WSL2"
25
+ # nvidia-smi should be installed in the container
26
+ if nvidia-smi; then
27
+ GPU_ACCELERATION=true
28
+ GPU_VENDOR=nvidia
29
+ else
30
+ echo "NVIDIA GPU detected via WSL2, but nvidia-smi is not installed. GPU acceleration will not be available."
31
+ fi
32
+ fi
33
+ }
34
+
35
+ function check_amd() {
36
+ if lspci | grep -E 'VGA|3D' | grep -iq amd; then
37
+ echo "AMD GPU detected"
38
+ # Check if ROCm is installed
39
+ if [ -d /opt/rocm ]; then
40
+ GPU_ACCELERATION=true
41
+ GPU_VENDOR=amd
42
+ else
43
+ echo "AMD GPU detected, but ROCm is not installed. GPU acceleration will not be available."
44
+ fi
45
+ fi
46
+ }
47
+
48
+ function check_nvidia() {
49
+ if lspci | grep -E 'VGA|3D' | grep -iq nvidia; then
50
+ echo "NVIDIA GPU detected"
51
+ # nvidia-smi should be installed in the container
52
+ if nvidia-smi; then
53
+ GPU_ACCELERATION=true
54
+ GPU_VENDOR=nvidia
55
+ else
56
+ echo "NVIDIA GPU detected, but nvidia-smi is not installed. GPU acceleration will not be available."
57
+ fi
58
+ fi
59
+ }
60
+
61
+ function check_metal() {
62
+ if system_profiler SPDisplaysDataType | grep -iq 'Metal'; then
63
+ echo "Apple Metal supported GPU detected"
64
+ GPU_ACCELERATION=true
65
+ GPU_VENDOR=apple
66
+ fi
67
+ }
68
+
69
+ function detect_gpu() {
70
+ case "$(uname -s)" in
71
+ Linux)
72
+ check_nvidia
73
+ check_amd
74
+ check_intel
75
+ check_nvidia_wsl
76
+ ;;
77
+ Darwin)
78
+ check_metal
79
+ ;;
80
+ esac
81
+ }
82
+
83
+ function detect_gpu_size() {
84
+ # Attempting to find GPU memory size for NVIDIA GPUs
85
+ if [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "nvidia" ]; then
86
+ echo "NVIDIA GPU detected. Attempting to find memory size..."
87
+ # Using head -n 1 to get the total memory of the 1st NVIDIA GPU detected.
88
+ # If handling multiple GPUs is required in the future, this is the place to do it
89
+ nvidia_sm=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n 1)
90
+ if [ ! -z "$nvidia_sm" ]; then
91
+ echo "Total GPU Memory: $nvidia_sm MiB"
92
+ # if bigger than 8GB, use 16GB
93
+ #if [ "$nvidia_sm" -gt 8192 ]; then
94
+ # GPU_SIZE=gpu-16g
95
+ #else
96
+ GPU_SIZE=gpu-8g
97
+ #fi
98
+ else
99
+ echo "Unable to determine NVIDIA GPU memory size. Falling back to CPU."
100
+ GPU_SIZE=gpu-8g
101
+ fi
102
+ elif [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "intel" ]; then
103
+ GPU_SIZE=intel
104
+ # Default to a generic GPU size until we implement GPU size detection for non NVIDIA GPUs
105
+ elif [ "$GPU_ACCELERATION" = true ]; then
106
+ echo "Non-NVIDIA GPU detected. Specific GPU memory size detection is not implemented."
107
+ GPU_SIZE=gpu-8g
108
+
109
+ # default to cpu if GPU_SIZE is not set
110
+ else
111
+ echo "GPU acceleration is not enabled or supported. Defaulting to CPU."
112
+ GPU_SIZE=cpu
113
+ fi
114
+ }
115
+
116
+ function check_vars() {
117
+ if [ -z "$MODELS" ]; then
118
+ echo "MODELS environment variable is not set. Please set it to a comma-separated list of model YAML files to load."
119
+ exit 1
120
+ fi
121
+
122
+ if [ -z "$PROFILE" ]; then
123
+ echo "PROFILE environment variable is not set. Please set it to one of the following: cpu, gpu-8g, gpu-16g, apple"
124
+ exit 1
125
+ fi
126
+ }
127
+
128
+ detect_gpu
129
+ detect_gpu_size
130
+
131
+ PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
132
+ export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
133
+
134
+ check_vars
135
+
136
+ echo "===> Starting LocalAI[$PROFILE] with the following models: $MODELS"
137
+
138
+ exec /build/entrypoint.sh "$@"
aio/gpu-8g/embeddings.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: text-embedding-ada-002
2
+ backend: sentencetransformers
3
+ parameters:
4
+ model: all-MiniLM-L6-v2
5
+
6
+ usage: |
7
+ You can test this model with curl like this:
8
+
9
+ curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
10
+ "input": "Your text string goes here",
11
+ "model": "text-embedding-ada-002"
12
+ }'
aio/gpu-8g/image-gen.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: stablediffusion
2
+ parameters:
3
+ model: DreamShaper_8_pruned.safetensors
4
+ backend: diffusers
5
+ step: 25
6
+ f16: true
7
+
8
+ diffusers:
9
+ pipeline_type: StableDiffusionPipeline
10
+ cuda: true
11
+ enable_parameters: "negative_prompt,num_inference_steps"
12
+ scheduler_type: "k_dpmpp_2m"
13
+
14
+ download_files:
15
+ - filename: DreamShaper_8_pruned.safetensors
16
+ uri: huggingface://Lykon/DreamShaper/DreamShaper_8_pruned.safetensors
17
+
18
+ usage: |
19
+ curl http://localhost:8080/v1/images/generations \
20
+ -H "Content-Type: application/json" \
21
+ -d '{
22
+ "prompt": "<positive prompt>|<negative prompt>",
23
+ "step": 25,
24
+ "size": "512x512"
25
+ }'
aio/gpu-8g/rerank.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: jina-reranker-v1-base-en
2
+ backend: rerankers
3
+ parameters:
4
+ model: cross-encoder
5
+
6
+ usage: |
7
+ You can test this model with curl like this:
8
+
9
+ curl http://localhost:8080/v1/rerank \
10
+ -H "Content-Type: application/json" \
11
+ -d '{
12
+ "model": "jina-reranker-v1-base-en",
13
+ "query": "Organic skincare products for sensitive skin",
14
+ "documents": [
15
+ "Eco-friendly kitchenware for modern homes",
16
+ "Biodegradable cleaning supplies for eco-conscious consumers",
17
+ "Organic cotton baby clothes for sensitive skin",
18
+ "Natural organic skincare range for sensitive skin",
19
+ "Tech gadgets for smart homes: 2024 edition",
20
+ "Sustainable gardening tools and compost solutions",
21
+ "Sensitive skin-friendly facial cleansers and toners",
22
+ "Organic food wraps and storage solutions",
23
+ "All-natural pet food for dogs with allergies",
24
+ "Yoga mats made from recycled materials"
25
+ ],
26
+ "top_n": 3
27
+ }'
aio/gpu-8g/speech-to-text.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: whisper-1
2
+ backend: whisper
3
+ parameters:
4
+ model: ggml-whisper-base.bin
5
+
6
+ usage: |
7
+ ## example audio file
8
+ wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
9
+
10
+ ## Send the example audio file to the transcriptions endpoint
11
+ curl http://localhost:8080/v1/audio/transcriptions \
12
+ -H "Content-Type: multipart/form-data" \
13
+ -F file="@$PWD/gb1.ogg" -F model="whisper-1"
14
+
15
+ download_files:
16
+ - filename: "ggml-whisper-base.bin"
17
+ sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
18
+ uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
aio/gpu-8g/text-to-speech.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: tts-1
2
+ download_files:
3
+ - filename: voice-en-us-amy-low.tar.gz
4
+ uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
5
+
6
+ parameters:
7
+ model: en-us-amy-low.onnx
8
+
9
+ usage: |
10
+ To test if this model works as expected, you can use the following curl command:
11
+
12
+ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
13
+ "model":"tts-1",
14
+ "input": "Hi, this is a test."
15
+ }'
aio/gpu-8g/text-to-text.yaml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: gpt-4
2
+ mmap: true
3
+ parameters:
4
+ model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
5
+ context_size: 8192
6
+
7
+ stopwords:
8
+ - "<|im_end|>"
9
+ - "<dummy32000>"
10
+ - "</tool_call>"
11
+ - "<|eot_id|>"
12
+ - "<|end_of_text|>"
13
+
14
+ function:
15
+ # disable injecting the "answer" tool
16
+ disable_no_action: true
17
+
18
+ grammar:
19
+ # This allows the grammar to also return messages
20
+ mixed_mode: true
21
+ # Suffix to add to the grammar
22
+ #prefix: '<tool_call>\n'
23
+ # Force parallel calls in the grammar
24
+ # parallel_calls: true
25
+
26
+ return_name_in_function_response: true
27
+ # Without grammar uncomment the lines below
28
+ # Warning: this is relying only on the capability of the
29
+ # LLM model to generate the correct function call.
30
+ json_regex_match:
31
+ - "(?s)<tool_call>(.*?)</tool_call>"
32
+ - "(?s)<tool_call>(.*?)"
33
+ replace_llm_results:
34
+ # Drop the scratchpad content from responses
35
+ - key: "(?s)<scratchpad>.*</scratchpad>"
36
+ value: ""
37
+ replace_function_results:
38
+ # Replace everything that is not JSON array or object
39
+ #
40
+ - key: '(?s)^[^{\[]*'
41
+ value: ""
42
+ - key: '(?s)[^}\]]*$'
43
+ value: ""
44
+ - key: "'([^']*?)'"
45
+ value: "_DQUOTE_${1}_DQUOTE_"
46
+ - key: '\\"'
47
+ value: "__TEMP_QUOTE__"
48
+ - key: "\'"
49
+ value: "'"
50
+ - key: "_DQUOTE_"
51
+ value: '"'
52
+ - key: "__TEMP_QUOTE__"
53
+ value: '"'
54
+ # Drop the scratchpad content from responses
55
+ - key: "(?s)<scratchpad>.*</scratchpad>"
56
+ value: ""
57
+
58
+ template:
59
+ chat: |
60
+ {{.Input -}}
61
+ <|im_start|>assistant
62
+ chat_message: |
63
+ <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
64
+ {{- if .FunctionCall }}
65
+ <tool_call>
66
+ {{- else if eq .RoleName "tool" }}
67
+ <tool_response>
68
+ {{- end }}
69
+ {{- if .Content}}
70
+ {{.Content }}
71
+ {{- end }}
72
+ {{- if .FunctionCall}}
73
+ {{toJson .FunctionCall}}
74
+ {{- end }}
75
+ {{- if .FunctionCall }}
76
+ </tool_call>
77
+ {{- else if eq .RoleName "tool" }}
78
+ </tool_response>
79
+ {{- end }}<|im_end|>
80
+ completion: |
81
+ {{.Input}}
82
+ function: |-
83
+ <|im_start|>system
84
+ You are a function calling AI model.
85
+ Here are the available tools:
86
+ <tools>
87
+ {{range .Functions}}
88
+ {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
89
+ {{end}}
90
+ </tools>
91
+ You should call the tools provided to you sequentially
92
+ Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
93
+ <scratchpad>
94
+ {step-by-step reasoning and plan in bullet points}
95
+ </scratchpad>
96
+ For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
97
+ <tool_call>
98
+ {"arguments": <args-dict>, "name": <function-name>}
99
+ </tool_call><|im_end|>
100
+ {{.Input -}}
101
+ <|im_start|>assistant
aio/gpu-8g/vision.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ backend: llama-cpp
2
+ context_size: 4096
3
+ f16: true
4
+ mmap: true
5
+ name: gpt-4o
6
+
7
+ roles:
8
+ user: "USER:"
9
+ assistant: "ASSISTANT:"
10
+ system: "SYSTEM:"
11
+
12
+ mmproj: llava-v1.6-7b-mmproj-f16.gguf
13
+ parameters:
14
+ model: llava-v1.6-mistral-7b.Q5_K_M.gguf
15
+ temperature: 0.2
16
+ top_k: 40
17
+ top_p: 0.95
18
+ seed: -1
19
+
20
+ template:
21
+ chat: |
22
+ A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
23
+ {{.Input}}
24
+ ASSISTANT:
25
+
26
+ download_files:
27
+ - filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
28
+ uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
29
+ - filename: llava-v1.6-7b-mmproj-f16.gguf
30
+ uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
31
+
32
+ usage: |
33
+ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
34
+ "model": "gpt-4-vision-preview",
35
+ "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
aio/intel/embeddings.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: text-embedding-ada-002
2
+ backend: sentencetransformers
3
+ parameters:
4
+ model: all-MiniLM-L6-v2
5
+
6
+ usage: |
7
+ You can test this model with curl like this:
8
+
9
+ curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
10
+ "input": "Your text string goes here",
11
+ "model": "text-embedding-ada-002"
12
+ }'
aio/intel/image-gen.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: stablediffusion
2
+ parameters:
3
+ model: Lykon/dreamshaper-8
4
+ backend: diffusers
5
+ step: 25
6
+ f16: true
7
+ diffusers:
8
+ pipeline_type: StableDiffusionPipeline
9
+ cuda: true
10
+ enable_parameters: "negative_prompt,num_inference_steps"
11
+ scheduler_type: "k_dpmpp_2m"
12
+
13
+ usage: |
14
+ curl http://localhost:8080/v1/images/generations \
15
+ -H "Content-Type: application/json" \
16
+ -d '{
17
+ "prompt": "<positive prompt>|<negative prompt>",
18
+ "step": 25,
19
+ "size": "512x512"
20
+ }'
aio/intel/rerank.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: jina-reranker-v1-base-en
2
+ backend: rerankers
3
+ parameters:
4
+ model: cross-encoder
5
+
6
+ usage: |
7
+ You can test this model with curl like this:
8
+
9
+ curl http://localhost:8080/v1/rerank \
10
+ -H "Content-Type: application/json" \
11
+ -d '{
12
+ "model": "jina-reranker-v1-base-en",
13
+ "query": "Organic skincare products for sensitive skin",
14
+ "documents": [
15
+ "Eco-friendly kitchenware for modern homes",
16
+ "Biodegradable cleaning supplies for eco-conscious consumers",
17
+ "Organic cotton baby clothes for sensitive skin",
18
+ "Natural organic skincare range for sensitive skin",
19
+ "Tech gadgets for smart homes: 2024 edition",
20
+ "Sustainable gardening tools and compost solutions",
21
+ "Sensitive skin-friendly facial cleansers and toners",
22
+ "Organic food wraps and storage solutions",
23
+ "All-natural pet food for dogs with allergies",
24
+ "Yoga mats made from recycled materials"
25
+ ],
26
+ "top_n": 3
27
+ }'
aio/intel/speech-to-text.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: whisper-1
2
+ backend: whisper
3
+ parameters:
4
+ model: ggml-whisper-base.bin
5
+
6
+ usage: |
7
+ ## example audio file
8
+ wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
9
+
10
+ ## Send the example audio file to the transcriptions endpoint
11
+ curl http://localhost:8080/v1/audio/transcriptions \
12
+ -H "Content-Type: multipart/form-data" \
13
+ -F file="@$PWD/gb1.ogg" -F model="whisper-1"
14
+
15
+ download_files:
16
+ - filename: "ggml-whisper-base.bin"
17
+ sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
18
+ uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
aio/intel/text-to-speech.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: tts-1
2
+ download_files:
3
+ - filename: voice-en-us-amy-low.tar.gz
4
+ uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
5
+
6
+ parameters:
7
+ model: en-us-amy-low.onnx
8
+
9
+ usage: |
10
+ To test if this model works as expected, you can use the following curl command:
11
+
12
+ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
13
+ "model":"tts-1",
14
+ "input": "Hi, this is a test."
15
+ }'
aio/intel/text-to-text.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: gpt-4
2
+ mmap: false
3
+ context_size: 8192
4
+
5
+ f16: false
6
+ parameters:
7
+ model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
8
+
9
+ stopwords:
10
+ - "<|im_end|>"
11
+ - "<dummy32000>"
12
+ - "</tool_call>"
13
+ - "<|eot_id|>"
14
+ - "<|end_of_text|>"
15
+
16
+ function:
17
+ # disable injecting the "answer" tool
18
+ disable_no_action: true
19
+
20
+ grammar:
21
+ # This allows the grammar to also return messages
22
+ mixed_mode: true
23
+ # Suffix to add to the grammar
24
+ #prefix: '<tool_call>\n'
25
+ # Force parallel calls in the grammar
26
+ # parallel_calls: true
27
+
28
+ return_name_in_function_response: true
29
+ # Without grammar uncomment the lines below
30
+ # Warning: this is relying only on the capability of the
31
+ # LLM model to generate the correct function call.
32
+ json_regex_match:
33
+ - "(?s)<tool_call>(.*?)</tool_call>"
34
+ - "(?s)<tool_call>(.*?)"
35
+ replace_llm_results:
36
+ # Drop the scratchpad content from responses
37
+ - key: "(?s)<scratchpad>.*</scratchpad>"
38
+ value: ""
39
+ replace_function_results:
40
+ # Replace everything that is not JSON array or object
41
+ #
42
+ - key: '(?s)^[^{\[]*'
43
+ value: ""
44
+ - key: '(?s)[^}\]]*$'
45
+ value: ""
46
+ - key: "'([^']*?)'"
47
+ value: "_DQUOTE_${1}_DQUOTE_"
48
+ - key: '\\"'
49
+ value: "__TEMP_QUOTE__"
50
+ - key: "\'"
51
+ value: "'"
52
+ - key: "_DQUOTE_"
53
+ value: '"'
54
+ - key: "__TEMP_QUOTE__"
55
+ value: '"'
56
+ # Drop the scratchpad content from responses
57
+ - key: "(?s)<scratchpad>.*</scratchpad>"
58
+ value: ""
59
+
60
+ template:
61
+ chat: |
62
+ {{.Input -}}
63
+ <|im_start|>assistant
64
+ chat_message: |
65
+ <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
66
+ {{- if .FunctionCall }}
67
+ <tool_call>
68
+ {{- else if eq .RoleName "tool" }}
69
+ <tool_response>
70
+ {{- end }}
71
+ {{- if .Content}}
72
+ {{.Content }}
73
+ {{- end }}
74
+ {{- if .FunctionCall}}
75
+ {{toJson .FunctionCall}}
76
+ {{- end }}
77
+ {{- if .FunctionCall }}
78
+ </tool_call>
79
+ {{- else if eq .RoleName "tool" }}
80
+ </tool_response>
81
+ {{- end }}<|im_end|>
82
+ completion: |
83
+ {{.Input}}
84
+ function: |-
85
+ <|im_start|>system
86
+ You are a function calling AI model.
87
+ Here are the available tools:
88
+ <tools>
89
+ {{range .Functions}}
90
+ {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
91
+ {{end}}
92
+ </tools>
93
+ You should call the tools provided to you sequentially
94
+ Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
95
+ <scratchpad>
96
+ {step-by-step reasoning and plan in bullet points}
97
+ </scratchpad>
98
+ For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
99
+ <tool_call>
100
+ {"arguments": <args-dict>, "name": <function-name>}
101
+ </tool_call><|im_end|>
102
+ {{.Input -}}
103
+ <|im_start|>assistant
aio/intel/vision.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ backend: llama-cpp
2
+ context_size: 4096
3
+ mmap: false
4
+ f16: false
5
+ name: gpt-4o
6
+
7
+ roles:
8
+ user: "USER:"
9
+ assistant: "ASSISTANT:"
10
+ system: "SYSTEM:"
11
+
12
+ mmproj: llava-v1.6-7b-mmproj-f16.gguf
13
+ parameters:
14
+ model: llava-v1.6-mistral-7b.Q5_K_M.gguf
15
+ temperature: 0.2
16
+ top_k: 40
17
+ top_p: 0.95
18
+ seed: -1
19
+
20
+ template:
21
+ chat: |
22
+ A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
23
+ {{.Input}}
24
+ ASSISTANT:
25
+
26
+ download_files:
27
+ - filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
28
+ uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
29
+ - filename: llava-v1.6-7b-mmproj-f16.gguf
30
+ uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
31
+
32
+ usage: |
33
+ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
34
+ "model": "gpt-4-vision-preview",
35
+ "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
assets.go ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ package main
2
+
3
+ import "embed"
4
+
5
+ //go:embed backend-assets/*
6
+ var backendAssets embed.FS
backend/backend.proto ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ syntax = "proto3";
2
+
3
+ option go_package = "github.com/go-skynet/LocalAI/pkg/grpc/proto";
4
+ option java_multiple_files = true;
5
+ option java_package = "io.skynet.localai.backend";
6
+ option java_outer_classname = "LocalAIBackend";
7
+
8
+ package backend;
9
+
10
+ service Backend {
11
+ rpc Health(HealthMessage) returns (Reply) {}
12
+ rpc Predict(PredictOptions) returns (Reply) {}
13
+ rpc LoadModel(ModelOptions) returns (Result) {}
14
+ rpc PredictStream(PredictOptions) returns (stream Reply) {}
15
+ rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
16
+ rpc GenerateImage(GenerateImageRequest) returns (Result) {}
17
+ rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
18
+ rpc TTS(TTSRequest) returns (Result) {}
19
+ rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
20
+ rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
21
+ rpc Status(HealthMessage) returns (StatusResponse) {}
22
+
23
+ rpc StoresSet(StoresSetOptions) returns (Result) {}
24
+ rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
25
+ rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
26
+ rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
27
+
28
+ rpc Rerank(RerankRequest) returns (RerankResult) {}
29
+
30
+ rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
31
+ }
32
+
33
+ // Define the empty request
34
+ message MetricsRequest {}
35
+
36
+ message MetricsResponse {
37
+ int32 slot_id = 1;
38
+ string prompt_json_for_slot = 2; // Stores the prompt as a JSON string.
39
+ float tokens_per_second = 3;
40
+ int32 tokens_generated = 4;
41
+ int32 prompt_tokens_processed = 5;
42
+ }
43
+
44
+ message RerankRequest {
45
+ string query = 1;
46
+ repeated string documents = 2;
47
+ int32 top_n = 3;
48
+ }
49
+
50
+ message RerankResult {
51
+ Usage usage = 1;
52
+ repeated DocumentResult results = 2;
53
+ }
54
+
55
+ message Usage {
56
+ int32 total_tokens = 1;
57
+ int32 prompt_tokens = 2;
58
+ }
59
+
60
+ message DocumentResult {
61
+ int32 index = 1;
62
+ string text = 2;
63
+ float relevance_score = 3;
64
+ }
65
+
66
+ message StoresKey {
67
+ repeated float Floats = 1;
68
+ }
69
+
70
+ message StoresValue {
71
+ bytes Bytes = 1;
72
+ }
73
+
74
+ message StoresSetOptions {
75
+ repeated StoresKey Keys = 1;
76
+ repeated StoresValue Values = 2;
77
+ }
78
+
79
+ message StoresDeleteOptions {
80
+ repeated StoresKey Keys = 1;
81
+ }
82
+
83
+ message StoresGetOptions {
84
+ repeated StoresKey Keys = 1;
85
+ }
86
+
87
+ message StoresGetResult {
88
+ repeated StoresKey Keys = 1;
89
+ repeated StoresValue Values = 2;
90
+ }
91
+
92
+ message StoresFindOptions {
93
+ StoresKey Key = 1;
94
+ int32 TopK = 2;
95
+ }
96
+
97
+ message StoresFindResult {
98
+ repeated StoresKey Keys = 1;
99
+ repeated StoresValue Values = 2;
100
+ repeated float Similarities = 3;
101
+ }
102
+
103
+ message HealthMessage {}
104
+
105
+ // The request message containing the user's name.
106
+ message PredictOptions {
107
+ string Prompt = 1;
108
+ int32 Seed = 2;
109
+ int32 Threads = 3;
110
+ int32 Tokens = 4;
111
+ int32 TopK = 5;
112
+ int32 Repeat = 6;
113
+ int32 Batch = 7;
114
+ int32 NKeep = 8;
115
+ float Temperature = 9;
116
+ float Penalty = 10;
117
+ bool F16KV = 11;
118
+ bool DebugMode = 12;
119
+ repeated string StopPrompts = 13;
120
+ bool IgnoreEOS = 14;
121
+ float TailFreeSamplingZ = 15;
122
+ float TypicalP = 16;
123
+ float FrequencyPenalty = 17;
124
+ float PresencePenalty = 18;
125
+ int32 Mirostat = 19;
126
+ float MirostatETA = 20;
127
+ float MirostatTAU = 21;
128
+ bool PenalizeNL = 22;
129
+ string LogitBias = 23;
130
+ bool MLock = 25;
131
+ bool MMap = 26;
132
+ bool PromptCacheAll = 27;
133
+ bool PromptCacheRO = 28;
134
+ string Grammar = 29;
135
+ string MainGPU = 30;
136
+ string TensorSplit = 31;
137
+ float TopP = 32;
138
+ string PromptCachePath = 33;
139
+ bool Debug = 34;
140
+ repeated int32 EmbeddingTokens = 35;
141
+ string Embeddings = 36;
142
+ float RopeFreqBase = 37;
143
+ float RopeFreqScale = 38;
144
+ float NegativePromptScale = 39;
145
+ string NegativePrompt = 40;
146
+ int32 NDraft = 41;
147
+ repeated string Images = 42;
148
+ bool UseTokenizerTemplate = 43;
149
+ repeated Message Messages = 44;
150
+ repeated string Videos = 45;
151
+ repeated string Audios = 46;
152
+ string CorrelationId = 47;
153
+ }
154
+
155
+ // The response message containing the result
156
+ message Reply {
157
+ bytes message = 1;
158
+ int32 tokens = 2;
159
+ int32 prompt_tokens = 3;
160
+ }
161
+
162
+ message ModelOptions {
163
+ string Model = 1;
164
+ int32 ContextSize = 2;
165
+ int32 Seed = 3;
166
+ int32 NBatch = 4;
167
+ bool F16Memory = 5;
168
+ bool MLock = 6;
169
+ bool MMap = 7;
170
+ bool VocabOnly = 8;
171
+ bool LowVRAM = 9;
172
+ bool Embeddings = 10;
173
+ bool NUMA = 11;
174
+ int32 NGPULayers = 12;
175
+ string MainGPU = 13;
176
+ string TensorSplit = 14;
177
+ int32 Threads = 15;
178
+ string LibrarySearchPath = 16;
179
+ float RopeFreqBase = 17;
180
+ float RopeFreqScale = 18;
181
+ float RMSNormEps = 19;
182
+ int32 NGQA = 20;
183
+ string ModelFile = 21;
184
+
185
+ // AutoGPTQ
186
+ string Device = 22;
187
+ bool UseTriton = 23;
188
+ string ModelBaseName = 24;
189
+ bool UseFastTokenizer = 25;
190
+
191
+ // Diffusers
192
+ string PipelineType = 26;
193
+ string SchedulerType = 27;
194
+ bool CUDA = 28;
195
+ float CFGScale = 29;
196
+ bool IMG2IMG = 30;
197
+ string CLIPModel = 31;
198
+ string CLIPSubfolder = 32;
199
+ int32 CLIPSkip = 33;
200
+ string ControlNet = 48;
201
+
202
+ string Tokenizer = 34;
203
+
204
+ // LLM (llama.cpp)
205
+ string LoraBase = 35;
206
+ string LoraAdapter = 36;
207
+ float LoraScale = 42;
208
+
209
+ bool NoMulMatQ = 37;
210
+ string DraftModel = 39;
211
+
212
+ string AudioPath = 38;
213
+
214
+ // vllm
215
+ string Quantization = 40;
216
+ float GPUMemoryUtilization = 50;
217
+ bool TrustRemoteCode = 51;
218
+ bool EnforceEager = 52;
219
+ int32 SwapSpace = 53;
220
+ int32 MaxModelLen = 54;
221
+ int32 TensorParallelSize = 55;
222
+ string LoadFormat = 58;
223
+
224
+ string MMProj = 41;
225
+
226
+ string RopeScaling = 43;
227
+ float YarnExtFactor = 44;
228
+ float YarnAttnFactor = 45;
229
+ float YarnBetaFast = 46;
230
+ float YarnBetaSlow = 47;
231
+
232
+ string Type = 49;
233
+
234
+ bool FlashAttention = 56;
235
+ bool NoKVOffload = 57;
236
+
237
+ string ModelPath = 59;
238
+
239
+ repeated string LoraAdapters = 60;
240
+ repeated float LoraScales = 61;
241
+ }
242
+
243
+ message Result {
244
+ string message = 1;
245
+ bool success = 2;
246
+ }
247
+
248
+ message EmbeddingResult {
249
+ repeated float embeddings = 1;
250
+ }
251
+
252
+ message TranscriptRequest {
253
+ string dst = 2;
254
+ string language = 3;
255
+ uint32 threads = 4;
256
+ bool translate = 5;
257
+ }
258
+
259
+ message TranscriptResult {
260
+ repeated TranscriptSegment segments = 1;
261
+ string text = 2;
262
+ }
263
+
264
+ message TranscriptSegment {
265
+ int32 id = 1;
266
+ int64 start = 2;
267
+ int64 end = 3;
268
+ string text = 4;
269
+ repeated int32 tokens = 5;
270
+ }
271
+
272
+ message GenerateImageRequest {
273
+ int32 height = 1;
274
+ int32 width = 2;
275
+ int32 mode = 3;
276
+ int32 step = 4;
277
+ int32 seed = 5;
278
+ string positive_prompt = 6;
279
+ string negative_prompt = 7;
280
+ string dst = 8;
281
+ string src = 9;
282
+
283
+ // Diffusers
284
+ string EnableParameters = 10;
285
+ int32 CLIPSkip = 11;
286
+ }
287
+
288
+ message TTSRequest {
289
+ string text = 1;
290
+ string model = 2;
291
+ string dst = 3;
292
+ string voice = 4;
293
+ optional string language = 5;
294
+ }
295
+
296
+ message SoundGenerationRequest {
297
+ string text = 1;
298
+ string model = 2;
299
+ string dst = 3;
300
+ optional float duration = 4;
301
+ optional float temperature = 5;
302
+ optional bool sample = 6;
303
+ optional string src = 7;
304
+ optional int32 src_divisor = 8;
305
+ }
306
+
307
+ message TokenizationResponse {
308
+ int32 length = 1;
309
+ repeated int32 tokens = 2;
310
+ }
311
+
312
+ message MemoryUsageData {
313
+ uint64 total = 1;
314
+ map<string, uint64> breakdown = 2;
315
+ }
316
+
317
+ message StatusResponse {
318
+ enum State {
319
+ UNINITIALIZED = 0;
320
+ BUSY = 1;
321
+ READY = 2;
322
+ ERROR = -1;
323
+ }
324
+ State state = 1;
325
+ MemoryUsageData memory = 2;
326
+ }
327
+
328
+ message Message {
329
+ string role = 1;
330
+ string content = 2;
331
+ }
backend/cpp/grpc/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ installed_packages/
2
+ grpc_build/
3
+ grpc_repo/
backend/cpp/grpc/Makefile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Basic platform detection
2
+ HOST_SYSTEM = $(shell uname | cut -f 1 -d_)
3
+ SYSTEM ?= $(HOST_SYSTEM)
4
+
5
+ TAG_LIB_GRPC?=v1.59.0
6
+ GIT_REPO_LIB_GRPC?=https://github.com/grpc/grpc.git
7
+ GIT_CLONE_DEPTH?=1
8
+
9
+ INSTALLED_PACKAGES=installed_packages
10
+ GRPC_REPO=grpc_repo
11
+ GRPC_BUILD=grpc_build
12
+
13
+ export CMAKE_ARGS?=
14
+ CMAKE_ARGS+=-DCMAKE_BUILD_TYPE=Release
15
+ CMAKE_ARGS+=-DgRPC_INSTALL=ON
16
+ CMAKE_ARGS+=-DEXECUTABLE_OUTPUT_PATH=../$(INSTALLED_PACKAGES)/grpc/bin
17
+ CMAKE_ARGS+=-DLIBRARY_OUTPUT_PATH=../$(INSTALLED_PACKAGES)/grpc/lib
18
+ CMAKE_ARGS+=-DgRPC_BUILD_TESTS=OFF
19
+ CMAKE_ARGS+=-DgRPC_BUILD_CSHARP_EXT=OFF
20
+ CMAKE_ARGS+=-DgRPC_BUILD_GRPC_CPP_PLUGIN=ON
21
+ CMAKE_ARGS+=-DgRPC_BUILD_GRPC_CSHARP_PLUGIN=OFF
22
+ CMAKE_ARGS+=-DgRPC_BUILD_GRPC_NODE_PLUGIN=OFF
23
+ CMAKE_ARGS+=-DgRPC_BUILD_GRPC_OBJECTIVE_C_PLUGIN=OFF
24
+ CMAKE_ARGS+=-DgRPC_BUILD_GRPC_PHP_PLUGIN=OFF
25
+ CMAKE_ARGS+=-DgRPC_BUILD_GRPC_PYTHON_PLUGIN=ON
26
+ CMAKE_ARGS+=-DgRPC_BUILD_GRPC_RUBY_PLUGIN=OFF
27
+ CMAKE_ARGS+=-Dprotobuf_WITH_ZLIB=ON
28
+ CMAKE_ARGS+=-DRE2_BUILD_TESTING=OFF
29
+ CMAKE_ARGS+=-DCMAKE_INSTALL_PREFIX=../$(INSTALLED_PACKAGES)
30
+
31
+ # windows need to set OPENSSL_NO_ASM. Results in slower crypto performance but doesn't build otherwise.
32
+ # May be resolvable, but for now its set. More info: https://stackoverflow.com/a/75240504/480673
33
+ ifeq ($(SYSTEM),MSYS)
34
+ CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
35
+ endif
36
+ ifeq ($(SYSTEM),MINGW64)
37
+ CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
38
+ endif
39
+ ifeq ($(SYSTEM),MINGW32)
40
+ CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
41
+ endif
42
+ ifeq ($(SYSTEM),CYGWIN)
43
+ CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
44
+ endif
45
+
46
+ $(INSTALLED_PACKAGES): grpc_build
47
+
48
+ $(GRPC_REPO):
49
+ mkdir -p $(GRPC_REPO)/grpc
50
+ cd $(GRPC_REPO)/grpc && \
51
+ git init && \
52
+ git remote add origin $(GIT_REPO_LIB_GRPC) && \
53
+ git fetch origin && \
54
+ git checkout $(TAG_LIB_GRPC) && \
55
+ git submodule update --init --recursive --depth 1 --single-branch
56
+
57
+ $(GRPC_BUILD): $(GRPC_REPO)
58
+ mkdir -p $(GRPC_BUILD)
59
+ cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . && cmake --build . --target install
60
+
61
+ build: $(INSTALLED_PACKAGES)
62
+
63
+ rebuild:
64
+ rm -rf grpc_build
65
+ $(MAKE) grpc_build
66
+
67
+ clean:
68
+ rm -rf grpc_build
69
+ rm -rf grpc_repo
70
+ rm -rf installed_packages
backend/cpp/llama/CMakeLists.txt ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## XXX: In some versions of CMake clip wasn't being built before llama.
3
+ ## This is an hack for now, but it should be fixed in the future.
4
+ set(TARGET myclip)
5
+ add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
6
+ install(TARGETS ${TARGET} LIBRARY)
7
+ target_include_directories(myclip PUBLIC .)
8
+ target_include_directories(myclip PUBLIC ../..)
9
+ target_include_directories(myclip PUBLIC ../../common)
10
+ target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
11
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
12
+ if (NOT MSVC)
13
+ target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
14
+ endif()
15
+ # END CLIP hack
16
+
17
+
18
+ set(TARGET grpc-server)
19
+ set(CMAKE_CXX_STANDARD 17)
20
+ cmake_minimum_required(VERSION 3.15)
21
+ set(TARGET grpc-server)
22
+ set(_PROTOBUF_LIBPROTOBUF libprotobuf)
23
+ set(_REFLECTION grpc++_reflection)
24
+
25
+ if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
26
+ # Set correct Homebrew install folder for Apple Silicon and Intel Macs
27
+ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
28
+ set(HOMEBREW_DEFAULT_PREFIX "/opt/homebrew")
29
+ else()
30
+ set(HOMEBREW_DEFAULT_PREFIX "/usr/local")
31
+ endif()
32
+
33
+ link_directories("${HOMEBREW_DEFAULT_PREFIX}/lib")
34
+ include_directories("${HOMEBREW_DEFAULT_PREFIX}/include")
35
+ endif()
36
+
37
+ find_package(absl CONFIG REQUIRED)
38
+ find_package(Protobuf CONFIG REQUIRED)
39
+ find_package(gRPC CONFIG REQUIRED)
40
+
41
+ find_program(_PROTOBUF_PROTOC protoc)
42
+ set(_GRPC_GRPCPP grpc++)
43
+ find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
44
+
45
+ include_directories(${CMAKE_CURRENT_BINARY_DIR})
46
+ include_directories(${Protobuf_INCLUDE_DIRS})
47
+
48
+ message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
49
+
50
+ # Proto file
51
+ get_filename_component(hw_proto "../../../../../../backend/backend.proto" ABSOLUTE)
52
+ get_filename_component(hw_proto_path "${hw_proto}" PATH)
53
+
54
+ # Generated sources
55
+ set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
56
+ set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
57
+ set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
58
+ set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
59
+
60
+ add_custom_command(
61
+ OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
62
+ COMMAND ${_PROTOBUF_PROTOC}
63
+ ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
64
+ --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
65
+ -I "${hw_proto_path}"
66
+ --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
67
+ "${hw_proto}"
68
+ DEPENDS "${hw_proto}")
69
+
70
+ # hw_grpc_proto
71
+ add_library(hw_grpc_proto
72
+ ${hw_grpc_srcs}
73
+ ${hw_grpc_hdrs}
74
+ ${hw_proto_srcs}
75
+ ${hw_proto_hdrs} )
76
+
77
+ add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
78
+ target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
79
+ absl::flags_parse
80
+ gRPC::${_REFLECTION}
81
+ gRPC::${_GRPC_GRPCPP}
82
+ protobuf::${_PROTOBUF_LIBPROTOBUF})
83
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
84
+ if(TARGET BUILD_INFO)
85
+ add_dependencies(${TARGET} BUILD_INFO)
86
+ endif()
backend/cpp/llama/Makefile ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ LLAMA_VERSION?=
3
+ LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
4
+
5
+ CMAKE_ARGS?=
6
+ BUILD_TYPE?=
7
+ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
8
+ TARGET?=--target grpc-server
9
+
10
+ # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
11
+ CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
12
+
13
+ # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
14
+ ifeq ($(BUILD_TYPE),cublas)
15
+ CMAKE_ARGS+=-DGGML_CUDA=ON
16
+ # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
17
+ # to CMAKE_ARGS automatically
18
+ else ifeq ($(BUILD_TYPE),openblas)
19
+ CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
20
+ # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
21
+ else ifeq ($(BUILD_TYPE),clblas)
22
+ CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
23
+ # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
24
+ else ifeq ($(BUILD_TYPE),hipblas)
25
+ CMAKE_ARGS+=-DGGML_HIPBLAS=ON
26
+ # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
27
+ # But if it's OSX without metal, disable it here
28
+ else ifeq ($(OS),Darwin)
29
+ ifneq ($(BUILD_TYPE),metal)
30
+ CMAKE_ARGS+=-DGGML_METAL=OFF
31
+ else
32
+ CMAKE_ARGS+=-DGGML_METAL=ON
33
+ # Until this is tested properly, we disable embedded metal file
34
+ # as we already embed it as part of the LocalAI assets
35
+ CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
36
+ TARGET+=--target ggml-metal
37
+ endif
38
+ endif
39
+
40
+ ifeq ($(BUILD_TYPE),sycl_f16)
41
+ CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
42
+ endif
43
+
44
+ ifeq ($(BUILD_TYPE),sycl_f32)
45
+ CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
46
+ endif
47
+
48
+ llama.cpp:
49
+ mkdir -p llama.cpp
50
+ cd llama.cpp && \
51
+ git init && \
52
+ git remote add origin $(LLAMA_REPO) && \
53
+ git fetch origin && \
54
+ git checkout -b build $(LLAMA_VERSION) && \
55
+ git submodule update --init --recursive --depth 1 --single-branch
56
+
57
+ llama.cpp/examples/grpc-server: llama.cpp
58
+ mkdir -p llama.cpp/examples/grpc-server
59
+ bash prepare.sh
60
+
61
+ rebuild:
62
+ bash prepare.sh
63
+ rm -rf grpc-server
64
+ $(MAKE) grpc-server
65
+
66
+ purge:
67
+ rm -rf llama.cpp/build
68
+ rm -rf llama.cpp/examples/grpc-server
69
+ rm -rf grpc-server
70
+
71
+ clean: purge
72
+ rm -rf llama.cpp
73
+
74
+ grpc-server: llama.cpp llama.cpp/examples/grpc-server
75
+ @echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
76
+ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
77
+ +bash -c "source $(ONEAPI_VARS); \
78
+ cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
79
+ else
80
+ +cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
81
+ endif
82
+ cp llama.cpp/build/bin/grpc-server .
backend/cpp/llama/grpc-server.cpp ADDED
@@ -0,0 +1,2537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // llama.cpp gRPC C++ backend server
2
+ //
3
+ // Ettore Di Giacinto <mudler@localai.io> and llama.cpp authors
4
+ //
5
+ // This is a gRPC server for llama.cpp compatible with the LocalAI proto
6
+ // Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP (https://github.com/ggerganov/llama.cpp/tree/master/examples/server),
7
+ // but modified to work with gRPC
8
+ //
9
+
10
+ #include <iostream>
11
+ #include <memory>
12
+ #include <string>
13
+ #include <getopt.h>
14
+ #include "clip.h"
15
+ #include "llava.h"
16
+ #include "log.h"
17
+ #include "stb_image.h"
18
+ #include "common.h"
19
+ #include "json.hpp"
20
+ #include "llama.h"
21
+ #include "backend.pb.h"
22
+ #include "backend.grpc.pb.h"
23
+ #include "utils.hpp"
24
+ #include "sampling.h"
25
+ // include std::regex
26
+ #include <cstddef>
27
+ #include <thread>
28
+ #include <mutex>
29
+ #include <chrono>
30
+ #include <regex>
31
+ #include <condition_variable>
32
+ #include <grpcpp/ext/proto_server_reflection_plugin.h>
33
+ #include <grpcpp/grpcpp.h>
34
+ #include <grpcpp/health_check_service_interface.h>
35
+ #include <atomic>
36
+ #include <signal.h>
37
+
38
+ using grpc::Server;
39
+ using grpc::ServerBuilder;
40
+ using grpc::ServerContext;
41
+ using grpc::Status;
42
+
43
+
44
+ using backend::HealthMessage;
45
+
46
+
47
+ ///// LLAMA.CPP server code below
48
+
49
+ using json = nlohmann::json;
50
+
51
+ struct server_params
52
+ {
53
+ std::string hostname = "127.0.0.1";
54
+ std::vector<std::string> api_keys;
55
+ std::string public_path = "examples/server/public";
56
+ std::string chat_template = "";
57
+ int32_t port = 8080;
58
+ int32_t read_timeout = 600;
59
+ int32_t write_timeout = 600;
60
+ bool slots_endpoint = true;
61
+ bool metrics_endpoint = false;
62
+ };
63
+
64
+ bool server_verbose = false;
65
+ bool server_log_json = true;
66
+
67
+ static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
68
+ {
69
+ size_t i;
70
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
71
+ {
72
+ }
73
+ return i;
74
+ }
75
+
76
+ enum stop_type
77
+ {
78
+ STOP_FULL,
79
+ STOP_PARTIAL,
80
+ };
81
+
82
+ static bool ends_with(const std::string &str, const std::string &suffix)
83
+ {
84
+ return str.size() >= suffix.size() &&
85
+ 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
86
+ }
87
+
88
+ static size_t find_partial_stop_string(const std::string &stop,
89
+ const std::string &text)
90
+ {
91
+ if (!text.empty() && !stop.empty())
92
+ {
93
+ const char text_last_char = text.back();
94
+ for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
95
+ {
96
+ if (stop[char_index] == text_last_char)
97
+ {
98
+ const std::string current_partial = stop.substr(0, char_index + 1);
99
+ if (ends_with(text, current_partial))
100
+ {
101
+ return text.size() - char_index - 1;
102
+ }
103
+ }
104
+ }
105
+ }
106
+ return std::string::npos;
107
+ }
108
+
109
+ // TODO: reuse llama_detokenize
110
+ template <class Iter>
111
+ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
112
+ {
113
+ std::string ret;
114
+ for (; begin != end; ++begin)
115
+ {
116
+ ret += common_token_to_piece(ctx, *begin);
117
+ }
118
+ return ret;
119
+ }
120
+
121
+ // format incomplete utf-8 multibyte character for output
122
+ static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
123
+ {
124
+ std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
125
+ // if the size is 1 and first bit is 1, meaning it's a partial character
126
+ // (size > 1 meaning it's already a known token)
127
+ if (out.size() == 1 && (out[0] & 0x80) == 0x80)
128
+ {
129
+ std::stringstream ss;
130
+ ss << std::hex << (out[0] & 0xff);
131
+ std::string res(ss.str());
132
+ out = "byte: \\x" + res;
133
+ }
134
+ return out;
135
+ }
136
+
137
+ // convert a vector of completion_token_output to json
138
+ static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
139
+ {
140
+ json out = json::array();
141
+ for (const auto &prob : probs)
142
+ {
143
+ json probs_for_token = json::array();
144
+ for (const auto &p : prob.probs)
145
+ {
146
+ std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
147
+ probs_for_token.push_back(json
148
+ {
149
+ {"tok_str", tok_str},
150
+ {"prob", p.prob},
151
+ });
152
+ }
153
+ std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
154
+ out.push_back(json{
155
+ {"content", tok_str},
156
+ {"probs", probs_for_token},
157
+ });
158
+ }
159
+ return out;
160
+ }
161
+
162
+ struct llama_client_slot
163
+ {
164
+ int id;
165
+ int task_id = -1;
166
+
167
+ struct slot_params params;
168
+
169
+ slot_state state = IDLE;
170
+ slot_command command = NONE;
171
+
172
+ // used to determine the slot that has been used the longest
173
+ int64_t t_last_used = -1;
174
+
175
+ // generation props
176
+ int32_t n_ctx = 0; // context size per slot
177
+ int32_t n_past = 0;
178
+ int32_t n_decoded = 0;
179
+ int32_t n_remaining = -1;
180
+ int32_t i_batch = -1;
181
+ int32_t n_predict = -1;
182
+
183
+ int32_t num_prompt_tokens = 0;
184
+ int32_t num_prompt_tokens_processed = 0;
185
+
186
+ json prompt;
187
+ std::string generated_text;
188
+ llama_token sampled;
189
+ std::vector<llama_token> cache_tokens;
190
+ std::vector<completion_token_output> generated_token_probs;
191
+
192
+ bool infill = false;
193
+ bool embedding = false;
194
+ bool has_next_token = true;
195
+ bool truncated = false;
196
+ bool stopped_eos = false;
197
+ bool stopped_word = false;
198
+ bool stopped_limit = false;
199
+
200
+ bool oaicompat = false;
201
+ std::string oaicompat_model;
202
+
203
+ std::string stopping_word;
204
+
205
+ // sampling
206
+ struct common_sampler_params sparams;
207
+ common_sampler *ctx_sampling = nullptr;
208
+
209
+ int32_t ga_i = 0; // group-attention state
210
+ int32_t ga_n = 1; // group-attention factor
211
+ int32_t ga_w = 512; // group-attention width
212
+
213
+ int32_t n_past_se = 0; // self-extend
214
+
215
+ // multimodal
216
+ std::vector<slot_image> images;
217
+
218
+ // stats
219
+ size_t sent_count = 0;
220
+ size_t sent_token_probs_index = 0;
221
+
222
+ int64_t t_start_process_prompt;
223
+ int64_t t_start_genereration;
224
+
225
+ double t_prompt_processing; // ms
226
+ double t_token_generation; // ms
227
+
228
+ // multitasks
229
+ int multitask_id = -1;
230
+
231
+ void reset() {
232
+ num_prompt_tokens = 0;
233
+ generated_text = "";
234
+ truncated = false;
235
+ stopped_eos = false;
236
+ stopped_word = false;
237
+ stopped_limit = false;
238
+ stopping_word = "";
239
+ n_past = 0;
240
+ sent_count = 0;
241
+ sent_token_probs_index = 0;
242
+ infill = false;
243
+ ga_i = 0;
244
+ n_past_se = 0;
245
+
246
+ generated_token_probs.clear();
247
+
248
+ for (slot_image & img : images)
249
+ {
250
+ free(img.image_embedding);
251
+ if (img.img_data) {
252
+ clip_image_u8_free(img.img_data);
253
+ }
254
+ img.prefix_prompt = "";
255
+ }
256
+
257
+ images.clear();
258
+ }
259
+
260
+ bool has_budget(common_params &global_params) {
261
+ if (params.n_predict == -1 && global_params.n_predict == -1)
262
+ {
263
+ return true; // limitless
264
+ }
265
+
266
+ n_remaining = -1;
267
+
268
+ if (params.n_predict != -1)
269
+ {
270
+ n_remaining = params.n_predict - n_decoded;
271
+ }
272
+ else if (global_params.n_predict != -1)
273
+ {
274
+ n_remaining = global_params.n_predict - n_decoded;
275
+ }
276
+
277
+ return n_remaining > 0; // no budget
278
+ }
279
+
280
+ bool available() const {
281
+ return state == IDLE && command == NONE;
282
+ }
283
+
284
+ bool is_processing() const {
285
+ return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
286
+ }
287
+
288
+ void add_token_string(const completion_token_output &token) {
289
+ if (command == RELEASE)
290
+ {
291
+ return;
292
+ }
293
+ cache_tokens.push_back(token.tok);
294
+ generated_token_probs.push_back(token);
295
+ }
296
+
297
+ void release() {
298
+ if (state == PROCESSING)
299
+ {
300
+ t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
301
+ command = RELEASE;
302
+ }
303
+ }
304
+
305
+ json get_formated_timings() {
306
+ return json
307
+ {
308
+ {"prompt_n", num_prompt_tokens_processed},
309
+ {"prompt_ms", t_prompt_processing},
310
+ {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed},
311
+ {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed},
312
+
313
+ {"predicted_n", n_decoded},
314
+ {"predicted_ms", t_token_generation},
315
+ {"predicted_per_token_ms", t_token_generation / n_decoded},
316
+ {"predicted_per_second", 1e3 / t_token_generation * n_decoded},
317
+ };
318
+ }
319
+
320
+ void print_timings() const {
321
+ char buffer[512];
322
+ double t_token = t_prompt_processing / num_prompt_tokens_processed;
323
+ double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
324
+ sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
325
+ t_prompt_processing, num_prompt_tokens_processed,
326
+ t_token, n_tokens_second);
327
+ LOG_INFO(buffer, {
328
+ {"slot_id", id},
329
+ {"task_id", task_id},
330
+ {"t_prompt_processing", t_prompt_processing},
331
+ {"num_prompt_tokens_processed", num_prompt_tokens_processed},
332
+ {"t_token", t_token},
333
+ {"n_tokens_second", n_tokens_second},
334
+ });
335
+
336
+ t_token = t_token_generation / n_decoded;
337
+ n_tokens_second = 1e3 / t_token_generation * n_decoded;
338
+ sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
339
+ t_token_generation, n_decoded,
340
+ t_token, n_tokens_second);
341
+ LOG_INFO(buffer, {
342
+ {"slot_id", id},
343
+ {"task_id", task_id},
344
+ {"t_token_generation", t_token_generation},
345
+ {"n_decoded", n_decoded},
346
+ {"t_token", t_token},
347
+ {"n_tokens_second", n_tokens_second},
348
+ });
349
+
350
+ sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
351
+ LOG_INFO(buffer, {
352
+ {"slot_id", id},
353
+ {"task_id", task_id},
354
+ {"t_prompt_processing", t_prompt_processing},
355
+ {"t_token_generation", t_token_generation},
356
+ {"t_total", t_prompt_processing + t_token_generation},
357
+ });
358
+ }
359
+ };
360
+
361
+ struct llama_metrics {
362
+ uint64_t n_prompt_tokens_processed_total = 0;
363
+ uint64_t n_tokens_predicted_total = 0;
364
+
365
+ uint64_t n_prompt_tokens_processed = 0;
366
+ uint64_t t_prompt_processing = 0;
367
+
368
+ uint64_t n_tokens_predicted = 0;
369
+ uint64_t t_tokens_generation = 0;
370
+
371
+
372
+ void on_prompt_eval(const llama_client_slot &slot) {
373
+ n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
374
+
375
+ n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
376
+ t_prompt_processing += slot.t_prompt_processing;
377
+ }
378
+
379
+ void on_prediction(const llama_client_slot &slot) {
380
+ n_tokens_predicted_total += slot.n_decoded;
381
+
382
+ n_tokens_predicted += slot.n_decoded;
383
+ t_tokens_generation += slot.t_token_generation;
384
+ }
385
+
386
+ void reset_bucket() {
387
+ n_prompt_tokens_processed = 0;
388
+ t_prompt_processing = 0;
389
+ n_tokens_predicted = 0;
390
+ t_tokens_generation = 0;
391
+ }
392
+ };
393
+
394
+ struct llava_embd_batch {
395
+ std::vector<llama_pos> pos;
396
+ std::vector<int32_t> n_seq_id;
397
+ std::vector<llama_seq_id> seq_id_0;
398
+ std::vector<llama_seq_id *> seq_ids;
399
+ std::vector<int8_t> logits;
400
+ llama_batch batch;
401
+ llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
402
+ pos .resize(n_tokens);
403
+ n_seq_id.resize(n_tokens);
404
+ seq_ids .resize(n_tokens + 1);
405
+ logits .resize(n_tokens);
406
+ seq_id_0.resize(1);
407
+ seq_id_0[0] = seq_id;
408
+ seq_ids [n_tokens] = nullptr;
409
+ batch = {
410
+ /*n_tokens =*/ n_tokens,
411
+ /*tokens =*/ nullptr,
412
+ /*embd =*/ embd,
413
+ /*pos =*/ pos.data(),
414
+ /*n_seq_id =*/ n_seq_id.data(),
415
+ /*seq_id =*/ seq_ids.data(),
416
+ /*logits =*/ logits.data(),
417
+ };
418
+ for (int i = 0; i < n_tokens; i++) {
419
+ batch.pos [i] = pos_0 + i;
420
+ batch.n_seq_id[i] = 1;
421
+ batch.seq_id [i] = seq_id_0.data();
422
+ batch.logits [i] = false;
423
+ }
424
+ }
425
+ };
426
+
427
+ struct llama_server_context
428
+ {
429
+ llama_model *model = nullptr;
430
+ llama_context *ctx = nullptr;
431
+
432
+ clip_ctx *clp_ctx = nullptr;
433
+
434
+ common_params params;
435
+
436
+ llama_batch batch;
437
+
438
+ bool multimodal = false;
439
+ bool clean_kv_cache = true;
440
+ bool all_slots_are_idle = false;
441
+ bool add_bos_token = true;
442
+
443
+ int32_t n_ctx; // total context for all clients / slots
444
+
445
+ // system prompt
446
+ bool system_need_update = false;
447
+
448
+ std::string system_prompt;
449
+ std::vector<llama_token> system_tokens;
450
+
451
+ std::string name_user; // this should be the antiprompt
452
+ std::string name_assistant;
453
+
454
+ // slots / clients
455
+ std::vector<llama_client_slot> slots;
456
+ json default_generation_settings_for_props;
457
+
458
+ llama_server_queue queue_tasks;
459
+ llama_server_response queue_results;
460
+
461
+ llama_metrics metrics;
462
+
463
+ ~llama_server_context()
464
+ {
465
+ if (ctx)
466
+ {
467
+ llama_free(ctx);
468
+ ctx = nullptr;
469
+ }
470
+ if (model)
471
+ {
472
+ llama_free_model(model);
473
+ model = nullptr;
474
+ }
475
+ }
476
+
477
+ bool load_model(const common_params &params_)
478
+ {
479
+ params = params_;
480
+ if (!params.mmproj.empty()) {
481
+ multimodal = true;
482
+ LOG_INFO("Multi Modal Mode Enabled", {});
483
+ clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
484
+ if(clp_ctx == nullptr) {
485
+ LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
486
+ return false;
487
+ }
488
+
489
+ if (params.n_ctx < 2048) { // request larger context for the image embedding
490
+ params.n_ctx = 2048;
491
+ }
492
+ }
493
+
494
+ common_init_result common_init = common_init_from_params(params);
495
+ model = common_init.model;
496
+ ctx = common_init.context;
497
+ if (model == nullptr)
498
+ {
499
+ LOG_ERR("unable to load model: %s", params.model.c_str());
500
+ return false;
501
+ }
502
+
503
+ if (multimodal) {
504
+ const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
505
+ const int n_embd_llm = llama_n_embd(model);
506
+ if (n_embd_clip != n_embd_llm) {
507
+ LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
508
+ llama_free(ctx);
509
+ llama_free_model(model);
510
+ return false;
511
+ }
512
+ }
513
+
514
+ n_ctx = llama_n_ctx(ctx);
515
+
516
+ add_bos_token = llama_add_bos_token(model);
517
+
518
+ return true;
519
+ }
520
+
521
+ void validate_model_chat_template(server_params & sparams) {
522
+ llama_chat_message chat[] = {{"user", "test"}};
523
+ std::vector<char> buf(1);
524
+ int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
525
+ if (res < 0) {
526
+ LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
527
+ sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
528
+ }
529
+ }
530
+
531
+ llama_client_slot* get_active_slot() {
532
+ for (llama_client_slot& slot : slots) {
533
+ // Check if the slot is currently processing
534
+ if (slot.is_processing()) {
535
+ return &slot; // Return the active slot
536
+ }
537
+ }
538
+ return nullptr; // No active slot found
539
+ }
540
+
541
+ void initialize() {
542
+ // create slots
543
+ all_slots_are_idle = true;
544
+
545
+ const int32_t n_ctx_slot = n_ctx / params.n_parallel;
546
+
547
+ LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
548
+ for (int i = 0; i < params.n_parallel; i++)
549
+ {
550
+ llama_client_slot slot;
551
+
552
+ slot.id = i;
553
+ slot.n_ctx = n_ctx_slot;
554
+ slot.n_predict = params.n_predict;
555
+
556
+ LOG_INFO("new slot", {
557
+ {"slot_id", slot.id},
558
+ {"n_ctx_slot", slot.n_ctx}
559
+ });
560
+
561
+ const int ga_n = params.grp_attn_n;
562
+ const int ga_w = params.grp_attn_w;
563
+
564
+ if (ga_n != 1) {
565
+ GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
566
+ GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
567
+ //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
568
+ //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
569
+
570
+ LOG_INFO("slot self-extend", {
571
+ {"slot_id", slot.id},
572
+ {"ga_n", ga_n},
573
+ {"ga_w", ga_w}
574
+ });
575
+ }
576
+
577
+ slot.ga_i = 0;
578
+ slot.ga_n = ga_n;
579
+ slot.ga_w = ga_w;
580
+
581
+ slot.reset();
582
+
583
+ slots.push_back(slot);
584
+ }
585
+
586
+ default_generation_settings_for_props = get_formated_generation(slots.front());
587
+ default_generation_settings_for_props["seed"] = -1;
588
+
589
+ batch = llama_batch_init(n_ctx, 0, params.n_parallel);
590
+ }
591
+
592
+ std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
593
+ {
594
+ // TODO: currently, we tokenize using special tokens by default
595
+ // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
596
+ // but it's better compared to completely ignoring ChatML and other chat templates
597
+ const bool TMP_FORCE_SPECIAL = true;
598
+
599
+ // If `add_bos` is true, we only add BOS, when json_prompt is a string,
600
+ // or the first element of the json_prompt array is a string.
601
+ std::vector<llama_token> prompt_tokens;
602
+
603
+ if (json_prompt.is_array())
604
+ {
605
+ bool first = true;
606
+ for (const auto& p : json_prompt)
607
+ {
608
+ if (p.is_string())
609
+ {
610
+ auto s = p.template get<std::string>();
611
+ std::vector<llama_token> p;
612
+ if (first)
613
+ {
614
+ p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
615
+ first = false;
616
+ }
617
+ else
618
+ {
619
+ p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
620
+ }
621
+ prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
622
+ }
623
+ else
624
+ {
625
+ if (first)
626
+ {
627
+ first = false;
628
+ }
629
+ prompt_tokens.push_back(p.template get<llama_token>());
630
+ }
631
+ }
632
+ }
633
+ else
634
+ {
635
+ auto s = json_prompt.template get<std::string>();
636
+ prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
637
+ }
638
+
639
+ return prompt_tokens;
640
+ }
641
+
642
+ llama_client_slot* get_slot(int id) {
643
+ int64_t t_last = ggml_time_us();
644
+ llama_client_slot *last_used = nullptr;
645
+
646
+ for (llama_client_slot & slot : slots)
647
+ {
648
+ if (slot.id == id && slot.available())
649
+ {
650
+ return &slot;
651
+ }
652
+
653
+ if (slot.available() && slot.t_last_used < t_last)
654
+ {
655
+ last_used = &slot;
656
+ t_last = slot.t_last_used;
657
+ }
658
+ }
659
+
660
+ return last_used;
661
+ }
662
+
663
+ bool launch_slot_with_data(llama_client_slot* &slot, json data) {
664
+ slot_params default_params;
665
+ common_sampler_params default_sparams;
666
+
667
+ slot->params.stream = json_value(data, "stream", false);
668
+ slot->params.cache_prompt = json_value(data, "cache_prompt", false);
669
+ slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
670
+ slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
671
+ slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
672
+ slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
673
+ slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
674
+ slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
675
+ slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
676
+ slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
677
+ slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
678
+ slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
679
+ slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
680
+ slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
681
+ slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
682
+ slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
683
+ slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
684
+ slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
685
+ slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
686
+ slot->sparams.seed = json_value(data, "seed", default_sparams.seed);
687
+ slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
688
+ slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
689
+ slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
690
+
691
+ if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
692
+ // Might be better to reject the request with a 400 ?
693
+ LOG_WARNING("Max tokens to predict exceeds server configuration", {
694
+ {"params.n_predict", slot->params.n_predict},
695
+ {"slot.n_predict", slot->n_predict},
696
+ });
697
+ slot->params.n_predict = slot->n_predict;
698
+ }
699
+
700
+ // infill
701
+ if (data.count("input_prefix") != 0)
702
+ {
703
+ slot->params.input_prefix = data["input_prefix"];
704
+ }
705
+ else
706
+ {
707
+ slot->params.input_prefix = "";
708
+ }
709
+
710
+
711
+ if (data.count("input_suffix") != 0)
712
+ {
713
+ slot->params.input_suffix = data["input_suffix"];
714
+ }
715
+ else
716
+ {
717
+ slot->params.input_suffix = "";
718
+ }
719
+
720
+ if (data.count("prompt") != 0)
721
+ {
722
+ slot->prompt = data["prompt"];
723
+ }
724
+ else
725
+ {
726
+ slot->prompt = "";
727
+ }
728
+
729
+ if (json_value(data, "ignore_eos", false)) {
730
+ slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
731
+ }
732
+ /*
733
+ slot->sparams.penalty_prompt_tokens.clear();
734
+ slot->sparams.use_penalty_prompt_tokens = false;
735
+ const auto &penalty_prompt = data.find("penalty_prompt");
736
+ if (penalty_prompt != data.end())
737
+ {
738
+ if (penalty_prompt->is_string())
739
+ {
740
+ const auto penalty_prompt_string = penalty_prompt->get<std::string>();
741
+ auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
742
+ slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
743
+ if (slot->params.n_predict > 0)
744
+ {
745
+ slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
746
+ }
747
+ slot->sparams.use_penalty_prompt_tokens = true;
748
+ }
749
+ else if (penalty_prompt->is_array())
750
+ {
751
+ const auto n_tokens = penalty_prompt->size();
752
+ slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
753
+ const int n_vocab = llama_n_vocab(model);
754
+ for (const auto &penalty_token : *penalty_prompt)
755
+ {
756
+ if (penalty_token.is_number_integer())
757
+ {
758
+ const auto tok = penalty_token.get<llama_token>();
759
+ if (tok >= 0 && tok < n_vocab)
760
+ {
761
+ slot->sparams.penalty_prompt_tokens.push_back(tok);
762
+ }
763
+ }
764
+ }
765
+ slot->sparams.use_penalty_prompt_tokens = true;
766
+ }
767
+ }
768
+ */
769
+
770
+ slot->sparams.logit_bias.clear();
771
+
772
+ const auto &logit_bias = data.find("logit_bias");
773
+ if (logit_bias != data.end() && logit_bias->is_array())
774
+ {
775
+ const int n_vocab = llama_n_vocab(model);
776
+ for (const auto &el : *logit_bias)
777
+ {
778
+ if (el.is_array() && el.size() == 2)
779
+ {
780
+ float bias;
781
+ if (el[1].is_number())
782
+ {
783
+ bias = el[1].get<float>();
784
+ }
785
+ else if (el[1].is_boolean() && !el[1].get<bool>())
786
+ {
787
+ bias = -INFINITY;
788
+ }
789
+ else
790
+ {
791
+ continue;
792
+ }
793
+
794
+ if (el[0].is_number_integer())
795
+ {
796
+ llama_token tok = el[0].get<llama_token>();
797
+ if (tok >= 0 && tok < n_vocab)
798
+ {
799
+ slot->sparams.logit_bias.push_back({tok, bias});
800
+ }
801
+ }
802
+ else if (el[0].is_string())
803
+ {
804
+ auto toks = common_tokenize(model, el[0].get<std::string>(), false);
805
+ for (auto tok : toks)
806
+ {
807
+ slot->sparams.logit_bias.push_back({tok, bias});
808
+ }
809
+ }
810
+ }
811
+ }
812
+ }
813
+
814
+ slot->params.antiprompt.clear();
815
+
816
+ const auto &stop = data.find("stop");
817
+ if (stop != data.end() && stop->is_array())
818
+ {
819
+ for (const auto &word : *stop)
820
+ {
821
+ if (!word.empty())
822
+ {
823
+ slot->params.antiprompt.push_back(word);
824
+ }
825
+ }
826
+ }
827
+
828
+ const auto & samplers = data.find("samplers");
829
+ if (samplers != data.end() && samplers->is_array()) {
830
+ std::vector<std::string> sampler_names;
831
+ for (const auto & name : *samplers) {
832
+ if (name.is_string()) {
833
+ sampler_names.emplace_back(name);
834
+ }
835
+ }
836
+ slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);
837
+ }
838
+ else
839
+ {
840
+ slot->sparams.samplers = default_sparams.samplers;
841
+ }
842
+
843
+
844
+ if (multimodal)
845
+ {
846
+ const auto &images_data = data.find("image_data");
847
+ if (images_data != data.end() && images_data->is_array())
848
+ {
849
+ for (const auto &img : *images_data)
850
+ {
851
+ const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
852
+
853
+ slot_image img_sl;
854
+ img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
855
+ img_sl.img_data = clip_image_u8_init();
856
+ if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
857
+ {
858
+ LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
859
+ __func__,
860
+ slot->id,
861
+ img_sl.id
862
+ );
863
+ return false;
864
+ }
865
+ LOG_VERBOSE("image loaded", {
866
+ {"slot_id", slot->id},
867
+ {"img_sl_id", img_sl.id}
868
+ });
869
+ img_sl.request_encode_image = true;
870
+ slot->images.push_back(img_sl);
871
+ }
872
+ // process prompt
873
+ // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
874
+ if (slot->images.size() > 0 && !slot->prompt.is_array())
875
+ {
876
+ std::string prompt = slot->prompt.get<std::string>();
877
+ size_t pos = 0, begin_prefix = 0;
878
+ std::string pattern = "[img-";
879
+ while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
880
+ size_t end_prefix = pos;
881
+ pos += pattern.length();
882
+ size_t end_pos = prompt.find(']', pos);
883
+ if (end_pos != std::string::npos)
884
+ {
885
+ std::string image_id = prompt.substr(pos, end_pos - pos);
886
+ try
887
+ {
888
+ int img_id = std::stoi(image_id);
889
+ bool found = false;
890
+ for (slot_image &img : slot->images)
891
+ {
892
+ if (img.id == img_id) {
893
+ found = true;
894
+ img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
895
+ begin_prefix = end_pos + 1;
896
+ break;
897
+ }
898
+ }
899
+ if (!found) {
900
+ LOG("ERROR: Image with id: %i, not found.\n", img_id);
901
+ slot->images.clear();
902
+ return false;
903
+ }
904
+ } catch (const std::invalid_argument& e) {
905
+ LOG("Invalid image number id in prompt\n");
906
+ slot->images.clear();
907
+ return false;
908
+ }
909
+ }
910
+ }
911
+ slot->prompt = "";
912
+ slot->params.input_suffix = prompt.substr(begin_prefix);
913
+ slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
914
+ }
915
+ }
916
+ }
917
+
918
+ if (slot->ctx_sampling != nullptr)
919
+ {
920
+ common_sampler_free(slot->ctx_sampling);
921
+ }
922
+ slot->ctx_sampling = common_sampler_init(model, slot->sparams);
923
+ //llama_set_rng_seed(ctx, slot->params.seed);
924
+ slot->command = LOAD_PROMPT;
925
+
926
+ all_slots_are_idle = false;
927
+
928
+ LOG_INFO("slot is processing task", {
929
+ {"slot_id", slot->id},
930
+ {"task_id", slot->task_id},
931
+ });
932
+
933
+ // LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
934
+
935
+ return true;
936
+ }
937
+
938
+ void kv_cache_clear() {
939
+ // clear the entire KV cache
940
+ llama_kv_cache_clear(ctx);
941
+ clean_kv_cache = false;
942
+ }
943
+
944
+ void update_system_prompt() {
945
+ kv_cache_clear();
946
+ system_tokens.clear();
947
+
948
+ if (!system_prompt.empty()) {
949
+ system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);
950
+
951
+ common_batch_clear(batch);
952
+
953
+ for (int i = 0; i < (int)system_tokens.size(); ++i)
954
+ {
955
+ common_batch_add(batch, system_tokens[i], i, { 0 }, false);
956
+ }
957
+
958
+ for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
959
+ {
960
+ const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
961
+ llama_batch batch_view = {
962
+ n_tokens,
963
+ batch.token + i,
964
+ nullptr,
965
+ batch.pos + i,
966
+ batch.n_seq_id + i,
967
+ batch.seq_id + i,
968
+ batch.logits + i,
969
+ };
970
+ if (llama_decode(ctx, batch_view) != 0)
971
+ {
972
+ LOG("%s: llama_decode() failed\n", __func__);
973
+ return;
974
+ }
975
+ }
976
+
977
+ // assign the system KV cache to all parallel sequences
978
+ for (int32_t i = 1; i < params.n_parallel; ++i)
979
+ {
980
+ llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
981
+ }
982
+ }
983
+
984
+ LOG("system prompt updated\n");
985
+ system_need_update = false;
986
+ }
987
+
988
+ void notify_system_prompt_changed() {
989
+ // release all slots
990
+ for (llama_client_slot &slot : slots)
991
+ {
992
+ slot.release();
993
+ }
994
+
995
+ system_need_update = true;
996
+ }
997
+
998
+ void process_system_prompt_data(const json &sys_props) {
999
+ system_prompt = sys_props.value("prompt", "");
1000
+ name_user = sys_props.value("anti_prompt", "");
1001
+ name_assistant = sys_props.value("assistant_name", "");
1002
+
1003
+
1004
+ notify_system_prompt_changed();
1005
+ }
1006
+
1007
+ static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
1008
+ const stop_type type, llama_client_slot &slot)
1009
+ {
1010
+ size_t stop_pos = std::string::npos;
1011
+
1012
+ for (const std::string &word : slot.params.antiprompt)
1013
+ {
1014
+ size_t pos;
1015
+ if (type == STOP_FULL)
1016
+ {
1017
+ const size_t tmp = word.size() + last_token_size;
1018
+ const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
1019
+ pos = text.find(word, from_pos);
1020
+ }
1021
+ else
1022
+ {
1023
+ pos = find_partial_stop_string(word, text);
1024
+ }
1025
+ if (pos != std::string::npos &&
1026
+ (stop_pos == std::string::npos || pos < stop_pos))
1027
+ {
1028
+ if (type == STOP_FULL)
1029
+ {
1030
+ slot.stopped_word = true;
1031
+ slot.stopping_word = word;
1032
+ slot.has_next_token = false;
1033
+ }
1034
+ stop_pos = pos;
1035
+ }
1036
+ }
1037
+
1038
+ return stop_pos;
1039
+ }
1040
+
1041
+ bool process_token(completion_token_output &result, llama_client_slot &slot) {
1042
+ // remember which tokens were sampled - used for repetition penalties during sampling
1043
+ const std::string token_str = common_token_to_piece(ctx, result.tok);
1044
+ slot.sampled = result.tok;
1045
+
1046
+ // search stop word and delete it
1047
+ slot.generated_text += token_str;
1048
+ slot.has_next_token = true;
1049
+
1050
+ /*
1051
+ if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
1052
+ {
1053
+ // we can change penalty_prompt_tokens because it is always created from scratch each request
1054
+ slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
1055
+ }
1056
+ */
1057
+
1058
+ // check if there is incomplete UTF-8 character at the end
1059
+ bool incomplete = false;
1060
+ for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
1061
+ {
1062
+ unsigned char c = slot.generated_text[slot.generated_text.size() - i];
1063
+ if ((c & 0xC0) == 0x80)
1064
+ {
1065
+ // continuation byte: 10xxxxxx
1066
+ continue;
1067
+ }
1068
+ if ((c & 0xE0) == 0xC0)
1069
+ {
1070
+ // 2-byte character: 110xxxxx ...
1071
+ incomplete = i < 2;
1072
+ }
1073
+ else if ((c & 0xF0) == 0xE0)
1074
+ {
1075
+ // 3-byte character: 1110xxxx ...
1076
+ incomplete = i < 3;
1077
+ }
1078
+ else if ((c & 0xF8) == 0xF0)
1079
+ {
1080
+ // 4-byte character: 11110xxx ...
1081
+ incomplete = i < 4;
1082
+ }
1083
+ // else 1-byte character or invalid byte
1084
+ break;
1085
+ }
1086
+
1087
+ if (!incomplete)
1088
+ {
1089
+ size_t pos = std::min(slot.sent_count, slot.generated_text.size());
1090
+ const std::string str_test = slot.generated_text.substr(pos);
1091
+ bool is_stop_full = false;
1092
+ size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
1093
+ if (stop_pos != std::string::npos)
1094
+ {
1095
+ is_stop_full = true;
1096
+ slot.generated_text.erase(
1097
+ slot.generated_text.begin() + pos + stop_pos,
1098
+ slot.generated_text.end());
1099
+ pos = std::min(slot.sent_count, slot.generated_text.size());
1100
+ }
1101
+ else
1102
+ {
1103
+ is_stop_full = false;
1104
+ stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
1105
+ }
1106
+
1107
+ // check if there is any token to predict
1108
+ if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
1109
+ {
1110
+ // no send the stop word in the response
1111
+ result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
1112
+ slot.sent_count += result.text_to_send.size();
1113
+ // add the token to slot queue and cache
1114
+ }
1115
+ slot.add_token_string(result);
1116
+ if (slot.params.stream)
1117
+ {
1118
+ send_partial_response(slot, result);
1119
+ }
1120
+ }
1121
+
1122
+ if (incomplete)
1123
+ {
1124
+ slot.has_next_token = true;
1125
+ }
1126
+
1127
+ // check the limits
1128
+ if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params))
1129
+ {
1130
+ slot.stopped_limit = true;
1131
+ slot.has_next_token = false;
1132
+ }
1133
+
1134
+ if (result.tok == llama_token_eos(model))
1135
+ {
1136
+ slot.stopped_eos = true;
1137
+ slot.has_next_token = false;
1138
+ LOG_VERBOSE("eos token found", {});
1139
+ }
1140
+
1141
+ LOG_VERBOSE("next token", {
1142
+ {"token", result.tok},
1143
+ {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
1144
+ {"has_next_token", slot.has_next_token},
1145
+ {"n_remain", slot.n_remaining},
1146
+ {"num_tokens_predicted", slot.n_decoded},
1147
+ {"stopped_eos", slot.stopped_eos},
1148
+ {"stopped_word", slot.stopped_word},
1149
+ {"stopped_limit", slot.stopped_limit},
1150
+ {"stopping_word", slot.stopping_word},
1151
+ });
1152
+
1153
+ return slot.has_next_token; // continue
1154
+ }
1155
+
1156
+ bool process_images(llama_client_slot &slot) const
1157
+ {
1158
+ for (slot_image &img : slot.images)
1159
+ {
1160
+ if (!img.request_encode_image)
1161
+ {
1162
+ continue;
1163
+ }
1164
+
1165
+ if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
1166
+ LOG("Error processing the given image");
1167
+ return false;
1168
+ }
1169
+
1170
+ img.request_encode_image = false;
1171
+ }
1172
+
1173
+ return slot.images.size() > 0;
1174
+ }
1175
+
1176
+ void send_error(task_server& task, const std::string &error)
1177
+ {
1178
+ LOG("task %i - error: %s\n", task.id, error.c_str());
1179
+ task_result res;
1180
+ res.id = task.id;
1181
+ res.multitask_id = task.multitask_id;
1182
+ res.stop = false;
1183
+ res.error = true;
1184
+ res.result_json = { { "content", error } };
1185
+ queue_results.send(res);
1186
+ }
1187
+
1188
+ json get_formated_generation(llama_client_slot &slot)
1189
+ {
1190
+ std::vector<std::string> samplers;
1191
+ samplers.reserve(slot.sparams.samplers.size());
1192
+ for (const auto & sampler : slot.sparams.samplers)
1193
+ {
1194
+ samplers.emplace_back(common_sampler_type_to_str(sampler));
1195
+ }
1196
+
1197
+ return json {
1198
+ {"n_ctx", slot.n_ctx},
1199
+ {"n_predict", slot.n_predict},
1200
+ {"model", params.model_alias},
1201
+ {"seed", slot.params.seed},
1202
+ {"temperature", slot.sparams.temp},
1203
+ {"dynatemp_range", slot.sparams.dynatemp_range},
1204
+ {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
1205
+ {"top_k", slot.sparams.top_k},
1206
+ {"top_p", slot.sparams.top_p},
1207
+ {"min_p", slot.sparams.min_p},
1208
+ {"typical_p", slot.sparams.typ_p},
1209
+ {"repeat_last_n", slot.sparams.penalty_last_n},
1210
+ {"repeat_penalty", slot.sparams.penalty_repeat},
1211
+ {"presence_penalty", slot.sparams.penalty_present},
1212
+ {"frequency_penalty", slot.sparams.penalty_freq},
1213
+ {"mirostat", slot.sparams.mirostat},
1214
+ {"mirostat_tau", slot.sparams.mirostat_tau},
1215
+ {"mirostat_eta", slot.sparams.mirostat_eta},
1216
+ {"penalize_nl", slot.sparams.penalize_nl},
1217
+ {"stop", slot.params.antiprompt},
1218
+ {"n_predict", slot.params.n_predict},
1219
+ {"n_keep", params.n_keep},
1220
+ {"ignore_eos", slot.sparams.ignore_eos},
1221
+ {"stream", slot.params.stream},
1222
+ // {"logit_bias", slot.sparams.logit_bias},
1223
+ {"n_probs", slot.sparams.n_probs},
1224
+ {"min_keep", slot.sparams.min_keep},
1225
+ {"grammar", slot.sparams.grammar},
1226
+ {"samplers", samplers}
1227
+ };
1228
+ }
1229
+
1230
+ void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
1231
+ {
1232
+ task_result res;
1233
+ res.id = slot.task_id;
1234
+ res.multitask_id = slot.multitask_id;
1235
+ res.error = false;
1236
+ res.stop = false;
1237
+
1238
+ res.result_json = json
1239
+ {
1240
+ {"content", tkn.text_to_send},
1241
+ {"stop", false},
1242
+ {"slot_id", slot.id},
1243
+ {"multimodal", multimodal}
1244
+ };
1245
+
1246
+ if (slot.sparams.n_probs > 0)
1247
+ {
1248
+ std::vector<completion_token_output> probs_output = {};
1249
+ const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
1250
+ size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
1251
+ size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
1252
+ if (probs_pos < probs_stop_pos)
1253
+ {
1254
+ probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
1255
+ }
1256
+ slot.sent_token_probs_index = probs_stop_pos;
1257
+ res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
1258
+ }
1259
+
1260
+ if (slot.oaicompat)
1261
+ {
1262
+ res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
1263
+ res.result_json["model"] = slot.oaicompat_model;
1264
+ }
1265
+
1266
+ queue_results.send(res);
1267
+ }
1268
+
1269
+ void send_final_response(llama_client_slot &slot)
1270
+ {
1271
+ task_result res;
1272
+ res.id = slot.task_id;
1273
+ res.multitask_id = slot.multitask_id;
1274
+ res.error = false;
1275
+ res.stop = true;
1276
+
1277
+ res.result_json = json
1278
+ {
1279
+ {"content", !slot.params.stream ? slot.generated_text : ""},
1280
+ {"slot_id", slot.id},
1281
+ {"stop", true},
1282
+ {"model", params.model_alias},
1283
+ {"tokens_predicted", slot.n_decoded},
1284
+ {"tokens_evaluated", slot.num_prompt_tokens},
1285
+ {"generation_settings", get_formated_generation(slot)},
1286
+ {"prompt", slot.prompt},
1287
+ {"truncated", slot.truncated},
1288
+ {"stopped_eos", slot.stopped_eos},
1289
+ {"stopped_word", slot.stopped_word},
1290
+ {"stopped_limit", slot.stopped_limit},
1291
+ {"stopping_word", slot.stopping_word},
1292
+ {"tokens_cached", slot.n_past},
1293
+ {"timings", slot.get_formated_timings()}
1294
+ };
1295
+
1296
+ if (slot.sparams.n_probs > 0)
1297
+ {
1298
+ std::vector<completion_token_output> probs = {};
1299
+ if (!slot.params.stream && slot.stopped_word)
1300
+ {
1301
+ const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
1302
+ probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
1303
+ }
1304
+ else
1305
+ {
1306
+ probs = std::vector<completion_token_output>(
1307
+ slot.generated_token_probs.begin(),
1308
+ slot.generated_token_probs.end());
1309
+ }
1310
+ res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
1311
+ }
1312
+
1313
+ if (slot.oaicompat)
1314
+ {
1315
+ res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
1316
+ res.result_json["model"] = slot.oaicompat_model;
1317
+ }
1318
+
1319
+ queue_results.send(res);
1320
+ }
1321
+
1322
+ void send_embedding(llama_client_slot &slot)
1323
+ {
1324
+ task_result res;
1325
+ res.id = slot.task_id;
1326
+ res.multitask_id = slot.multitask_id;
1327
+ res.error = false;
1328
+ res.stop = true;
1329
+
1330
+ const int n_embd = llama_n_embd(model);
1331
+ if (!params.embedding)
1332
+ {
1333
+ LOG_WARNING("embedding disabled", {
1334
+ {"params.embedding", params.embedding},
1335
+ });
1336
+ res.result_json = json
1337
+ {
1338
+ {"embedding", std::vector<float>(n_embd, 0.0f)},
1339
+ };
1340
+ }
1341
+ else
1342
+ {
1343
+ const float *data = llama_get_embeddings(ctx);
1344
+ std::vector<float> embedding(data, data + n_embd);
1345
+ res.result_json = json
1346
+ {
1347
+ {"embedding", embedding },
1348
+ };
1349
+ }
1350
+ queue_results.send(res);
1351
+ }
1352
+
1353
+ void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
1354
+ {
1355
+ task_server task;
1356
+ task.id = task_id;
1357
+ task.target_id = 0;
1358
+ task.data = std::move(data);
1359
+ task.infill_mode = infill;
1360
+ task.embedding_mode = embedding;
1361
+ task.type = TASK_TYPE_COMPLETION;
1362
+ task.multitask_id = multitask_id;
1363
+
1364
+ // when a completion task's prompt array is not a singleton, we split it into multiple requests
1365
+ // otherwise, it's a single-prompt task, we actually queue it
1366
+ // if there's numbers in the prompt array it will be treated as an array of tokens
1367
+ if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
1368
+ bool numbers = false;
1369
+ for (const auto& e : task.data.at("prompt")) {
1370
+ if (e.is_number()) {
1371
+ numbers = true;
1372
+ break;
1373
+ }
1374
+ }
1375
+
1376
+ // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers,
1377
+ // it will completely stall the server. I don't know where the bug for this is.
1378
+ //
1379
+ // if there are numbers, it needs to be treated like a single prompt,
1380
+ // queue_tasks handles a mix of strings and numbers just fine.
1381
+ if (numbers) {
1382
+ queue_tasks.post(task);
1383
+ } else {
1384
+ split_multiprompt_task(task_id, task);
1385
+ }
1386
+ } else {
1387
+ queue_tasks.post(task);
1388
+ }
1389
+ }
1390
+
1391
+ // for multiple images processing
1392
+ bool ingest_images(llama_client_slot &slot, int n_batch)
1393
+ {
1394
+ int image_idx = 0;
1395
+
1396
+ while (image_idx < (int) slot.images.size())
1397
+ {
1398
+ slot_image &img = slot.images[image_idx];
1399
+
1400
+ // process prefix prompt
1401
+ for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
1402
+ {
1403
+ const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
1404
+ llama_batch batch_view = {
1405
+ n_tokens,
1406
+ batch.token + i,
1407
+ nullptr,
1408
+ batch.pos + i,
1409
+ batch.n_seq_id + i,
1410
+ batch.seq_id + i,
1411
+ batch.logits + i,
1412
+ };
1413
+ if (llama_decode(ctx, batch_view))
1414
+ {
1415
+ LOG("%s : failed to eval\n", __func__);
1416
+ return false;
1417
+ }
1418
+ }
1419
+
1420
+ // process image with llm
1421
+ for (int i = 0; i < img.image_tokens; i += n_batch)
1422
+ {
1423
+ int n_eval = img.image_tokens - i;
1424
+ if (n_eval > n_batch)
1425
+ {
1426
+ n_eval = n_batch;
1427
+ }
1428
+
1429
+ const int n_embd = llama_n_embd(model);
1430
+ float * embd = img.image_embedding + i * n_embd;
1431
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
1432
+ if (llama_decode(ctx, llava_batch.batch))
1433
+ {
1434
+ LOG("%s : failed to eval image\n", __func__);
1435
+ return false;
1436
+ }
1437
+ slot.n_past += n_eval;
1438
+ }
1439
+ image_idx++;
1440
+
1441
+ common_batch_clear(batch);
1442
+
1443
+ // append prefix of next image
1444
+ const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
1445
+ slot.params.input_suffix : // no more images, then process suffix prompt
1446
+ (json)(slot.images[image_idx].prefix_prompt);
1447
+
1448
+ std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
1449
+ for (int i = 0; i < (int) append_tokens.size(); ++i)
1450
+ {
1451
+ common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
1452
+ slot.n_past += 1;
1453
+ }
1454
+ }
1455
+
1456
+ return true;
1457
+ }
1458
+
1459
+ void request_cancel(int task_id)
1460
+ {
1461
+ task_server task;
1462
+ task.type = TASK_TYPE_CANCEL;
1463
+ task.target_id = task_id;
1464
+ queue_tasks.post(task);
1465
+ }
1466
+
1467
+ void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
1468
+ {
1469
+ int prompt_count = multiprompt_task.data.at("prompt").size();
1470
+ if (prompt_count <= 1) {
1471
+ send_error(multiprompt_task, "error while handling multiple prompts");
1472
+ return;
1473
+ }
1474
+
1475
+ // generate all the ID for subtask
1476
+ std::vector<int> subtask_ids(prompt_count);
1477
+ for (int i = 0; i < prompt_count; i++)
1478
+ {
1479
+ subtask_ids[i] = queue_tasks.get_new_id();
1480
+ }
1481
+
1482
+ // queue up the multitask so we can track its subtask progression
1483
+ queue_tasks.add_multitask(multitask_id, subtask_ids);
1484
+
1485
+ // add subtasks
1486
+ for (int i = 0; i < prompt_count; i++)
1487
+ {
1488
+ json subtask_data = multiprompt_task.data;
1489
+ subtask_data["prompt"] = subtask_data["prompt"][i];
1490
+
1491
+ // subtasks inherit everything else (infill mode, embedding mode, etc.)
1492
+ request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
1493
+ }
1494
+ }
1495
+
1496
+ void process_single_task(task_server& task)
1497
+ {
1498
+ switch (task.type)
1499
+ {
1500
+ case TASK_TYPE_COMPLETION: {
1501
+ llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
1502
+ if (slot == nullptr)
1503
+ {
1504
+ // if no slot is available, we defer this task for processing later
1505
+ LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
1506
+ queue_tasks.defer(task);
1507
+ break;
1508
+ }
1509
+
1510
+ if (task.data.contains("system_prompt"))
1511
+ {
1512
+ if (!all_slots_are_idle) {
1513
+ send_error(task, "system prompt can only be updated when all slots are idle");
1514
+ break;
1515
+ }
1516
+ process_system_prompt_data(task.data["system_prompt"]);
1517
+
1518
+ // reset cache_tokens for all slots
1519
+ for (llama_client_slot &slot : slots)
1520
+ {
1521
+ slot.cache_tokens.clear();
1522
+ slot.n_past = 0;
1523
+ slot.n_past_se = 0;
1524
+ }
1525
+ }
1526
+
1527
+ slot->reset();
1528
+
1529
+ slot->infill = task.infill_mode;
1530
+ slot->embedding = task.embedding_mode;
1531
+ slot->task_id = task.id;
1532
+ slot->multitask_id = task.multitask_id;
1533
+
1534
+ if (!launch_slot_with_data(slot, task.data))
1535
+ {
1536
+ // send error result
1537
+ send_error(task, "internal_error");
1538
+ break;
1539
+ }
1540
+ } break;
1541
+ case TASK_TYPE_CANCEL: { // release slot linked with the task id
1542
+ for (auto & slot : slots)
1543
+ {
1544
+ if (slot.task_id == task.target_id)
1545
+ {
1546
+ slot.release();
1547
+ break;
1548
+ }
1549
+ }
1550
+ } break;
1551
+ case TASK_TYPE_NEXT_RESPONSE: {
1552
+ // do nothing
1553
+ } break;
1554
+ }
1555
+ }
1556
+
1557
+ void on_finish_multitask(task_multi& multitask)
1558
+ {
1559
+ // all subtasks done == multitask is done
1560
+ task_result result;
1561
+ result.id = multitask.id;
1562
+ result.stop = true;
1563
+ result.error = false;
1564
+
1565
+ // collect json results into one json result
1566
+ std::vector<json> result_jsons;
1567
+ for (auto& subres : multitask.results)
1568
+ {
1569
+ result_jsons.push_back(subres.result_json);
1570
+ result.error = result.error && subres.error;
1571
+ }
1572
+ result.result_json = json{ { "results", result_jsons } };
1573
+ queue_results.send(result);
1574
+ }
1575
+
1576
+ bool update_slots() {
1577
+ if (system_need_update)
1578
+ {
1579
+ LOG_INFO("updating system prompt", {});
1580
+ update_system_prompt();
1581
+ }
1582
+
1583
+ common_batch_clear(batch);
1584
+
1585
+ if (all_slots_are_idle)
1586
+ {
1587
+ if (system_prompt.empty() && clean_kv_cache)
1588
+ {
1589
+ LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
1590
+ kv_cache_clear();
1591
+ }
1592
+ return true;
1593
+ }
1594
+
1595
+ LOG_VERBOSE("posting NEXT_RESPONSE", {});
1596
+ task_server task;
1597
+ task.type = TASK_TYPE_NEXT_RESPONSE;
1598
+ task.target_id = -1;
1599
+ queue_tasks.post(task);
1600
+
1601
+ for (llama_client_slot &slot : slots)
1602
+ {
1603
+ if (slot.ga_n == 1)
1604
+ {
1605
+ if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
1606
+ {
1607
+ // START LOCALAI changes
1608
+ // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
1609
+ // See: https://github.com/mudler/LocalAI/issues/1333
1610
+ // Context is exhausted, release the slot
1611
+ slot.release();
1612
+ send_final_response(slot);
1613
+ slot.cache_tokens.clear();
1614
+ slot.n_past = 0;
1615
+ slot.truncated = false;
1616
+ slot.has_next_token = true;
1617
+ LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
1618
+
1619
+ continue;
1620
+ // END LOCALAI changes
1621
+ }
1622
+ }
1623
+ }
1624
+
1625
+ // decode any currently ongoing sequences
1626
+ LOG_VERBOSE("decoding ongoing sequences", {});
1627
+ for (auto & slot : slots)
1628
+ {
1629
+ // release the slot
1630
+ if (slot.command == RELEASE)
1631
+ {
1632
+ slot.state = IDLE;
1633
+ slot.command = NONE;
1634
+ slot.t_last_used = ggml_time_us();
1635
+
1636
+ LOG_INFO("slot released", {
1637
+ {"slot_id", slot.id},
1638
+ {"task_id", slot.task_id},
1639
+ {"n_ctx", n_ctx},
1640
+ {"n_past", slot.n_past},
1641
+ {"n_system_tokens", system_tokens.size()},
1642
+ {"n_cache_tokens", slot.cache_tokens.size()},
1643
+ {"truncated", slot.truncated}
1644
+ });
1645
+ queue_tasks.notify_slot_changed();
1646
+
1647
+ continue;
1648
+ }
1649
+
1650
+ if (slot.state == IDLE)
1651
+ {
1652
+ continue;
1653
+ }
1654
+
1655
+ slot.i_batch = batch.n_tokens;
1656
+
1657
+ const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
1658
+
1659
+ // TODO: we always have to take into account the "system_tokens"
1660
+ // this is not great and needs to be improved somehow
1661
+ common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
1662
+ slot.n_past += 1;
1663
+ }
1664
+
1665
+ // process in chunks of params.n_batch
1666
+ int32_t n_batch = params.n_batch;
1667
+
1668
+ // assign workload to the slots
1669
+ if (params.cont_batching || batch.n_tokens == 0)
1670
+ {
1671
+ for (auto & slot : slots)
1672
+ {
1673
+ const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
1674
+
1675
+ // empty prompt passed -> release the slot and send empty response
1676
+ // note: infill mode allows empty prompt
1677
+ if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill)
1678
+ {
1679
+ slot.release();
1680
+ slot.print_timings();
1681
+ send_final_response(slot);
1682
+ continue;
1683
+ }
1684
+
1685
+ // need process the prompt
1686
+ if (slot.state == IDLE && slot.command == LOAD_PROMPT)
1687
+ {
1688
+ slot.state = PROCESSING;
1689
+ slot.command = NONE;
1690
+ std::vector<llama_token> prompt_tokens;
1691
+ slot.t_start_process_prompt = ggml_time_us();
1692
+ slot.t_start_genereration = 0;
1693
+
1694
+ if (slot.infill)
1695
+ {
1696
+ bool suff_rm_leading_spc = true;
1697
+ if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
1698
+ {
1699
+ params.input_suffix.erase(0, 1);
1700
+ suff_rm_leading_spc = false;
1701
+ }
1702
+ auto prefix_tokens = tokenize(slot.params.input_prefix, false);
1703
+ auto suffix_tokens = tokenize(slot.params.input_suffix, false);
1704
+
1705
+ const int space_token = 29871; // TODO: this should not be hardcoded
1706
+ if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
1707
+ suffix_tokens.erase(suffix_tokens.begin());
1708
+ }
1709
+
1710
+ prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
1711
+ prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
1712
+ prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
1713
+ prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
1714
+ prefix_tokens.push_back(llama_token_middle(model));
1715
+ prompt_tokens = prefix_tokens;
1716
+ }
1717
+ else
1718
+ {
1719
+ prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
1720
+ }
1721
+
1722
+ slot.num_prompt_tokens = prompt_tokens.size();
1723
+
1724
+ if (slot.params.n_keep < 0)
1725
+ {
1726
+ slot.params.n_keep = slot.num_prompt_tokens;
1727
+ }
1728
+ slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
1729
+
1730
+ // if input prompt is too big, truncate it
1731
+ if (slot.num_prompt_tokens >= slot.n_ctx)
1732
+ {
1733
+ const int n_left = slot.n_ctx - slot.params.n_keep;
1734
+ const int n_block_size = n_left / 2;
1735
+ const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
1736
+
1737
+ std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
1738
+ new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
1739
+
1740
+ LOG_VERBOSE("input truncated", {
1741
+ {"n_ctx", slot.n_ctx},
1742
+ {"n_keep", slot.params.n_keep},
1743
+ {"n_left", n_left},
1744
+ {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
1745
+ });
1746
+ slot.truncated = true;
1747
+ prompt_tokens = new_tokens;
1748
+
1749
+ slot.num_prompt_tokens = prompt_tokens.size();
1750
+ GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
1751
+ }
1752
+
1753
+ if (!slot.params.cache_prompt)
1754
+ {
1755
+ common_sampler_reset(slot.ctx_sampling);
1756
+
1757
+ slot.n_past = 0;
1758
+ slot.n_past_se = 0;
1759
+ slot.ga_i = 0;
1760
+ slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
1761
+ }
1762
+ else
1763
+ {
1764
+ // push the prompt into the sampling context (do not apply grammar)
1765
+ for (auto &token : prompt_tokens)
1766
+ {
1767
+ common_sampler_accept(slot.ctx_sampling, token, false);
1768
+ }
1769
+
1770
+ slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
1771
+
1772
+ // the last token of the cache is not in the KV cache until the next call to llama_decode
1773
+ // (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
1774
+ if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
1775
+ {
1776
+ slot.n_past -= 1;
1777
+ }
1778
+
1779
+ slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
1780
+
1781
+ if (slot.ga_n != 1)
1782
+ {
1783
+ int ga_i = 0;
1784
+ int32_t ga_n = slot.ga_n;
1785
+ int32_t ga_w = slot.ga_w;
1786
+ int32_t slot_npast = 0;
1787
+ for (int k = 0; k < slot.n_past; ++k)
1788
+ {
1789
+ while (slot_npast >= ga_i + ga_w) {
1790
+ const int bd = (ga_w/ga_n)*(ga_n - 1);
1791
+ slot_npast -= bd;
1792
+ ga_i += ga_w/ga_n;
1793
+ }
1794
+ slot_npast++;
1795
+ }
1796
+ slot.n_past_se = slot_npast;
1797
+ slot.ga_i = ga_i;
1798
+ }
1799
+
1800
+ LOG_INFO("slot progression", {
1801
+ { "slot_id", slot.id },
1802
+ { "task_id", slot.task_id },
1803
+ { "n_past", slot.n_past },
1804
+ { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
1805
+ });
1806
+ }
1807
+
1808
+ slot.cache_tokens = prompt_tokens;
1809
+
1810
+ if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
1811
+ {
1812
+ // we have to evaluate at least 1 token to generate logits.
1813
+ LOG_INFO("we have to evaluate at least 1 token to generate logits", {
1814
+ { "slot_id", slot.id },
1815
+ { "task_id", slot.task_id }
1816
+ });
1817
+ slot.n_past--;
1818
+ if (slot.ga_i > 0)
1819
+ {
1820
+ slot.n_past_se--;
1821
+ }
1822
+ }
1823
+
1824
+ int p0 = (int) system_tokens.size() + slot.n_past;
1825
+ LOG_INFO("kv cache rm [p0, end)", {
1826
+ { "slot_id", slot.id },
1827
+ { "task_id", slot.task_id },
1828
+ { "p0", p0 }
1829
+ });
1830
+ llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
1831
+
1832
+ LOG_VERBOSE("prompt ingested", {
1833
+ {"n_past", slot.n_past},
1834
+ {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
1835
+ {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
1836
+ });
1837
+
1838
+ const bool has_images = process_images(slot);
1839
+
1840
+ // process the prefix of first image
1841
+ std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
1842
+
1843
+ int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
1844
+
1845
+ int32_t ga_i = slot.ga_i;
1846
+ int32_t ga_n = slot.ga_n;
1847
+ int32_t ga_w = slot.ga_w;
1848
+
1849
+ for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
1850
+ {
1851
+ if (slot.ga_n != 1)
1852
+ {
1853
+ while (slot_npast >= ga_i + ga_w) {
1854
+ const int bd = (ga_w/ga_n)*(ga_n - 1);
1855
+ slot_npast -= bd;
1856
+ ga_i += ga_w/ga_n;
1857
+ }
1858
+ }
1859
+ common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
1860
+ slot_npast++;
1861
+ }
1862
+
1863
+ if (has_images && !ingest_images(slot, n_batch))
1864
+ {
1865
+ LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d",
1866
+ __func__,
1867
+ slot.id,
1868
+ slot.task_id
1869
+ );
1870
+ // FIXME @phymbert: to be properly tested
1871
+ // early returning without changing the slot state will block the slot for ever
1872
+ // no one at the moment is checking the return value
1873
+ return false;
1874
+ }
1875
+
1876
+ // extract the logits only for the last token
1877
+ if (batch.n_tokens > 0)
1878
+ {
1879
+ batch.logits[batch.n_tokens - 1] = true;
1880
+ }
1881
+
1882
+ slot.n_decoded = 0;
1883
+ slot.i_batch = batch.n_tokens - 1;
1884
+ }
1885
+ }
1886
+ }
1887
+
1888
+ if (batch.n_tokens == 0)
1889
+ {
1890
+ all_slots_are_idle = true;
1891
+ return true;
1892
+ }
1893
+
1894
+ for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
1895
+ {
1896
+ const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
1897
+
1898
+ for (auto & slot : slots)
1899
+ {
1900
+ if (slot.ga_n != 1)
1901
+ {
1902
+ // context extension via Self-Extend
1903
+ while (slot.n_past_se >= slot.ga_i + slot.ga_w)
1904
+ {
1905
+ const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
1906
+ const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
1907
+ const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
1908
+
1909
+ LOG("\n");
1910
+ LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
1911
+ LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
1912
+ LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
1913
+
1914
+ llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
1915
+ llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
1916
+ llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
1917
+
1918
+ slot.n_past_se -= bd;
1919
+
1920
+ slot.ga_i += slot.ga_w / slot.ga_n;
1921
+
1922
+ LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
1923
+ }
1924
+ slot.n_past_se += n_tokens;
1925
+ }
1926
+ }
1927
+
1928
+ llama_batch batch_view =
1929
+ {
1930
+ n_tokens,
1931
+ batch.token + i,
1932
+ nullptr,
1933
+ batch.pos + i,
1934
+ batch.n_seq_id + i,
1935
+ batch.seq_id + i,
1936
+ batch.logits + i,
1937
+ };
1938
+
1939
+ const int ret = llama_decode(ctx, batch_view);
1940
+
1941
+ if (ret != 0)
1942
+ {
1943
+ if (n_batch == 1 || ret < 0)
1944
+ {
1945
+ // if you get here, it means the KV cache is full - try increasing it via the context size
1946
+ LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
1947
+ return false;
1948
+ }
1949
+
1950
+ LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
1951
+
1952
+ // retry with half the batch size to try to find a free slot in the KV cache
1953
+ n_batch /= 2;
1954
+ i -= n_batch;
1955
+ continue;
1956
+ }
1957
+
1958
+ for (auto & slot : slots)
1959
+ {
1960
+ if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
1961
+ {
1962
+ continue;
1963
+ }
1964
+
1965
+ // prompt evaluated for embedding
1966
+ if (slot.embedding)
1967
+ {
1968
+ send_embedding(slot);
1969
+ slot.release();
1970
+ slot.i_batch = -1;
1971
+ continue;
1972
+ }
1973
+
1974
+ completion_token_output result;
1975
+ const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
1976
+
1977
+ common_sampler_accept(slot.ctx_sampling, id, true);
1978
+
1979
+ slot.n_decoded += 1;
1980
+ if (slot.n_decoded == 1)
1981
+ {
1982
+ slot.t_start_genereration = ggml_time_us();
1983
+ slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
1984
+ metrics.on_prompt_eval(slot);
1985
+ }
1986
+
1987
+ result.tok = id;
1988
+ const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);
1989
+
1990
+ for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
1991
+ result.probs.push_back({
1992
+ cur_p->data[i].id,
1993
+ i >= cur_p->size ? 0.0f : cur_p->data[i].p,
1994
+ });
1995
+ }
1996
+
1997
+ if (!process_token(result, slot))
1998
+ {
1999
+ slot.release();
2000
+ slot.print_timings();
2001
+ send_final_response(slot);
2002
+ metrics.on_prediction(slot);
2003
+ }
2004
+
2005
+ slot.i_batch = -1;
2006
+ }
2007
+ }
2008
+
2009
+ LOG_VERBOSE("slots updated", {});
2010
+ return true;
2011
+ }
2012
+
2013
+ void run_on_all_tasks_finished() {
2014
+ update_slots();
2015
+ }
2016
+ };
2017
+
2018
+ /* llama.cpp completion api semantics */
2019
+ static json format_partial_response(
2020
+ llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
2021
+ ) {
2022
+ json res = json
2023
+ {
2024
+ {"content", content },
2025
+ {"stop", false},
2026
+ {"slot_id", slot->id },
2027
+ {"multimodal", llama.multimodal }
2028
+ };
2029
+
2030
+ if (slot->sparams.n_probs > 0)
2031
+ {
2032
+ res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
2033
+ }
2034
+
2035
+ return res;
2036
+ }
2037
+
2038
+ struct token_translator
2039
+ {
2040
+ llama_context * ctx;
2041
+ std::string operator()(llama_token tok) const { return common_token_to_piece(ctx, tok); }
2042
+ std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
2043
+ };
2044
+
2045
+ static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot)
2046
+ {
2047
+ auto & gtps = slot->generated_token_probs;
2048
+ auto translator = token_translator{llama.ctx};
2049
+ auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
2050
+ const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
2051
+ if (slot->generated_text.capacity() < slot->generated_text.size() + len)
2052
+ {
2053
+ slot->generated_text.reserve(slot->generated_text.size() + len);
2054
+ }
2055
+ for (const completion_token_output & cto : gtps)
2056
+ {
2057
+ slot->generated_text += translator(cto);
2058
+ }
2059
+ }
2060
+
2061
+ std::function<void(int)> shutdown_handler;
2062
+ inline void signal_handler(int signal) { shutdown_handler(signal); }
2063
+
2064
+ /////////////////////////////////
2065
+ ////////////////////////////////
2066
+ //////// LOCALAI code starts below here
2067
+ /////////////////////////////////
2068
+ ////////////////////////////////
2069
+
2070
+ bool loaded_model; // TODO: add a mutex for this, but happens only once loading the model
2071
+
2072
+ // The class has a llama instance that is shared across all RPCs
2073
+ llama_server_context llama;
2074
+
2075
+ static void start_llama_server() {
2076
+ // Wait for model to be loaded first
2077
+ while (!loaded_model) {
2078
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
2079
+ }
2080
+
2081
+ llama.queue_tasks.on_new_task(std::bind(
2082
+ &llama_server_context::process_single_task, &llama, std::placeholders::_1));
2083
+ llama.queue_tasks.on_finish_multitask(std::bind(
2084
+ &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
2085
+ llama.queue_tasks.on_all_tasks_finished(std::bind(
2086
+ &llama_server_context::run_on_all_tasks_finished, &llama));
2087
+ llama.queue_results.on_multitask_update(std::bind(
2088
+ &llama_server_queue::update_multitask,
2089
+ &llama.queue_tasks,
2090
+ std::placeholders::_1,
2091
+ std::placeholders::_2,
2092
+ std::placeholders::_3
2093
+ ));
2094
+ llama.queue_tasks.start_loop();
2095
+ }
2096
+
2097
+ json parse_options(bool streaming, const backend::PredictOptions* predict, llama_server_context &llama)
2098
+ {
2099
+
2100
+ // This is for example a slot data from the json data
2101
+ // slot->params.stream = json_value(data, "stream", false);
2102
+ // slot->params.cache_prompt = json_value(data, "cache_prompt", false);
2103
+ // slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
2104
+ // slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
2105
+ // slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
2106
+ // slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
2107
+ // slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
2108
+ // slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
2109
+ // slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
2110
+ // slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
2111
+ // slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
2112
+ // slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
2113
+ // slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
2114
+ // slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
2115
+ // slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
2116
+ // slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
2117
+ // slot->params.seed = json_value(data, "seed", default_params.seed);
2118
+ // slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
2119
+ // slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
2120
+
2121
+ // Create now a json data from the prediction options instead
2122
+ //
2123
+ json data;
2124
+ data["stream"] = streaming;
2125
+ data["cache_prompt"] = predict->promptcacheall();
2126
+ data["n_predict"] = predict->tokens() == 0 ? -1 : predict->tokens();
2127
+ data["top_k"] = predict->topk();
2128
+ data["top_p"] = predict->topp();
2129
+ data["typical_p"] = predict->typicalp();
2130
+ data["temperature"] = predict->temperature();
2131
+ data["repeat_last_n"] = predict->repeat();
2132
+ data["repeat_penalty"] = predict->penalty();
2133
+ data["frequency_penalty"] = predict->frequencypenalty();
2134
+ data["presence_penalty"] = predict->presencepenalty();
2135
+ data["mirostat"] = predict->mirostat();
2136
+ data["mirostat_tau"] = predict->mirostattau();
2137
+ data["mirostat_eta"] = predict->mirostateta();
2138
+ data["penalize_nl"] = predict->penalizenl();
2139
+ data["n_keep"] = predict->nkeep();
2140
+ data["seed"] = predict->seed();
2141
+ data["grammar"] = predict->grammar();
2142
+ data["prompt"] = predict->prompt();
2143
+ data["ignore_eos"] = predict->ignoreeos();
2144
+ data["embeddings"] = predict->embeddings();
2145
+
2146
+ // Add the correlationid to json data
2147
+ data["correlation_id"] = predict->correlationid();
2148
+
2149
+ // for each image in the request, add the image data
2150
+ //
2151
+ for (int i = 0; i < predict->images_size(); i++) {
2152
+ data["image_data"].push_back(json
2153
+ {
2154
+ {"id", i},
2155
+ {"data", predict->images(i)},
2156
+ });
2157
+ }
2158
+
2159
+ data["stop"] = predict->stopprompts();
2160
+ // data["n_probs"] = predict->nprobs();
2161
+ //TODO: images,
2162
+
2163
+ return data;
2164
+ }
2165
+
2166
+ // static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama)
2167
+ // {
2168
+ // // https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L673
2169
+ // gpt_params default_params;
2170
+
2171
+ // llama.stream = streaming;
2172
+ // llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
2173
+ // llama.params.sparams.top_k = predict->topk();
2174
+ // llama.params.sparams.top_p = predict->topp();
2175
+ // llama.params.sparams.typical_p = predict->typicalp();
2176
+ // llama.params.sparams.penalty_last_n = predict->repeat();
2177
+ // llama.params.sparams.temp = predict->temperature();
2178
+ // llama.params.sparams.penalty_repeat = predict->penalty();
2179
+ // llama.params.sparams.penalty_present = predict->presencepenalty();
2180
+ // llama.params.sparams.penalty_freq = predict->frequencypenalty();
2181
+ // llama.params.sparams.mirostat = predict->mirostat();
2182
+ // llama.params.sparams.mirostat_tau = predict->mirostattau();
2183
+ // llama.params.sparams.mirostat_eta = predict->mirostateta();
2184
+ // llama.params.sparams.penalize_nl = predict->penalizenl();
2185
+ // llama.params.n_keep = predict->nkeep();
2186
+ // llama.params.seed = predict->seed();
2187
+ // llama.params.sparams.grammar = predict->grammar();
2188
+ // // llama.params.n_probs = predict->
2189
+ // llama.params.prompt = predict->prompt();
2190
+
2191
+ // llama.params.sparams.logit_bias.clear();
2192
+
2193
+ // if (predict->ignoreeos())
2194
+ // {
2195
+ // llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
2196
+ // }
2197
+
2198
+ // // const auto &logit_bias = body.find("logit_bias");
2199
+ // // if (logit_bias != body.end() && logit_bias->is_array())
2200
+ // // {
2201
+ // // const int n_vocab = llama_n_vocab(llama.model);
2202
+ // // for (const auto &el : *logit_bias)
2203
+ // // {
2204
+ // // if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
2205
+ // // {
2206
+ // // llama_token tok = el[0].get<llama_token>();
2207
+ // // if (tok >= 0 && tok < n_vocab)
2208
+ // // {
2209
+ // // if (el[1].is_number())
2210
+ // // {
2211
+ // // llama.params.logit_bias[tok] = el[1].get<float>();
2212
+ // // }
2213
+ // // else if (el[1].is_boolean() && !el[1].get<bool>())
2214
+ // // {
2215
+ // // llama.params.logit_bias[tok] = -INFINITY;
2216
+ // // }
2217
+ // // }
2218
+ // // }
2219
+ // // }
2220
+ // // }
2221
+
2222
+ // llama.params.antiprompt.clear();
2223
+ // for (const std::string& stopPrompt : predict->stopprompts()) {
2224
+ // if (!stopPrompt.empty())
2225
+ // {
2226
+ // llama.params.antiprompt.push_back(stopPrompt);
2227
+ // }
2228
+ // }
2229
+ // }
2230
+
2231
+ static void params_parse(const backend::ModelOptions* request,
2232
+ common_params & params) {
2233
+
2234
+ // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
2235
+
2236
+ params.model = request->modelfile();
2237
+ if (!request->mmproj().empty()) {
2238
+ // get the directory of modelfile
2239
+ std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
2240
+ params.mmproj = model_dir + "/"+ request->mmproj();
2241
+ }
2242
+ // params.model_alias ??
2243
+ params.model_alias = request->modelfile();
2244
+ params.n_ctx = request->contextsize();
2245
+ //params.memory_f16 = request->f16memory();
2246
+ params.cpuparams.n_threads = request->threads();
2247
+ params.n_gpu_layers = request->ngpulayers();
2248
+ params.n_batch = request->nbatch();
2249
+ // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
2250
+ //params.n_parallel = 1;
2251
+ const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
2252
+ if (env_parallel != NULL) {
2253
+ params.n_parallel = std::stoi(env_parallel);
2254
+ params.cont_batching = true;
2255
+ } else {
2256
+ params.n_parallel = 1;
2257
+ }
2258
+
2259
+ const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
2260
+ if (llama_grpc_servers != NULL) {
2261
+ params.rpc_servers = std::string(llama_grpc_servers);
2262
+ }
2263
+
2264
+ // TODO: Add yarn
2265
+
2266
+ if (!request->tensorsplit().empty()) {
2267
+ std::string arg_next = request->tensorsplit();
2268
+
2269
+ // split string by , and /
2270
+ const std::regex regex{ R"([,/]+)" };
2271
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
2272
+ std::vector<std::string> split_arg{ it, {} };
2273
+
2274
+ GGML_ASSERT(split_arg.size() <= llama_max_devices());
2275
+
2276
+ for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
2277
+ if (i_device < split_arg.size()) {
2278
+ params.tensor_split[i_device] = std::stof(split_arg[i_device]);
2279
+ }
2280
+ else {
2281
+ params.tensor_split[i_device] = 0.0f;
2282
+ }
2283
+ }
2284
+ }
2285
+
2286
+ if (!request->maingpu().empty()) {
2287
+ params.main_gpu = std::stoi(request->maingpu());
2288
+ }
2289
+ if (!request->loraadapter().empty() && !request->lorabase().empty()) {
2290
+ float scale_factor = 1.0f;
2291
+ if (request->lorascale() != 0.0f) {
2292
+ scale_factor = request->lorascale();
2293
+ }
2294
+ // get the directory of modelfile
2295
+ std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
2296
+ params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
2297
+ }
2298
+ params.use_mlock = request->mlock();
2299
+ params.use_mmap = request->mmap();
2300
+ params.flash_attn = request->flashattention();
2301
+ params.no_kv_offload = request->nokvoffload();
2302
+
2303
+ params.embedding = request->embeddings();
2304
+
2305
+ if (request->ropescaling() == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
2306
+ else if (request->ropescaling() == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
2307
+ else { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
2308
+ if ( request->yarnextfactor() != 0.0f ) {
2309
+ params.yarn_ext_factor = request->yarnextfactor();
2310
+ }
2311
+ if ( request->yarnattnfactor() != 0.0f ) {
2312
+ params.yarn_attn_factor = request->yarnattnfactor();
2313
+ }
2314
+ if ( request->yarnbetafast() != 0.0f ) {
2315
+ params.yarn_beta_fast = request->yarnbetafast();
2316
+ }
2317
+ if ( request->yarnbetaslow() != 0.0f ) {
2318
+ params.yarn_beta_slow = request->yarnbetaslow();
2319
+ }
2320
+ if ( request->ropefreqbase() != 0.0f ) {
2321
+ params.rope_freq_base = request->ropefreqbase();
2322
+ }
2323
+ if ( request->ropefreqscale() != 0.0f ) {
2324
+ params.rope_freq_scale = request->ropefreqscale();
2325
+ }
2326
+ }
2327
+
2328
+
2329
+ // GRPC Server start
2330
+ class BackendServiceImpl final : public backend::Backend::Service {
2331
+ public:
2332
+ grpc::Status Health(ServerContext* context, const backend::HealthMessage* request, backend::Reply* reply) {
2333
+ // Implement Health RPC
2334
+ reply->set_message("OK");
2335
+ return Status::OK;
2336
+ }
2337
+
2338
+ grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
2339
+ // Implement LoadModel RPC
2340
+ common_params params;
2341
+ params_parse(request, params);
2342
+
2343
+ llama_backend_init();
2344
+ llama_numa_init(params.numa);
2345
+
2346
+ // load the model
2347
+ if (!llama.load_model(params))
2348
+ {
2349
+ result->set_message("Failed loading model");
2350
+ result->set_success(false);
2351
+ return Status::CANCELLED;
2352
+ }
2353
+ llama.initialize();
2354
+ result->set_message("Loading succeeded");
2355
+ result->set_success(true);
2356
+ loaded_model = true;
2357
+ return Status::OK;
2358
+ }
2359
+ grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
2360
+ json data = parse_options(true, request, llama);
2361
+ const int task_id = llama.queue_tasks.get_new_id();
2362
+ llama.queue_results.add_waiting_task_id(task_id);
2363
+ llama.request_completion(task_id, data, false, false, -1);
2364
+ while (true)
2365
+ {
2366
+ task_result result = llama.queue_results.recv(task_id);
2367
+ if (!result.error) {
2368
+ const std::string str =
2369
+ "data: " +
2370
+ result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
2371
+ "\n\n";
2372
+ LOG_VERBOSE("data stream", {
2373
+ { "to_send", str }
2374
+ });
2375
+
2376
+ backend::Reply reply;
2377
+ // print it
2378
+ std::string completion_text = result.result_json.value("content", "");
2379
+
2380
+ reply.set_message(completion_text);
2381
+ int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
2382
+ reply.set_tokens(tokens_predicted);
2383
+ int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
2384
+ reply.set_prompt_tokens(tokens_evaluated);
2385
+
2386
+ // Log Request Correlation Id
2387
+ LOG_VERBOSE("correlation:", {
2388
+ { "id", data["correlation_id"] }
2389
+ });
2390
+
2391
+ // Send the reply
2392
+ writer->Write(reply);
2393
+
2394
+ if (result.stop) {
2395
+ break;
2396
+ }
2397
+ } else {
2398
+ break;
2399
+ }
2400
+ }
2401
+
2402
+ return grpc::Status::OK;
2403
+ }
2404
+
2405
+
2406
+ grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
2407
+ json data = parse_options(false, request, llama);
2408
+ const int task_id = llama.queue_tasks.get_new_id();
2409
+ llama.queue_results.add_waiting_task_id(task_id);
2410
+ llama.request_completion(task_id, data, false, false, -1);
2411
+ std::string completion_text;
2412
+ task_result result = llama.queue_results.recv(task_id);
2413
+ if (!result.error && result.stop) {
2414
+
2415
+ // Log Request Correlation Id
2416
+ LOG_VERBOSE("correlation:", {
2417
+ { "id", data["correlation_id"] }
2418
+ });
2419
+
2420
+ completion_text = result.result_json.value("content", "");
2421
+ int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
2422
+ int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
2423
+ reply->set_prompt_tokens(tokens_evaluated);
2424
+ reply->set_tokens(tokens_predicted);
2425
+ reply->set_message(completion_text);
2426
+ }
2427
+ else
2428
+ {
2429
+ return grpc::Status::OK;
2430
+ }
2431
+
2432
+ return grpc::Status::OK;
2433
+ }
2434
+
2435
+ /// https://github.com/ggerganov/llama.cpp/blob/aa2341298924ac89778252015efcb792f2df1e20/examples/server/server.cpp#L2969
2436
+ grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
2437
+ json data = parse_options(false, request, llama);
2438
+ const int task_id = llama.queue_tasks.get_new_id();
2439
+ llama.queue_results.add_waiting_task_id(task_id);
2440
+ llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
2441
+ // get the result
2442
+ task_result result = llama.queue_results.recv(task_id);
2443
+ //std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
2444
+ llama.queue_results.remove_waiting_task_id(task_id);
2445
+ if (!result.error && result.stop) {
2446
+ std::vector<float> embeddings = result.result_json.value("embedding", std::vector<float>());
2447
+ // loop the vector and set the embeddings results
2448
+ for (int i = 0; i < embeddings.size(); i++) {
2449
+ embeddingResult->add_embeddings(embeddings[i]);
2450
+ }
2451
+ }
2452
+ else
2453
+ {
2454
+ return grpc::Status::OK;
2455
+ }
2456
+
2457
+ return grpc::Status::OK;
2458
+ }
2459
+
2460
+ grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
2461
+ llama_client_slot* active_slot = llama.get_active_slot();
2462
+
2463
+ if (active_slot != nullptr) {
2464
+ // Calculate the tokens per second using existing logic
2465
+ double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
2466
+
2467
+ // Populate the response with metrics
2468
+ response->set_slot_id(active_slot->id);
2469
+ response->set_prompt_json_for_slot(active_slot->prompt.dump());
2470
+ response->set_tokens_per_second(tokens_per_second);
2471
+ response->set_tokens_generated(active_slot->n_decoded);
2472
+ response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
2473
+ } else {
2474
+ // Handle case when no active slot exists
2475
+ response->set_slot_id(0);
2476
+ response->set_prompt_json_for_slot("");
2477
+ response->set_tokens_per_second(0);
2478
+ response->set_tokens_generated(0);
2479
+ response->set_prompt_tokens_processed(0);
2480
+ }
2481
+
2482
+ return grpc::Status::OK;
2483
+ }
2484
+ };
2485
+
2486
+ void RunServer(const std::string& server_address) {
2487
+ BackendServiceImpl service;
2488
+
2489
+ ServerBuilder builder;
2490
+ builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
2491
+ builder.RegisterService(&service);
2492
+
2493
+ std::unique_ptr<Server> server(builder.BuildAndStart());
2494
+ std::cout << "Server listening on " << server_address << std::endl;
2495
+ server->Wait();
2496
+ }
2497
+
2498
+ int main(int argc, char** argv) {
2499
+ std::string server_address("localhost:50051");
2500
+
2501
+ // Define long and short options
2502
+ struct option long_options[] = {
2503
+ {"addr", required_argument, nullptr, 'a'},
2504
+ {nullptr, 0, nullptr, 0}
2505
+ };
2506
+
2507
+ // Parse command-line arguments
2508
+ int option;
2509
+ int option_index = 0;
2510
+ while ((option = getopt_long(argc, argv, "a:", long_options, &option_index)) != -1) {
2511
+ switch (option) {
2512
+ case 'a':
2513
+ server_address = optarg;
2514
+ break;
2515
+ default:
2516
+ std::cerr << "Usage: " << argv[0] << " [--addr=<address>] or [-a <address>]" << std::endl;
2517
+ return 1;
2518
+ }
2519
+ }
2520
+
2521
+ // run the HTTP server in a thread - see comment below
2522
+ std::thread t([&]()
2523
+ {
2524
+ RunServer(server_address);
2525
+ return 0;
2526
+ });
2527
+
2528
+
2529
+ //);
2530
+ start_llama_server();
2531
+ std::cout << "stopping" << std::endl;
2532
+
2533
+ t.join();
2534
+
2535
+ llama_backend_free();
2536
+ return 0;
2537
+ }
backend/cpp/llama/json.hpp ADDED
The diff for this file is too large to render. See raw diff
 
backend/cpp/llama/patches/01-llava.patch ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
2
+ index 342042ff..224db9b5 100644
3
+ --- a/examples/llava/clip.cpp
4
+ +++ b/examples/llava/clip.cpp
5
+ @@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
6
+ struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
7
+ int* patches_data = (int*)malloc(ggml_nbytes(patches));
8
+ for (int i = 0; i < num_patches; i++) {
9
+ - patches_data[i] = i + 1;
10
+ + patches_data[i] = i;
11
+ }
12
+ ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
13
+ free(patches_data);
backend/cpp/llama/prepare.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ ## Patches
4
+ ## Apply patches from the `patches` directory
5
+ for patch in $(ls patches); do
6
+ echo "Applying patch $patch"
7
+ patch -d llama.cpp/ -p1 < patches/$patch
8
+ done
9
+
10
+ cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
11
+ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
12
+ cp -rfv json.hpp llama.cpp/examples/grpc-server/
13
+ cp -rfv utils.hpp llama.cpp/examples/grpc-server/
14
+
15
+ if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
16
+ echo "grpc-server already added"
17
+ else
18
+ echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
19
+ fi
20
+
21
+ ## XXX: In some versions of CMake clip wasn't being built before llama.
22
+ ## This is an hack for now, but it should be fixed in the future.
23
+ cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
24
+ cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
25
+ echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
26
+ cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
27
+ cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
backend/cpp/llama/utils.hpp ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
2
+
3
+ #pragma once
4
+
5
+ #include <string>
6
+ #include <vector>
7
+ #include <set>
8
+ #include <mutex>
9
+ #include <condition_variable>
10
+ #include <unordered_map>
11
+
12
+ #include "json.hpp"
13
+
14
+ #include "../llava/clip.h"
15
+
16
+ using json = nlohmann::json;
17
+
18
+ extern bool server_verbose;
19
+
20
+ #ifndef SERVER_VERBOSE
21
+ #define SERVER_VERBOSE 1
22
+ #endif
23
+
24
+ #if SERVER_VERBOSE != 1
25
+ #define LOG_VERBOSE(MSG, ...)
26
+ #else
27
+ #define LOG_VERBOSE(MSG, ...) \
28
+ do \
29
+ { \
30
+ if (server_verbose) \
31
+ { \
32
+ server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
33
+ } \
34
+ } while (0)
35
+ #endif
36
+
37
+ #define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
38
+ #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
39
+ #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
40
+
41
+ //
42
+ // parallel
43
+ //
44
+
45
+ enum server_state {
46
+ SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
47
+ SERVER_STATE_READY, // Server is ready and model is loaded
48
+ SERVER_STATE_ERROR // An error occurred, load_model failed
49
+ };
50
+
51
+ enum task_type {
52
+ TASK_TYPE_COMPLETION,
53
+ TASK_TYPE_CANCEL,
54
+ TASK_TYPE_NEXT_RESPONSE
55
+ };
56
+
57
+ struct task_server {
58
+ int id = -1; // to be filled by llama_server_queue
59
+ int target_id;
60
+ task_type type;
61
+ json data;
62
+ bool infill_mode = false;
63
+ bool embedding_mode = false;
64
+ int multitask_id = -1;
65
+ };
66
+
67
+ struct task_result {
68
+ int id;
69
+ int multitask_id = -1;
70
+ bool stop;
71
+ bool error;
72
+ json result_json;
73
+ };
74
+
75
+ struct task_multi {
76
+ int id;
77
+ std::set<int> subtasks_remaining{};
78
+ std::vector<task_result> results{};
79
+ };
80
+
81
+ // TODO: can become bool if we can't find use of more states
82
+ enum slot_state
83
+ {
84
+ IDLE,
85
+ PROCESSING,
86
+ };
87
+
88
+ enum slot_command
89
+ {
90
+ NONE,
91
+ LOAD_PROMPT,
92
+ RELEASE,
93
+ };
94
+
95
+ struct slot_params
96
+ {
97
+ bool stream = true;
98
+ bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
99
+
100
+ uint32_t seed = -1; // RNG seed
101
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
102
+ int32_t n_predict = -1; // new tokens to predict
103
+
104
+ std::vector<std::string> antiprompt;
105
+
106
+ json input_prefix;
107
+ json input_suffix;
108
+ };
109
+
110
+ struct slot_image
111
+ {
112
+ int32_t id;
113
+
114
+ bool request_encode_image = false;
115
+ float * image_embedding = nullptr;
116
+ int32_t image_tokens = 0;
117
+
118
+ clip_image_u8 * img_data;
119
+
120
+ std::string prefix_prompt; // before of this image
121
+ };
122
+
123
+ // completion token output with probabilities
124
+ struct completion_token_output
125
+ {
126
+ struct token_prob
127
+ {
128
+ llama_token tok;
129
+ float prob;
130
+ };
131
+
132
+ std::vector<token_prob> probs;
133
+ llama_token tok;
134
+ std::string text_to_send;
135
+ };
136
+
137
+ static inline void server_log(const char *level, const char *function, int line,
138
+ const char *message, const nlohmann::ordered_json &extra)
139
+ {
140
+ nlohmann::ordered_json log
141
+ {
142
+ {"timestamp", time(nullptr)},
143
+ {"level", level},
144
+ {"function", function},
145
+ {"line", line},
146
+ {"message", message},
147
+ };
148
+
149
+ if (!extra.empty())
150
+ {
151
+ log.merge_patch(extra);
152
+ }
153
+
154
+ const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
155
+ printf("%.*s\n", (int)str.size(), str.data());
156
+ fflush(stdout);
157
+ }
158
+
159
+ //
160
+ // server utils
161
+ //
162
+
163
+ template <typename T>
164
+ static T json_value(const json &body, const std::string &key, const T &default_value)
165
+ {
166
+ // Fallback null to default value
167
+ return body.contains(key) && !body.at(key).is_null()
168
+ ? body.value(key, default_value)
169
+ : default_value;
170
+ }
171
+
172
+ inline std::string format_chatml(std::vector<json> messages)
173
+ {
174
+ std::ostringstream chatml_msgs;
175
+
176
+ for (auto it = messages.begin(); it != messages.end(); ++it) {
177
+ chatml_msgs << "<|im_start|>"
178
+ << json_value(*it, "role", std::string("user")) << '\n';
179
+ chatml_msgs << json_value(*it, "content", std::string(""))
180
+ << "<|im_end|>\n";
181
+ }
182
+
183
+ chatml_msgs << "<|im_start|>assistant" << '\n';
184
+
185
+ return chatml_msgs.str();
186
+ }
187
+
188
+ //
189
+ // work queue utils
190
+ //
191
+
192
+ struct llama_server_queue {
193
+ int id = 0;
194
+ std::mutex mutex_tasks;
195
+ // queues
196
+ std::vector<task_server> queue_tasks;
197
+ std::vector<task_server> queue_tasks_deferred;
198
+ std::vector<task_multi> queue_multitasks;
199
+ std::condition_variable condition_tasks;
200
+ // callback functions
201
+ std::function<void(task_server&)> callback_new_task;
202
+ std::function<void(task_multi&)> callback_finish_multitask;
203
+ std::function<void(void)> callback_all_task_finished;
204
+
205
+ // Add a new task to the end of the queue
206
+ int post(task_server task) {
207
+ std::unique_lock<std::mutex> lock(mutex_tasks);
208
+ if (task.id == -1) {
209
+ task.id = id++;
210
+ }
211
+ queue_tasks.push_back(std::move(task));
212
+ condition_tasks.notify_one();
213
+ return task.id;
214
+ }
215
+
216
+ // Add a new task, but defer until one slot is available
217
+ void defer(task_server task) {
218
+ std::unique_lock<std::mutex> lock(mutex_tasks);
219
+ queue_tasks_deferred.push_back(std::move(task));
220
+ }
221
+
222
+ // Get the next id for creating anew task
223
+ int get_new_id() {
224
+ std::unique_lock<std::mutex> lock(mutex_tasks);
225
+ return id++;
226
+ }
227
+
228
+ // Register function to process a new task
229
+ void on_new_task(std::function<void(task_server&)> callback) {
230
+ callback_new_task = callback;
231
+ }
232
+
233
+ // Register function to process a multitask
234
+ void on_finish_multitask(std::function<void(task_multi&)> callback) {
235
+ callback_finish_multitask = callback;
236
+ }
237
+
238
+ // Register the function to be called when the batch of tasks is finished
239
+ void on_all_tasks_finished(std::function<void(void)> callback) {
240
+ callback_all_task_finished = callback;
241
+ }
242
+
243
+ // Call when the state of one slot is changed
244
+ void notify_slot_changed() {
245
+ // move deferred tasks back to main loop
246
+ std::unique_lock<std::mutex> lock(mutex_tasks);
247
+ for (auto & task : queue_tasks_deferred) {
248
+ queue_tasks.push_back(std::move(task));
249
+ }
250
+ queue_tasks_deferred.clear();
251
+ }
252
+
253
+ // Start the main loop. This call is blocking
254
+ [[noreturn]]
255
+ void start_loop() {
256
+ while (true) {
257
+ // new task arrived
258
+ LOG_VERBOSE("have new task", {});
259
+ {
260
+ while (true)
261
+ {
262
+ std::unique_lock<std::mutex> lock(mutex_tasks);
263
+ if (queue_tasks.empty()) {
264
+ lock.unlock();
265
+ break;
266
+ }
267
+ task_server task = queue_tasks.front();
268
+ queue_tasks.erase(queue_tasks.begin());
269
+ lock.unlock();
270
+ LOG_VERBOSE("callback_new_task", {});
271
+ callback_new_task(task);
272
+ }
273
+ LOG_VERBOSE("callback_all_task_finished", {});
274
+ // process and update all the multitasks
275
+ auto queue_iterator = queue_multitasks.begin();
276
+ while (queue_iterator != queue_multitasks.end())
277
+ {
278
+ if (queue_iterator->subtasks_remaining.empty())
279
+ {
280
+ // all subtasks done == multitask is done
281
+ task_multi current_multitask = *queue_iterator;
282
+ callback_finish_multitask(current_multitask);
283
+ // remove this multitask
284
+ queue_iterator = queue_multitasks.erase(queue_iterator);
285
+ }
286
+ else
287
+ {
288
+ ++queue_iterator;
289
+ }
290
+ }
291
+ // all tasks in the current loop is finished
292
+ callback_all_task_finished();
293
+ }
294
+ LOG_VERBOSE("wait for new task", {});
295
+ // wait for new task
296
+ {
297
+ std::unique_lock<std::mutex> lock(mutex_tasks);
298
+ if (queue_tasks.empty()) {
299
+ condition_tasks.wait(lock, [&]{
300
+ return !queue_tasks.empty();
301
+ });
302
+ }
303
+ }
304
+ }
305
+ }
306
+
307
+ //
308
+ // functions to manage multitasks
309
+ //
310
+
311
+ // add a multitask by specifying the id of all subtask (subtask is a task_server)
312
+ void add_multitask(int multitask_id, std::vector<int>& sub_ids)
313
+ {
314
+ std::lock_guard<std::mutex> lock(mutex_tasks);
315
+ task_multi multi;
316
+ multi.id = multitask_id;
317
+ std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
318
+ queue_multitasks.push_back(multi);
319
+ }
320
+
321
+ // updatethe remaining subtasks, while appending results to multitask
322
+ void update_multitask(int multitask_id, int subtask_id, task_result& result)
323
+ {
324
+ std::lock_guard<std::mutex> lock(mutex_tasks);
325
+ for (auto& multitask : queue_multitasks)
326
+ {
327
+ if (multitask.id == multitask_id)
328
+ {
329
+ multitask.subtasks_remaining.erase(subtask_id);
330
+ multitask.results.push_back(result);
331
+ }
332
+ }
333
+ }
334
+ };
335
+
336
+ struct llama_server_response {
337
+ typedef std::function<void(int, int, task_result&)> callback_multitask_t;
338
+ callback_multitask_t callback_update_multitask;
339
+ // for keeping track of all tasks waiting for the result
340
+ std::set<int> waiting_task_ids;
341
+ // the main result queue
342
+ std::vector<task_result> queue_results;
343
+ std::mutex mutex_results;
344
+ std::condition_variable condition_results;
345
+
346
+ void add_waiting_task_id(int task_id) {
347
+ std::unique_lock<std::mutex> lock(mutex_results);
348
+ waiting_task_ids.insert(task_id);
349
+ }
350
+
351
+ void remove_waiting_task_id(int task_id) {
352
+ std::unique_lock<std::mutex> lock(mutex_results);
353
+ waiting_task_ids.erase(task_id);
354
+ }
355
+
356
+ // This function blocks the thread until there is a response for this task_id
357
+ task_result recv(int task_id) {
358
+ while (true)
359
+ {
360
+ std::unique_lock<std::mutex> lock(mutex_results);
361
+ condition_results.wait(lock, [&]{
362
+ return !queue_results.empty();
363
+ });
364
+ LOG_VERBOSE("condition_results unblock", {});
365
+
366
+ for (int i = 0; i < (int) queue_results.size(); i++)
367
+ {
368
+ if (queue_results[i].id == task_id)
369
+ {
370
+ assert(queue_results[i].multitask_id == -1);
371
+ task_result res = queue_results[i];
372
+ queue_results.erase(queue_results.begin() + i);
373
+ return res;
374
+ }
375
+ }
376
+ }
377
+
378
+ // should never reach here
379
+ }
380
+
381
+ // Register the function to update multitask
382
+ void on_multitask_update(callback_multitask_t callback) {
383
+ callback_update_multitask = callback;
384
+ }
385
+
386
+ // Send a new result to a waiting task_id
387
+ void send(task_result result) {
388
+ std::unique_lock<std::mutex> lock(mutex_results);
389
+ LOG_VERBOSE("send new result", {});
390
+ for (auto& task_id : waiting_task_ids) {
391
+ // LOG_TEE("waiting task id %i \n", task_id);
392
+ // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
393
+ if (result.multitask_id == task_id)
394
+ {
395
+ LOG_VERBOSE("callback_update_multitask", {});
396
+ callback_update_multitask(task_id, result.id, result);
397
+ continue;
398
+ }
399
+
400
+ if (result.id == task_id)
401
+ {
402
+ LOG_VERBOSE("queue_results.push_back", {});
403
+ queue_results.push_back(result);
404
+ condition_results.notify_one();
405
+ return;
406
+ }
407
+ }
408
+ }
409
+ };
410
+
411
+ //
412
+ // base64 utils (TODO: move to common in the future)
413
+ //
414
+
415
+ static const std::string base64_chars =
416
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
417
+ "abcdefghijklmnopqrstuvwxyz"
418
+ "0123456789+/";
419
+
420
+ static inline bool is_base64(uint8_t c)
421
+ {
422
+ return (isalnum(c) || (c == '+') || (c == '/'));
423
+ }
424
+
425
+ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
426
+ {
427
+ int i = 0;
428
+ int j = 0;
429
+ int in_ = 0;
430
+
431
+ int in_len = encoded_string.size();
432
+
433
+ uint8_t char_array_4[4];
434
+ uint8_t char_array_3[3];
435
+
436
+ std::vector<uint8_t> ret;
437
+
438
+ while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
439
+ {
440
+ char_array_4[i++] = encoded_string[in_]; in_++;
441
+ if (i == 4)
442
+ {
443
+ for (i = 0; i <4; i++)
444
+ {
445
+ char_array_4[i] = base64_chars.find(char_array_4[i]);
446
+ }
447
+
448
+ char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
449
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
450
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
451
+
452
+ for (i = 0; (i < 3); i++)
453
+ {
454
+ ret.push_back(char_array_3[i]);
455
+ }
456
+ i = 0;
457
+ }
458
+ }
459
+
460
+ if (i)
461
+ {
462
+ for (j = i; j <4; j++)
463
+ {
464
+ char_array_4[j] = 0;
465
+ }
466
+
467
+ for (j = 0; j <4; j++)
468
+ {
469
+ char_array_4[j] = base64_chars.find(char_array_4[j]);
470
+ }
471
+
472
+ char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
473
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
474
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
475
+
476
+ for (j = 0; (j < i - 1); j++)
477
+ {
478
+ ret.push_back(char_array_3[j]);
479
+ }
480
+ }
481
+
482
+ return ret;
483
+ }