YZ-TAN commited on
Commit
5a29263
·
verified ·
1 Parent(s): 7261750

Upload 2821 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +136 -0
  2. Dockerfile +7 -0
  3. Flask-llama.py +107 -0
  4. Flask-llama.pyproj +35 -0
  5. llama.cpp/.clang-format +161 -0
  6. llama.cpp/.clang-tidy +26 -0
  7. llama.cpp/.devops/cloud-v-pipeline +22 -0
  8. llama.cpp/.devops/cpu.Dockerfile +92 -0
  9. llama.cpp/.devops/cuda.Dockerfile +94 -0
  10. llama.cpp/.devops/intel.Dockerfile +91 -0
  11. llama.cpp/.devops/llama-cli-cann.Dockerfile +44 -0
  12. llama.cpp/.devops/llama-cpp-cuda.srpm.spec +83 -0
  13. llama.cpp/.devops/llama-cpp.srpm.spec +85 -0
  14. llama.cpp/.devops/musa.Dockerfile +108 -0
  15. llama.cpp/.devops/nix/apps.nix +21 -0
  16. llama.cpp/.devops/nix/devshells.nix +52 -0
  17. llama.cpp/.devops/nix/docker.nix +37 -0
  18. llama.cpp/.devops/nix/jetson-support.nix +39 -0
  19. llama.cpp/.devops/nix/nixpkgs-instances.nix +45 -0
  20. llama.cpp/.devops/nix/package-gguf-py.nix +36 -0
  21. llama.cpp/.devops/nix/package.nix +247 -0
  22. llama.cpp/.devops/nix/python-scripts.nix +66 -0
  23. llama.cpp/.devops/nix/scope.nix +41 -0
  24. llama.cpp/.devops/nix/sif.nix +27 -0
  25. llama.cpp/.devops/rocm.Dockerfile +113 -0
  26. llama.cpp/.devops/tools.sh +49 -0
  27. llama.cpp/.devops/vulkan.Dockerfile +89 -0
  28. llama.cpp/.dockerignore +20 -0
  29. llama.cpp/.ecrc +6 -0
  30. llama.cpp/.editorconfig +50 -0
  31. llama.cpp/.flake8 +17 -0
  32. llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +87 -0
  33. llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +101 -0
  34. llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +91 -0
  35. llama.cpp/.github/ISSUE_TEMPLATE/020-enhancement.yml +51 -0
  36. llama.cpp/.github/ISSUE_TEMPLATE/030-research.yml +52 -0
  37. llama.cpp/.github/ISSUE_TEMPLATE/040-refactor.yml +28 -0
  38. llama.cpp/.github/ISSUE_TEMPLATE/config.yml +11 -0
  39. llama.cpp/.github/labeler.yml +86 -0
  40. llama.cpp/.github/pull_request_template.md +1 -0
  41. llama.cpp/.github/workflows/bench.yml.disabled +315 -0
  42. llama.cpp/.github/workflows/build.yml +1645 -0
  43. llama.cpp/.github/workflows/close-issue.yml +28 -0
  44. llama.cpp/.github/workflows/docker.yml +173 -0
  45. llama.cpp/.github/workflows/editorconfig.yml +29 -0
  46. llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  47. llama.cpp/.github/workflows/labeler.yml +17 -0
  48. llama.cpp/.github/workflows/python-check-requirements.yml +33 -0
  49. llama.cpp/.github/workflows/python-lint.yml +30 -0
  50. llama.cpp/.github/workflows/python-type-check.yml +40 -0
.gitattributes CHANGED
@@ -33,3 +33,139 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ llama.cpp/build/bin/Release/ggml-base.dll filter=lfs diff=lfs merge=lfs -text
37
+ llama.cpp/build/bin/Release/ggml-cpu.dll filter=lfs diff=lfs merge=lfs -text
38
+ llama.cpp/build/bin/Release/llama-batched-bench.exe filter=lfs diff=lfs merge=lfs -text
39
+ llama.cpp/build/bin/Release/llama-batched.exe filter=lfs diff=lfs merge=lfs -text
40
+ llama.cpp/build/bin/Release/llama-bench.exe filter=lfs diff=lfs merge=lfs -text
41
+ llama.cpp/build/bin/Release/llama-cli.exe filter=lfs diff=lfs merge=lfs -text
42
+ llama.cpp/build/bin/Release/llama-convert-llama2c-to-ggml.exe filter=lfs diff=lfs merge=lfs -text
43
+ llama.cpp/build/bin/Release/llama-cvector-generator.exe filter=lfs diff=lfs merge=lfs -text
44
+ llama.cpp/build/bin/Release/llama-embedding.exe filter=lfs diff=lfs merge=lfs -text
45
+ llama.cpp/build/bin/Release/llama-eval-callback.exe filter=lfs diff=lfs merge=lfs -text
46
+ llama.cpp/build/bin/Release/llama-export-lora.exe filter=lfs diff=lfs merge=lfs -text
47
+ llama.cpp/build/bin/Release/llama-gen-docs.exe filter=lfs diff=lfs merge=lfs -text
48
+ llama.cpp/build/bin/Release/llama-gritlm.exe filter=lfs diff=lfs merge=lfs -text
49
+ llama.cpp/build/bin/Release/llama-imatrix.exe filter=lfs diff=lfs merge=lfs -text
50
+ llama.cpp/build/bin/Release/llama-infill.exe filter=lfs diff=lfs merge=lfs -text
51
+ llama.cpp/build/bin/Release/llama-llava-cli.exe filter=lfs diff=lfs merge=lfs -text
52
+ llama.cpp/build/bin/Release/llama-lookahead.exe filter=lfs diff=lfs merge=lfs -text
53
+ llama.cpp/build/bin/Release/llama-lookup-create.exe filter=lfs diff=lfs merge=lfs -text
54
+ llama.cpp/build/bin/Release/llama-lookup-stats.exe filter=lfs diff=lfs merge=lfs -text
55
+ llama.cpp/build/bin/Release/llama-lookup.exe filter=lfs diff=lfs merge=lfs -text
56
+ llama.cpp/build/bin/Release/llama-minicpmv-cli.exe filter=lfs diff=lfs merge=lfs -text
57
+ llama.cpp/build/bin/Release/llama-parallel.exe filter=lfs diff=lfs merge=lfs -text
58
+ llama.cpp/build/bin/Release/llama-passkey.exe filter=lfs diff=lfs merge=lfs -text
59
+ llama.cpp/build/bin/Release/llama-perplexity.exe filter=lfs diff=lfs merge=lfs -text
60
+ llama.cpp/build/bin/Release/llama-quantize.exe filter=lfs diff=lfs merge=lfs -text
61
+ llama.cpp/build/bin/Release/llama-qwen2vl-cli.exe filter=lfs diff=lfs merge=lfs -text
62
+ llama.cpp/build/bin/Release/llama-retrieval.exe filter=lfs diff=lfs merge=lfs -text
63
+ llama.cpp/build/bin/Release/llama-run.exe filter=lfs diff=lfs merge=lfs -text
64
+ llama.cpp/build/bin/Release/llama-save-load-state.exe filter=lfs diff=lfs merge=lfs -text
65
+ llama.cpp/build/bin/Release/llama-server.exe filter=lfs diff=lfs merge=lfs -text
66
+ llama.cpp/build/bin/Release/llama-speculative-simple.exe filter=lfs diff=lfs merge=lfs -text
67
+ llama.cpp/build/bin/Release/llama-speculative.exe filter=lfs diff=lfs merge=lfs -text
68
+ llama.cpp/build/bin/Release/llama-tokenize.exe filter=lfs diff=lfs merge=lfs -text
69
+ llama.cpp/build/bin/Release/llama-tts.exe filter=lfs diff=lfs merge=lfs -text
70
+ llama.cpp/build/bin/Release/llama.dll filter=lfs diff=lfs merge=lfs -text
71
+ llama.cpp/build/bin/Release/llava_shared.dll filter=lfs diff=lfs merge=lfs -text
72
+ llama.cpp/build/bin/Release/test-arg-parser.exe filter=lfs diff=lfs merge=lfs -text
73
+ llama.cpp/build/bin/Release/test-backend-ops.exe filter=lfs diff=lfs merge=lfs -text
74
+ llama.cpp/build/bin/Release/test-chat-template.exe filter=lfs diff=lfs merge=lfs -text
75
+ llama.cpp/build/bin/Release/test-tokenizer-0.exe filter=lfs diff=lfs merge=lfs -text
76
+ llama.cpp/build/common/common.dir/Release/arg.obj filter=lfs diff=lfs merge=lfs -text
77
+ llama.cpp/build/common/common.dir/Release/chat.obj filter=lfs diff=lfs merge=lfs -text
78
+ llama.cpp/build/common/common.dir/Release/common.obj filter=lfs diff=lfs merge=lfs -text
79
+ llama.cpp/build/common/common.dir/Release/json-schema-to-grammar.obj filter=lfs diff=lfs merge=lfs -text
80
+ llama.cpp/build/common/common.dir/Release/ngram-cache.obj filter=lfs diff=lfs merge=lfs -text
81
+ llama.cpp/build/common/common.dir/Release/sampling.obj filter=lfs diff=lfs merge=lfs -text
82
+ llama.cpp/build/common/Release/common.lib filter=lfs diff=lfs merge=lfs -text
83
+ llama.cpp/build/examples/convert-llama2c-to-ggml/llama-convert-llama2c-to-ggml.dir/Release/convert-llama2c-to-ggml.obj filter=lfs diff=lfs merge=lfs -text
84
+ llama.cpp/build/examples/cvector-generator/llama-cvector-generator.dir/Release/cvector-generator.obj filter=lfs diff=lfs merge=lfs -text
85
+ llama.cpp/build/examples/embedding/llama-embedding.dir/Release/embedding.obj filter=lfs diff=lfs merge=lfs -text
86
+ llama.cpp/build/examples/eval-callback/llama-eval-callback.dir/Release/eval-callback.obj filter=lfs diff=lfs merge=lfs -text
87
+ llama.cpp/build/examples/export-lora/llama-export-lora.dir/Release/export-lora.obj filter=lfs diff=lfs merge=lfs -text
88
+ llama.cpp/build/examples/gen-docs/llama-gen-docs.dir/Release/gen-docs.obj filter=lfs diff=lfs merge=lfs -text
89
+ llama.cpp/build/examples/gguf-hash/llama-gguf-hash.dir/Release/gguf-hash.obj filter=lfs diff=lfs merge=lfs -text
90
+ llama.cpp/build/examples/gguf-split/llama-gguf-split.dir/Release/gguf-split.obj filter=lfs diff=lfs merge=lfs -text
91
+ llama.cpp/build/examples/gritlm/llama-gritlm.dir/Release/gritlm.obj filter=lfs diff=lfs merge=lfs -text
92
+ llama.cpp/build/examples/imatrix/llama-imatrix.dir/Release/imatrix.obj filter=lfs diff=lfs merge=lfs -text
93
+ llama.cpp/build/examples/infill/llama-infill.dir/Release/infill.obj filter=lfs diff=lfs merge=lfs -text
94
+ llama.cpp/build/examples/llama-bench/llama-bench.dir/Release/llama-bench.obj filter=lfs diff=lfs merge=lfs -text
95
+ llama.cpp/build/examples/llava/llama-llava-cli.dir/Release/llava-cli.obj filter=lfs diff=lfs merge=lfs -text
96
+ llama.cpp/build/examples/llava/llama-minicpmv-cli.dir/Release/minicpmv-cli.obj filter=lfs diff=lfs merge=lfs -text
97
+ llama.cpp/build/examples/llava/llama-qwen2vl-cli.dir/Release/qwen2vl-cli.obj filter=lfs diff=lfs merge=lfs -text
98
+ llama.cpp/build/examples/llava/llava.dir/Release/clip.obj filter=lfs diff=lfs merge=lfs -text
99
+ llama.cpp/build/examples/llava/llava.dir/Release/llava.lib filter=lfs diff=lfs merge=lfs -text
100
+ llama.cpp/build/examples/llava/Release/llava_static.lib filter=lfs diff=lfs merge=lfs -text
101
+ llama.cpp/build/examples/lookahead/llama-lookahead.dir/Release/lookahead.obj filter=lfs diff=lfs merge=lfs -text
102
+ llama.cpp/build/examples/lookup/llama-lookup-stats.dir/Release/lookup-stats.obj filter=lfs diff=lfs merge=lfs -text
103
+ llama.cpp/build/examples/lookup/llama-lookup.dir/Release/lookup.obj filter=lfs diff=lfs merge=lfs -text
104
+ llama.cpp/build/examples/main/llama-cli.dir/Release/main.obj filter=lfs diff=lfs merge=lfs -text
105
+ llama.cpp/build/examples/parallel/llama-parallel.dir/Release/parallel.obj filter=lfs diff=lfs merge=lfs -text
106
+ llama.cpp/build/examples/passkey/llama-passkey.dir/Release/passkey.obj filter=lfs diff=lfs merge=lfs -text
107
+ llama.cpp/build/examples/perplexity/llama-perplexity.dir/Release/perplexity.obj filter=lfs diff=lfs merge=lfs -text
108
+ llama.cpp/build/examples/quantize/llama-quantize.dir/Release/quantize.obj filter=lfs diff=lfs merge=lfs -text
109
+ llama.cpp/build/examples/retrieval/llama-retrieval.dir/Release/retrieval.obj filter=lfs diff=lfs merge=lfs -text
110
+ llama.cpp/build/examples/run/llama-run.dir/Release/run.obj filter=lfs diff=lfs merge=lfs -text
111
+ llama.cpp/build/examples/save-load-state/llama-save-load-state.dir/Release/save-load-state.obj filter=lfs diff=lfs merge=lfs -text
112
+ llama.cpp/build/examples/server/llama-server.dir/Release/server.obj filter=lfs diff=lfs merge=lfs -text
113
+ llama.cpp/build/examples/speculative/llama-speculative.dir/Release/speculative.obj filter=lfs diff=lfs merge=lfs -text
114
+ llama.cpp/build/examples/tokenize/llama-tokenize.dir/Release/tokenize.obj filter=lfs diff=lfs merge=lfs -text
115
+ llama.cpp/build/examples/tts/llama-tts.dir/Release/tts.obj filter=lfs diff=lfs merge=lfs -text
116
+ llama.cpp/build/ggml/src/ggml-base.dir/Release/ggml-backend.obj filter=lfs diff=lfs merge=lfs -text
117
+ llama.cpp/build/ggml/src/ggml-base.dir/Release/ggml-opt.obj filter=lfs diff=lfs merge=lfs -text
118
+ llama.cpp/build/ggml/src/ggml-base.dir/Release/ggml-quants.obj filter=lfs diff=lfs merge=lfs -text
119
+ llama.cpp/build/ggml/src/ggml-base.dir/Release/ggml.obj filter=lfs diff=lfs merge=lfs -text
120
+ llama.cpp/build/ggml/src/ggml-base.dir/Release/gguf.obj filter=lfs diff=lfs merge=lfs -text
121
+ llama.cpp/build/ggml/src/ggml-cpu.dir/Release/ggml-cpu-aarch64.obj filter=lfs diff=lfs merge=lfs -text
122
+ llama.cpp/build/ggml/src/ggml-cpu.dir/Release/ggml-cpu-quants.obj filter=lfs diff=lfs merge=lfs -text
123
+ llama.cpp/build/ggml/src/ggml-cpu.dir/Release/ggml-cpu/ggml-cpu.c.obj filter=lfs diff=lfs merge=lfs -text
124
+ llama.cpp/build/ggml/src/ggml-cpu.dir/Release/sgemm.obj filter=lfs diff=lfs merge=lfs -text
125
+ llama.cpp/build/ggml/src/ggml.dir/Release/ggml-backend-reg.obj filter=lfs diff=lfs merge=lfs -text
126
+ llama.cpp/build/ggml/src/Release/ggml-base.lib filter=lfs diff=lfs merge=lfs -text
127
+ llama.cpp/build/src/llama.dir/Release/llama-adapter.obj filter=lfs diff=lfs merge=lfs -text
128
+ llama.cpp/build/src/llama.dir/Release/llama-arch.obj filter=lfs diff=lfs merge=lfs -text
129
+ llama.cpp/build/src/llama.dir/Release/llama-batch.obj filter=lfs diff=lfs merge=lfs -text
130
+ llama.cpp/build/src/llama.dir/Release/llama-chat.obj filter=lfs diff=lfs merge=lfs -text
131
+ llama.cpp/build/src/llama.dir/Release/llama-context.obj filter=lfs diff=lfs merge=lfs -text
132
+ llama.cpp/build/src/llama.dir/Release/llama-grammar.obj filter=lfs diff=lfs merge=lfs -text
133
+ llama.cpp/build/src/llama.dir/Release/llama-impl.obj filter=lfs diff=lfs merge=lfs -text
134
+ llama.cpp/build/src/llama.dir/Release/llama-kv-cache.obj filter=lfs diff=lfs merge=lfs -text
135
+ llama.cpp/build/src/llama.dir/Release/llama-model-loader.obj filter=lfs diff=lfs merge=lfs -text
136
+ llama.cpp/build/src/llama.dir/Release/llama-model.obj filter=lfs diff=lfs merge=lfs -text
137
+ llama.cpp/build/src/llama.dir/Release/llama-quant.obj filter=lfs diff=lfs merge=lfs -text
138
+ llama.cpp/build/src/llama.dir/Release/llama-sampling.obj filter=lfs diff=lfs merge=lfs -text
139
+ llama.cpp/build/src/llama.dir/Release/llama-vocab.obj filter=lfs diff=lfs merge=lfs -text
140
+ llama.cpp/build/src/llama.dir/Release/llama.obj filter=lfs diff=lfs merge=lfs -text
141
+ llama.cpp/build/src/llama.dir/Release/unicode-data.obj filter=lfs diff=lfs merge=lfs -text
142
+ llama.cpp/build/src/llama.dir/Release/unicode.obj filter=lfs diff=lfs merge=lfs -text
143
+ llama.cpp/build/tests/test-arg-parser.dir/Release/test-arg-parser.obj filter=lfs diff=lfs merge=lfs -text
144
+ llama.cpp/build/tests/test-backend-ops.dir/Release/test-backend-ops.obj filter=lfs diff=lfs merge=lfs -text
145
+ llama.cpp/build/tests/test-chat-template.dir/Release/test-chat-template.obj filter=lfs diff=lfs merge=lfs -text
146
+ llama.cpp/build/tests/test-gguf.dir/Release/test-gguf.obj filter=lfs diff=lfs merge=lfs -text
147
+ llama.cpp/build/tests/test-quantize-perf.dir/Release/test-quantize-perf.obj filter=lfs diff=lfs merge=lfs -text
148
+ llama.cpp/build/tests/test-tokenizer-0.dir/Release/test-tokenizer-0.obj filter=lfs diff=lfs merge=lfs -text
149
+ llama.cpp/docs/development/llama-star/idea-arch.key filter=lfs diff=lfs merge=lfs -text
150
+ llama.cpp/examples/server/themes/buttons-top/buttons_top.png filter=lfs diff=lfs merge=lfs -text
151
+ llama.cpp/examples/server/themes/wild/llamapattern.png filter=lfs diff=lfs merge=lfs -text
152
+ llama.cpp/examples/server/themes/wild/wild.png filter=lfs diff=lfs merge=lfs -text
153
+ llama.cpp/media/llama0-banner.png filter=lfs diff=lfs merge=lfs -text
154
+ llama.cpp/media/llama0-logo.png filter=lfs diff=lfs merge=lfs -text
155
+ llama.cpp/media/matmul.png filter=lfs diff=lfs merge=lfs -text
156
+ llama.cpp/models/ggml-vocab-aquila.gguf filter=lfs diff=lfs merge=lfs -text
157
+ llama.cpp/models/ggml-vocab-baichuan.gguf filter=lfs diff=lfs merge=lfs -text
158
+ llama.cpp/models/ggml-vocab-bert-bge.gguf filter=lfs diff=lfs merge=lfs -text
159
+ llama.cpp/models/ggml-vocab-command-r.gguf filter=lfs diff=lfs merge=lfs -text
160
+ llama.cpp/models/ggml-vocab-deepseek-coder.gguf filter=lfs diff=lfs merge=lfs -text
161
+ llama.cpp/models/ggml-vocab-deepseek-llm.gguf filter=lfs diff=lfs merge=lfs -text
162
+ llama.cpp/models/ggml-vocab-falcon.gguf filter=lfs diff=lfs merge=lfs -text
163
+ llama.cpp/models/ggml-vocab-gpt-2.gguf filter=lfs diff=lfs merge=lfs -text
164
+ llama.cpp/models/ggml-vocab-gpt-neox.gguf filter=lfs diff=lfs merge=lfs -text
165
+ llama.cpp/models/ggml-vocab-llama-bpe.gguf filter=lfs diff=lfs merge=lfs -text
166
+ llama.cpp/models/ggml-vocab-llama-spm.gguf filter=lfs diff=lfs merge=lfs -text
167
+ llama.cpp/models/ggml-vocab-mpt.gguf filter=lfs diff=lfs merge=lfs -text
168
+ llama.cpp/models/ggml-vocab-phi-3.gguf filter=lfs diff=lfs merge=lfs -text
169
+ llama.cpp/models/ggml-vocab-qwen2.gguf filter=lfs diff=lfs merge=lfs -text
170
+ llama.cpp/models/ggml-vocab-refact.gguf filter=lfs diff=lfs merge=lfs -text
171
+ llama.cpp/models/ggml-vocab-starcoder.gguf filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ WORKDIR /app
3
+ COPY . /app
4
+ RUN pip install -r requirements.txt
5
+ RUN chmod +x start.sh
6
+ EXPOSE 8000
7
+ CMD ["bash", "start.sh"]
Flask-llama.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import requests
3
+ import json
4
+ from typing import List, Dict
5
+ import subprocess
6
+ import time
7
+ from werkzeug.middleware.dispatcher import DispatcherMiddleware
8
+ from werkzeug.wrappers import Response
9
+
10
+ app = Flask(__name__)
11
+
12
+ @app.after_request
13
+ def disable_gzip_compression(response: Response):
14
+ response.headers["Content-Encoding"] = "identity"
15
+ return response
16
+
17
+ # Function to start the LLaMA server
18
+ def start_llama_server():
19
+ # print("Starting llama server")
20
+ # llama_command = [
21
+ # 'llama-server.exe',
22
+ # '-m', './models/Qwen2.5-7B-Instruct-Q4_K_M.gguf',
23
+ # '-c', '2048'
24
+ # ]
25
+ # subprocess.Popen(llama_command)
26
+ # print("Done starting llama server")
27
+ print("Starting llama server")
28
+
29
+ llama_command = [
30
+ "./llama-server", # Adjust for Linux (Hugging Face Spaces)
31
+ "-m", "./models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
32
+ "-c", "2048"
33
+ ]
34
+
35
+ # Start the LLaMA server and redirect output to logs (use nohup for background execution)
36
+ subprocess.Popen(llama_command, stdout=open("llama.log", "w"), stderr=subprocess.STDOUT)
37
+
38
+ print("Done starting llama server")
39
+
40
+
41
+ # Initialize LLaMA server before starting Flask
42
+ start_llama_server()
43
+
44
+ # Wait for the LLaMA server to fully start up (if needed, adjust time)
45
+ time.sleep(10)
46
+
47
+ # Function to send request to the model server
48
+ def get_response(
49
+ server_url: str,
50
+ messages: List[Dict[str, str]],
51
+ temperature: float = 0.7,
52
+ top_p: float = 0.9,
53
+ max_tokens: int = 4096,
54
+ stream: bool = False, # Disable streaming for testing
55
+ ) -> str:
56
+ headers = {"Content-Type": "application/json"}
57
+ data = {
58
+ "messages": messages,
59
+ "temperature": temperature,
60
+ "top_p": top_p,
61
+ "max_tokens": max_tokens,
62
+ "stream": stream,
63
+ }
64
+
65
+ response = requests.post(f"{server_url}/v1/chat/completions", headers=headers, json=data)
66
+ response.raise_for_status()
67
+
68
+ result = response.json()
69
+ if "choices" in result and len(result["choices"]) > 0:
70
+ return result["choices"][0]["message"]["content"]
71
+ else:
72
+ return ""
73
+
74
+ @app.route("/", methods=["GET"])
75
+ def home():
76
+ response = jsonify({"status": "Flask server is running"})
77
+ response.headers["Content-Encoding"] = "identity" # Disable compression
78
+ return response
79
+
80
+ @app.route("/chat", methods=["POST"])
81
+ def chatbot():
82
+ data = request.json
83
+ user_message = data.get("message", "")
84
+
85
+ if not user_message:
86
+ return jsonify({"error": "No message provided"}), 400
87
+
88
+ # Request LLaMA model for summarization of the test message (hardcoded)
89
+ # test_message = "Summarize: the product is good in overall. just have some drawbacks. it heats up easily and the accuracy is inconsistent. If all these problems are solved, potential order might be 1000 units."
90
+
91
+ # server_url = "http://127.0.0.1:8080"
92
+ server_url = "http://0.0.0.0:8080"
93
+ messages = [{"role": "system", "content": "You are an asistant that helps to solve problem based on user's input."},
94
+ {"role": "user", "content": user_message}] # Using the hardcoded test message here
95
+
96
+ response_text = get_response(server_url, messages)
97
+
98
+ # Print response in terminal (CMD)
99
+ print(f"Test message: {user_message}")
100
+ print(f"Assistant: {response_text}")
101
+
102
+ return jsonify({"response": response_text})
103
+
104
+ if __name__ == "__main__":
105
+ print("Flask server is running...")
106
+ # app.run(host="127.0.0.1", port=8000, debug=True)
107
+ app.run(host="0.0.0.0", port=7860, debug=True)
Flask-llama.pyproj ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
2
+ <PropertyGroup>
3
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
4
+ <SchemaVersion>2.0</SchemaVersion>
5
+ <ProjectGuid>94c76d7d-1ed7-45c4-885a-dc19dcf6fcb0</ProjectGuid>
6
+ <ProjectHome>.</ProjectHome>
7
+ <StartupFile>Flask_llama.py</StartupFile>
8
+ <SearchPath>
9
+ </SearchPath>
10
+ <WorkingDirectory>.</WorkingDirectory>
11
+ <OutputPath>.</OutputPath>
12
+ <Name>Flask-llama</Name>
13
+ <RootNamespace>Flask-llama</RootNamespace>
14
+ </PropertyGroup>
15
+ <PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
16
+ <DebugSymbols>true</DebugSymbols>
17
+ <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
18
+ </PropertyGroup>
19
+ <PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
20
+ <DebugSymbols>true</DebugSymbols>
21
+ <EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
22
+ </PropertyGroup>
23
+ <ItemGroup>
24
+ <Compile Include="Flask_llama.py" />
25
+ </ItemGroup>
26
+ <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
27
+ <!-- Uncomment the CoreCompile target to enable the Build command in
28
+ Visual Studio and specify your pre- and post-build commands in
29
+ the BeforeBuild and AfterBuild targets below. -->
30
+ <!--<Target Name="CoreCompile" />-->
31
+ <Target Name="BeforeBuild">
32
+ </Target>
33
+ <Target Name="AfterBuild">
34
+ </Target>
35
+ </Project>
llama.cpp/.clang-format ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ Language: Cpp
3
+ AlignAfterOpenBracket: Align
4
+ AlignArrayOfStructures: Left
5
+ AlignConsecutiveAssignments: AcrossComments
6
+ AlignConsecutiveBitFields: AcrossComments
7
+ AlignConsecutiveDeclarations: AcrossComments
8
+ AlignConsecutiveMacros: AcrossComments
9
+ # AlignConsecutiveShortCaseStatements: AcrossComments
10
+ AlignEscapedNewlines: Left # LeftWithLastLine
11
+ AlignOperands: Align
12
+ AlignTrailingComments:
13
+ Kind: Always
14
+ OverEmptyLines: 1
15
+ AllowAllArgumentsOnNextLine: true
16
+ AllowAllParametersOfDeclarationOnNextLine: false
17
+ # AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
18
+ AllowShortBlocksOnASingleLine: Never
19
+ AllowShortCaseLabelsOnASingleLine: false
20
+ AllowShortFunctionsOnASingleLine: Inline
21
+ AllowShortIfStatementsOnASingleLine: Never
22
+ AllowShortLambdasOnASingleLine: Inline
23
+ AllowShortLoopsOnASingleLine: false
24
+ AlwaysBreakBeforeMultilineStrings: true
25
+ BinPackArguments: true
26
+ BinPackParameters: true # OnePerLine
27
+ BitFieldColonSpacing: Both
28
+ BreakBeforeBraces: Custom # Attach
29
+ BraceWrapping:
30
+ AfterCaseLabel: true
31
+ AfterClass: false
32
+ AfterControlStatement: false
33
+ AfterEnum: false
34
+ AfterFunction: false
35
+ AfterNamespace: false
36
+ AfterObjCDeclaration: false
37
+ AfterStruct: false
38
+ AfterUnion: false
39
+ AfterExternBlock: false
40
+ BeforeCatch: false
41
+ BeforeElse: false
42
+ BeforeLambdaBody: false
43
+ BeforeWhile: false
44
+ IndentBraces: false
45
+ SplitEmptyFunction: false
46
+ SplitEmptyRecord: false
47
+ SplitEmptyNamespace: false
48
+ # BreakAdjacentStringLiterals: true
49
+ BreakAfterAttributes: Never
50
+ BreakBeforeBinaryOperators: None
51
+ BreakBeforeInlineASMColon: OnlyMultiline
52
+ BreakBeforeTernaryOperators: false
53
+ # BreakBinaryOperations: Never
54
+ BreakConstructorInitializers: AfterColon
55
+ # BreakFunctionDefinitionParameters: false
56
+ BreakInheritanceList: AfterComma
57
+ BreakStringLiterals: true
58
+ # BreakTemplateDeclarations: Yes
59
+ ColumnLimit: 120
60
+ CommentPragmas: '^ IWYU pragma:'
61
+ CompactNamespaces: false
62
+ ConstructorInitializerIndentWidth: 4
63
+ ContinuationIndentWidth: 4
64
+ Cpp11BracedListStyle: false
65
+ DerivePointerAlignment: false
66
+ DisableFormat: false
67
+ EmptyLineBeforeAccessModifier: Leave
68
+ EmptyLineAfterAccessModifier: Never
69
+ ExperimentalAutoDetectBinPacking: false
70
+ FixNamespaceComments: true
71
+ IncludeBlocks: Regroup
72
+ IncludeCategories:
73
+ - Regex: '^<.*\.h>'
74
+ Priority: 1
75
+ SortPriority: 0
76
+ - Regex: '^<.*'
77
+ Priority: 2
78
+ SortPriority: 0
79
+ - Regex: '.*'
80
+ Priority: 3
81
+ SortPriority: 0
82
+ IncludeIsMainRegex: '([-_](test|unittest))?$'
83
+ IncludeIsMainSourceRegex: ''
84
+ IndentAccessModifiers: false
85
+ IndentCaseBlocks: true
86
+ IndentCaseLabels: true
87
+ IndentExternBlock: NoIndent
88
+ IndentGotoLabels: false
89
+ IndentPPDirectives: AfterHash
90
+ IndentWidth: 4
91
+ IndentWrappedFunctionNames: false
92
+ InsertBraces: true # NOTE: may lead to incorrect formatting
93
+ InsertNewlineAtEOF: true
94
+ JavaScriptQuotes: Leave
95
+ JavaScriptWrapImports: true
96
+ KeepEmptyLinesAtTheStartOfBlocks: false
97
+ LambdaBodyIndentation: Signature
98
+ LineEnding: LF
99
+ MacroBlockBegin: ''
100
+ MacroBlockEnd: ''
101
+ MaxEmptyLinesToKeep: 1
102
+ NamespaceIndentation: None
103
+ ObjCBinPackProtocolList: Auto
104
+ ObjCBlockIndentWidth: 4
105
+ ObjCSpaceAfterProperty: true
106
+ ObjCSpaceBeforeProtocolList: true
107
+ PPIndentWidth: -1
108
+ PackConstructorInitializers: CurrentLine
109
+ PenaltyBreakAssignment: 2
110
+ PenaltyBreakBeforeFirstCallParameter: 1
111
+ PenaltyBreakComment: 300
112
+ PenaltyBreakFirstLessLess: 120
113
+ PenaltyBreakString: 1000
114
+ PenaltyBreakTemplateDeclaration: 10
115
+ PenaltyExcessCharacter: 1000000
116
+ PenaltyReturnTypeOnItsOwnLine: 200
117
+ PointerAlignment: Middle
118
+ QualifierAlignment: Left
119
+ #QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
120
+ RawStringFormats:
121
+ - Language: Cpp
122
+ Delimiters:
123
+ - cc
124
+ - CC
125
+ - cpp
126
+ - Cpp
127
+ - CPP
128
+ - 'c++'
129
+ - 'C++'
130
+ CanonicalDelimiter: ''
131
+ ReferenceAlignment: Middle
132
+ ReflowComments: false # IndentOnly
133
+ SeparateDefinitionBlocks: Always
134
+ SortIncludes: CaseInsensitive
135
+ SortUsingDeclarations: LexicographicNumeric
136
+ SpaceAfterCStyleCast: true
137
+ SpaceAfterLogicalNot: false
138
+ SpaceAfterTemplateKeyword: true
139
+ SpaceBeforeAssignmentOperators: true
140
+ SpaceBeforeCpp11BracedList: false
141
+ SpaceBeforeCtorInitializerColon: true
142
+ SpaceBeforeInheritanceColon: true
143
+ SpaceBeforeParens: ControlStatements
144
+ SpaceBeforeRangeBasedForLoopColon: true
145
+ SpaceInEmptyBlock: false
146
+ SpaceInEmptyParentheses: false
147
+ SpacesBeforeTrailingComments: 2
148
+ SpacesInAngles: Never
149
+ SpacesInContainerLiterals: true
150
+ SpacesInLineCommentPrefix:
151
+ Minimum: 1
152
+ Maximum: -1
153
+ SpacesInParentheses: false
154
+ SpacesInSquareBrackets: false
155
+ SpaceBeforeSquareBrackets: false
156
+ Standard: c++17
157
+ TabWidth: 4
158
+ UseTab: Never
159
+ WhitespaceSensitiveMacros: ['STRINGIZE']
160
+ ...
161
+
llama.cpp/.clang-tidy ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ Checks: >
3
+ bugprone-*,
4
+ -bugprone-easily-swappable-parameters,
5
+ -bugprone-implicit-widening-of-multiplication-result,
6
+ -bugprone-misplaced-widening-cast,
7
+ -bugprone-narrowing-conversions,
8
+ readability-*,
9
+ -readability-avoid-unconditional-preprocessor-if,
10
+ -readability-function-cognitive-complexity,
11
+ -readability-identifier-length,
12
+ -readability-implicit-bool-conversion,
13
+ -readability-magic-numbers,
14
+ -readability-uppercase-literal-suffix,
15
+ -readability-simplify-boolean-expr,
16
+ clang-analyzer-*,
17
+ -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
18
+ performance-*,
19
+ portability-*,
20
+ -portability-simd-intrinsics,
21
+ misc-*,
22
+ -misc-const-correctness,
23
+ -misc-non-private-member-variables-in-classes,
24
+ -misc-no-recursion,
25
+ -misc-use-anonymous-namespace,
26
+ FormatStyle: none
llama.cpp/.devops/cloud-v-pipeline ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
2
+ stage('Cleanup'){
3
+ cleanWs() // Cleaning previous CI build in workspace
4
+ }
5
+ stage('checkout repo'){
6
+ retry(5){ // Retry if the cloning fails due to some reason
7
+ checkout scm // Clone the repo on Runner
8
+ }
9
+ }
10
+ stage('Compiling llama.cpp'){
11
+ sh'''#!/bin/bash
12
+ make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
13
+ '''
14
+ }
15
+ stage('Running llama.cpp'){
16
+ sh'''#!/bin/bash
17
+ module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
18
+ qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
19
+ cat llama_log.txt # Printing results
20
+ '''
21
+ }
22
+ }
llama.cpp/.devops/cpu.Dockerfile ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+
3
+ FROM ubuntu:$UBUNTU_VERSION AS build
4
+
5
+ ARG TARGETARCH
6
+
7
+ ARG GGML_CPU_ARM_ARCH=armv8-a
8
+
9
+ RUN apt-get update && \
10
+ apt-get install -y build-essential git cmake libcurl4-openssl-dev
11
+
12
+ WORKDIR /app
13
+
14
+ COPY . .
15
+
16
+ RUN if [ "$TARGETARCH" = "amd64" ]; then \
17
+ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
18
+ elif [ "$TARGETARCH" = "arm64" ]; then \
19
+ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
20
+ else \
21
+ echo "Unsupported architecture"; \
22
+ exit 1; \
23
+ fi && \
24
+ cmake --build build -j $(nproc)
25
+
26
+ RUN mkdir -p /app/lib && \
27
+ find build -name "*.so" -exec cp {} /app/lib \;
28
+
29
+ RUN mkdir -p /app/full \
30
+ && cp build/bin/* /app/full \
31
+ && cp *.py /app/full \
32
+ && cp -r gguf-py /app/full \
33
+ && cp -r requirements /app/full \
34
+ && cp requirements.txt /app/full \
35
+ && cp .devops/tools.sh /app/full/tools.sh
36
+
37
+ ## Base image
38
+ FROM ubuntu:$UBUNTU_VERSION AS base
39
+
40
+ RUN apt-get update \
41
+ && apt-get install -y libgomp1 curl\
42
+ && apt autoremove -y \
43
+ && apt clean -y \
44
+ && rm -rf /tmp/* /var/tmp/* \
45
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
46
+ && find /var/cache -type f -delete
47
+
48
+ COPY --from=build /app/lib/ /app
49
+
50
+ ### Full
51
+ FROM base AS full
52
+
53
+ COPY --from=build /app/full /app
54
+
55
+ WORKDIR /app
56
+
57
+ RUN apt-get update \
58
+ && apt-get install -y \
59
+ git \
60
+ python3 \
61
+ python3-pip \
62
+ && pip install --upgrade pip setuptools wheel \
63
+ && pip install -r requirements.txt \
64
+ && apt autoremove -y \
65
+ && apt clean -y \
66
+ && rm -rf /tmp/* /var/tmp/* \
67
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
68
+ && find /var/cache -type f -delete
69
+
70
+ ENTRYPOINT ["/app/tools.sh"]
71
+
72
+ ### Light, CLI only
73
+ FROM base AS light
74
+
75
+ COPY --from=build /app/full/llama-cli /app
76
+
77
+ WORKDIR /app
78
+
79
+ ENTRYPOINT [ "/app/llama-cli" ]
80
+
81
+ ### Server, Server only
82
+ FROM base AS server
83
+
84
+ ENV LLAMA_ARG_HOST=0.0.0.0
85
+
86
+ COPY --from=build /app/full/llama-server /app
87
+
88
+ WORKDIR /app
89
+
90
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
91
+
92
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.devops/cuda.Dockerfile ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+ # This needs to generally match the container host's environment.
3
+ ARG CUDA_VERSION=12.6.0
4
+ # Target the CUDA build image
5
+ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
6
+
7
+ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
8
+
9
+ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
10
+
11
+ # CUDA architecture to build for (defaults to all supported archs)
12
+ ARG CUDA_DOCKER_ARCH=default
13
+
14
+ RUN apt-get update && \
15
+ apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
16
+
17
+ WORKDIR /app
18
+
19
+ COPY . .
20
+
21
+ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
22
+ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
23
+ fi && \
24
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
25
+ cmake --build build --config Release -j$(nproc)
26
+
27
+ RUN mkdir -p /app/lib && \
28
+ find build -name "*.so" -exec cp {} /app/lib \;
29
+
30
+ RUN mkdir -p /app/full \
31
+ && cp build/bin/* /app/full \
32
+ && cp *.py /app/full \
33
+ && cp -r gguf-py /app/full \
34
+ && cp -r requirements /app/full \
35
+ && cp requirements.txt /app/full \
36
+ && cp .devops/tools.sh /app/full/tools.sh
37
+
38
+ ## Base image
39
+ FROM ${BASE_CUDA_RUN_CONTAINER} AS base
40
+
41
+ RUN apt-get update \
42
+ && apt-get install -y libgomp1 curl\
43
+ && apt autoremove -y \
44
+ && apt clean -y \
45
+ && rm -rf /tmp/* /var/tmp/* \
46
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
47
+ && find /var/cache -type f -delete
48
+
49
+ COPY --from=build /app/lib/ /app
50
+
51
+ ### Full
52
+ FROM base AS full
53
+
54
+ COPY --from=build /app/full /app
55
+
56
+ WORKDIR /app
57
+
58
+ RUN apt-get update \
59
+ && apt-get install -y \
60
+ git \
61
+ python3 \
62
+ python3-pip \
63
+ && pip install --upgrade pip setuptools wheel \
64
+ && pip install -r requirements.txt \
65
+ && apt autoremove -y \
66
+ && apt clean -y \
67
+ && rm -rf /tmp/* /var/tmp/* \
68
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
69
+ && find /var/cache -type f -delete
70
+
71
+
72
+ ENTRYPOINT ["/app/tools.sh"]
73
+
74
+ ### Light, CLI only
75
+ FROM base AS light
76
+
77
+ COPY --from=build /app/full/llama-cli /app
78
+
79
+ WORKDIR /app
80
+
81
+ ENTRYPOINT [ "/app/llama-cli" ]
82
+
83
+ ### Server, Server only
84
+ FROM base AS server
85
+
86
+ ENV LLAMA_ARG_HOST=0.0.0.0
87
+
88
+ COPY --from=build /app/full/llama-server /app
89
+
90
+ WORKDIR /app
91
+
92
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
93
+
94
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.devops/intel.Dockerfile ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
2
+
3
+ ## Build Image
4
+
5
+ FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
6
+
7
+ ARG GGML_SYCL_F16=OFF
8
+ RUN apt-get update && \
9
+ apt-get install -y git libcurl4-openssl-dev
10
+
11
+ WORKDIR /app
12
+
13
+ COPY . .
14
+
15
+ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
16
+ echo "GGML_SYCL_F16 is set" \
17
+ && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
18
+ fi && \
19
+ echo "Building with dynamic libs" && \
20
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
21
+ cmake --build build --config Release -j$(nproc)
22
+
23
+ RUN mkdir -p /app/lib && \
24
+ find build -name "*.so" -exec cp {} /app/lib \;
25
+
26
+ RUN mkdir -p /app/full \
27
+ && cp build/bin/* /app/full \
28
+ && cp *.py /app/full \
29
+ && cp -r gguf-py /app/full \
30
+ && cp -r requirements /app/full \
31
+ && cp requirements.txt /app/full \
32
+ && cp .devops/tools.sh /app/full/tools.sh
33
+
34
+ FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
35
+
36
+ RUN apt-get update \
37
+ && apt-get install -y libgomp1 curl\
38
+ && apt autoremove -y \
39
+ && apt clean -y \
40
+ && rm -rf /tmp/* /var/tmp/* \
41
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
42
+ && find /var/cache -type f -delete
43
+
44
+ ### Full
45
+ FROM base AS full
46
+
47
+ COPY --from=build /app/lib/ /app
48
+ COPY --from=build /app/full /app
49
+
50
+ WORKDIR /app
51
+
52
+ RUN apt-get update \
53
+ && apt-get install -y \
54
+ git \
55
+ python3 \
56
+ python3-pip \
57
+ && pip install --upgrade pip setuptools wheel \
58
+ && pip install -r requirements.txt \
59
+ && apt autoremove -y \
60
+ && apt clean -y \
61
+ && rm -rf /tmp/* /var/tmp/* \
62
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
63
+ && find /var/cache -type f -delete
64
+
65
+
66
+ ENTRYPOINT ["/app/tools.sh"]
67
+
68
+ ### Light, CLI only
69
+ FROM base AS light
70
+
71
+ COPY --from=build /app/lib/ /app
72
+ COPY --from=build /app/full/llama-cli /app
73
+
74
+ WORKDIR /app
75
+
76
+ ENTRYPOINT [ "/app/llama-cli" ]
77
+
78
+ ### Server, Server only
79
+ FROM base AS server
80
+
81
+ ENV LLAMA_ARG_HOST=0.0.0.0
82
+
83
+ COPY --from=build /app/lib/ /app
84
+ COPY --from=build /app/full/llama-server /app
85
+
86
+ WORKDIR /app
87
+
88
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
89
+
90
+ ENTRYPOINT [ "/app/llama-server" ]
91
+
llama.cpp/.devops/llama-cli-cann.Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
2
+
3
+ FROM ascendai/cann:$ASCEND_VERSION AS build
4
+
5
+ WORKDIR /app
6
+
7
+ COPY . .
8
+
9
+ RUN yum install -y gcc g++ cmake make
10
+ ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
11
+ ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
12
+ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
13
+ ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
14
+ ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
15
+ ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
16
+ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
17
+ ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
18
+ ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
19
+
20
+ # find libascend_hal.so, because the drive hasn`t been mounted.
21
+ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
22
+
23
+ RUN echo "Building with static libs" && \
24
+ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
25
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
26
+ cmake --build build --config Release --target llama-cli
27
+
28
+ # TODO: use image with NNRT
29
+ FROM ascendai/cann:$ASCEND_VERSION AS runtime
30
+ COPY --from=build /app/build/bin/llama-cli /llama-cli
31
+
32
+ ENV LC_ALL=C.utf8
33
+
34
+ ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
35
+ ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
36
+ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
37
+ ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
38
+ ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
39
+ ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
40
+ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
41
+ ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
42
+ ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
43
+
44
+ ENTRYPOINT ["/llama-cli" ]
llama.cpp/.devops/llama-cpp-cuda.srpm.spec ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SRPM for building from source and packaging an RPM for RPM-based distros.
2
+ # https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
3
+ # Built and maintained by John Boero - boeroboy@gmail.com
4
+ # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
5
+
6
+ # Notes for llama.cpp:
7
+ # 1. Tags are currently based on hash - which will not sort asciibetically.
8
+ # We need to declare standard versioning if people want to sort latest releases.
9
+ # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
10
+ # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
11
+ # Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
12
+ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
13
+ # It is up to the user to install the correct vendor-specific support.
14
+
15
+ Name: llama.cpp-cuda
16
+ Version: %( date "+%%Y%%m%%d" )
17
+ Release: 1%{?dist}
18
+ Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
19
+ License: MIT
20
+ Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
21
+ BuildRequires: coreutils make gcc-c++ git cuda-toolkit
22
+ Requires: cuda-toolkit
23
+ URL: https://github.com/ggerganov/llama.cpp
24
+
25
+ %define debug_package %{nil}
26
+ %define source_date_epoch_from_changelog 0
27
+
28
+ %description
29
+ CPU inference for Meta's Lllama2 models using default options.
30
+
31
+ %prep
32
+ %setup -n llama.cpp-master
33
+
34
+ %build
35
+ make -j GGML_CUDA=1
36
+
37
+ %install
38
+ mkdir -p %{buildroot}%{_bindir}/
39
+ cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
40
+ cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
41
+ cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
42
+
43
+ mkdir -p %{buildroot}/usr/lib/systemd/system
44
+ %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
45
+ [Unit]
46
+ Description=Llama.cpp server, CPU only (no GPU support in this build).
47
+ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
48
+
49
+ [Service]
50
+ Type=simple
51
+ EnvironmentFile=/etc/sysconfig/llama
52
+ ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
53
+ ExecReload=/bin/kill -s HUP $MAINPID
54
+ Restart=never
55
+
56
+ [Install]
57
+ WantedBy=default.target
58
+ EOF
59
+
60
+ mkdir -p %{buildroot}/etc/sysconfig
61
+ %{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
62
+ LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
63
+ EOF
64
+
65
+ %clean
66
+ rm -rf %{buildroot}
67
+ rm -rf %{_builddir}/*
68
+
69
+ %files
70
+ %{_bindir}/llama-cuda-cli
71
+ %{_bindir}/llama-cuda-server
72
+ %{_bindir}/llama-cuda-simple
73
+ /usr/lib/systemd/system/llamacuda.service
74
+ %config /etc/sysconfig/llama
75
+
76
+ %pre
77
+
78
+ %post
79
+
80
+ %preun
81
+ %postun
82
+
83
+ %changelog
llama.cpp/.devops/llama-cpp.srpm.spec ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SRPM for building from source and packaging an RPM for RPM-based distros.
2
+ # https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
3
+ # Built and maintained by John Boero - boeroboy@gmail.com
4
+ # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
5
+
6
+ # Notes for llama.cpp:
7
+ # 1. Tags are currently based on hash - which will not sort asciibetically.
8
+ # We need to declare standard versioning if people want to sort latest releases.
9
+ # In the meantime, YYYYMMDD format will be used.
10
+ # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
11
+ # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
12
+ # Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
13
+ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
14
+ # It is up to the user to install the correct vendor-specific support.
15
+
16
+ Name: llama.cpp
17
+ Version: %( date "+%%Y%%m%%d" )
18
+ Release: 1%{?dist}
19
+ Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
20
+ License: MIT
21
+ Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
22
+ BuildRequires: coreutils make gcc-c++ git libstdc++-devel
23
+ Requires: libstdc++
24
+ URL: https://github.com/ggerganov/llama.cpp
25
+
26
+ %define debug_package %{nil}
27
+ %define source_date_epoch_from_changelog 0
28
+
29
+ %description
30
+ CPU inference for Meta's Lllama2 models using default options.
31
+ Models are not included in this package and must be downloaded separately.
32
+
33
+ %prep
34
+ %setup -n llama.cpp-master
35
+
36
+ %build
37
+ make -j
38
+
39
+ %install
40
+ mkdir -p %{buildroot}%{_bindir}/
41
+ cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
42
+ cp -p llama-server %{buildroot}%{_bindir}/llama-server
43
+ cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
44
+
45
+ mkdir -p %{buildroot}/usr/lib/systemd/system
46
+ %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
47
+ [Unit]
48
+ Description=Llama.cpp server, CPU only (no GPU support in this build).
49
+ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
50
+
51
+ [Service]
52
+ Type=simple
53
+ EnvironmentFile=/etc/sysconfig/llama
54
+ ExecStart=/usr/bin/llama-server $LLAMA_ARGS
55
+ ExecReload=/bin/kill -s HUP $MAINPID
56
+ Restart=never
57
+
58
+ [Install]
59
+ WantedBy=default.target
60
+ EOF
61
+
62
+ mkdir -p %{buildroot}/etc/sysconfig
63
+ %{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
64
+ LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
65
+ EOF
66
+
67
+ %clean
68
+ rm -rf %{buildroot}
69
+ rm -rf %{_builddir}/*
70
+
71
+ %files
72
+ %{_bindir}/llama-cli
73
+ %{_bindir}/llama-server
74
+ %{_bindir}/llama-simple
75
+ /usr/lib/systemd/system/llama.service
76
+ %config /etc/sysconfig/llama
77
+
78
+ %pre
79
+
80
+ %post
81
+
82
+ %preun
83
+ %postun
84
+
85
+ %changelog
llama.cpp/.devops/musa.Dockerfile ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+ # This needs to generally match the container host's environment.
3
+ ARG MUSA_VERSION=rc3.1.0
4
+ # Target the MUSA build image
5
+ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
6
+
7
+ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
8
+
9
+ FROM ${BASE_MUSA_DEV_CONTAINER} AS build
10
+
11
+ # MUSA architecture to build for (defaults to all supported archs)
12
+ ARG MUSA_DOCKER_ARCH=default
13
+
14
+ RUN apt-get update && \
15
+ apt-get install -y \
16
+ build-essential \
17
+ cmake \
18
+ python3 \
19
+ python3-pip \
20
+ git \
21
+ libcurl4-openssl-dev \
22
+ libgomp1
23
+
24
+ COPY requirements.txt requirements.txt
25
+ COPY requirements requirements
26
+
27
+ RUN pip install --upgrade pip setuptools wheel \
28
+ && pip install -r requirements.txt
29
+
30
+ WORKDIR /app
31
+
32
+ COPY . .
33
+
34
+ # Use the default MUSA archs if not specified
35
+ RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
36
+ export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
37
+ fi && \
38
+ cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
39
+ cmake --build build --config Release -j$(nproc)
40
+
41
+ RUN mkdir -p /app/lib && \
42
+ find build -name "*.so" -exec cp {} /app/lib \;
43
+
44
+ RUN mkdir -p /app/full \
45
+ && cp build/bin/* /app/full \
46
+ && cp *.py /app/full \
47
+ && cp -r gguf-py /app/full \
48
+ && cp -r requirements /app/full \
49
+ && cp requirements.txt /app/full \
50
+ && cp .devops/tools.sh /app/full/tools.sh
51
+
52
+ ## Base image
53
+ FROM ${BASE_MUSA_RUN_CONTAINER} AS base
54
+
55
+ RUN apt-get update \
56
+ && apt-get install -y libgomp1 curl\
57
+ && apt autoremove -y \
58
+ && apt clean -y \
59
+ && rm -rf /tmp/* /var/tmp/* \
60
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
61
+ && find /var/cache -type f -delete
62
+
63
+ COPY --from=build /app/lib/ /app
64
+
65
+ ### Full
66
+ FROM base AS full
67
+
68
+ COPY --from=build /app/full /app
69
+
70
+ WORKDIR /app
71
+
72
+ RUN apt-get update \
73
+ && apt-get install -y \
74
+ git \
75
+ python3 \
76
+ python3-pip \
77
+ && pip install --upgrade pip setuptools wheel \
78
+ && pip install -r requirements.txt \
79
+ && apt autoremove -y \
80
+ && apt clean -y \
81
+ && rm -rf /tmp/* /var/tmp/* \
82
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
83
+ && find /var/cache -type f -delete
84
+
85
+
86
+ ENTRYPOINT ["/app/tools.sh"]
87
+
88
+ ### Light, CLI only
89
+ FROM base AS light
90
+
91
+ COPY --from=build /app/full/llama-cli /app
92
+
93
+ WORKDIR /app
94
+
95
+ ENTRYPOINT [ "/app/llama-cli" ]
96
+
97
+ ### Server, Server only
98
+ FROM base AS server
99
+
100
+ ENV LLAMA_ARG_HOST=0.0.0.0
101
+
102
+ COPY --from=build /app/full/llama-server /app
103
+
104
+ WORKDIR /app
105
+
106
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
107
+
108
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.devops/nix/apps.nix ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ perSystem =
3
+ { config, lib, ... }:
4
+ {
5
+ apps =
6
+ let
7
+ inherit (config.packages) default;
8
+ binaries = [
9
+ "llama-cli"
10
+ "llama-embedding"
11
+ "llama-server"
12
+ "llama-quantize"
13
+ ];
14
+ mkApp = name: {
15
+ type = "app";
16
+ program = "${default}/bin/${name}";
17
+ };
18
+ in
19
+ lib.genAttrs binaries mkApp;
20
+ };
21
+ }
llama.cpp/.devops/nix/devshells.nix ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ { inputs, ... }:
2
+
3
+ {
4
+ perSystem =
5
+ {
6
+ config,
7
+ lib,
8
+ system,
9
+ ...
10
+ }:
11
+ {
12
+ devShells =
13
+ let
14
+ pkgs = import inputs.nixpkgs { inherit system; };
15
+ stdenv = pkgs.stdenv;
16
+ scripts = config.packages.python-scripts;
17
+ in
18
+ lib.pipe (config.packages) [
19
+ (lib.concatMapAttrs (
20
+ name: package: {
21
+ ${name} = pkgs.mkShell {
22
+ name = "${name}";
23
+ inputsFrom = [ package ];
24
+ shellHook = ''
25
+ echo "Entering ${name} devShell"
26
+ '';
27
+ };
28
+ "${name}-extra" =
29
+ if (name == "python-scripts") then
30
+ null
31
+ else
32
+ pkgs.mkShell {
33
+ name = "${name}-extra";
34
+ inputsFrom = [
35
+ package
36
+ scripts
37
+ ];
38
+ # Extra packages that *may* be used by some scripts
39
+ packages = [
40
+ pkgs.python3Packages.tiktoken
41
+ ];
42
+ shellHook = ''
43
+ echo "Entering ${name} devShell"
44
+ addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
45
+ '';
46
+ };
47
+ }
48
+ ))
49
+ (lib.filterAttrs (name: value: value != null))
50
+ ];
51
+ };
52
+ }
llama.cpp/.devops/nix/docker.nix ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ lib,
3
+ dockerTools,
4
+ buildEnv,
5
+ llama-cpp,
6
+ interactive ? true,
7
+ coreutils,
8
+ }:
9
+
10
+ # A tar that can be fed into `docker load`:
11
+ #
12
+ # $ nix build .#llamaPackages.docker
13
+ # $ docker load < result
14
+
15
+ # For details and variations cf.
16
+ # - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
17
+ # - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
18
+ # - https://nixery.dev/
19
+
20
+ # Approximate (compressed) sizes, at the time of writing, are:
21
+ #
22
+ # .#llamaPackages.docker: 125M;
23
+ # .#llamaPackagesCuda.docker: 537M;
24
+ # .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
25
+
26
+ dockerTools.buildLayeredImage {
27
+ name = llama-cpp.pname;
28
+ tag = "latest";
29
+
30
+ contents =
31
+ [ llama-cpp ]
32
+ ++ lib.optionals interactive [
33
+ coreutils
34
+ dockerTools.binSh
35
+ dockerTools.caCertificates
36
+ ];
37
+ }
llama.cpp/.devops/nix/jetson-support.nix ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ { inputs, ... }:
2
+ {
3
+ perSystem =
4
+ {
5
+ config,
6
+ system,
7
+ lib,
8
+ pkgsCuda,
9
+ ...
10
+ }:
11
+ {
12
+ legacyPackages =
13
+ let
14
+ caps.llamaPackagesXavier = "7.2";
15
+ caps.llamaPackagesOrin = "8.7";
16
+ caps.llamaPackagesTX2 = "6.2";
17
+ caps.llamaPackagesNano = "5.3";
18
+
19
+ pkgsFor =
20
+ cap:
21
+ import inputs.nixpkgs {
22
+ inherit system;
23
+ config = {
24
+ cudaSupport = true;
25
+ cudaCapabilities = [ cap ];
26
+ cudaEnableForwardCompat = false;
27
+ inherit (pkgsCuda.config) allowUnfreePredicate;
28
+ };
29
+ };
30
+ in
31
+ builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
32
+
33
+ packages = lib.optionalAttrs (system == "aarch64-linux") {
34
+ jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
35
+ jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
36
+ jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
37
+ };
38
+ };
39
+ }
llama.cpp/.devops/nix/nixpkgs-instances.nix ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ { inputs, ... }:
2
+ {
3
+ # The _module.args definitions are passed on to modules as arguments. E.g.
4
+ # the module `{ pkgs ... }: { /* config */ }` implicitly uses
5
+ # `_module.args.pkgs` (defined in this case by flake-parts).
6
+ perSystem =
7
+ { system, ... }:
8
+ {
9
+ _module.args = {
10
+ # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
11
+ # again, the below creates several nixpkgs instances which the
12
+ # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
13
+ #
14
+ # This is currently "slow" and "expensive", on a certain scale.
15
+ # This also isn't "right" in that this hinders dependency injection at
16
+ # the level of flake inputs. This might get removed in the foreseeable
17
+ # future.
18
+ #
19
+ # Note that you can use these expressions without Nix
20
+ # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
21
+
22
+ pkgsCuda = import inputs.nixpkgs {
23
+ inherit system;
24
+ # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
25
+ # and ucx are built with CUDA support)
26
+ config.cudaSupport = true;
27
+ config.allowUnfreePredicate =
28
+ p:
29
+ builtins.all (
30
+ license:
31
+ license.free
32
+ || builtins.elem license.shortName [
33
+ "CUDA EULA"
34
+ "cuDNN EULA"
35
+ ]
36
+ ) (p.meta.licenses or [ p.meta.license ]);
37
+ };
38
+ # Ensure dependencies use ROCm consistently
39
+ pkgsRocm = import inputs.nixpkgs {
40
+ inherit system;
41
+ config.rocmSupport = true;
42
+ };
43
+ };
44
+ };
45
+ }
llama.cpp/.devops/nix/package-gguf-py.nix ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ lib,
3
+ llamaVersion,
4
+ numpy,
5
+ tqdm,
6
+ sentencepiece,
7
+ pyyaml,
8
+ poetry-core,
9
+ buildPythonPackage,
10
+ pytestCheckHook,
11
+ }:
12
+
13
+ buildPythonPackage {
14
+ pname = "gguf";
15
+ version = llamaVersion;
16
+ pyproject = true;
17
+ nativeBuildInputs = [ poetry-core ];
18
+ propagatedBuildInputs = [
19
+ numpy
20
+ tqdm
21
+ sentencepiece
22
+ pyyaml
23
+ ];
24
+ src = lib.cleanSource ../../gguf-py;
25
+ pythonImportsCheck = [
26
+ "numpy"
27
+ "gguf"
28
+ ];
29
+ nativeCheckInputs = [ pytestCheckHook ];
30
+ doCheck = true;
31
+ meta = with lib; {
32
+ description = "Python package for writing binary files in the GGUF format";
33
+ license = licenses.mit;
34
+ maintainers = [ maintainers.ditsuke ];
35
+ };
36
+ }
llama.cpp/.devops/nix/package.nix ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ lib,
3
+ glibc,
4
+ config,
5
+ stdenv,
6
+ runCommand,
7
+ cmake,
8
+ ninja,
9
+ pkg-config,
10
+ git,
11
+ mpi,
12
+ blas,
13
+ cudaPackages,
14
+ autoAddDriverRunpath,
15
+ darwin,
16
+ rocmPackages,
17
+ vulkan-headers,
18
+ vulkan-loader,
19
+ curl,
20
+ shaderc,
21
+ useBlas ?
22
+ builtins.all (x: !x) [
23
+ useCuda
24
+ useMetalKit
25
+ useRocm
26
+ useVulkan
27
+ ]
28
+ && blas.meta.available,
29
+ useCuda ? config.cudaSupport,
30
+ useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
31
+ # Increases the runtime closure size by ~700M
32
+ useMpi ? false,
33
+ useRocm ? config.rocmSupport,
34
+ rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
35
+ enableCurl ? true,
36
+ useVulkan ? false,
37
+ llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
38
+
39
+ # It's necessary to consistently use backendStdenv when building with CUDA support,
40
+ # otherwise we get libstdc++ errors downstream.
41
+ effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
42
+ enableStatic ? effectiveStdenv.hostPlatform.isStatic,
43
+ precompileMetalShaders ? false,
44
+ }:
45
+
46
+ let
47
+ inherit (lib)
48
+ cmakeBool
49
+ cmakeFeature
50
+ optionals
51
+ strings
52
+ ;
53
+
54
+ stdenv = throw "Use effectiveStdenv instead";
55
+
56
+ suffices =
57
+ lib.optionals useBlas [ "BLAS" ]
58
+ ++ lib.optionals useCuda [ "CUDA" ]
59
+ ++ lib.optionals useMetalKit [ "MetalKit" ]
60
+ ++ lib.optionals useMpi [ "MPI" ]
61
+ ++ lib.optionals useRocm [ "ROCm" ]
62
+ ++ lib.optionals useVulkan [ "Vulkan" ];
63
+
64
+ pnameSuffix =
65
+ strings.optionalString (suffices != [ ])
66
+ "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
67
+ descriptionSuffix = strings.optionalString (
68
+ suffices != [ ]
69
+ ) ", accelerated with ${strings.concatStringsSep ", " suffices}";
70
+
71
+ xcrunHost = runCommand "xcrunHost" { } ''
72
+ mkdir -p $out/bin
73
+ ln -s /usr/bin/xcrun $out/bin
74
+ '';
75
+
76
+ # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
77
+ # separately
78
+ darwinBuildInputs =
79
+ with darwin.apple_sdk.frameworks;
80
+ [
81
+ Accelerate
82
+ CoreVideo
83
+ CoreGraphics
84
+ ]
85
+ ++ optionals useMetalKit [ MetalKit ];
86
+
87
+ cudaBuildInputs = with cudaPackages; [
88
+ cuda_cudart
89
+ cuda_cccl # <nv/target>
90
+ libcublas
91
+ ];
92
+
93
+ rocmBuildInputs = with rocmPackages; [
94
+ clr
95
+ hipblas
96
+ rocblas
97
+ ];
98
+
99
+ vulkanBuildInputs = [
100
+ vulkan-headers
101
+ vulkan-loader
102
+ shaderc
103
+ ];
104
+ in
105
+
106
+ effectiveStdenv.mkDerivation (finalAttrs: {
107
+ pname = "llama-cpp${pnameSuffix}";
108
+ version = llamaVersion;
109
+
110
+ # Note: none of the files discarded here are visible in the sandbox or
111
+ # affect the output hash. This also means they can be modified without
112
+ # triggering a rebuild.
113
+ src = lib.cleanSourceWith {
114
+ filter =
115
+ name: type:
116
+ let
117
+ noneOf = builtins.all (x: !x);
118
+ baseName = baseNameOf name;
119
+ in
120
+ noneOf [
121
+ (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
122
+ (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
123
+ (lib.hasPrefix "." baseName) # Skip hidden files and directories
124
+ (baseName == "flake.lock")
125
+ ];
126
+ src = lib.cleanSource ../../.;
127
+ };
128
+
129
+ postPatch = ''
130
+ substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
131
+ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
132
+ substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
133
+ --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
134
+ '';
135
+
136
+ # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
137
+ # `default.metallib` may be compiled with Metal compiler from XCode
138
+ # and we need to escape sandbox on MacOS to access Metal compiler.
139
+ # `xcrun` is used find the path of the Metal compiler, which is varible
140
+ # and not on $PATH
141
+ # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
142
+ __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
143
+
144
+ nativeBuildInputs =
145
+ [
146
+ cmake
147
+ ninja
148
+ pkg-config
149
+ git
150
+ ]
151
+ ++ optionals useCuda [
152
+ cudaPackages.cuda_nvcc
153
+
154
+ autoAddDriverRunpath
155
+ ]
156
+ ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
157
+ ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
158
+
159
+ buildInputs =
160
+ optionals effectiveStdenv.isDarwin darwinBuildInputs
161
+ ++ optionals useCuda cudaBuildInputs
162
+ ++ optionals useMpi [ mpi ]
163
+ ++ optionals useRocm rocmBuildInputs
164
+ ++ optionals useBlas [ blas ]
165
+ ++ optionals useVulkan vulkanBuildInputs
166
+ ++ optionals enableCurl [ curl ];
167
+
168
+ cmakeFlags =
169
+ [
170
+ (cmakeBool "LLAMA_BUILD_SERVER" true)
171
+ (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
172
+ (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
173
+ (cmakeBool "LLAMA_CURL" enableCurl)
174
+ (cmakeBool "GGML_NATIVE" false)
175
+ (cmakeBool "GGML_BLAS" useBlas)
176
+ (cmakeBool "GGML_CUDA" useCuda)
177
+ (cmakeBool "GGML_HIP" useRocm)
178
+ (cmakeBool "GGML_METAL" useMetalKit)
179
+ (cmakeBool "GGML_VULKAN" useVulkan)
180
+ (cmakeBool "GGML_STATIC" enableStatic)
181
+ ]
182
+ ++ optionals useCuda [
183
+ (
184
+ with cudaPackages.flags;
185
+ cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
186
+ builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
187
+ )
188
+ )
189
+ ]
190
+ ++ optionals useRocm [
191
+ (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
192
+ (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
193
+ ]
194
+ ++ optionals useMetalKit [
195
+ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
196
+ (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
197
+ ];
198
+
199
+ # Environment variables needed for ROCm
200
+ env = optionals useRocm {
201
+ ROCM_PATH = "${rocmPackages.clr}";
202
+ HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
203
+ };
204
+
205
+ # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
206
+ # if they haven't been added yet.
207
+ postInstall = ''
208
+ mkdir -p $out/include
209
+ cp $src/include/llama.h $out/include/
210
+ '';
211
+
212
+ meta = {
213
+ # Configurations we don't want even the CI to evaluate. Results in the
214
+ # "unsupported platform" messages. This is mostly a no-op, because
215
+ # cudaPackages would've refused to evaluate anyway.
216
+ badPlatforms = optionals useCuda lib.platforms.darwin;
217
+
218
+ # Configurations that are known to result in build failures. Can be
219
+ # overridden by importing Nixpkgs with `allowBroken = true`.
220
+ broken = (useMetalKit && !effectiveStdenv.isDarwin);
221
+
222
+ description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
223
+ homepage = "https://github.com/ggerganov/llama.cpp/";
224
+ license = lib.licenses.mit;
225
+
226
+ # Accommodates `nix run` and `lib.getExe`
227
+ mainProgram = "llama-cli";
228
+
229
+ # These people might respond, on the best effort basis, if you ping them
230
+ # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
231
+ # Consider adding yourself to this list if you want to ensure this flake
232
+ # stays maintained and you're willing to invest your time. Do not add
233
+ # other people without their consent. Consider removing people after
234
+ # they've been unreachable for long periods of time.
235
+
236
+ # Note that lib.maintainers is defined in Nixpkgs, but you may just add
237
+ # an attrset following the same format as in
238
+ # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
239
+ maintainers = with lib.maintainers; [
240
+ philiptaron
241
+ SomeoneSerge
242
+ ];
243
+
244
+ # Extend `badPlatforms` instead
245
+ platforms = lib.platforms.all;
246
+ };
247
+ })
llama.cpp/.devops/nix/python-scripts.nix ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ lib,
3
+ stdenv,
4
+ buildPythonPackage,
5
+ poetry-core,
6
+ mkShell,
7
+ python3Packages,
8
+ gguf-py,
9
+ }@inputs:
10
+
11
+ let
12
+ llama-python-deps = with python3Packages; [
13
+ numpy
14
+ sentencepiece
15
+ transformers
16
+ protobuf
17
+ torchWithoutCuda
18
+ gguf-py
19
+ tqdm
20
+
21
+ # for scripts/compare-llama-bench.py
22
+ gitpython
23
+ tabulate
24
+
25
+ # for examples/pydantic-models-to-grammar-examples.py
26
+ docstring-parser
27
+ pydantic
28
+
29
+ ];
30
+
31
+ llama-python-test-deps = with python3Packages; [
32
+ # Server bench
33
+ matplotlib
34
+
35
+ # server tests
36
+ openai
37
+ pytest
38
+ prometheus-client
39
+ ];
40
+ in
41
+
42
+ buildPythonPackage ({
43
+ pname = "llama-scripts";
44
+ version = "0.0.0";
45
+ pyproject = true;
46
+
47
+ # NOTE: The files filtered out here are not visible in the build sandbox, neither
48
+ # do they affect the output hash. They can be modified without triggering a rebuild.
49
+ src = lib.cleanSourceWith {
50
+ filter =
51
+ name: type:
52
+ let
53
+ any = builtins.any (x: x);
54
+ baseName = builtins.baseNameOf name;
55
+ in
56
+ any [
57
+ (lib.hasSuffix ".py" name)
58
+ (baseName == "README.md")
59
+ (baseName == "pyproject.toml")
60
+ ];
61
+ src = lib.cleanSource ../../.;
62
+ };
63
+ nativeBuildInputs = [ poetry-core ];
64
+ nativeCheckInputs = llama-python-test-deps;
65
+ dependencies = llama-python-deps;
66
+ })
llama.cpp/.devops/nix/scope.nix ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ lib,
3
+ newScope,
4
+ python3,
5
+ llamaVersion ? "0.0.0",
6
+ }:
7
+
8
+ let
9
+ pythonPackages = python3.pkgs;
10
+ buildPythonPackage = pythonPackages.buildPythonPackage;
11
+ numpy = pythonPackages.numpy;
12
+ tqdm = pythonPackages.tqdm;
13
+ sentencepiece = pythonPackages.sentencepiece;
14
+ pyyaml = pythonPackages.pyyaml;
15
+ poetry-core = pythonPackages.poetry-core;
16
+ pytestCheckHook = pythonPackages.pytestCheckHook;
17
+ in
18
+
19
+ # We're using `makeScope` instead of just writing out an attrset
20
+ # because it allows users to apply overlays later using `overrideScope'`.
21
+ # Cf. https://noogle.dev/f/lib/makeScope
22
+
23
+ lib.makeScope newScope (self: {
24
+ inherit llamaVersion;
25
+ gguf-py = self.callPackage ./package-gguf-py.nix {
26
+ inherit
27
+ buildPythonPackage
28
+ numpy
29
+ tqdm
30
+ sentencepiece
31
+ poetry-core
32
+ pyyaml
33
+ pytestCheckHook
34
+ ;
35
+ };
36
+ python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
37
+ llama-cpp = self.callPackage ./package.nix { };
38
+ docker = self.callPackage ./docker.nix { };
39
+ docker-min = self.callPackage ./docker.nix { interactive = false; };
40
+ sif = self.callPackage ./sif.nix { };
41
+ })
llama.cpp/.devops/nix/sif.nix ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ lib,
3
+ singularity-tools,
4
+ llama-cpp,
5
+ bashInteractive,
6
+ interactive ? false,
7
+ }:
8
+
9
+ let
10
+ optionalInt = cond: x: if cond then x else 0;
11
+ in
12
+ singularity-tools.buildImage rec {
13
+ inherit (llama-cpp) name;
14
+ contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
15
+
16
+ # These are excessive (but safe) for most variants. Building singularity
17
+ # images requires superuser privileges, so we build them inside a VM in a
18
+ # writable image of pre-determined size.
19
+ #
20
+ # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
21
+ #
22
+ # Expected image sizes:
23
+ # - cpu/blas: 150M,
24
+ # - cuda, all gencodes: 560M,
25
+ diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
26
+ memSize = diskSize;
27
+ }
llama.cpp/.devops/rocm.Dockerfile ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=24.04
2
+
3
+ # This needs to generally match the container host's environment.
4
+ ARG ROCM_VERSION=6.3
5
+ ARG AMDGPU_VERSION=6.3
6
+
7
+ # Target the CUDA build image
8
+ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
9
+
10
+ ### Build image
11
+ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
12
+
13
+ # Unless otherwise specified, we make a fat build.
14
+ # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
15
+ # This is mostly tied to rocBLAS supported archs.
16
+ # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
17
+ # gfx906 is deprecated
18
+ #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
19
+
20
+ #ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
21
+ ARG ROCM_DOCKER_ARCH=gfx1100
22
+
23
+ # Set nvcc architectured
24
+ ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
25
+ # Enable ROCm
26
+ # ENV CC=/opt/rocm/llvm/bin/clang
27
+ # ENV CXX=/opt/rocm/llvm/bin/clang++
28
+
29
+ RUN apt-get update \
30
+ && apt-get install -y \
31
+ build-essential \
32
+ cmake \
33
+ git \
34
+ libcurl4-openssl-dev \
35
+ curl \
36
+ libgomp1
37
+
38
+ WORKDIR /app
39
+
40
+ COPY . .
41
+
42
+ RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
43
+ cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
44
+ && cmake --build build --config Release -j$(nproc)
45
+
46
+ RUN mkdir -p /app/lib \
47
+ && find build -name "*.so" -exec cp {} /app/lib \;
48
+
49
+ RUN mkdir -p /app/full \
50
+ && cp build/bin/* /app/full \
51
+ && cp *.py /app/full \
52
+ && cp -r gguf-py /app/full \
53
+ && cp -r requirements /app/full \
54
+ && cp requirements.txt /app/full \
55
+ && cp .devops/tools.sh /app/full/tools.sh
56
+
57
+ ## Base image
58
+ FROM ${BASE_ROCM_DEV_CONTAINER} AS base
59
+
60
+ RUN apt-get update \
61
+ && apt-get install -y libgomp1 curl\
62
+ && apt autoremove -y \
63
+ && apt clean -y \
64
+ && rm -rf /tmp/* /var/tmp/* \
65
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
66
+ && find /var/cache -type f -delete
67
+
68
+ COPY --from=build /app/lib/ /app
69
+
70
+ ### Full
71
+ FROM base AS full
72
+
73
+ COPY --from=build /app/full /app
74
+
75
+ WORKDIR /app
76
+
77
+ RUN apt-get update \
78
+ && apt-get install -y \
79
+ git \
80
+ python3-pip \
81
+ python3 \
82
+ python3-wheel\
83
+ && pip install --break-system-packages --upgrade setuptools \
84
+ && pip install --break-system-packages -r requirements.txt \
85
+ && apt autoremove -y \
86
+ && apt clean -y \
87
+ && rm -rf /tmp/* /var/tmp/* \
88
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
89
+ && find /var/cache -type f -delete
90
+
91
+ ENTRYPOINT ["/app/tools.sh"]
92
+
93
+ ### Light, CLI only
94
+ FROM base AS light
95
+
96
+ COPY --from=build /app/full/llama-cli /app
97
+
98
+ WORKDIR /app
99
+
100
+ ENTRYPOINT [ "/app/llama-cli" ]
101
+
102
+ ### Server, Server only
103
+ FROM base AS server
104
+
105
+ ENV LLAMA_ARG_HOST=0.0.0.0
106
+
107
+ COPY --from=build /app/full/llama-server /app
108
+
109
+ WORKDIR /app
110
+
111
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
112
+
113
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.devops/tools.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # Read the first argument into a variable
5
+ arg1="$1"
6
+
7
+ # Shift the arguments to remove the first one
8
+ shift
9
+
10
+ if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
11
+ exec python3 ./convert_hf_to_gguf.py "$@"
12
+ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
13
+ exec ./llama-quantize "$@"
14
+ elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
15
+ exec ./llama-cli "$@"
16
+ elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
17
+ exec ./llama-bench "$@"
18
+ elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
19
+ exec ./llama-perplexity "$@"
20
+ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
21
+ echo "Converting PTH to GGML..."
22
+ for i in $(ls $1/$2/ggml-model-f16.bin*); do
23
+ if [ -f "${i/f16/q4_0}" ]; then
24
+ echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
25
+ else
26
+ echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
27
+ exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
28
+ fi
29
+ done
30
+ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
31
+ exec ./llama-server "$@"
32
+ else
33
+ echo "Unknown command: $arg1"
34
+ echo "Available commands: "
35
+ echo " --run (-r): Run a model previously converted into ggml"
36
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
37
+ echo " --bench (-b): Benchmark the performance of the inference for various parameters."
38
+ echo " ex: -m model.gguf"
39
+ echo " --perplexity (-p): Measure the perplexity of a model over a given text."
40
+ echo " ex: -m model.gguf -f file.txt"
41
+ echo " --convert (-c): Convert a llama model into ggml"
42
+ echo " ex: --outtype f16 \"/models/7B/\" "
43
+ echo " --quantize (-q): Optimize with quantization process ggml"
44
+ echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
45
+ echo " --all-in-one (-a): Execute --convert & --quantize"
46
+ echo " ex: \"/models/\" 7B"
47
+ echo " --server (-s): Run a model on the server"
48
+ echo " ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
49
+ fi
llama.cpp/.devops/vulkan.Dockerfile ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=24.04
2
+
3
+ FROM ubuntu:$UBUNTU_VERSION AS build
4
+
5
+ # Install build tools
6
+ RUN apt update && apt install -y git build-essential cmake wget
7
+
8
+ # Install Vulkan SDK and cURL
9
+ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
10
+ wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
11
+ apt update -y && \
12
+ apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
13
+
14
+ # Build it
15
+ WORKDIR /app
16
+
17
+ COPY . .
18
+
19
+ RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
20
+ cmake --build build --config Release -j$(nproc)
21
+
22
+ RUN mkdir -p /app/lib && \
23
+ find build -name "*.so" -exec cp {} /app/lib \;
24
+
25
+ RUN mkdir -p /app/full \
26
+ && cp build/bin/* /app/full \
27
+ && cp *.py /app/full \
28
+ && cp -r gguf-py /app/full \
29
+ && cp -r requirements /app/full \
30
+ && cp requirements.txt /app/full \
31
+ && cp .devops/tools.sh /app/full/tools.sh
32
+
33
+ ## Base image
34
+ FROM ubuntu:$UBUNTU_VERSION AS base
35
+
36
+ RUN apt-get update \
37
+ && apt-get install -y libgomp1 curl libvulkan-dev \
38
+ && apt autoremove -y \
39
+ && apt clean -y \
40
+ && rm -rf /tmp/* /var/tmp/* \
41
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
42
+ && find /var/cache -type f -delete
43
+
44
+ COPY --from=build /app/lib/ /app
45
+
46
+ ### Full
47
+ FROM base AS full
48
+
49
+ COPY --from=build /app/full /app
50
+
51
+ WORKDIR /app
52
+
53
+ RUN apt-get update \
54
+ && apt-get install -y \
55
+ git \
56
+ python3 \
57
+ python3-pip \
58
+ python3-wheel \
59
+ && pip install --break-system-packages --upgrade setuptools \
60
+ && pip install --break-system-packages -r requirements.txt \
61
+ && apt autoremove -y \
62
+ && apt clean -y \
63
+ && rm -rf /tmp/* /var/tmp/* \
64
+ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
65
+ && find /var/cache -type f -delete
66
+
67
+ ENTRYPOINT ["/app/tools.sh"]
68
+
69
+ ### Light, CLI only
70
+ FROM base AS light
71
+
72
+ COPY --from=build /app/full/llama-cli /app
73
+
74
+ WORKDIR /app
75
+
76
+ ENTRYPOINT [ "/app/llama-cli" ]
77
+
78
+ ### Server, Server only
79
+ FROM base AS server
80
+
81
+ ENV LLAMA_ARG_HOST=0.0.0.0
82
+
83
+ COPY --from=build /app/full/llama-server /app
84
+
85
+ WORKDIR /app
86
+
87
+ HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
88
+
89
+ ENTRYPOINT [ "/app/llama-server" ]
llama.cpp/.dockerignore ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.o
2
+ *.a
3
+ .cache/
4
+ # Do not ignore .git directory, otherwise the reported build number will always be 0
5
+ .github/
6
+ .gitignore
7
+ .vs/
8
+ .vscode/
9
+ .DS_Store
10
+
11
+ build*/
12
+
13
+ models/*
14
+
15
+ /llama-cli
16
+ /llama-quantize
17
+
18
+ arm_neon.h
19
+ compile_commands.json
20
+ Dockerfile
llama.cpp/.ecrc ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
3
+ "Disable": {
4
+ "IndentSize": true
5
+ }
6
+ }
llama.cpp/.editorconfig ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://EditorConfig.org
2
+
3
+ # Top-most EditorConfig file
4
+ root = true
5
+
6
+ # Unix-style newlines with a newline ending every file, utf-8 charset
7
+ [*]
8
+ end_of_line = lf
9
+ insert_final_newline = true
10
+ trim_trailing_whitespace = true
11
+ charset = utf-8
12
+ indent_style = space
13
+ indent_size = 4
14
+
15
+ [Makefile]
16
+ indent_style = tab
17
+
18
+ [scripts/*.mk]
19
+ indent_style = tab
20
+
21
+ [prompts/*.txt]
22
+ insert_final_newline = unset
23
+
24
+ [examples/server/public/*]
25
+ indent_size = 2
26
+
27
+ [examples/server/public/deps_*]
28
+ trim_trailing_whitespace = unset
29
+ indent_style = unset
30
+ indent_size = unset
31
+
32
+ [examples/server/deps_*]
33
+ trim_trailing_whitespace = unset
34
+ indent_style = unset
35
+ indent_size = unset
36
+
37
+ [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
38
+ indent_style = tab
39
+
40
+ [examples/cvector-generator/*.txt]
41
+ trim_trailing_whitespace = unset
42
+ insert_final_newline = unset
43
+
44
+ [models/templates/*.jinja]
45
+ indent_style = unset
46
+ indent_size = unset
47
+ end_of_line = unset
48
+ charset = unset
49
+ trim_trailing_whitespace = unset
50
+ insert_final_newline = unset
llama.cpp/.flake8 ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [flake8]
2
+ max-line-length = 125
3
+ ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
4
+ exclude =
5
+ # Do not traverse examples
6
+ examples,
7
+ # Do not include package initializers
8
+ __init__.py,
9
+ # No need to traverse our git directory
10
+ .git,
11
+ # There's no value in checking cache directories
12
+ __pycache__,
13
+ # No need to include the build path
14
+ build,
15
+ # This contains builds that we don't want to check
16
+ dist # This is generated with `python build .` for package releases
17
+ # max-complexity = 10
llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Bug (compilation)
2
+ description: Something goes wrong when trying to compile llama.cpp.
3
+ title: "Compile bug: "
4
+ labels: ["bug-unconfirmed", "compilation"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: >
9
+ Thanks for taking the time to fill out this bug report!
10
+ This issue template is intended for bug reports where the compilation of llama.cpp fails.
11
+ Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
12
+ If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
13
+ by clearing `~/.cache/ccache` (on Linux).
14
+ - type: textarea
15
+ id: commit
16
+ attributes:
17
+ label: Git commit
18
+ description: Which commit are you trying to compile?
19
+ placeholder: |
20
+ $git rev-parse HEAD
21
+ 84a07a17b1b08cf2b9747c633a2372782848a27f
22
+ validations:
23
+ required: true
24
+ - type: dropdown
25
+ id: operating-system
26
+ attributes:
27
+ label: Operating systems
28
+ description: Which operating systems do you know to be affected?
29
+ multiple: true
30
+ options:
31
+ - Linux
32
+ - Mac
33
+ - Windows
34
+ - BSD
35
+ - Other? (Please let us know in description)
36
+ validations:
37
+ required: true
38
+ - type: dropdown
39
+ id: backends
40
+ attributes:
41
+ label: GGML backends
42
+ description: Which GGML backends do you know to be affected?
43
+ options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
44
+ multiple: true
45
+ validations:
46
+ required: true
47
+ - type: textarea
48
+ id: info
49
+ attributes:
50
+ label: Problem description & steps to reproduce
51
+ description: >
52
+ Please give us a summary of the problem and tell us how to reproduce it.
53
+ If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
54
+ placeholder: >
55
+ I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
56
+ Here are the exact commands that I used: ...
57
+ validations:
58
+ required: true
59
+ - type: textarea
60
+ id: first_bad_commit
61
+ attributes:
62
+ label: First Bad Commit
63
+ description: >
64
+ If the bug was not present on an earlier version: when did it start appearing?
65
+ If possible, please do a git bisect and identify the exact commit that introduced the bug.
66
+ validations:
67
+ required: false
68
+ - type: textarea
69
+ id: command
70
+ attributes:
71
+ label: Compile command
72
+ description: >
73
+ Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
74
+ This will be automatically formatted into code, so no need for backticks.
75
+ render: shell
76
+ validations:
77
+ required: true
78
+ - type: textarea
79
+ id: logs
80
+ attributes:
81
+ label: Relevant log output
82
+ description: >
83
+ Please copy and paste any relevant log output, including any generated text.
84
+ This will be automatically formatted into code, so no need for backticks.
85
+ render: shell
86
+ validations:
87
+ required: true
llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Bug (model use)
2
+ description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
3
+ title: "Eval bug: "
4
+ labels: ["bug-unconfirmed", "model evaluation"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: >
9
+ Thanks for taking the time to fill out this bug report!
10
+ This issue template is intended for bug reports where the model evaluation results
11
+ (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
12
+ If you encountered the issue while using an external UI (e.g. ollama),
13
+ please reproduce your issue using one of the examples/binaries in this repository.
14
+ The `llama-cli` binary can be used for simple and reproducible model inference.
15
+ - type: textarea
16
+ id: version
17
+ attributes:
18
+ label: Name and Version
19
+ description: Which version of our software are you running? (use `--version` to get a version string)
20
+ placeholder: |
21
+ $./llama-cli --version
22
+ version: 2999 (42b4109e)
23
+ built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
24
+ validations:
25
+ required: true
26
+ - type: dropdown
27
+ id: operating-system
28
+ attributes:
29
+ label: Operating systems
30
+ description: Which operating systems do you know to be affected?
31
+ multiple: true
32
+ options:
33
+ - Linux
34
+ - Mac
35
+ - Windows
36
+ - BSD
37
+ - Other? (Please let us know in description)
38
+ validations:
39
+ required: true
40
+ - type: dropdown
41
+ id: backends
42
+ attributes:
43
+ label: GGML backends
44
+ description: Which GGML backends do you know to be affected?
45
+ options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
46
+ multiple: true
47
+ validations:
48
+ required: true
49
+ - type: textarea
50
+ id: hardware
51
+ attributes:
52
+ label: Hardware
53
+ description: Which CPUs/GPUs are you using?
54
+ placeholder: >
55
+ e.g. Ryzen 5950X + 2x RTX 4090
56
+ validations:
57
+ required: true
58
+ - type: textarea
59
+ id: model
60
+ attributes:
61
+ label: Models
62
+ description: >
63
+ Which model(s) at which quantization were you using when encountering the bug?
64
+ If you downloaded a GGUF file off of Huggingface, please provide a link.
65
+ placeholder: >
66
+ e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
67
+ validations:
68
+ required: false
69
+ - type: textarea
70
+ id: info
71
+ attributes:
72
+ label: Problem description & steps to reproduce
73
+ description: >
74
+ Please give us a summary of the problem and tell us how to reproduce it.
75
+ If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
76
+ that information would be very much appreciated by us.
77
+ placeholder: >
78
+ e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
79
+ When I use -ngl 0 it works correctly.
80
+ Here are the exact commands that I used: ...
81
+ validations:
82
+ required: true
83
+ - type: textarea
84
+ id: first_bad_commit
85
+ attributes:
86
+ label: First Bad Commit
87
+ description: >
88
+ If the bug was not present on an earlier version: when did it start appearing?
89
+ If possible, please do a git bisect and identify the exact commit that introduced the bug.
90
+ validations:
91
+ required: false
92
+ - type: textarea
93
+ id: logs
94
+ attributes:
95
+ label: Relevant log output
96
+ description: >
97
+ Please copy and paste any relevant log output, including the command that you entered and any generated text.
98
+ This will be automatically formatted into code, so no need for backticks.
99
+ render: shell
100
+ validations:
101
+ required: true
llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Bug (misc.)
2
+ description: Something is not working the way it should (and it's not covered by any of the above cases).
3
+ title: "Misc. bug: "
4
+ labels: ["bug-unconfirmed"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: >
9
+ Thanks for taking the time to fill out this bug report!
10
+ This issue template is intended for miscellaneous bugs that don't fit into any other category.
11
+ If you encountered the issue while using an external UI (e.g. ollama),
12
+ please reproduce your issue using one of the examples/binaries in this repository.
13
+ - type: textarea
14
+ id: version
15
+ attributes:
16
+ label: Name and Version
17
+ description: Which version of our software is affected? (You can use `--version` to get a version string.)
18
+ placeholder: |
19
+ $./llama-cli --version
20
+ version: 2999 (42b4109e)
21
+ built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
22
+ validations:
23
+ required: true
24
+ - type: dropdown
25
+ id: operating-system
26
+ attributes:
27
+ label: Operating systems
28
+ description: Which operating systems do you know to be affected?
29
+ multiple: true
30
+ options:
31
+ - Linux
32
+ - Mac
33
+ - Windows
34
+ - BSD
35
+ - Other? (Please let us know in description)
36
+ validations:
37
+ required: false
38
+ - type: dropdown
39
+ id: module
40
+ attributes:
41
+ label: Which llama.cpp modules do you know to be affected?
42
+ multiple: true
43
+ options:
44
+ - Documentation/Github
45
+ - libllama (core library)
46
+ - llama-cli
47
+ - llama-server
48
+ - llama-bench
49
+ - llama-quantize
50
+ - Python/Bash scripts
51
+ - Test code
52
+ - Other (Please specify in the next section)
53
+ validations:
54
+ required: false
55
+ - type: textarea
56
+ id: command
57
+ attributes:
58
+ label: Command line
59
+ description: >
60
+ Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
61
+ This will be automatically formatted into code, so no need for backticks.
62
+ render: shell
63
+ validations:
64
+ required: false
65
+ - type: textarea
66
+ id: info
67
+ attributes:
68
+ label: Problem description & steps to reproduce
69
+ description: >
70
+ Please give us a summary of the problem and tell us how to reproduce it (if applicable).
71
+ validations:
72
+ required: true
73
+ - type: textarea
74
+ id: first_bad_commit
75
+ attributes:
76
+ label: First Bad Commit
77
+ description: >
78
+ If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
79
+ If possible, please do a git bisect and identify the exact commit that introduced the bug.
80
+ validations:
81
+ required: false
82
+ - type: textarea
83
+ id: logs
84
+ attributes:
85
+ label: Relevant log output
86
+ description: >
87
+ If applicable, please copy and paste any relevant log output, including any generated text.
88
+ This will be automatically formatted into code, so no need for backticks.
89
+ render: shell
90
+ validations:
91
+ required: false
llama.cpp/.github/ISSUE_TEMPLATE/020-enhancement.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Enhancement
2
+ description: Used to request enhancements for llama.cpp.
3
+ title: "Feature Request: "
4
+ labels: ["enhancement"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: |
9
+ [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
10
+
11
+ - type: checkboxes
12
+ id: prerequisites
13
+ attributes:
14
+ label: Prerequisites
15
+ description: Please confirm the following before submitting your enhancement request.
16
+ options:
17
+ - label: I am running the latest code. Mention the version if possible as well.
18
+ required: true
19
+ - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
20
+ required: true
21
+ - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
22
+ required: true
23
+ - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
24
+ required: true
25
+
26
+ - type: textarea
27
+ id: feature-description
28
+ attributes:
29
+ label: Feature Description
30
+ description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
31
+ placeholder: Detailed description of the enhancement
32
+ validations:
33
+ required: true
34
+
35
+ - type: textarea
36
+ id: motivation
37
+ attributes:
38
+ label: Motivation
39
+ description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
40
+ placeholder: Explanation of why this feature is needed and its benefits
41
+ validations:
42
+ required: true
43
+
44
+ - type: textarea
45
+ id: possible-implementation
46
+ attributes:
47
+ label: Possible Implementation
48
+ description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
49
+ placeholder: Detailed description of potential implementation
50
+ validations:
51
+ required: false
llama.cpp/.github/ISSUE_TEMPLATE/030-research.yml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Research
2
+ description: Track new technical research area.
3
+ title: "Research: "
4
+ labels: ["research 🔬"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: |
9
+ Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
10
+
11
+ - type: checkboxes
12
+ id: research-stage
13
+ attributes:
14
+ label: Research Stage
15
+ description: Track general state of this research ticket
16
+ options:
17
+ - label: Background Research (Let's try to avoid reinventing the wheel)
18
+ - label: Hypothesis Formed (How do you think this will work and it's effect?)
19
+ - label: Strategy / Implementation Forming
20
+ - label: Analysis of results
21
+ - label: Debrief / Documentation (So people in the future can learn from us)
22
+
23
+ - type: textarea
24
+ id: background
25
+ attributes:
26
+ label: Previous existing literature and research
27
+ description: Whats the current state of the art and whats the motivation for this research?
28
+
29
+ - type: textarea
30
+ id: hypothesis
31
+ attributes:
32
+ label: Hypothesis
33
+ description: How do you think this will work and it's effect?
34
+
35
+ - type: textarea
36
+ id: implementation
37
+ attributes:
38
+ label: Implementation
39
+ description: Got an approach? e.g. a PR ready to go?
40
+
41
+ - type: textarea
42
+ id: analysis
43
+ attributes:
44
+ label: Analysis
45
+ description: How does the proposed implementation behave?
46
+
47
+ - type: textarea
48
+ id: logs
49
+ attributes:
50
+ label: Relevant log output
51
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
52
+ render: shell
llama.cpp/.github/ISSUE_TEMPLATE/040-refactor.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Refactor (Maintainers)
2
+ description: Used to track refactoring opportunities.
3
+ title: "Refactor: "
4
+ labels: ["refactor"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: |
9
+ Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
10
+ Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
11
+
12
+ - type: textarea
13
+ id: background-description
14
+ attributes:
15
+ label: Background Description
16
+ description: Please provide a detailed written description of the pain points you are trying to solve.
17
+ placeholder: Detailed description behind your motivation to request refactor
18
+ validations:
19
+ required: true
20
+
21
+ - type: textarea
22
+ id: possible-approaches
23
+ attributes:
24
+ label: Possible Refactor Approaches
25
+ description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
26
+ placeholder: Your idea of possible refactoring opportunity/approaches
27
+ validations:
28
+ required: false
llama.cpp/.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ blank_issues_enabled: true
2
+ contact_links:
3
+ - name: Got an idea?
4
+ url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
5
+ about: Pop it there. It may then become an enhancement ticket.
6
+ - name: Got a question?
7
+ url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
8
+ about: Ask a question there!
9
+ - name: Want to contribute?
10
+ url: https://github.com/ggerganov/llama.cpp/wiki/contribute
11
+ about: Head to the contribution guide page of the wiki for areas you can help with
llama.cpp/.github/labeler.yml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/actions/labeler
2
+ Kompute:
3
+ - changed-files:
4
+ - any-glob-to-any-file:
5
+ - ggml/include/ggml-kompute.h
6
+ - ggml/src/ggml-kompute/**
7
+ - README-kompute.md
8
+ Apple Metal:
9
+ - changed-files:
10
+ - any-glob-to-any-file:
11
+ - ggml/include/ggml-metal.h
12
+ - ggml/src/ggml-metal/**
13
+ - README-metal.md
14
+ SYCL:
15
+ - changed-files:
16
+ - any-glob-to-any-file:
17
+ - ggml/include/ggml-sycl.h
18
+ - ggml/src/ggml-sycl/**
19
+ - docs/backend/SYCL.md
20
+ - examples/sycl/**
21
+ Nvidia GPU:
22
+ - changed-files:
23
+ - any-glob-to-any-file:
24
+ - ggml/include/ggml-cuda.h
25
+ - ggml/src/ggml-cuda/**
26
+ Vulkan:
27
+ - changed-files:
28
+ - any-glob-to-any-file:
29
+ - ggml/include/ggml-vulkan.h
30
+ - ggml/src/ggml-vulkan/**
31
+ documentation:
32
+ - changed-files:
33
+ - any-glob-to-any-file:
34
+ - docs/**
35
+ - media/**
36
+ testing:
37
+ - changed-files:
38
+ - any-glob-to-any-file:
39
+ - tests/**
40
+ build:
41
+ - changed-files:
42
+ - any-glob-to-any-file:
43
+ - cmake/**
44
+ - CMakeLists.txt
45
+ - CMakePresets.json
46
+ examples:
47
+ - changed-files:
48
+ - any-glob-to-any-file: examples/**
49
+ devops:
50
+ - changed-files:
51
+ - any-glob-to-any-file:
52
+ - .devops/**
53
+ - .github/**
54
+ - ci/**
55
+ python:
56
+ - changed-files:
57
+ - any-glob-to-any-file:
58
+ - "**/*.py"
59
+ - requirements/**
60
+ - gguf-py/**
61
+ - .flake8
62
+ script:
63
+ - changed-files:
64
+ - any-glob-to-any-file:
65
+ - scripts/**
66
+ android:
67
+ - changed-files:
68
+ - any-glob-to-any-file:
69
+ - examples/llama.android/**
70
+ server:
71
+ - changed-files:
72
+ - any-glob-to-any-file:
73
+ - examples/server/**
74
+ ggml:
75
+ - changed-files:
76
+ - any-glob-to-any-file:
77
+ - ggml/**
78
+ nix:
79
+ - changed-files:
80
+ - any-glob-to-any-file:
81
+ - "**/*.nix"
82
+ - .github/workflows/nix-*.yml
83
+ - .devops/nix/nixpkgs-instances.nix
84
+ embedding:
85
+ - changed-files:
86
+ - any-glob-to-any-file: examples/embedding/
llama.cpp/.github/pull_request_template.md ADDED
@@ -0,0 +1 @@
 
 
1
+ *Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
llama.cpp/.github/workflows/bench.yml.disabled ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TODO: there have been some issues with the workflow, so disabling for now
2
+ # https://github.com/ggerganov/llama.cpp/issues/7893
3
+ #
4
+ # Benchmark
5
+ name: Benchmark
6
+
7
+ on:
8
+ workflow_dispatch:
9
+ inputs:
10
+ gpu-series:
11
+ description: 'Azure GPU series to run with'
12
+ required: true
13
+ type: choice
14
+ options:
15
+ - Standard_NC4as_T4_v3
16
+ - Standard_NC24ads_A100_v4
17
+ - Standard_NC80adis_H100_v5
18
+ sha:
19
+ description: 'Commit SHA1 to build'
20
+ required: false
21
+ type: string
22
+ duration:
23
+ description: 'Duration of the bench'
24
+ type: string
25
+ default: 10m
26
+
27
+ push:
28
+ branches:
29
+ - master
30
+ paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
31
+ pull_request_target:
32
+ types: [opened, synchronize, reopened]
33
+ paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
34
+ schedule:
35
+ - cron: '04 2 * * *'
36
+
37
+ concurrency:
38
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
39
+ cancel-in-progress: true
40
+
41
+ jobs:
42
+ bench-server-baseline:
43
+ runs-on: Standard_NC4as_T4_v3
44
+ env:
45
+ RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
46
+ N_USERS: 8
47
+ DURATION: 10m
48
+
49
+ strategy:
50
+ matrix:
51
+ model: [phi-2]
52
+ ftype: [q4_0, q8_0, f16]
53
+ include:
54
+ - model: phi-2
55
+ ftype: q4_0
56
+ pr_comment_enabled: "true"
57
+
58
+ if: |
59
+ inputs.gpu-series == 'Standard_NC4as_T4_v3'
60
+ || (
61
+ github.event_name == 'schedule'
62
+ && github.ref_name == 'master'
63
+ && github.repository_owner == 'ggerganov'
64
+ )
65
+ || github.event_name == 'pull_request_target'
66
+ || (
67
+ github.event_name == 'push'
68
+ && github.event.ref == 'refs/heads/master'
69
+ && github.repository_owner == 'ggerganov'
70
+ )
71
+ steps:
72
+ - name: Clone
73
+ id: checkout
74
+ uses: actions/checkout@v4
75
+ with:
76
+ fetch-depth: 0
77
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
78
+
79
+ - name: Install python env
80
+ id: pipenv
81
+ run: |
82
+ cd examples/server/bench
83
+ python3 -m venv venv
84
+ source venv/bin/activate
85
+ pip install -r requirements.txt
86
+
87
+ - name: Prometheus
88
+ id: install_prometheus
89
+ run: |
90
+ wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
91
+ tar xzf prometheus*.tar.gz --strip-components=1
92
+ ./prometheus --config.file=examples/server/bench/prometheus.yml &
93
+ while ! nc -z localhost 9090; do
94
+ sleep 0.1
95
+ done
96
+
97
+ - name: Set up Go
98
+ uses: actions/setup-go@v5
99
+ with:
100
+ go-version: '1.21'
101
+
102
+ - name: Install k6 and xk6-sse
103
+ id: k6_installation
104
+ run: |
105
+ cd examples/server/bench
106
+ go install go.k6.io/xk6/cmd/xk6@latest
107
+ xk6 build master \
108
+ --with github.com/phymbert/xk6-sse
109
+
110
+ - name: Build
111
+ id: cmake_build
112
+ run: |
113
+ set -eux
114
+ cmake -B build \
115
+ -DGGML_NATIVE=OFF \
116
+ -DLLAMA_BUILD_SERVER=ON \
117
+ -DLLAMA_CURL=ON \
118
+ -DLLAMA_CUBLAS=ON \
119
+ -DCUDAToolkit_ROOT=/usr/local/cuda \
120
+ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
121
+ -DCMAKE_CUDA_ARCHITECTURES=75 \
122
+ -DLLAMA_FATAL_WARNINGS=OFF \
123
+ -DLLAMA_ALL_WARNINGS=OFF \
124
+ -DCMAKE_BUILD_TYPE=Release;
125
+ cmake --build build --config Release -j $(nproc) --target llama-server
126
+
127
+ - name: Download the dataset
128
+ id: download_dataset
129
+ run: |
130
+ cd examples/server/bench
131
+ wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
132
+
133
+ - name: Server bench
134
+ id: server_bench
135
+ env:
136
+ HEAD_REF: ${{ github.head_ref || github.ref_name }}
137
+ run: |
138
+ set -eux
139
+
140
+ cd examples/server/bench
141
+ source venv/bin/activate
142
+ python bench.py \
143
+ --runner-label ${{ env.RUNNER_LABEL }} \
144
+ --name ${{ github.job }} \
145
+ --branch $HEAD_REF \
146
+ --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
147
+ --scenario script.js \
148
+ --duration ${{ github.event.inputs.duration || env.DURATION }} \
149
+ --hf-repo ggml-org/models \
150
+ --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
151
+ --model-path-prefix /models \
152
+ --parallel ${{ env.N_USERS }} \
153
+ -ngl 33 \
154
+ --batch-size 2048 \
155
+ --ubatch-size 256 \
156
+ --ctx-size 16384 \
157
+ --n-prompts 1000 \
158
+ --max-prompt-tokens 1024 \
159
+ --max-tokens 2048
160
+
161
+ cat results.github.env >> $GITHUB_ENV
162
+
163
+ # Remove dataset as we do not want it in the artefact
164
+ rm ShareGPT_V3_unfiltered_cleaned_split.json
165
+
166
+ - uses: actions/upload-artifact@v4
167
+ with:
168
+ name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
169
+ compression-level: 9
170
+ path: |
171
+ examples/server/bench/*.jpg
172
+ examples/server/bench/*.json
173
+ examples/server/bench/*.log
174
+
175
+ - name: Commit status
176
+ uses: Sibz/github-status-action@v1
177
+ with:
178
+ authToken: ${{secrets.GITHUB_TOKEN}}
179
+ sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
180
+ context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
181
+ description: |
182
+ ${{ env.BENCH_RESULTS }}
183
+ state: 'success'
184
+
185
+ - name: Upload benchmark images
186
+ uses: devicons/public-upload-to-imgur@v2.2.2
187
+ continue-on-error: true # Important as it looks unstable: 503
188
+ id: imgur_step
189
+ with:
190
+ client_id: ${{secrets.IMGUR_CLIENT_ID}}
191
+ path: |
192
+ examples/server/bench/prompt_tokens_seconds.jpg
193
+ examples/server/bench/predicted_tokens_seconds.jpg
194
+ examples/server/bench/kv_cache_usage_ratio.jpg
195
+ examples/server/bench/requests_processing.jpg
196
+
197
+ - name: Extract mermaid
198
+ id: set_mermaid
199
+ run: |
200
+ set -eux
201
+
202
+ cd examples/server/bench
203
+ PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
204
+ echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
205
+ echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
206
+ echo "EOF" >> $GITHUB_ENV
207
+
208
+ PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
209
+ echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
210
+ echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
211
+ echo "EOF" >> $GITHUB_ENV
212
+
213
+ KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
214
+ echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
215
+ echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
216
+ echo "EOF" >> $GITHUB_ENV
217
+
218
+ REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
219
+ echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
220
+ echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
221
+ echo "EOF" >> $GITHUB_ENV
222
+
223
+ - name: Extract image url
224
+ id: extract_image_url
225
+ continue-on-error: true
226
+ run: |
227
+ set -eux
228
+
229
+ echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
230
+ echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
231
+ echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
232
+ echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
233
+
234
+ - name: Comment PR
235
+ uses: mshick/add-pr-comment@v2
236
+ id: comment_pr
237
+ if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
238
+ with:
239
+ message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
240
+ message: |
241
+ <p align="center">
242
+
243
+ 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
244
+
245
+ </p>
246
+
247
+ <details>
248
+
249
+ <summary>Expand details for performance related PR only</summary>
250
+
251
+ - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
252
+ - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
253
+ - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
254
+ - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
255
+ - ${{ env.BENCH_GRAPH_XLABEL }}
256
+
257
+
258
+ <p align="center">
259
+
260
+ <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
261
+
262
+ <details>
263
+
264
+ <summary>More</summary>
265
+
266
+ ```mermaid
267
+ ${{ env.PROMPT_TOKENS_SECONDS }}
268
+ ```
269
+
270
+ </details>
271
+
272
+ <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
273
+
274
+ <details>
275
+ <summary>More</summary>
276
+
277
+ ```mermaid
278
+ ${{ env.PREDICTED_TOKENS_SECONDS }}
279
+ ```
280
+
281
+ </details>
282
+
283
+ </p>
284
+
285
+ <details>
286
+
287
+ <summary>Details</summary>
288
+
289
+ <p align="center">
290
+
291
+ <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
292
+
293
+ <details>
294
+ <summary>More</summary>
295
+
296
+ ```mermaid
297
+ ${{ env.KV_CACHE_USAGE_RATIO }}
298
+ ```
299
+
300
+ </details>
301
+
302
+ <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
303
+
304
+ <details>
305
+ <summary>More</summary>
306
+
307
+ ```mermaid
308
+ ${{ env.REQUESTS_PROCESSING }}
309
+ ```
310
+
311
+ </details>
312
+
313
+ </p>
314
+ </details>
315
+ </details>
llama.cpp/.github/workflows/build.yml ADDED
@@ -0,0 +1,1645 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ workflow_dispatch: # allows manual triggering
5
+ inputs:
6
+ create_release:
7
+ description: 'Create new release'
8
+ required: true
9
+ type: boolean
10
+ push:
11
+ branches:
12
+ - master
13
+ paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
14
+ pull_request:
15
+ types: [opened, synchronize, reopened]
16
+ paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
17
+
18
+ concurrency:
19
+ group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
20
+ cancel-in-progress: true
21
+
22
+ # Fine-grant permission
23
+ # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
24
+ permissions:
25
+ contents: write # for creating release
26
+
27
+ env:
28
+ BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
29
+ GGML_NLOOP: 3
30
+ GGML_N_THREADS: 1
31
+ LLAMA_LOG_COLORS: 1
32
+ LLAMA_LOG_PREFIX: 1
33
+ LLAMA_LOG_TIMESTAMPS: 1
34
+
35
+ jobs:
36
+ macOS-latest-cmake-arm64:
37
+ runs-on: macos-14
38
+
39
+ steps:
40
+ - name: Clone
41
+ id: checkout
42
+ uses: actions/checkout@v4
43
+ with:
44
+ fetch-depth: 0
45
+
46
+ - name: ccache
47
+ uses: hendrikmuhs/ccache-action@v1.2.16
48
+ with:
49
+ key: macOS-latest-cmake-arm64
50
+ evict-old-files: 1d
51
+
52
+ - name: Dependencies
53
+ id: depends
54
+ continue-on-error: true
55
+ run: |
56
+ brew update
57
+
58
+ - name: Build
59
+ id: cmake_build
60
+ run: |
61
+ sysctl -a
62
+ cmake -B build \
63
+ -DCMAKE_BUILD_RPATH="@loader_path" \
64
+ -DLLAMA_FATAL_WARNINGS=ON \
65
+ -DLLAMA_CURL=ON \
66
+ -DGGML_METAL_USE_BF16=ON \
67
+ -DGGML_METAL_EMBED_LIBRARY=ON \
68
+ -DGGML_RPC=ON
69
+ cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
70
+
71
+ - name: Test
72
+ id: cmake_test
73
+ run: |
74
+ cd build
75
+ ctest -L 'main|curl' --verbose --timeout 900
76
+
77
+ - name: Determine tag name
78
+ id: tag
79
+ shell: bash
80
+ run: |
81
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
82
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
83
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
84
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
85
+ else
86
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
87
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
88
+ fi
89
+
90
+ - name: Pack artifacts
91
+ id: pack_artifacts
92
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
93
+ run: |
94
+ cp LICENSE ./build/bin/
95
+ cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
96
+ zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
97
+
98
+ - name: Upload artifacts
99
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
100
+ uses: actions/upload-artifact@v4
101
+ with:
102
+ path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
103
+ name: llama-bin-macos-arm64.zip
104
+
105
+ macOS-latest-cmake-x64:
106
+ runs-on: macos-13
107
+
108
+ steps:
109
+ - name: Clone
110
+ id: checkout
111
+ uses: actions/checkout@v4
112
+ with:
113
+ fetch-depth: 0
114
+
115
+ - name: ccache
116
+ uses: hendrikmuhs/ccache-action@v1.2.16
117
+ with:
118
+ key: macOS-latest-cmake-x64
119
+ evict-old-files: 1d
120
+
121
+ - name: Dependencies
122
+ id: depends
123
+ continue-on-error: true
124
+ run: |
125
+ brew update
126
+
127
+ - name: Build
128
+ id: cmake_build
129
+ run: |
130
+ sysctl -a
131
+ # Metal is disabled due to intermittent failures with Github runners not having a GPU:
132
+ # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
133
+ cmake -B build \
134
+ -DCMAKE_BUILD_RPATH="@loader_path" \
135
+ -DLLAMA_FATAL_WARNINGS=ON \
136
+ -DLLAMA_CURL=ON \
137
+ -DGGML_METAL=OFF \
138
+ -DGGML_RPC=ON
139
+ cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
140
+
141
+ - name: Test
142
+ id: cmake_test
143
+ run: |
144
+ cd build
145
+ ctest -L main --verbose --timeout 900
146
+
147
+ - name: Determine tag name
148
+ id: tag
149
+ shell: bash
150
+ run: |
151
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
152
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
153
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
154
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
155
+ else
156
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
157
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
158
+ fi
159
+
160
+ - name: Pack artifacts
161
+ id: pack_artifacts
162
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
163
+ run: |
164
+ cp LICENSE ./build/bin/
165
+ cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
166
+ zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
167
+
168
+ - name: Upload artifacts
169
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
170
+ uses: actions/upload-artifact@v4
171
+ with:
172
+ path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
173
+ name: llama-bin-macos-x64.zip
174
+
175
+ ubuntu-cpu-cmake:
176
+ runs-on: ubuntu-22.04
177
+
178
+ steps:
179
+ - name: Clone
180
+ id: checkout
181
+ uses: actions/checkout@v4
182
+ with:
183
+ fetch-depth: 0
184
+
185
+ - name: ccache
186
+ uses: hendrikmuhs/ccache-action@v1.2.16
187
+ with:
188
+ key: ubuntu-cpu-cmake
189
+ evict-old-files: 1d
190
+
191
+ - name: Dependencies
192
+ id: depends
193
+ run: |
194
+ sudo apt-get update
195
+ sudo apt-get install build-essential libcurl4-openssl-dev
196
+
197
+ - name: Build
198
+ id: cmake_build
199
+ run: |
200
+ cmake -B build \
201
+ -DLLAMA_FATAL_WARNINGS=ON \
202
+ -DLLAMA_CURL=ON \
203
+ -DGGML_RPC=ON
204
+ cmake --build build --config Release -j $(nproc)
205
+
206
+ - name: Test
207
+ id: cmake_test
208
+ run: |
209
+ cd build
210
+ ctest -L 'main|curl' --verbose --timeout 900
211
+
212
+ - name: Test llama2c conversion
213
+ id: llama2c_test
214
+ run: |
215
+ cd build
216
+ echo "Fetch tokenizer"
217
+ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
218
+ echo "Fetch llama2c model"
219
+ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
220
+ ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
221
+ ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
222
+
223
+ - name: Determine tag name
224
+ id: tag
225
+ shell: bash
226
+ run: |
227
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
228
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
229
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
230
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
231
+ else
232
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
233
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
234
+ fi
235
+
236
+ - name: Pack artifacts
237
+ id: pack_artifacts
238
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
239
+ run: |
240
+ cp LICENSE ./build/bin/
241
+ cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
242
+ zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
243
+
244
+ - name: Upload artifacts
245
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
246
+ uses: actions/upload-artifact@v4
247
+ with:
248
+ path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
249
+ name: llama-bin-ubuntu-x64.zip
250
+
251
+ ubuntu-latest-cmake-sanitizer:
252
+ runs-on: ubuntu-latest
253
+
254
+ continue-on-error: true
255
+
256
+ strategy:
257
+ matrix:
258
+ sanitizer: [ADDRESS, THREAD, UNDEFINED]
259
+ build_type: [Debug]
260
+
261
+ steps:
262
+ - name: Clone
263
+ id: checkout
264
+ uses: actions/checkout@v4
265
+
266
+ - name: ccache
267
+ uses: hendrikmuhs/ccache-action@v1.2.16
268
+ with:
269
+ key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
270
+ evict-old-files: 1d
271
+
272
+ - name: Dependencies
273
+ id: depends
274
+ run: |
275
+ sudo apt-get update
276
+ sudo apt-get install build-essential
277
+
278
+ - name: Build
279
+ id: cmake_build
280
+ if: ${{ matrix.sanitizer != 'THREAD' }}
281
+ run: |
282
+ cmake -B build \
283
+ -DLLAMA_FATAL_WARNINGS=ON \
284
+ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
285
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
286
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
287
+
288
+ - name: Build (no OpenMP)
289
+ id: cmake_build_no_openmp
290
+ if: ${{ matrix.sanitizer == 'THREAD' }}
291
+ run: |
292
+ cmake -B build \
293
+ -DLLAMA_FATAL_WARNINGS=ON \
294
+ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
295
+ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
296
+ -DGGML_OPENMP=OFF
297
+ cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
298
+
299
+ - name: Test
300
+ id: cmake_test
301
+ run: |
302
+ cd build
303
+ ctest -L main --verbose --timeout 900
304
+
305
+ ubuntu-latest-llguidance:
306
+ runs-on: ubuntu-latest
307
+
308
+ steps:
309
+ - name: Clone
310
+ id: checkout
311
+ uses: actions/checkout@v4
312
+
313
+ - name: Dependencies
314
+ id: depends
315
+ run: |
316
+ sudo apt-get update
317
+ sudo apt-get install build-essential
318
+
319
+ - name: Build
320
+ id: cmake_build
321
+ run: |
322
+ mkdir build
323
+ cd build
324
+ cmake .. \
325
+ -DLLAMA_FATAL_WARNINGS=ON \
326
+ -DLLAMA_LLGUIDANCE=ON
327
+ cmake --build . --config Release -j $(nproc)
328
+
329
+ - name: Test
330
+ id: cmake_test
331
+ run: |
332
+ cd build
333
+ ctest -L main --verbose --timeout 900
334
+
335
+ ubuntu-latest-cmake-rpc:
336
+ runs-on: ubuntu-latest
337
+
338
+ continue-on-error: true
339
+
340
+ steps:
341
+ - name: Clone
342
+ id: checkout
343
+ uses: actions/checkout@v4
344
+
345
+ - name: ccache
346
+ uses: hendrikmuhs/ccache-action@v1.2.16
347
+ with:
348
+ key: ubuntu-latest-cmake-rpc
349
+ evict-old-files: 1d
350
+
351
+ - name: Dependencies
352
+ id: depends
353
+ run: |
354
+ sudo apt-get update
355
+ sudo apt-get install build-essential
356
+
357
+ - name: Build
358
+ id: cmake_build
359
+ run: |
360
+ cmake -B build \
361
+ -DGGML_RPC=ON
362
+ cmake --build build --config Release -j $(nproc)
363
+
364
+ - name: Test
365
+ id: cmake_test
366
+ run: |
367
+ cd build
368
+ ctest -L main --verbose
369
+
370
+ ubuntu-22-cmake-vulkan:
371
+ runs-on: ubuntu-22.04
372
+
373
+ steps:
374
+ - name: Clone
375
+ id: checkout
376
+ uses: actions/checkout@v4
377
+
378
+ - name: ccache
379
+ uses: hendrikmuhs/ccache-action@v1.2.16
380
+ with:
381
+ key: ubuntu-22-cmake-vulkan
382
+ evict-old-files: 1d
383
+
384
+ - name: Dependencies
385
+ id: depends
386
+ run: |
387
+ wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
388
+ sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
389
+ sudo apt-get update -y
390
+ sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
391
+
392
+ - name: Build
393
+ id: cmake_build
394
+ run: |
395
+ cmake -B build \
396
+ -DGGML_VULKAN=ON
397
+ cmake --build build --config Release -j $(nproc)
398
+
399
+ - name: Test
400
+ id: cmake_test
401
+ run: |
402
+ cd build
403
+ # This is using llvmpipe and runs slower than other backends
404
+ ctest -L main --verbose --timeout 1800
405
+
406
+ ubuntu-22-cmake-hip:
407
+ runs-on: ubuntu-22.04
408
+ container: rocm/dev-ubuntu-22.04:6.0.2
409
+
410
+ steps:
411
+ - name: Clone
412
+ id: checkout
413
+ uses: actions/checkout@v4
414
+
415
+ - name: Dependencies
416
+ id: depends
417
+ run: |
418
+ sudo apt-get update
419
+ sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
420
+
421
+ - name: ccache
422
+ uses: hendrikmuhs/ccache-action@v1.2.16
423
+ with:
424
+ key: ubuntu-22-cmake-hip
425
+ evict-old-files: 1d
426
+
427
+ - name: Build with native CMake HIP support
428
+ id: cmake_build
429
+ run: |
430
+ cmake -B build -S . \
431
+ -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
432
+ -DGGML_HIP=ON
433
+ cmake --build build --config Release -j $(nproc)
434
+
435
+ - name: Build with legacy HIP support
436
+ id: cmake_build_legacy_hip
437
+ run: |
438
+ cmake -B build2 -S . \
439
+ -DCMAKE_C_COMPILER=hipcc \
440
+ -DCMAKE_CXX_COMPILER=hipcc \
441
+ -DGGML_HIP=ON
442
+ cmake --build build2 --config Release -j $(nproc)
443
+
444
+ ubuntu-22-cmake-musa:
445
+ runs-on: ubuntu-22.04
446
+ container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
447
+
448
+ steps:
449
+ - name: Clone
450
+ id: checkout
451
+ uses: actions/checkout@v4
452
+
453
+ - name: Dependencies
454
+ id: depends
455
+ run: |
456
+ apt-get update
457
+ apt-get install -y build-essential git cmake libcurl4-openssl-dev
458
+
459
+ - name: ccache
460
+ uses: hendrikmuhs/ccache-action@v1.2.16
461
+ with:
462
+ key: ubuntu-22-cmake-musa
463
+ evict-old-files: 1d
464
+
465
+ - name: Build with native CMake MUSA support
466
+ id: cmake_build
467
+ run: |
468
+ cmake -B build -S . \
469
+ -DGGML_MUSA=ON
470
+ cmake --build build --config Release -j $(nproc)
471
+
472
+ ubuntu-22-cmake-sycl:
473
+ runs-on: ubuntu-22.04
474
+
475
+ continue-on-error: true
476
+
477
+ steps:
478
+ - uses: actions/checkout@v4
479
+
480
+ - name: add oneAPI to apt
481
+ shell: bash
482
+ run: |
483
+ cd /tmp
484
+ wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
485
+ sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
486
+ rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
487
+ sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
488
+
489
+ - name: install oneAPI dpcpp compiler
490
+ shell: bash
491
+ run: |
492
+ sudo apt update
493
+ sudo apt install intel-oneapi-compiler-dpcpp-cpp
494
+
495
+ - name: install oneAPI MKL library
496
+ shell: bash
497
+ run: |
498
+ sudo apt install intel-oneapi-mkl-devel
499
+
500
+ - name: Clone
501
+ id: checkout
502
+ uses: actions/checkout@v4
503
+
504
+ - name: ccache
505
+ uses: hendrikmuhs/ccache-action@v1.2.16
506
+ with:
507
+ key: ubuntu-22-cmake-sycl
508
+ evict-old-files: 1d
509
+
510
+ - name: Build
511
+ id: cmake_build
512
+ run: |
513
+ source /opt/intel/oneapi/setvars.sh
514
+ cmake -B build \
515
+ -DGGML_SYCL=ON \
516
+ -DCMAKE_C_COMPILER=icx \
517
+ -DCMAKE_CXX_COMPILER=icpx
518
+ cmake --build build --config Release -j $(nproc)
519
+
520
+ ubuntu-22-cmake-sycl-fp16:
521
+ runs-on: ubuntu-22.04
522
+
523
+ continue-on-error: true
524
+
525
+ steps:
526
+ - uses: actions/checkout@v4
527
+
528
+ - name: add oneAPI to apt
529
+ shell: bash
530
+ run: |
531
+ cd /tmp
532
+ wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
533
+ sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
534
+ rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
535
+ sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
536
+
537
+ - name: install oneAPI dpcpp compiler
538
+ shell: bash
539
+ run: |
540
+ sudo apt update
541
+ sudo apt install intel-oneapi-compiler-dpcpp-cpp
542
+
543
+ - name: install oneAPI MKL library
544
+ shell: bash
545
+ run: |
546
+ sudo apt install intel-oneapi-mkl-devel
547
+
548
+ - name: Clone
549
+ id: checkout
550
+ uses: actions/checkout@v4
551
+
552
+ - name: ccache
553
+ uses: hendrikmuhs/ccache-action@v1.2.16
554
+ with:
555
+ key: ubuntu-22-cmake-sycl-fp16
556
+ evict-old-files: 1d
557
+
558
+ - name: Build
559
+ id: cmake_build
560
+ run: |
561
+ source /opt/intel/oneapi/setvars.sh
562
+ cmake -B build \
563
+ -DGGML_SYCL=ON \
564
+ -DCMAKE_C_COMPILER=icx \
565
+ -DCMAKE_CXX_COMPILER=icpx \
566
+ -DGGML_SYCL_F16=ON
567
+ cmake --build build --config Release -j $(nproc)
568
+
569
+ macOS-latest-cmake-ios:
570
+ runs-on: macos-latest
571
+
572
+ steps:
573
+ - name: Clone
574
+ id: checkout
575
+ uses: actions/checkout@v4
576
+
577
+ - name: ccache
578
+ uses: hendrikmuhs/ccache-action@v1.2.16
579
+ with:
580
+ key: macOS-latest-cmake-ios
581
+ evict-old-files: 1d
582
+
583
+ - name: Dependencies
584
+ id: depends
585
+ continue-on-error: true
586
+ run: |
587
+ brew update
588
+
589
+ - name: Build
590
+ id: cmake_build
591
+ run: |
592
+ sysctl -a
593
+ cmake -B build -G Xcode \
594
+ -DGGML_METAL_USE_BF16=ON \
595
+ -DGGML_METAL_EMBED_LIBRARY=ON \
596
+ -DLLAMA_BUILD_EXAMPLES=OFF \
597
+ -DLLAMA_BUILD_TESTS=OFF \
598
+ -DLLAMA_BUILD_SERVER=OFF \
599
+ -DCMAKE_SYSTEM_NAME=iOS \
600
+ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
601
+ -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
602
+ cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
603
+
604
+ macOS-latest-cmake-tvos:
605
+ runs-on: macos-latest
606
+
607
+ steps:
608
+ - name: Clone
609
+ id: checkout
610
+ uses: actions/checkout@v4
611
+
612
+ - name: ccache
613
+ uses: hendrikmuhs/ccache-action@v1.2.16
614
+ with:
615
+ key: macOS-latest-cmake-tvos
616
+ evict-old-files: 1d
617
+
618
+ - name: Dependencies
619
+ id: depends
620
+ continue-on-error: true
621
+ run: |
622
+ brew update
623
+
624
+ - name: Build
625
+ id: cmake_build
626
+ run: |
627
+ sysctl -a
628
+ cmake -B build -G Xcode \
629
+ -DGGML_METAL_USE_BF16=ON \
630
+ -DGGML_METAL_EMBED_LIBRARY=ON \
631
+ -DLLAMA_BUILD_EXAMPLES=OFF \
632
+ -DLLAMA_BUILD_TESTS=OFF \
633
+ -DLLAMA_BUILD_SERVER=OFF \
634
+ -DCMAKE_SYSTEM_NAME=tvOS \
635
+ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
636
+ -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
637
+ cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
638
+
639
+ macOS-latest-swift:
640
+ runs-on: macos-latest
641
+
642
+ strategy:
643
+ matrix:
644
+ destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
645
+
646
+ steps:
647
+ - name: Clone
648
+ id: checkout
649
+ uses: actions/checkout@v4
650
+
651
+ - name: ccache
652
+ uses: hendrikmuhs/ccache-action@v1.2.16
653
+ with:
654
+ key: macOS-latest-swift
655
+ evict-old-files: 1d
656
+
657
+ - name: Dependencies
658
+ id: depends
659
+ continue-on-error: true
660
+ run: |
661
+ brew update
662
+
663
+ - name: Build llama.cpp with CMake
664
+ id: cmake_build
665
+ run: |
666
+ sysctl -a
667
+ cmake -B build -G Xcode \
668
+ -DGGML_METAL_USE_BF16=ON \
669
+ -DGGML_METAL_EMBED_LIBRARY=ON \
670
+ -DLLAMA_BUILD_EXAMPLES=OFF \
671
+ -DLLAMA_BUILD_TESTS=OFF \
672
+ -DLLAMA_BUILD_SERVER=OFF \
673
+ -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
674
+ cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
675
+ sudo cmake --install build --config Release
676
+
677
+ - name: xcodebuild for swift package
678
+ id: xcodebuild
679
+ run: |
680
+ xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
681
+
682
+ windows-msys2:
683
+ runs-on: windows-latest
684
+
685
+ strategy:
686
+ fail-fast: false
687
+ matrix:
688
+ include:
689
+ - { sys: UCRT64, env: ucrt-x86_64, build: Release }
690
+ - { sys: CLANG64, env: clang-x86_64, build: Release }
691
+
692
+ steps:
693
+ - name: Clone
694
+ uses: actions/checkout@v4
695
+
696
+ - name: ccache
697
+ uses: hendrikmuhs/ccache-action@v1.2.16
698
+ with:
699
+ key: windows-msys2
700
+ variant: sccache
701
+ evict-old-files: 1d
702
+
703
+ - name: Setup ${{ matrix.sys }}
704
+ uses: msys2/setup-msys2@v2
705
+ with:
706
+ update: true
707
+ msystem: ${{matrix.sys}}
708
+ install: >-
709
+ base-devel
710
+ git
711
+ mingw-w64-${{matrix.env}}-toolchain
712
+ mingw-w64-${{matrix.env}}-cmake
713
+ mingw-w64-${{matrix.env}}-openblas
714
+
715
+ - name: Build using CMake
716
+ shell: msys2 {0}
717
+ run: |
718
+ cmake -B build
719
+ cmake --build build --config ${{ matrix.build }} -j $(nproc)
720
+
721
+ - name: Clean after building using CMake
722
+ shell: msys2 {0}
723
+ run: |
724
+ rm -rf build
725
+
726
+ - name: Build using CMake w/ OpenBLAS
727
+ shell: msys2 {0}
728
+ run: |
729
+ cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
730
+ cmake --build build --config ${{ matrix.build }} -j $(nproc)
731
+
732
+ windows-latest-cmake:
733
+ runs-on: windows-latest
734
+
735
+ env:
736
+ OPENBLAS_VERSION: 0.3.23
737
+ SDE_VERSION: 9.33.0-2024-01-07
738
+ VULKAN_VERSION: 1.3.261.1
739
+
740
+ strategy:
741
+ matrix:
742
+ include:
743
+ - build: 'noavx-x64'
744
+ defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
745
+ - build: 'avx2-x64'
746
+ defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
747
+ - build: 'avx-x64'
748
+ defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
749
+ - build: 'avx512-x64'
750
+ defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
751
+ - build: 'openblas-x64'
752
+ defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
753
+ - build: 'kompute-x64'
754
+ defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
755
+ - build: 'vulkan-x64'
756
+ defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
757
+ - build: 'llvm-arm64'
758
+ defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
759
+ - build: 'msvc-arm64'
760
+ defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
761
+ - build: 'llvm-arm64-opencl-adreno'
762
+ defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
763
+
764
+ steps:
765
+ - name: Clone
766
+ id: checkout
767
+ uses: actions/checkout@v4
768
+ with:
769
+ fetch-depth: 0
770
+
771
+ - name: ccache
772
+ uses: hendrikmuhs/ccache-action@v1.2.16
773
+ with:
774
+ key: windows-latest-cmake-${{ matrix.build }}
775
+ variant: sccache
776
+ evict-old-files: 1d
777
+
778
+ - name: Clone Kompute submodule
779
+ id: clone_kompute
780
+ if: ${{ matrix.build == 'kompute-x64' }}
781
+ run: |
782
+ git submodule update --init ggml/src/ggml-kompute/kompute
783
+
784
+ - name: Download OpenBLAS
785
+ id: get_openblas
786
+ if: ${{ matrix.build == 'openblas-x64' }}
787
+ run: |
788
+ curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
789
+ curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
790
+ mkdir $env:RUNNER_TEMP/openblas
791
+ tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
792
+ $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
793
+ $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
794
+ $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
795
+ & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
796
+
797
+ - name: Install Vulkan SDK
798
+ id: get_vulkan
799
+ if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
800
+ run: |
801
+ curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
802
+ & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
803
+ Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
804
+ Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
805
+
806
+ - name: Install Ninja
807
+ id: install_ninja
808
+ run: |
809
+ choco install ninja
810
+
811
+ - name: Install OpenCL Headers and Libs
812
+ id: install_opencl
813
+ if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
814
+ run: |
815
+ git clone https://github.com/KhronosGroup/OpenCL-Headers
816
+ cd OpenCL-Headers
817
+ cmake -B build `
818
+ -DBUILD_TESTING=OFF `
819
+ -DOPENCL_HEADERS_BUILD_TESTING=OFF `
820
+ -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
821
+ -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
822
+ cmake --build build --target install
823
+ git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
824
+ cd OpenCL-ICD-Loader
825
+ cmake -B build-arm64-release `
826
+ -A arm64 `
827
+ -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
828
+ -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
829
+ cmake --build build-arm64-release --target install --config release
830
+
831
+ - name: Build
832
+ id: cmake_build
833
+ run: |
834
+ cmake -S . -B build ${{ matrix.defines }}
835
+ cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
836
+
837
+ - name: Add libopenblas.dll
838
+ id: add_libopenblas_dll
839
+ if: ${{ matrix.build == 'openblas-x64' }}
840
+ run: |
841
+ cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
842
+ cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
843
+
844
+ - name: Check AVX512F support
845
+ id: check_avx512f
846
+ if: ${{ matrix.build == 'avx512-x64' }}
847
+ continue-on-error: true
848
+ run: |
849
+ cd build
850
+ $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
851
+ $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
852
+ $cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
853
+ echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
854
+ & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
855
+ .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
856
+
857
+ - name: Test
858
+ id: cmake_test
859
+ # not all machines have native AVX-512
860
+ if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
861
+ run: |
862
+ cd build
863
+ ctest -L main -C Release --verbose --timeout 900
864
+
865
+ - name: Test (Intel SDE)
866
+ id: cmake_test_sde
867
+ if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
868
+ run: |
869
+ curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
870
+ # for some weird reason windows tar doesn't like sde tar.xz
871
+ 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
872
+ 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
873
+ $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
874
+ cd build
875
+ $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
876
+ & $sde -future -- ctest -L main -C Release --verbose --timeout 900
877
+
878
+ - name: Determine tag name
879
+ id: tag
880
+ shell: bash
881
+ run: |
882
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
883
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
884
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
885
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
886
+ else
887
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
888
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
889
+ fi
890
+
891
+ - name: Pack artifacts
892
+ id: pack_artifacts
893
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
894
+ run: |
895
+ Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
896
+ Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
897
+ 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
898
+
899
+ - name: Upload artifacts
900
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
901
+ uses: actions/upload-artifact@v4
902
+ with:
903
+ path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
904
+ name: llama-bin-win-${{ matrix.build }}.zip
905
+
906
+ ubuntu-latest-cmake-cuda:
907
+ runs-on: ubuntu-latest
908
+ container: nvidia/cuda:12.6.2-devel-ubuntu24.04
909
+
910
+ steps:
911
+ - name: Clone
912
+ id: checkout
913
+ uses: actions/checkout@v4
914
+ with:
915
+ fetch-depth: 0
916
+
917
+ - name: Install dependencies
918
+ env:
919
+ DEBIAN_FRONTEND: noninteractive
920
+ run: |
921
+ apt update
922
+ apt install -y cmake build-essential ninja-build libgomp1 git
923
+
924
+ - name: ccache
925
+ uses: hendrikmuhs/ccache-action@v1.2.16
926
+ with:
927
+ key: ubuntu-latest-cmake-cuda
928
+ evict-old-files: 1d
929
+
930
+ - name: Build with CMake
931
+ run: |
932
+ cmake -S . -B build -G Ninja \
933
+ -DCMAKE_BUILD_TYPE=Release \
934
+ -DCMAKE_CUDA_ARCHITECTURES=89-real \
935
+ -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
936
+ -DLLAMA_FATAL_WARNINGS=ON \
937
+ -DGGML_NATIVE=OFF \
938
+ -DGGML_CUDA=ON
939
+ cmake --build build
940
+
941
+ windows-2019-cmake-cuda:
942
+ runs-on: windows-2019
943
+
944
+ strategy:
945
+ matrix:
946
+ cuda: ['12.4', '11.7']
947
+ build: ['cuda']
948
+
949
+ steps:
950
+ - name: Clone
951
+ id: checkout
952
+ uses: actions/checkout@v4
953
+ with:
954
+ fetch-depth: 0
955
+
956
+ - name: Install ccache
957
+ uses: hendrikmuhs/ccache-action@v1.2.16
958
+ with:
959
+ key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
960
+ variant: sccache
961
+ evict-old-files: 1d
962
+
963
+ - name: Install Cuda Toolkit 11.7
964
+ if: ${{ matrix.cuda == '11.7' }}
965
+ run: |
966
+ mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
967
+ choco install unzip -y
968
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
969
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
970
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
971
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
972
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
973
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
974
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
975
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
976
+ unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
977
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
978
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
979
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
980
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
981
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
982
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
983
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
984
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
985
+ echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
986
+ echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
987
+ echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
988
+ echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
989
+
990
+ - name: Install Cuda Toolkit 12.4
991
+ if: ${{ matrix.cuda == '12.4' }}
992
+ run: |
993
+ mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
994
+ choco install unzip -y
995
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
996
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
997
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
998
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
999
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
1000
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
1001
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
1002
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
1003
+ curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
1004
+ unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
1005
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
1006
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
1007
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
1008
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
1009
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
1010
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
1011
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
1012
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
1013
+ xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
1014
+ echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
1015
+ echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
1016
+ echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
1017
+ echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
1018
+
1019
+ - name: Install Ninja
1020
+ id: install_ninja
1021
+ run: |
1022
+ choco install ninja
1023
+
1024
+ - name: Build
1025
+ id: cmake_build
1026
+ shell: cmd
1027
+ run: |
1028
+ call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
1029
+ cmake -S . -B build -G "Ninja Multi-Config" ^
1030
+ -DLLAMA_BUILD_SERVER=ON ^
1031
+ -DGGML_NATIVE=OFF ^
1032
+ -DGGML_CUDA=ON ^
1033
+ -DGGML_RPC=ON
1034
+ set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
1035
+ cmake --build build --config Release -j %NINJA_JOBS% -t ggml
1036
+ cmake --build build --config Release
1037
+
1038
+ - name: Determine tag name
1039
+ id: tag
1040
+ shell: bash
1041
+ run: |
1042
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
1043
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
1044
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
1045
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
1046
+ else
1047
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
1048
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
1049
+ fi
1050
+
1051
+ - name: Pack artifacts
1052
+ id: pack_artifacts
1053
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
1054
+ run: |
1055
+ 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
1056
+
1057
+ - name: Upload artifacts
1058
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
1059
+ uses: actions/upload-artifact@v4
1060
+ with:
1061
+ path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
1062
+ name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
1063
+
1064
+ - name: Copy and pack Cuda runtime
1065
+ if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
1066
+ run: |
1067
+ echo "Cuda install location: ${{ env.CUDA_PATH }}"
1068
+ $dst='.\build\bin\cudart\'
1069
+ robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
1070
+ robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
1071
+ 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
1072
+
1073
+ - name: Upload Cuda runtime
1074
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
1075
+ uses: actions/upload-artifact@v4
1076
+ with:
1077
+ path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
1078
+ name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
1079
+
1080
+ windows-latest-cmake-sycl:
1081
+ runs-on: windows-latest
1082
+
1083
+ defaults:
1084
+ run:
1085
+ shell: bash
1086
+
1087
+ env:
1088
+ WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
1089
+ WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
1090
+ ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
1091
+ steps:
1092
+ - name: Clone
1093
+ id: checkout
1094
+ uses: actions/checkout@v4
1095
+ with:
1096
+ fetch-depth: 0
1097
+
1098
+ - name: ccache
1099
+ uses: hendrikmuhs/ccache-action@v1.2.16
1100
+ with:
1101
+ key: windows-latest-cmake-sycl
1102
+ variant: sccache
1103
+ evict-old-files: 1d
1104
+
1105
+ - name: Install
1106
+ run: |
1107
+ scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
1108
+
1109
+ - name: Build
1110
+ id: cmake_build
1111
+ run: examples/sycl/win-build-sycl.bat
1112
+
1113
+ - name: Determine tag name
1114
+ id: tag
1115
+ shell: bash
1116
+ run: |
1117
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
1118
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
1119
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
1120
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
1121
+ else
1122
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
1123
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
1124
+ fi
1125
+
1126
+ - name: Build the release package
1127
+ id: pack_artifacts
1128
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
1129
+ run: |
1130
+ echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
1131
+
1132
+ cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
1133
+ cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
1134
+ cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
1135
+
1136
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
1137
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
1138
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
1139
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
1140
+
1141
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
1142
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
1143
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
1144
+ cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
1145
+
1146
+ cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
1147
+ cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
1148
+
1149
+ echo "cp oneAPI running time dll files to ./build/bin done"
1150
+ 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
1151
+
1152
+ - name: Upload the release package
1153
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
1154
+ uses: actions/upload-artifact@v4
1155
+ with:
1156
+ path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
1157
+ name: llama-bin-win-sycl-x64.zip
1158
+
1159
+ windows-latest-cmake-hip:
1160
+ if: ${{ github.event.inputs.create_release != 'true' }}
1161
+ runs-on: windows-latest
1162
+
1163
+ steps:
1164
+ - name: Clone
1165
+ id: checkout
1166
+ uses: actions/checkout@v4
1167
+
1168
+ - name: Install
1169
+ id: depends
1170
+ run: |
1171
+ $ErrorActionPreference = "Stop"
1172
+ write-host "Downloading AMD HIP SDK Installer"
1173
+ Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
1174
+ write-host "Installing AMD HIP SDK"
1175
+ Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
1176
+ write-host "Completed AMD HIP SDK installation"
1177
+
1178
+ - name: Verify ROCm
1179
+ id: verify
1180
+ run: |
1181
+ & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
1182
+
1183
+ - name: Install ccache
1184
+ uses: hendrikmuhs/ccache-action@v1.2.16
1185
+ with:
1186
+ key: ${{ github.job }}
1187
+ evict-old-files: 1d
1188
+
1189
+ - name: Build
1190
+ id: cmake_build
1191
+ run: |
1192
+ $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
1193
+ $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
1194
+ cmake -G "Unix Makefiles" -B build -S . `
1195
+ -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
1196
+ -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
1197
+ -DCMAKE_BUILD_TYPE=Release `
1198
+ -DGGML_HIP=ON `
1199
+ -DGGML_RPC=ON
1200
+ cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
1201
+
1202
+ windows-latest-cmake-hip-release:
1203
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
1204
+ runs-on: windows-latest
1205
+
1206
+ strategy:
1207
+ matrix:
1208
+ gpu_target: [gfx1100, gfx1101, gfx1030]
1209
+
1210
+ steps:
1211
+ - name: Clone
1212
+ id: checkout
1213
+ uses: actions/checkout@v4
1214
+ with:
1215
+ fetch-depth: 0
1216
+
1217
+ - name: ccache
1218
+ uses: hendrikmuhs/ccache-action@v1.2.16
1219
+ with:
1220
+ key: windows-latest-cmake-hip-release
1221
+ evict-old-files: 1d
1222
+
1223
+ - name: Install
1224
+ id: depends
1225
+ run: |
1226
+ $ErrorActionPreference = "Stop"
1227
+ write-host "Downloading AMD HIP SDK Installer"
1228
+ Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
1229
+ write-host "Installing AMD HIP SDK"
1230
+ Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
1231
+ write-host "Completed AMD HIP SDK installation"
1232
+
1233
+ - name: Verify ROCm
1234
+ id: verify
1235
+ run: |
1236
+ & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
1237
+
1238
+ - name: Build
1239
+ id: cmake_build
1240
+ run: |
1241
+ $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
1242
+ $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
1243
+ cmake -G "Unix Makefiles" -B build -S . `
1244
+ -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
1245
+ -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
1246
+ -DCMAKE_BUILD_TYPE=Release `
1247
+ -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
1248
+ -DGGML_HIP=ON `
1249
+ -DGGML_RPC=ON
1250
+ cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
1251
+ md "build\bin\rocblas\library\"
1252
+ cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
1253
+ cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
1254
+ cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
1255
+
1256
+ - name: Determine tag name
1257
+ id: tag
1258
+ shell: bash
1259
+ run: |
1260
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
1261
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
1262
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
1263
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
1264
+ else
1265
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
1266
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
1267
+ fi
1268
+
1269
+ - name: Pack artifacts
1270
+ id: pack_artifacts
1271
+ run: |
1272
+ 7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
1273
+
1274
+ - name: Upload artifacts
1275
+ uses: actions/upload-artifact@v4
1276
+ with:
1277
+ path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
1278
+ name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
1279
+
1280
+ ios-xcode-build:
1281
+ runs-on: macos-latest
1282
+
1283
+ steps:
1284
+ - name: Checkout code
1285
+ uses: actions/checkout@v4
1286
+
1287
+ - name: Build
1288
+ id: cmake_build
1289
+ run: |
1290
+ sysctl -a
1291
+ cmake -B build -G Xcode \
1292
+ -DGGML_METAL_USE_BF16=ON \
1293
+ -DGGML_METAL_EMBED_LIBRARY=ON \
1294
+ -DLLAMA_BUILD_EXAMPLES=OFF \
1295
+ -DLLAMA_BUILD_TESTS=OFF \
1296
+ -DLLAMA_BUILD_SERVER=OFF \
1297
+ -DCMAKE_SYSTEM_NAME=iOS \
1298
+ -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
1299
+ -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
1300
+ cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
1301
+ sudo cmake --install build --config Release
1302
+
1303
+ - name: xcodebuild for swift package
1304
+ id: xcodebuild
1305
+ run: |
1306
+ xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
1307
+
1308
+ - name: Build Xcode project
1309
+ run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
1310
+
1311
+ android-build:
1312
+ runs-on: ubuntu-latest
1313
+
1314
+ steps:
1315
+ - name: Clone
1316
+ uses: actions/checkout@v4
1317
+
1318
+ - name: ccache
1319
+ uses: hendrikmuhs/ccache-action@v1.2.16
1320
+ with:
1321
+ key: android-build
1322
+ evict-old-files: 1d
1323
+
1324
+ - name: Set up JDK
1325
+ uses: actions/setup-java@v3
1326
+ with:
1327
+ java-version: 17
1328
+ distribution: zulu
1329
+
1330
+ - name: Setup Android SDK
1331
+ uses: android-actions/setup-android@v3
1332
+ with:
1333
+ log-accepted-android-sdk-licenses: false
1334
+
1335
+ - name: Build
1336
+ run: |
1337
+ cd examples/llama.android
1338
+
1339
+ ./gradlew build --no-daemon
1340
+
1341
+ release:
1342
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
1343
+
1344
+ runs-on: ubuntu-latest
1345
+
1346
+ needs:
1347
+ - ubuntu-cpu-cmake
1348
+ - windows-latest-cmake
1349
+ - windows-2019-cmake-cuda
1350
+ - windows-latest-cmake-hip-release
1351
+ - macOS-latest-cmake-arm64
1352
+ - macOS-latest-cmake-x64
1353
+
1354
+ steps:
1355
+ - name: Clone
1356
+ id: checkout
1357
+ uses: actions/checkout@v4
1358
+ with:
1359
+ fetch-depth: 0
1360
+
1361
+ - name: ccache
1362
+ uses: hendrikmuhs/ccache-action@v1.2.16
1363
+ with:
1364
+ key: release
1365
+ evict-old-files: 1d
1366
+
1367
+ - name: Determine tag name
1368
+ id: tag
1369
+ shell: bash
1370
+ run: |
1371
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
1372
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
1373
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
1374
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
1375
+ else
1376
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
1377
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
1378
+ fi
1379
+
1380
+ - name: Download artifacts
1381
+ id: download-artifact
1382
+ uses: actions/download-artifact@v4
1383
+ with:
1384
+ path: ./artifact
1385
+
1386
+ - name: Move artifacts
1387
+ id: move_artifacts
1388
+ run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
1389
+
1390
+ - name: Create release
1391
+ id: create_release
1392
+ uses: ggml-org/action-create-release@v1
1393
+ env:
1394
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
1395
+ with:
1396
+ tag_name: ${{ steps.tag.outputs.name }}
1397
+
1398
+ - name: Upload release
1399
+ id: upload_release
1400
+ uses: actions/github-script@v3
1401
+ with:
1402
+ github-token: ${{secrets.GITHUB_TOKEN}}
1403
+ script: |
1404
+ const path = require('path');
1405
+ const fs = require('fs');
1406
+ const release_id = '${{ steps.create_release.outputs.id }}';
1407
+ for (let file of await fs.readdirSync('./artifact/release')) {
1408
+ if (path.extname(file) === '.zip') {
1409
+ console.log('uploadReleaseAsset', file);
1410
+ await github.repos.uploadReleaseAsset({
1411
+ owner: context.repo.owner,
1412
+ repo: context.repo.repo,
1413
+ release_id: release_id,
1414
+ name: file,
1415
+ data: await fs.readFileSync(`./artifact/release/${file}`)
1416
+ });
1417
+ }
1418
+ }
1419
+
1420
+ # ubuntu-latest-gcc:
1421
+ # runs-on: ubuntu-latest
1422
+ #
1423
+ # strategy:
1424
+ # matrix:
1425
+ # build: [Debug, Release]
1426
+ #
1427
+ # steps:
1428
+ # - name: Clone
1429
+ # uses: actions/checkout@v4
1430
+ #
1431
+ # - name: Dependencies
1432
+ # run: |
1433
+ # sudo apt-get update
1434
+ # sudo apt-get install build-essential
1435
+ # sudo apt-get install cmake
1436
+ #
1437
+ # - name: Configure
1438
+ # run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
1439
+ #
1440
+ # - name: Build
1441
+ # run: |
1442
+ # make
1443
+ #
1444
+ # ubuntu-latest-clang:
1445
+ # runs-on: ubuntu-latest
1446
+ #
1447
+ # strategy:
1448
+ # matrix:
1449
+ # build: [Debug, Release]
1450
+ #
1451
+ # steps:
1452
+ # - name: Clone
1453
+ # uses: actions/checkout@v4
1454
+ #
1455
+ # - name: Dependencies
1456
+ # run: |
1457
+ # sudo apt-get update
1458
+ # sudo apt-get install build-essential
1459
+ # sudo apt-get install cmake
1460
+ #
1461
+ # - name: Configure
1462
+ # run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
1463
+ #
1464
+ # - name: Build
1465
+ # run: |
1466
+ # make
1467
+ #
1468
+ # ubuntu-latest-gcc-sanitized:
1469
+ # runs-on: ubuntu-latest
1470
+ #
1471
+ # strategy:
1472
+ # matrix:
1473
+ # sanitizer: [ADDRESS, THREAD, UNDEFINED]
1474
+ #
1475
+ # steps:
1476
+ # - name: Clone
1477
+ # uses: actions/checkout@v4
1478
+ #
1479
+ # - name: Dependencies
1480
+ # run: |
1481
+ # sudo apt-get update
1482
+ # sudo apt-get install build-essential
1483
+ # sudo apt-get install cmake
1484
+ #
1485
+ # - name: Configure
1486
+ # run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
1487
+ #
1488
+ # - name: Build
1489
+ # run: |
1490
+ # make
1491
+ #
1492
+ # windows:
1493
+ # runs-on: windows-latest
1494
+ #
1495
+ # strategy:
1496
+ # matrix:
1497
+ # build: [Release]
1498
+ # arch: [Win32, x64]
1499
+ # include:
1500
+ # - arch: Win32
1501
+ # s2arc: x86
1502
+ # - arch: x64
1503
+ # s2arc: x64
1504
+ #
1505
+ # steps:
1506
+ # - name: Clone
1507
+ # uses: actions/checkout@v4
1508
+ #
1509
+ # - name: Add msbuild to PATH
1510
+ # uses: microsoft/setup-msbuild@v1
1511
+ #
1512
+ # - name: Configure
1513
+ # run: >
1514
+ # cmake -S . -B ./build -A ${{ matrix.arch }}
1515
+ # -DCMAKE_BUILD_TYPE=${{ matrix.build }}
1516
+ #
1517
+ # - name: Build
1518
+ # run: |
1519
+ # cd ./build
1520
+ # msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
1521
+ #
1522
+ # - name: Upload binaries
1523
+ # uses: actions/upload-artifact@v4
1524
+ # with:
1525
+ # name: llama-bin-${{ matrix.arch }}
1526
+ # path: build/bin/${{ matrix.build }}
1527
+ #
1528
+ # windows-blas:
1529
+ # runs-on: windows-latest
1530
+ #
1531
+ # strategy:
1532
+ # matrix:
1533
+ # build: [Release]
1534
+ # arch: [Win32, x64]
1535
+ # blas: [ON]
1536
+ # include:
1537
+ # - arch: Win32
1538
+ # obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
1539
+ # s2arc: x86
1540
+ # - arch: x64
1541
+ # obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
1542
+ # s2arc: x64
1543
+ #
1544
+ # steps:
1545
+ # - name: Clone
1546
+ # uses: actions/checkout@v4
1547
+ #
1548
+ # - name: Add msbuild to PATH
1549
+ # uses: microsoft/setup-msbuild@v1
1550
+ #
1551
+ # - name: Fetch OpenBLAS
1552
+ # if: matrix.blas == 'ON'
1553
+ # run: |
1554
+ # C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
1555
+ # 7z x blas.zip -oblas -y
1556
+ # copy blas/include/cblas.h .
1557
+ # copy blas/include/openblas_config.h .
1558
+ # echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
1559
+ #
1560
+ # - name: Configure
1561
+ # run: >
1562
+ # cmake -S . -B ./build -A ${{ matrix.arch }}
1563
+ # -DCMAKE_BUILD_TYPE=${{ matrix.build }}
1564
+ # -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
1565
+ # -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
1566
+ #
1567
+ # - name: Build
1568
+ # run: |
1569
+ # cd ./build
1570
+ # msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
1571
+ #
1572
+ # - name: Copy libopenblas.dll
1573
+ # if: matrix.blas == 'ON'
1574
+ # run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
1575
+ #
1576
+ # - name: Upload binaries
1577
+ # if: matrix.blas == 'ON'
1578
+ # uses: actions/upload-artifact@v4
1579
+ # with:
1580
+ # name: llama-blas-bin-${{ matrix.arch }}
1581
+ # path: build/bin/${{ matrix.build }}
1582
+ #
1583
+ # emscripten:
1584
+ # runs-on: ubuntu-latest
1585
+ #
1586
+ # strategy:
1587
+ # matrix:
1588
+ # build: [Release]
1589
+ #
1590
+ # steps:
1591
+ # - name: Clone
1592
+ # uses: actions/checkout@v4
1593
+ #
1594
+ # - name: Dependencies
1595
+ # run: |
1596
+ # wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
1597
+ # tar -xvf master.tar.gz
1598
+ # emsdk-master/emsdk update
1599
+ # emsdk-master/emsdk install latest
1600
+ # emsdk-master/emsdk activate latest
1601
+ #
1602
+ # - name: Configure
1603
+ # run: echo "tmp"
1604
+ #
1605
+ # - name: Build
1606
+ # run: |
1607
+ # pushd emsdk-master
1608
+ # source ./emsdk_env.sh
1609
+ # popd
1610
+ # emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
1611
+ # make
1612
+
1613
+ openEuler-latest-cmake-cann:
1614
+ if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
1615
+ defaults:
1616
+ run:
1617
+ shell: bash -el {0}
1618
+ runs-on: ubuntu-24.04-arm
1619
+ strategy:
1620
+ matrix:
1621
+ cann:
1622
+ - '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
1623
+ device:
1624
+ - 'ascend910b3'
1625
+ build:
1626
+ - 'Release'
1627
+ container: ascendai/cann:${{ matrix.cann }}
1628
+ steps:
1629
+ - name: Checkout
1630
+ uses: actions/checkout@v4
1631
+
1632
+ - name: Dependencies
1633
+ run: |
1634
+ yum update -y
1635
+ yum install -y git gcc gcc-c++ make cmake
1636
+
1637
+ - name: Build
1638
+ run: |
1639
+ export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
1640
+
1641
+ cmake -S . -B build \
1642
+ -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
1643
+ -DGGML_CANN=on \
1644
+ -DSOC_TYPE=${{ matrix.device }}
1645
+ cmake --build build -j $(nproc)
llama.cpp/.github/workflows/close-issue.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Close inactive issues
2
+ on:
3
+ schedule:
4
+ - cron: "42 0 * * *"
5
+
6
+ # Fine-grant permission
7
+ # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
8
+ permissions:
9
+ issues: write
10
+
11
+ jobs:
12
+ close-issues:
13
+ runs-on: ubuntu-latest
14
+ permissions:
15
+ issues: write
16
+ pull-requests: write
17
+ steps:
18
+ - uses: actions/stale@v5
19
+ with:
20
+ exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
21
+ days-before-issue-stale: 30
22
+ days-before-issue-close: 14
23
+ stale-issue-label: "stale"
24
+ close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
25
+ days-before-pr-stale: -1
26
+ days-before-pr-close: -1
27
+ operations-per-run: 10000
28
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
llama.cpp/.github/workflows/docker.yml ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow uses actions that are not certified by GitHub.
2
+ # They are provided by a third-party and are governed by
3
+ # separate terms of service, privacy policy, and support
4
+ # documentation.
5
+
6
+ # GitHub recommends pinning actions to a commit SHA.
7
+ # To get a newer version, you will need to update the SHA.
8
+ # You can also reference a tag or branch, but the action may change without warning.
9
+
10
+ name: Publish Docker image
11
+
12
+ on:
13
+ workflow_dispatch: # allows manual triggering
14
+ schedule:
15
+ # Rebuild daily rather than on every push because it is expensive
16
+ - cron: '12 4 * * *'
17
+
18
+ concurrency:
19
+ group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
20
+ cancel-in-progress: true
21
+
22
+ # Fine-grant permission
23
+ # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
24
+ permissions:
25
+ packages: write
26
+
27
+ jobs:
28
+ push_to_registry:
29
+ name: Push Docker image to Docker Hub
30
+
31
+ runs-on: ubuntu-22.04
32
+ env:
33
+ COMMIT_SHA: ${{ github.sha }}
34
+ strategy:
35
+ fail-fast: false
36
+ matrix:
37
+ config:
38
+ # Multi-stage build
39
+ - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
40
+ - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
41
+ - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
42
+ - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
43
+ - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
44
+ # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
45
+ #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
46
+ steps:
47
+ - name: Check out the repo
48
+ uses: actions/checkout@v4
49
+ with:
50
+ fetch-depth: 0 # preserve git history, so we can determine the build number
51
+
52
+ - name: Set up QEMU
53
+ uses: docker/setup-qemu-action@v3
54
+
55
+ - name: Set up Docker Buildx
56
+ uses: docker/setup-buildx-action@v3
57
+
58
+ - name: Log in to Docker Hub
59
+ uses: docker/login-action@v2
60
+ with:
61
+ registry: ghcr.io
62
+ username: ${{ github.repository_owner }}
63
+ password: ${{ secrets.GITHUB_TOKEN }}
64
+
65
+ - name: Determine tag name
66
+ id: tag
67
+ shell: bash
68
+ run: |
69
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
70
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
71
+ REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
72
+ REPO_NAME="${{ github.event.repository.name }}"
73
+
74
+ # determine tag name postfix (build number, commit hash)
75
+ if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
76
+ TAG_POSTFIX="-b${BUILD_NUMBER}"
77
+ else
78
+ SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
79
+ TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
80
+ fi
81
+ # list all tags possible
82
+ if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
83
+ TYPE=""
84
+ else
85
+ TYPE="-${{ matrix.config.tag }}"
86
+ fi
87
+ PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
88
+ FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
89
+ LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
90
+ SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
91
+ echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
92
+ echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
93
+ echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
94
+ echo "full_output_tags=$FULLTAGS" # print out for debugging
95
+ echo "light_output_tags=$LIGHTTAGS" # print out for debugging
96
+ echo "server_output_tags=$SERVERTAGS" # print out for debugging
97
+ env:
98
+ GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
99
+ GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
100
+
101
+ - name: Free Disk Space (Ubuntu)
102
+ if: ${{ matrix.config.free_disk_space == true }}
103
+ uses: ggml-org/free-disk-space@v1.3.1
104
+ with:
105
+ # this might remove tools that are actually needed,
106
+ # if set to "true" but frees about 6 GB
107
+ tool-cache: false
108
+
109
+ # all of these default to true, but feel free to set to
110
+ # "false" if necessary for your workflow
111
+ android: true
112
+ dotnet: true
113
+ haskell: true
114
+ large-packages: true
115
+ docker-images: true
116
+ swap-storage: true
117
+
118
+ - name: Build and push Full Docker image (tagged + versioned)
119
+ if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
120
+ uses: docker/build-push-action@v6
121
+ with:
122
+ context: .
123
+ push: true
124
+ platforms: ${{ matrix.config.platforms }}
125
+ # tag list is generated from step above
126
+ tags: ${{ steps.tag.outputs.full_output_tags }}
127
+ file: ${{ matrix.config.dockerfile }}
128
+ target: full
129
+ provenance: false
130
+ # using github experimental cache
131
+ cache-from: type=gha
132
+ cache-to: type=gha,mode=max
133
+ # return to this if the experimental github cache is having issues
134
+ #cache-to: type=local,dest=/tmp/.buildx-cache
135
+ #cache-from: type=local,src=/tmp/.buildx-cache
136
+
137
+ - name: Build and push Light Docker image (tagged + versioned)
138
+ if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
139
+ uses: docker/build-push-action@v6
140
+ with:
141
+ context: .
142
+ push: true
143
+ platforms: ${{ matrix.config.platforms }}
144
+ # tag list is generated from step above
145
+ tags: ${{ steps.tag.outputs.light_output_tags }}
146
+ file: ${{ matrix.config.dockerfile }}
147
+ target: light
148
+ provenance: false
149
+ # using github experimental cache
150
+ cache-from: type=gha
151
+ cache-to: type=gha,mode=max
152
+ # return to this if the experimental github cache is having issues
153
+ #cache-to: type=local,dest=/tmp/.buildx-cache
154
+ #cache-from: type=local,src=/tmp/.buildx-cache
155
+
156
+ - name: Build and push Server Docker image (tagged + versioned)
157
+ if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
158
+ uses: docker/build-push-action@v6
159
+ with:
160
+ context: .
161
+ push: true
162
+ platforms: ${{ matrix.config.platforms }}
163
+ # tag list is generated from step above
164
+ tags: ${{ steps.tag.outputs.server_output_tags }}
165
+ file: ${{ matrix.config.dockerfile }}
166
+ target: server
167
+ provenance: false
168
+ # using github experimental cache
169
+ cache-from: type=gha
170
+ cache-to: type=gha,mode=max
171
+ # return to this if the experimental github cache is having issues
172
+ #cache-to: type=local,dest=/tmp/.buildx-cache
173
+ #cache-from: type=local,src=/tmp/.buildx-cache
llama.cpp/.github/workflows/editorconfig.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: EditorConfig Checker
2
+
3
+ on:
4
+ workflow_dispatch: # allows manual triggering
5
+ inputs:
6
+ create_release:
7
+ description: 'Create new release'
8
+ required: true
9
+ type: boolean
10
+ push:
11
+ branches:
12
+ - master
13
+ pull_request:
14
+ branches:
15
+ - master
16
+
17
+ concurrency:
18
+ group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
19
+ cancel-in-progress: true
20
+
21
+ jobs:
22
+ editorconfig:
23
+ runs-on: ubuntu-latest
24
+ steps:
25
+ - uses: actions/checkout@v4
26
+ - uses: editorconfig-checker/action-editorconfig-checker@v2
27
+ with:
28
+ version: v3.0.3
29
+ - run: editorconfig-checker
llama.cpp/.github/workflows/gguf-publish.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will upload a Python Package using Twine when a GGUF release is created
2
+ # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3
+
4
+ # See `gguf-py/README.md` for how to make a release.
5
+
6
+ # This workflow uses actions that are not certified by GitHub.
7
+ # They are provided by a third-party and are governed by
8
+ # separate terms of service, privacy policy, and support
9
+ # documentation.
10
+
11
+ name: Upload Python Package
12
+
13
+ on:
14
+ workflow_dispatch:
15
+ push:
16
+ # Pattern matched against refs/tags
17
+ tags:
18
+ - 'gguf-v*' # Push events to every version tag
19
+
20
+
21
+ jobs:
22
+ deploy:
23
+
24
+ runs-on: ubuntu-latest
25
+
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ - name: Set up Python
29
+ uses: actions/setup-python@v5
30
+ with:
31
+ python-version: '3.9.x'
32
+ - name: Install dependencies
33
+ run: |
34
+ cd gguf-py
35
+ python -m pip install poetry
36
+ poetry install
37
+
38
+ - name: Build package
39
+ run: cd gguf-py && poetry build
40
+ - name: Publish package
41
+ uses: pypa/gh-action-pypi-publish@release/v1
42
+ with:
43
+ password: ${{ secrets.PYPI_API_TOKEN }}
44
+ packages-dir: gguf-py/dist
llama.cpp/.github/workflows/labeler.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Pull Request Labeler"
2
+ on:
3
+ - pull_request_target
4
+
5
+ jobs:
6
+ labeler:
7
+ permissions:
8
+ contents: read
9
+ pull-requests: write
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ with:
14
+ repository: "ggerganov/llama.cpp"
15
+ - uses: actions/labeler@v5
16
+ with:
17
+ configuration-path: '.github/labeler.yml'
llama.cpp/.github/workflows/python-check-requirements.yml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Python check requirements.txt
2
+
3
+ on:
4
+ push:
5
+ paths:
6
+ - '.github/workflows/python-check-requirements.yml'
7
+ - 'scripts/check-requirements.sh'
8
+ - 'convert*.py'
9
+ - '**/requirements*.txt'
10
+ pull_request:
11
+ paths:
12
+ - '.github/workflows/python-check-requirements.yml'
13
+ - 'scripts/check-requirements.sh'
14
+ - 'convert*.py'
15
+ - '**/requirements*.txt'
16
+
17
+ concurrency:
18
+ group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
19
+ cancel-in-progress: true
20
+
21
+ jobs:
22
+ python-check-requirements:
23
+ runs-on: ubuntu-latest
24
+ name: check-requirements
25
+ steps:
26
+ - name: Check out source repository
27
+ uses: actions/checkout@v4
28
+ - name: Set up Python environment
29
+ uses: actions/setup-python@v5
30
+ with:
31
+ python-version: "3.11"
32
+ - name: Run check-requirements.sh script
33
+ run: bash scripts/check-requirements.sh
llama.cpp/.github/workflows/python-lint.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: flake8 Lint
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - master
7
+ paths: ['.github/workflows/python-lint.yml', '**/*.py']
8
+ pull_request:
9
+ types: [opened, synchronize, reopened]
10
+ paths: ['.github/workflows/python-lint.yml', '**/*.py']
11
+
12
+ concurrency:
13
+ group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
14
+ cancel-in-progress: true
15
+
16
+ jobs:
17
+ flake8-lint:
18
+ runs-on: ubuntu-latest
19
+ name: Lint
20
+ steps:
21
+ - name: Check out source repository
22
+ uses: actions/checkout@v4
23
+ - name: Set up Python environment
24
+ uses: actions/setup-python@v5
25
+ with:
26
+ python-version: "3.11"
27
+ - name: flake8 Lint
28
+ uses: py-actions/flake8@v2
29
+ with:
30
+ plugins: "flake8-no-print"
llama.cpp/.github/workflows/python-type-check.yml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Python Type-Check
2
+
3
+ on:
4
+ push:
5
+ paths:
6
+ - '.github/workflows/python-type-check.yml'
7
+ - 'pyrightconfig.json'
8
+ - '**.py'
9
+ - '**/requirements*.txt'
10
+ pull_request:
11
+ paths:
12
+ - '.github/workflows/python-type-check.yml'
13
+ - 'pyrightconfig.json'
14
+ - '**.py'
15
+ - '**/requirements*.txt'
16
+
17
+ concurrency:
18
+ group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
19
+ cancel-in-progress: true
20
+
21
+ jobs:
22
+ python-type-check:
23
+ runs-on: ubuntu-latest
24
+ name: pyright type-check
25
+ steps:
26
+ - name: Check out source repository
27
+ uses: actions/checkout@v4
28
+ - name: Set up Python environment
29
+ uses: actions/setup-python@v5
30
+ with:
31
+ python-version: "3.11"
32
+ - name: Install Python dependencies
33
+ # TODO: use a venv
34
+ run: pip install -r requirements/requirements-all.txt
35
+ - name: Type-check with Pyright
36
+ uses: jakebailey/pyright-action@v2
37
+ with:
38
+ version: 1.1.382
39
+ level: warning
40
+ warnings: true