svjack commited on
Commit
9fcf2b6
1 Parent(s): 92ab350

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. stable-diffusion.cpp/.dockerignore +6 -0
  3. stable-diffusion.cpp/.github/workflows/build.yml +201 -0
  4. stable-diffusion.cpp/.gitignore +5 -0
  5. stable-diffusion.cpp/.gitmodules +3 -0
  6. stable-diffusion.cpp/CMakeLists.txt +45 -0
  7. stable-diffusion.cpp/Dockerfile +17 -0
  8. stable-diffusion.cpp/LICENSE +21 -0
  9. stable-diffusion.cpp/README.md +198 -0
  10. stable-diffusion.cpp/assets/a lovely cat.png +0 -0
  11. stable-diffusion.cpp/assets/f16.png +0 -0
  12. stable-diffusion.cpp/assets/f32.png +0 -0
  13. stable-diffusion.cpp/assets/img2img_output.png +0 -0
  14. stable-diffusion.cpp/assets/q4_0.png +0 -0
  15. stable-diffusion.cpp/assets/q4_1.png +0 -0
  16. stable-diffusion.cpp/assets/q5_0.png +0 -0
  17. stable-diffusion.cpp/assets/q5_1.png +0 -0
  18. stable-diffusion.cpp/assets/q8_0.png +0 -0
  19. stable-diffusion.cpp/examples/CMakeLists.txt +8 -0
  20. stable-diffusion.cpp/examples/main.cpp +473 -0
  21. stable-diffusion.cpp/examples/stb_image.h +0 -0
  22. stable-diffusion.cpp/examples/stb_image_write.h +1741 -0
  23. stable-diffusion.cpp/ggml/.editorconfig +19 -0
  24. stable-diffusion.cpp/ggml/.github/workflows/ci.yml +137 -0
  25. stable-diffusion.cpp/ggml/.gitignore +37 -0
  26. stable-diffusion.cpp/ggml/CMakeLists.txt +197 -0
  27. stable-diffusion.cpp/ggml/LICENSE +21 -0
  28. stable-diffusion.cpp/ggml/README.md +140 -0
  29. stable-diffusion.cpp/ggml/build.zig +158 -0
  30. stable-diffusion.cpp/ggml/ci/run.sh +334 -0
  31. stable-diffusion.cpp/ggml/cmake/BuildTypes.cmake +54 -0
  32. stable-diffusion.cpp/ggml/cmake/GitVars.cmake +22 -0
  33. stable-diffusion.cpp/ggml/examples/CMakeLists.txt +30 -0
  34. stable-diffusion.cpp/ggml/examples/common-ggml.cpp +246 -0
  35. stable-diffusion.cpp/ggml/examples/common-ggml.h +18 -0
  36. stable-diffusion.cpp/ggml/examples/common.cpp +817 -0
  37. stable-diffusion.cpp/ggml/examples/common.h +179 -0
  38. stable-diffusion.cpp/ggml/examples/dolly-v2/CMakeLists.txt +13 -0
  39. stable-diffusion.cpp/ggml/examples/dolly-v2/README.md +187 -0
  40. stable-diffusion.cpp/ggml/examples/dolly-v2/convert-h5-to-ggml.py +116 -0
  41. stable-diffusion.cpp/ggml/examples/dolly-v2/main.cpp +969 -0
  42. stable-diffusion.cpp/ggml/examples/dolly-v2/quantize.cpp +178 -0
  43. stable-diffusion.cpp/ggml/examples/dr_wav.h +0 -0
  44. stable-diffusion.cpp/ggml/examples/gpt-2/CMakeLists.txt +36 -0
  45. stable-diffusion.cpp/ggml/examples/gpt-2/README.md +225 -0
  46. stable-diffusion.cpp/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
  47. stable-diffusion.cpp/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
  48. stable-diffusion.cpp/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
  49. stable-diffusion.cpp/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
  50. stable-diffusion.cpp/ggml/examples/gpt-2/download-model.sh +48 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ stable-diffusion.cpp/ggml/examples/mnist/models/mnist/mnist_model.state_dict filter=lfs diff=lfs merge=lfs -text
37
+ stable-diffusion.cpp/ggml/examples/mnist/models/mnist/t10k-images.idx3-ubyte filter=lfs diff=lfs merge=lfs -text
stable-diffusion.cpp/.dockerignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ build*/
2
+ test/
3
+
4
+ .cache/
5
+ *.swp
6
+ models/
stable-diffusion.cpp/.github/workflows/build.yml ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ workflow_dispatch: # allows manual triggering
5
+ inputs:
6
+ create_release:
7
+ description: 'Create new release'
8
+ required: true
9
+ type: boolean
10
+ push:
11
+ branches:
12
+ - master
13
+ - ci
14
+ paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
15
+ pull_request:
16
+ types: [opened, synchronize, reopened]
17
+ paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
18
+
19
+ env:
20
+ BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
21
+
22
+ jobs:
23
+ ubuntu-latest-cmake:
24
+ runs-on: ubuntu-latest
25
+
26
+ steps:
27
+ - name: Clone
28
+ id: checkout
29
+ uses: actions/checkout@v3
30
+ with:
31
+ submodules: recursive
32
+
33
+
34
+ - name: Dependencies
35
+ id: depends
36
+ run: |
37
+ sudo apt-get update
38
+ sudo apt-get install build-essential
39
+
40
+ - name: Build
41
+ id: cmake_build
42
+ run: |
43
+ mkdir build
44
+ cd build
45
+ cmake ..
46
+ cmake --build . --config Release
47
+
48
+ #- name: Test
49
+ #id: cmake_test
50
+ #run: |
51
+ #cd build
52
+ #ctest --verbose --timeout 900
53
+
54
+ macOS-latest-cmake:
55
+ runs-on: macos-latest
56
+
57
+ steps:
58
+ - name: Clone
59
+ id: checkout
60
+ uses: actions/checkout@v3
61
+ with:
62
+ submodules: recursive
63
+
64
+ - name: Dependencies
65
+ id: depends
66
+ continue-on-error: true
67
+ run: |
68
+ brew update
69
+
70
+ - name: Build
71
+ id: cmake_build
72
+ run: |
73
+ sysctl -a
74
+ mkdir build
75
+ cd build
76
+ cmake ..
77
+ cmake --build . --config Release
78
+
79
+ #- name: Test
80
+ #id: cmake_test
81
+ #run: |
82
+ #cd build
83
+ #ctest --verbose --timeout 900
84
+
85
+ windows-latest-cmake:
86
+ runs-on: windows-latest
87
+
88
+ strategy:
89
+ matrix:
90
+ include:
91
+ - build: 'noavx'
92
+ defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
93
+ - build: 'avx2'
94
+ defines: '-DGGML_AVX2=ON'
95
+ - build: 'avx'
96
+ defines: '-DGGML_AVX2=OFF'
97
+ - build: 'avx512'
98
+ defines: '-DGGML_AVX512=ON'
99
+
100
+ steps:
101
+ - name: Clone
102
+ id: checkout
103
+ uses: actions/checkout@v3
104
+ with:
105
+ submodules: recursive
106
+
107
+ - name: Build
108
+ id: cmake_build
109
+ run: |
110
+ mkdir build
111
+ cd build
112
+ cmake .. ${{ matrix.defines }}
113
+ cmake --build . --config Release
114
+
115
+ - name: Check AVX512F support
116
+ id: check_avx512f
117
+ if: ${{ matrix.build == 'avx512' }}
118
+ continue-on-error: true
119
+ run: |
120
+ cd build
121
+ $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
122
+ $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
123
+ $cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
124
+ echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
125
+ & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
126
+ .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
127
+
128
+ #- name: Test
129
+ #id: cmake_test
130
+ #run: |
131
+ #cd build
132
+ #ctest -C Release --verbose --timeout 900
133
+
134
+ - name: Get commit hash
135
+ id: commit
136
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
137
+ uses: pr-mpt/actions-commit-hash@v2
138
+
139
+ - name: Pack artifacts
140
+ id: pack_artifacts
141
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
142
+ run: |
143
+ Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
144
+ Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
145
+ 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
146
+
147
+ - name: Upload artifacts
148
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
149
+ uses: actions/upload-artifact@v3
150
+ with:
151
+ path: |
152
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
153
+
154
+ release:
155
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
156
+
157
+ runs-on: ubuntu-latest
158
+
159
+ needs:
160
+ - ubuntu-latest-cmake
161
+ - macOS-latest-cmake
162
+ - windows-latest-cmake
163
+
164
+ steps:
165
+ - name: Download artifacts
166
+ id: download-artifact
167
+ uses: actions/download-artifact@v3
168
+
169
+ - name: Get commit hash
170
+ id: commit
171
+ uses: pr-mpt/actions-commit-hash@v2
172
+
173
+ - name: Create release
174
+ id: create_release
175
+ uses: anzz1/action-create-release@v1
176
+ env:
177
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
178
+ with:
179
+ tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
180
+
181
+ - name: Upload release
182
+ id: upload_release
183
+ uses: actions/github-script@v3
184
+ with:
185
+ github-token: ${{secrets.GITHUB_TOKEN}}
186
+ script: |
187
+ const path = require('path');
188
+ const fs = require('fs');
189
+ const release_id = '${{ steps.create_release.outputs.id }}';
190
+ for (let file of await fs.readdirSync('./artifact')) {
191
+ if (path.extname(file) === '.zip') {
192
+ console.log('uploadReleaseAsset', file);
193
+ await github.repos.uploadReleaseAsset({
194
+ owner: context.repo.owner,
195
+ repo: context.repo.repo,
196
+ release_id: release_id,
197
+ name: file,
198
+ data: await fs.readFileSync(`./artifact/${file}`)
199
+ });
200
+ }
201
+ }
stable-diffusion.cpp/.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ build*/
2
+ test/
3
+
4
+ .cache/
5
+ *.swp
stable-diffusion.cpp/.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "ggml"]
2
+ path = ggml
3
+ url = https://github.com/leejet/ggml.git
stable-diffusion.cpp/CMakeLists.txt ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.12)
2
+ project("stable-diffusion")
3
+
4
+ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
5
+
6
+ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
7
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
8
+ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
9
+ endif()
10
+
11
+ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
12
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
13
+
14
+ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
15
+ set(SD_STANDALONE ON)
16
+ else()
17
+ set(SD_STANDALONE OFF)
18
+ endif()
19
+
20
+ #
21
+ # Option list
22
+ #
23
+
24
+ # general
25
+ #option(SD_BUILD_TESTS "sd: build tests" ${SD_STANDALONE})
26
+ option(SD_BUILD_EXAMPLES "sd: build examples" ${SD_STANDALONE})
27
+ option(BUILD_SHARED_LIBS "sd: build shared libs" OFF)
28
+ #option(SD_BUILD_SERVER "sd: build server example" ON)
29
+
30
+
31
+ # deps
32
+ add_subdirectory(ggml)
33
+
34
+ set(SD_LIB stable-diffusion)
35
+
36
+ add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp)
37
+ target_link_libraries(${SD_LIB} PUBLIC ggml)
38
+ target_include_directories(${SD_LIB} PUBLIC .)
39
+ target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
40
+
41
+
42
+ if (SD_BUILD_EXAMPLES)
43
+ add_subdirectory(examples)
44
+ endif()
45
+
stable-diffusion.cpp/Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+
3
+ FROM ubuntu:$UBUNTU_VERSION as build
4
+
5
+ RUN apt-get update && apt-get install -y build-essential git cmake
6
+
7
+ WORKDIR /sd.cpp
8
+
9
+ COPY . .
10
+
11
+ RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
12
+
13
+ FROM ubuntu:$UBUNTU_VERSION as runtime
14
+
15
+ COPY --from=build /sd.cpp/build/bin/sd /sd
16
+
17
+ ENTRYPOINT [ "/sd" ]
stable-diffusion.cpp/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 leejet
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
stable-diffusion.cpp/README.md ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <img src="./assets/a%20lovely%20cat.png" width="256x">
3
+ </p>
4
+
5
+ # stable-diffusion.cpp
6
+
7
+ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in pure C/C++
8
+
9
+ ## Features
10
+
11
+ - Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
12
+ - 16-bit, 32-bit float support
13
+ - 4-bit, 5-bit and 8-bit integer quantization support
14
+ - Accelerated memory-efficient CPU inference
15
+ - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image
16
+ - AVX, AVX2 and AVX512 support for x86 architectures
17
+ - SD1.x and SD2.x support
18
+ - Original `txt2img` and `img2img` mode
19
+ - Negative prompt
20
+ - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
21
+ - Sampling method
22
+ - `Euler A`
23
+ - `Euler`
24
+ - `Heun`
25
+ - `DPM2`
26
+ - `DPM++ 2M`
27
+ - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
28
+ - `DPM++ 2S a`
29
+ - Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
30
+ - Embedds generation parameters into png output as webui-compatible text string
31
+ - Supported platforms
32
+ - Linux
33
+ - Mac OS
34
+ - Windows
35
+ - Android (via Termux)
36
+
37
+ ### TODO
38
+
39
+ - [ ] More sampling methods
40
+ - [ ] GPU support
41
+ - [ ] Make inference faster
42
+ - The current implementation of ggml_conv_2d is slow and has high memory usage
43
+ - [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
44
+ - [ ] LoRA support
45
+ - [ ] k-quants support
46
+
47
+ ## Usage
48
+
49
+ ### Get the Code
50
+
51
+ ```
52
+ git clone --recursive https://github.com/leejet/stable-diffusion.cpp
53
+ cd stable-diffusion.cpp
54
+ ```
55
+
56
+ - If you have already cloned the repository, you can use the following command to update the repository to the latest code.
57
+
58
+ ```
59
+ cd stable-diffusion.cpp
60
+ git pull origin master
61
+ git submodule init
62
+ git submodule update
63
+ ```
64
+
65
+ ### Convert weights
66
+
67
+ - download original weights(.ckpt or .safetensors). For example
68
+ - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
69
+ - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
70
+ - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
71
+
72
+ ```shell
73
+ curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
74
+ # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
75
+ # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-nonema-pruned.safetensors
76
+ ```
77
+
78
+ - convert weights to ggml model format
79
+
80
+ ```shell
81
+ cd models
82
+ pip install -r requirements.txt
83
+ python convert.py [path to weights] --out_type [output precision]
84
+ # For example, python convert.py sd-v1-4.ckpt --out_type f16
85
+ ```
86
+
87
+ ### Quantization
88
+
89
+ You can specify the output model format using the --out_type parameter
90
+
91
+ - `f16` for 16-bit floating-point
92
+ - `f32` for 32-bit floating-point
93
+ - `q8_0` for 8-bit integer quantization
94
+ - `q5_0` or `q5_1` for 5-bit integer quantization
95
+ - `q4_0` or `q4_1` for 4-bit integer quantization
96
+
97
+ ### Build
98
+
99
+ #### Build from scratch
100
+
101
+ ```shell
102
+ mkdir build
103
+ cd build
104
+ cmake ..
105
+ cmake --build . --config Release
106
+ ```
107
+
108
+ ##### Using OpenBLAS
109
+
110
+ ```
111
+ cmake .. -DGGML_OPENBLAS=ON
112
+ cmake --build . --config Release
113
+ ```
114
+
115
+ ### Run
116
+
117
+ ```
118
+ usage: ./bin/sd [arguments]
119
+
120
+ arguments:
121
+ -h, --help show this help message and exit
122
+ -M, --mode [txt2img or img2img] generation mode (default: txt2img)
123
+ -t, --threads N number of threads to use during computation (default: -1).
124
+ If threads <= 0, then threads will be set to the number of CPU physical cores
125
+ -m, --model [MODEL] path to model
126
+ -i, --init-img [IMAGE] path to the input image, required by img2img
127
+ -o, --output OUTPUT path to write result image to (default: .\output.png)
128
+ -p, --prompt [PROMPT] the prompt to render
129
+ -n, --negative-prompt PROMPT the negative prompt (default: "")
130
+ --cfg-scale SCALE unconditional guidance scale: (default: 7.0)
131
+ --strength STRENGTH strength for noising/unnoising (default: 0.75)
132
+ 1.0 corresponds to full destruction of information in init image
133
+ -H, --height H image height, in pixel space (default: 512)
134
+ -W, --width W image width, in pixel space (default: 512)
135
+ --sampling-method {euler, euler_a, heun, dpm++2m, dpm++2mv2}
136
+ sampling method (default: "euler_a")
137
+ --steps STEPS number of sample steps (default: 20)
138
+ --rng {std_default, cuda} RNG (default: cuda)
139
+ -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
140
+ -v, --verbose print extra info
141
+ ```
142
+
143
+ #### txt2img example
144
+
145
+ ```
146
+ ./bin/sd -m ../models/sd-v1-4-ggml-model-f16.bin -p "a lovely cat"
147
+ ```
148
+
149
+ Using formats of different precisions will yield results of varying quality.
150
+
151
+ | f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
152
+ | ---- |---- |---- |---- |---- |---- |---- |
153
+ | ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
154
+
155
+ #### img2img example
156
+
157
+ - `./output.png` is the image generated from the above txt2img pipeline
158
+
159
+
160
+ ```
161
+ ./bin/sd --mode img2img -m ../models/sd-v1-4-ggml-model-f16.bin -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
162
+ ```
163
+
164
+ <p align="center">
165
+ <img src="./assets/img2img_output.png" width="256x">
166
+ </p>
167
+
168
+ ### Docker
169
+
170
+ #### Building using Docker
171
+
172
+ ```shell
173
+ docker build -t sd .
174
+ ```
175
+
176
+ #### Run
177
+
178
+ ```shell
179
+ docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
180
+ # For example
181
+ # docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4-ggml-model-f16.bin -p "a lovely cat" -v -o /output/output.png
182
+ ```
183
+
184
+ ## Memory/Disk Requirements
185
+
186
+ | precision | f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
187
+ | ---- | ---- |---- |---- |---- |---- |---- |---- |
188
+ | **Disk** | 2.7G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
189
+ | **Memory**(txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
190
+
191
+
192
+ ## References
193
+
194
+ - [ggml](https://github.com/ggerganov/ggml)
195
+ - [stable-diffusion](https://github.com/CompVis/stable-diffusion)
196
+ - [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
197
+ - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
198
+ - [k-diffusion](https://github.com/crowsonkb/k-diffusion)
stable-diffusion.cpp/assets/a lovely cat.png ADDED
stable-diffusion.cpp/assets/f16.png ADDED
stable-diffusion.cpp/assets/f32.png ADDED
stable-diffusion.cpp/assets/img2img_output.png ADDED
stable-diffusion.cpp/assets/q4_0.png ADDED
stable-diffusion.cpp/assets/q4_1.png ADDED
stable-diffusion.cpp/assets/q5_0.png ADDED
stable-diffusion.cpp/assets/q5_1.png ADDED
stable-diffusion.cpp/assets/q8_0.png ADDED
stable-diffusion.cpp/examples/CMakeLists.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # TODO: move into its own subdirectoy
2
+ # TODO: make stb libs a target (maybe common)
3
+ set(SD_TARGET sd)
4
+
5
+ add_executable(${SD_TARGET} main.cpp stb_image.h stb_image_write.h)
6
+ install(TARGETS ${SD_TARGET} RUNTIME)
7
+ target_link_libraries(${SD_TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
8
+ target_compile_features(${SD_TARGET} PUBLIC cxx_std_11)
stable-diffusion.cpp/examples/main.cpp ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <stdio.h>
2
+ #include <ctime>
3
+ #include <fstream>
4
+ #include <iostream>
5
+ #include <random>
6
+ #include <string>
7
+ #include <thread>
8
+ #include <unordered_set>
9
+
10
+ #include "stable-diffusion.h"
11
+
12
+ #define STB_IMAGE_IMPLEMENTATION
13
+ #include "stb_image.h"
14
+
15
+ #define STB_IMAGE_WRITE_IMPLEMENTATION
16
+ #define STB_IMAGE_WRITE_STATIC
17
+ #include "stb_image_write.h"
18
+
19
+ #if defined(__APPLE__) && defined(__MACH__)
20
+ #include <sys/sysctl.h>
21
+ #include <sys/types.h>
22
+ #endif
23
+
24
+ #if !defined(_WIN32)
25
+ #include <sys/ioctl.h>
26
+ #include <unistd.h>
27
+ #endif
28
+
29
+ #define TXT2IMG "txt2img"
30
+ #define IMG2IMG "img2img"
31
+
32
+ // get_num_physical_cores is copy from
33
+ // https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
34
+ // LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
35
+ int32_t get_num_physical_cores() {
36
+ #ifdef __linux__
37
+ // enumerate the set of thread siblings, num entries is num cores
38
+ std::unordered_set<std::string> siblings;
39
+ for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
40
+ std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
41
+ if (!thread_siblings.is_open()) {
42
+ break; // no more cpus
43
+ }
44
+ std::string line;
45
+ if (std::getline(thread_siblings, line)) {
46
+ siblings.insert(line);
47
+ }
48
+ }
49
+ if (siblings.size() > 0) {
50
+ return static_cast<int32_t>(siblings.size());
51
+ }
52
+ #elif defined(__APPLE__) && defined(__MACH__)
53
+ int32_t num_physical_cores;
54
+ size_t len = sizeof(num_physical_cores);
55
+ int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
56
+ if (result == 0) {
57
+ return num_physical_cores;
58
+ }
59
+ result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
60
+ if (result == 0) {
61
+ return num_physical_cores;
62
+ }
63
+ #elif defined(_WIN32)
64
+ // TODO: Implement
65
+ #endif
66
+ unsigned int n_threads = std::thread::hardware_concurrency();
67
+ return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
68
+ }
69
+
70
+ const char* rng_type_to_str[] = {
71
+ "std_default",
72
+ "cuda",
73
+ };
74
+
75
+ // Names of the sampler method, same order as enum SampleMethod in stable-diffusion.h
76
+ const char* sample_method_str[] = {
77
+ "euler_a",
78
+ "euler",
79
+ "heun",
80
+ "dpm2",
81
+ "dpm++2s_a",
82
+ "dpm++2m",
83
+ "dpm++2mv2"};
84
+
85
+ // Names of the sigma schedule overrides, same order as Schedule in stable-diffusion.h
86
+ const char* schedule_str[] = {
87
+ "default",
88
+ "discrete",
89
+ "karras"};
90
+
91
+ struct Option {
92
+ int n_threads = -1;
93
+ std::string mode = TXT2IMG;
94
+ std::string model_path;
95
+ std::string output_path = "output.png";
96
+ std::string init_img;
97
+ std::string prompt;
98
+ std::string negative_prompt;
99
+ float cfg_scale = 7.0f;
100
+ int w = 512;
101
+ int h = 512;
102
+ SampleMethod sample_method = EULER_A;
103
+ Schedule schedule = DEFAULT;
104
+ int sample_steps = 20;
105
+ float strength = 0.75f;
106
+ RNGType rng_type = CUDA_RNG;
107
+ int64_t seed = 42;
108
+ bool verbose = false;
109
+
110
+ void print() {
111
+ printf("Option: \n");
112
+ printf(" n_threads: %d\n", n_threads);
113
+ printf(" mode: %s\n", mode.c_str());
114
+ printf(" model_path: %s\n", model_path.c_str());
115
+ printf(" output_path: %s\n", output_path.c_str());
116
+ printf(" init_img: %s\n", init_img.c_str());
117
+ printf(" prompt: %s\n", prompt.c_str());
118
+ printf(" negative_prompt: %s\n", negative_prompt.c_str());
119
+ printf(" cfg_scale: %.2f\n", cfg_scale);
120
+ printf(" width: %d\n", w);
121
+ printf(" height: %d\n", h);
122
+ printf(" sample_method: %s\n", sample_method_str[sample_method]);
123
+ printf(" schedule: %s\n", schedule_str[schedule]);
124
+ printf(" sample_steps: %d\n", sample_steps);
125
+ printf(" strength: %.2f\n", strength);
126
+ printf(" rng: %s\n", rng_type_to_str[rng_type]);
127
+ printf(" seed: %ld\n", seed);
128
+ }
129
+ };
130
+
131
+ void print_usage(int argc, const char* argv[]) {
132
+ printf("usage: %s [arguments]\n", argv[0]);
133
+ printf("\n");
134
+ printf("arguments:\n");
135
+ printf(" -h, --help show this help message and exit\n");
136
+ printf(" -M, --mode [txt2img or img2img] generation mode (default: txt2img)\n");
137
+ printf(" -t, --threads N number of threads to use during computation (default: -1).\n");
138
+ printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
139
+ printf(" -m, --model [MODEL] path to model\n");
140
+ printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n");
141
+ printf(" -o, --output OUTPUT path to write result image to (default: .\\output.png)\n");
142
+ printf(" -p, --prompt [PROMPT] the prompt to render\n");
143
+ printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
144
+ printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
145
+ printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
146
+ printf(" 1.0 corresponds to full destruction of information in init image\n");
147
+ printf(" -H, --height H image height, in pixel space (default: 512)\n");
148
+ printf(" -W, --width W image width, in pixel space (default: 512)\n");
149
+ printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2}\n");
150
+ printf(" sampling method (default: \"euler_a\")\n");
151
+ printf(" --steps STEPS number of sample steps (default: 20)\n");
152
+ printf(" --rng {std_default, cuda} RNG (default: cuda)\n");
153
+ printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
154
+ printf(" --schedule {discrete, karras} Denoiser sigma schedule (default: discrete)\n");
155
+ printf(" -v, --verbose print extra info\n");
156
+ }
157
+
158
+ void parse_args(int argc, const char* argv[], Option* opt) {
159
+ bool invalid_arg = false;
160
+
161
+ for (int i = 1; i < argc; i++) {
162
+ std::string arg = argv[i];
163
+
164
+ if (arg == "-t" || arg == "--threads") {
165
+ if (++i >= argc) {
166
+ invalid_arg = true;
167
+ break;
168
+ }
169
+ opt->n_threads = std::stoi(argv[i]);
170
+ } else if (arg == "-M" || arg == "--mode") {
171
+ if (++i >= argc) {
172
+ invalid_arg = true;
173
+ break;
174
+ }
175
+ opt->mode = argv[i];
176
+
177
+ } else if (arg == "-m" || arg == "--model") {
178
+ if (++i >= argc) {
179
+ invalid_arg = true;
180
+ break;
181
+ }
182
+ opt->model_path = argv[i];
183
+ } else if (arg == "-i" || arg == "--init-img") {
184
+ if (++i >= argc) {
185
+ invalid_arg = true;
186
+ break;
187
+ }
188
+ opt->init_img = argv[i];
189
+ } else if (arg == "-o" || arg == "--output") {
190
+ if (++i >= argc) {
191
+ invalid_arg = true;
192
+ break;
193
+ }
194
+ opt->output_path = argv[i];
195
+ } else if (arg == "-p" || arg == "--prompt") {
196
+ if (++i >= argc) {
197
+ invalid_arg = true;
198
+ break;
199
+ }
200
+ opt->prompt = argv[i];
201
+ } else if (arg == "-n" || arg == "--negative-prompt") {
202
+ if (++i >= argc) {
203
+ invalid_arg = true;
204
+ break;
205
+ }
206
+ opt->negative_prompt = argv[i];
207
+ } else if (arg == "--cfg-scale") {
208
+ if (++i >= argc) {
209
+ invalid_arg = true;
210
+ break;
211
+ }
212
+ opt->cfg_scale = std::stof(argv[i]);
213
+ } else if (arg == "--strength") {
214
+ if (++i >= argc) {
215
+ invalid_arg = true;
216
+ break;
217
+ }
218
+ opt->strength = std::stof(argv[i]);
219
+ } else if (arg == "-H" || arg == "--height") {
220
+ if (++i >= argc) {
221
+ invalid_arg = true;
222
+ break;
223
+ }
224
+ opt->h = std::stoi(argv[i]);
225
+ } else if (arg == "-W" || arg == "--width") {
226
+ if (++i >= argc) {
227
+ invalid_arg = true;
228
+ break;
229
+ }
230
+ opt->w = std::stoi(argv[i]);
231
+ } else if (arg == "--steps") {
232
+ if (++i >= argc) {
233
+ invalid_arg = true;
234
+ break;
235
+ }
236
+ opt->sample_steps = std::stoi(argv[i]);
237
+ } else if (arg == "--rng") {
238
+ if (++i >= argc) {
239
+ invalid_arg = true;
240
+ break;
241
+ }
242
+ std::string rng_type_str = argv[i];
243
+ if (rng_type_str == "std_default") {
244
+ opt->rng_type = STD_DEFAULT_RNG;
245
+ } else if (rng_type_str == "cuda") {
246
+ opt->rng_type = CUDA_RNG;
247
+ } else {
248
+ invalid_arg = true;
249
+ break;
250
+ }
251
+ } else if (arg == "--schedule") {
252
+ if (++i >= argc) {
253
+ invalid_arg = true;
254
+ break;
255
+ }
256
+ const char* schedule_selected = argv[i];
257
+ int schedule_found = -1;
258
+ for (int d = 0; d < N_SCHEDULES; d++) {
259
+ if (!strcmp(schedule_selected, schedule_str[d])) {
260
+ schedule_found = d;
261
+ }
262
+ }
263
+ if (schedule_found == -1) {
264
+ invalid_arg = true;
265
+ break;
266
+ }
267
+ opt->schedule = (Schedule)schedule_found;
268
+ } else if (arg == "-s" || arg == "--seed") {
269
+ if (++i >= argc) {
270
+ invalid_arg = true;
271
+ break;
272
+ }
273
+ opt->seed = std::stoll(argv[i]);
274
+ } else if (arg == "--sampling-method") {
275
+ if (++i >= argc) {
276
+ invalid_arg = true;
277
+ break;
278
+ }
279
+ const char* sample_method_selected = argv[i];
280
+ int sample_method_found = -1;
281
+ for (int m = 0; m < N_SAMPLE_METHODS; m++) {
282
+ if (!strcmp(sample_method_selected, sample_method_str[m])) {
283
+ sample_method_found = m;
284
+ }
285
+ }
286
+ if (sample_method_found == -1) {
287
+ invalid_arg = true;
288
+ break;
289
+ }
290
+ opt->sample_method = (SampleMethod)sample_method_found;
291
+ } else if (arg == "-h" || arg == "--help") {
292
+ print_usage(argc, argv);
293
+ exit(0);
294
+ } else if (arg == "-v" || arg == "--verbose") {
295
+ opt->verbose = true;
296
+ } else {
297
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
298
+ print_usage(argc, argv);
299
+ exit(1);
300
+ }
301
+ if (invalid_arg) {
302
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
303
+ print_usage(argc, argv);
304
+ exit(1);
305
+ }
306
+ }
307
+
308
+ if (opt->n_threads <= 0) {
309
+ opt->n_threads = get_num_physical_cores();
310
+ }
311
+
312
+ if (opt->mode != TXT2IMG && opt->mode != IMG2IMG) {
313
+ fprintf(stderr, "error: invalid mode %s, must be one of ['%s', '%s']\n",
314
+ opt->mode.c_str(), TXT2IMG, IMG2IMG);
315
+ exit(1);
316
+ }
317
+
318
+ if (opt->prompt.length() == 0) {
319
+ fprintf(stderr, "error: the following arguments are required: prompt\n");
320
+ print_usage(argc, argv);
321
+ exit(1);
322
+ }
323
+
324
+ if (opt->model_path.length() == 0) {
325
+ fprintf(stderr, "error: the following arguments are required: model_path\n");
326
+ print_usage(argc, argv);
327
+ exit(1);
328
+ }
329
+
330
+ if (opt->mode == IMG2IMG && opt->init_img.length() == 0) {
331
+ fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
332
+ print_usage(argc, argv);
333
+ exit(1);
334
+ }
335
+
336
+ if (opt->output_path.length() == 0) {
337
+ fprintf(stderr, "error: the following arguments are required: output_path\n");
338
+ print_usage(argc, argv);
339
+ exit(1);
340
+ }
341
+
342
+ if (opt->w <= 0 || opt->w % 64 != 0) {
343
+ fprintf(stderr, "error: the width must be a multiple of 64\n");
344
+ exit(1);
345
+ }
346
+
347
+ if (opt->h <= 0 || opt->h % 64 != 0) {
348
+ fprintf(stderr, "error: the height must be a multiple of 64\n");
349
+ exit(1);
350
+ }
351
+
352
+ if (opt->sample_steps <= 0) {
353
+ fprintf(stderr, "error: the sample_steps must be greater than 0\n");
354
+ exit(1);
355
+ }
356
+
357
+ if (opt->strength < 0.f || opt->strength > 1.f) {
358
+ fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
359
+ exit(1);
360
+ }
361
+
362
+ if (opt->seed < 0) {
363
+ srand((int)time(NULL));
364
+ opt->seed = rand();
365
+ }
366
+ }
367
+
368
+ std::string basename(const std::string& path) {
369
+ size_t pos = path.find_last_of('/');
370
+ if (pos != std::string::npos) {
371
+ return path.substr(pos + 1);
372
+ }
373
+ pos = path.find_last_of('\\');
374
+ if (pos != std::string::npos) {
375
+ return path.substr(pos + 1);
376
+ }
377
+ return path;
378
+ }
379
+
380
+ int main(int argc, const char* argv[]) {
381
+ Option opt;
382
+ parse_args(argc, argv, &opt);
383
+
384
+ if (opt.verbose) {
385
+ opt.print();
386
+ printf("%s", sd_get_system_info().c_str());
387
+ set_sd_log_level(SDLogLevel::DEBUG);
388
+ }
389
+
390
+ bool vae_decode_only = true;
391
+ std::vector<uint8_t> init_img;
392
+ if (opt.mode == IMG2IMG) {
393
+ vae_decode_only = false;
394
+
395
+ int c = 0;
396
+ unsigned char* img_data = stbi_load(opt.init_img.c_str(), &opt.w, &opt.h, &c, 3);
397
+ if (img_data == NULL) {
398
+ fprintf(stderr, "load image from '%s' failed\n", opt.init_img.c_str());
399
+ return 1;
400
+ }
401
+ if (c != 3) {
402
+ fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
403
+ free(img_data);
404
+ return 1;
405
+ }
406
+ if (opt.w <= 0 || opt.w % 64 != 0) {
407
+ fprintf(stderr, "error: the width of image must be a multiple of 64\n");
408
+ free(img_data);
409
+ return 1;
410
+ }
411
+ if (opt.h <= 0 || opt.h % 64 != 0) {
412
+ fprintf(stderr, "error: the height of image must be a multiple of 64\n");
413
+ free(img_data);
414
+ return 1;
415
+ }
416
+ init_img.assign(img_data, img_data + (opt.w * opt.h * c));
417
+ }
418
+
419
+ StableDiffusion sd(opt.n_threads, vae_decode_only, true, opt.rng_type);
420
+ if (!sd.load_from_file(opt.model_path, opt.schedule)) {
421
+ return 1;
422
+ }
423
+
424
+ std::vector<uint8_t> img;
425
+ if (opt.mode == TXT2IMG) {
426
+ img = sd.txt2img(opt.prompt,
427
+ opt.negative_prompt,
428
+ opt.cfg_scale,
429
+ opt.w,
430
+ opt.h,
431
+ opt.sample_method,
432
+ opt.sample_steps,
433
+ opt.seed);
434
+ } else {
435
+ img = sd.img2img(init_img,
436
+ opt.prompt,
437
+ opt.negative_prompt,
438
+ opt.cfg_scale,
439
+ opt.w,
440
+ opt.h,
441
+ opt.sample_method,
442
+ opt.sample_steps,
443
+ opt.strength,
444
+ opt.seed);
445
+ }
446
+
447
+ if (img.size() == 0) {
448
+ fprintf(stderr, "generate failed\n");
449
+ return 1;
450
+ }
451
+
452
+ std::string parameter_string = opt.prompt + "\n";
453
+ if (opt.negative_prompt.size() != 0) {
454
+ parameter_string += "Negative prompt: " + opt.negative_prompt + "\n";
455
+ }
456
+ parameter_string += "Steps: " + std::to_string(opt.sample_steps) + ", ";
457
+ parameter_string += "CFG scale: " + std::to_string(opt.cfg_scale) + ", ";
458
+ parameter_string += "Seed: " + std::to_string(opt.seed) + ", ";
459
+ parameter_string += "Size: " + std::to_string(opt.w) + "x" + std::to_string(opt.h) + ", ";
460
+ parameter_string += "Model: " + basename(opt.model_path) + ", ";
461
+ parameter_string += "RNG: " + std::string(rng_type_to_str[opt.rng_type]) + ", ";
462
+ parameter_string += "Sampler: " + std::string(sample_method_str[opt.sample_method]);
463
+ if (opt.schedule == KARRAS) {
464
+ parameter_string += " karras";
465
+ }
466
+ parameter_string += ", ";
467
+ parameter_string += "Version: stable-diffusion.cpp";
468
+
469
+ stbi_write_png(opt.output_path.c_str(), opt.w, opt.h, 3, img.data(), 0, parameter_string.c_str());
470
+ printf("save result image to '%s'\n", opt.output_path.c_str());
471
+
472
+ return 0;
473
+ }
stable-diffusion.cpp/examples/stb_image.h ADDED
The diff for this file is too large to render. See raw diff
 
stable-diffusion.cpp/examples/stb_image_write.h ADDED
@@ -0,0 +1,1741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* stb_image_write - v1.16 - public domain - http://nothings.org/stb
2
+ writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
3
+ no warranty implied; use at your own risk
4
+
5
+ Before #including,
6
+
7
+ #define STB_IMAGE_WRITE_IMPLEMENTATION
8
+
9
+ in the file that you want to have the implementation.
10
+
11
+ Will probably not work correctly with strict-aliasing optimizations.
12
+
13
+ ABOUT:
14
+
15
+ This header file is a library for writing images to C stdio or a callback.
16
+
17
+ The PNG output is not optimal; it is 20-50% larger than the file
18
+ written by a decent optimizing implementation; though providing a custom
19
+ zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
20
+ This library is designed for source code compactness and simplicity,
21
+ not optimal image file size or run-time performance.
22
+
23
+ BUILDING:
24
+
25
+ You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
26
+ You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
27
+ malloc,realloc,free.
28
+ You can #define STBIW_MEMMOVE() to replace memmove()
29
+ You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
30
+ for PNG compression (instead of the builtin one), it must have the following signature:
31
+ unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
32
+ The returned data will be freed with STBIW_FREE() (free() by default),
33
+ so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
34
+
35
+ UNICODE:
36
+
37
+ If compiling for Windows and you wish to use Unicode filenames, compile
38
+ with
39
+ #define STBIW_WINDOWS_UTF8
40
+ and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
41
+ Windows wchar_t filenames to utf8.
42
+
43
+ USAGE:
44
+
45
+ There are five functions, one for each image file format:
46
+
47
+ int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
48
+ int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
49
+ int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
50
+ int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
51
+ int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
52
+
53
+ void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
54
+
55
+ There are also five equivalent functions that use an arbitrary write function. You are
56
+ expected to open/close your file-equivalent before and after calling these:
57
+
58
+ int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data, int stride_in_bytes);
59
+ int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
60
+ int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
61
+ int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
62
+ int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
63
+
64
+ where the callback is:
65
+ void stbi_write_func(void *context, void *data, int size);
66
+
67
+ You can configure it with these global variables:
68
+ int stbi_write_tga_with_rle; // defaults to true; set to 0 to disable RLE
69
+ int stbi_write_png_compression_level; // defaults to 8; set to higher for more compression
70
+ int stbi_write_force_png_filter; // defaults to -1; set to 0..5 to force a filter mode
71
+
72
+
73
+ You can define STBI_WRITE_NO_STDIO to disable the file variant of these
74
+ functions, so the library will not use stdio.h at all. However, this will
75
+ also disable HDR writing, because it requires stdio for formatted output.
76
+
77
+ Each function returns 0 on failure and non-0 on success.
78
+
79
+ The functions create an image file defined by the parameters. The image
80
+ is a rectangle of pixels stored from left-to-right, top-to-bottom.
81
+ Each pixel contains 'comp' channels of data stored interleaved with 8-bits
82
+ per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
83
+ monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
84
+ The *data pointer points to the first byte of the top-left-most pixel.
85
+ For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
86
+ a row of pixels to the first byte of the next row of pixels.
87
+
88
+ PNG creates output files with the same number of components as the input.
89
+ The BMP format expands Y to RGB in the file format and does not
90
+ output alpha.
91
+
92
+ PNG supports writing rectangles of data even when the bytes storing rows of
93
+ data are not consecutive in memory (e.g. sub-rectangles of a larger image),
94
+ by supplying the stride between the beginning of adjacent rows. The other
95
+ formats do not. (Thus you cannot write a native-format BMP through the BMP
96
+ writer, both because it is in BGR order and because it may have padding
97
+ at the end of the line.)
98
+
99
+ PNG allows you to set the deflate compression level by setting the global
100
+ variable 'stbi_write_png_compression_level' (it defaults to 8).
101
+
102
+ HDR expects linear float data. Since the format is always 32-bit rgb(e)
103
+ data, alpha (if provided) is discarded, and for monochrome data it is
104
+ replicated across all three channels.
105
+
106
+ TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
107
+ data, set the global variable 'stbi_write_tga_with_rle' to 0.
108
+
109
+ JPEG does ignore alpha channels in input data; quality is between 1 and 100.
110
+ Higher quality looks better but results in a bigger image.
111
+ JPEG baseline (no JPEG progressive).
112
+
113
+ CREDITS:
114
+
115
+
116
+ Sean Barrett - PNG/BMP/TGA
117
+ Baldur Karlsson - HDR
118
+ Jean-Sebastien Guay - TGA monochrome
119
+ Tim Kelsey - misc enhancements
120
+ Alan Hickman - TGA RLE
121
+ Emmanuel Julien - initial file IO callback implementation
122
+ Jon Olick - original jo_jpeg.cpp code
123
+ Daniel Gibson - integrate JPEG, allow external zlib
124
+ Aarni Koskela - allow choosing PNG filter
125
+
126
+ bugfixes:
127
+ github:Chribba
128
+ Guillaume Chereau
129
+ github:jry2
130
+ github:romigrou
131
+ Sergio Gonzalez
132
+ Jonas Karlsson
133
+ Filip Wasil
134
+ Thatcher Ulrich
135
+ github:poppolopoppo
136
+ Patrick Boettcher
137
+ github:xeekworx
138
+ Cap Petschulat
139
+ Simon Rodriguez
140
+ Ivan Tikhonov
141
+ github:ignotion
142
+ Adam Schackart
143
+ Andrew Kensler
144
+
145
+ LICENSE
146
+
147
+ See end of file for license information.
148
+
149
+ */
150
+
151
+ #ifndef INCLUDE_STB_IMAGE_WRITE_H
152
+ #define INCLUDE_STB_IMAGE_WRITE_H
153
+
154
+ #include <stdlib.h>
155
+
156
+ // if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
157
+ #ifndef STBIWDEF
158
+ #ifdef STB_IMAGE_WRITE_STATIC
159
+ #define STBIWDEF static
160
+ #else
161
+ #ifdef __cplusplus
162
+ #define STBIWDEF extern "C"
163
+ #else
164
+ #define STBIWDEF extern
165
+ #endif
166
+ #endif
167
+ #endif
168
+
169
+ #ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations
170
+ STBIWDEF int stbi_write_tga_with_rle;
171
+ STBIWDEF int stbi_write_png_compression_level;
172
+ STBIWDEF int stbi_write_force_png_filter;
173
+ #endif
174
+
175
+ #ifndef STBI_WRITE_NO_STDIO
176
+ STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes, const char* parameters = NULL);
177
+ STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
178
+ STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
179
+ STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
180
+ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality);
181
+
182
+ #ifdef STBIW_WINDOWS_UTF8
183
+ STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
184
+ #endif
185
+ #endif
186
+
187
+ typedef void stbi_write_func(void *context, void *data, int size);
188
+
189
+ STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data, int stride_in_bytes);
190
+ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
191
+ STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
192
+ STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
193
+ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
194
+
195
+ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
196
+
197
+ #endif//INCLUDE_STB_IMAGE_WRITE_H
198
+
199
+ #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
200
+
201
+ #ifdef _WIN32
202
+ #ifndef _CRT_SECURE_NO_WARNINGS
203
+ #define _CRT_SECURE_NO_WARNINGS
204
+ #endif
205
+ #ifndef _CRT_NONSTDC_NO_DEPRECATE
206
+ #define _CRT_NONSTDC_NO_DEPRECATE
207
+ #endif
208
+ #endif
209
+
210
+ #ifndef STBI_WRITE_NO_STDIO
211
+ #include <stdio.h>
212
+ #endif // STBI_WRITE_NO_STDIO
213
+
214
+ #include <stdarg.h>
215
+ #include <stdlib.h>
216
+ #include <string.h>
217
+ #include <math.h>
218
+
219
+ #if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
220
+ // ok
221
+ #elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
222
+ // ok
223
+ #else
224
+ #error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
225
+ #endif
226
+
227
+ #ifndef STBIW_MALLOC
228
+ #define STBIW_MALLOC(sz) malloc(sz)
229
+ #define STBIW_REALLOC(p,newsz) realloc(p,newsz)
230
+ #define STBIW_FREE(p) free(p)
231
+ #endif
232
+
233
+ #ifndef STBIW_REALLOC_SIZED
234
+ #define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
235
+ #endif
236
+
237
+
238
+ #ifndef STBIW_MEMMOVE
239
+ #define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
240
+ #endif
241
+
242
+
243
+ #ifndef STBIW_ASSERT
244
+ #include <assert.h>
245
+ #define STBIW_ASSERT(x) assert(x)
246
+ #endif
247
+
248
+ #define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
249
+
250
+ #ifdef STB_IMAGE_WRITE_STATIC
251
+ static int stbi_write_png_compression_level = 8;
252
+ static int stbi_write_tga_with_rle = 1;
253
+ static int stbi_write_force_png_filter = -1;
254
+ #else
255
+ int stbi_write_png_compression_level = 8;
256
+ int stbi_write_tga_with_rle = 1;
257
+ int stbi_write_force_png_filter = -1;
258
+ #endif
259
+
260
+ static int stbi__flip_vertically_on_write = 0;
261
+
262
+ STBIWDEF void stbi_flip_vertically_on_write(int flag)
263
+ {
264
+ stbi__flip_vertically_on_write = flag;
265
+ }
266
+
267
+ typedef struct
268
+ {
269
+ stbi_write_func *func;
270
+ void *context;
271
+ unsigned char buffer[64];
272
+ int buf_used;
273
+ } stbi__write_context;
274
+
275
+ // initialize a callback-based context
276
+ static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
277
+ {
278
+ s->func = c;
279
+ s->context = context;
280
+ }
281
+
282
+ #ifndef STBI_WRITE_NO_STDIO
283
+
284
+ static void stbi__stdio_write(void *context, void *data, int size)
285
+ {
286
+ fwrite(data,1,size,(FILE*) context);
287
+ }
288
+
289
+ #if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
290
+ #ifdef __cplusplus
291
+ #define STBIW_EXTERN extern "C"
292
+ #else
293
+ #define STBIW_EXTERN extern
294
+ #endif
295
+ STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
296
+ STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
297
+
298
+ STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
299
+ {
300
+ return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
301
+ }
302
+ #endif
303
+
304
+ static FILE *stbiw__fopen(char const *filename, char const *mode)
305
+ {
306
+ FILE *f;
307
+ #if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
308
+ wchar_t wMode[64];
309
+ wchar_t wFilename[1024];
310
+ if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
311
+ return 0;
312
+
313
+ if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
314
+ return 0;
315
+
316
+ #if defined(_MSC_VER) && _MSC_VER >= 1400
317
+ if (0 != _wfopen_s(&f, wFilename, wMode))
318
+ f = 0;
319
+ #else
320
+ f = _wfopen(wFilename, wMode);
321
+ #endif
322
+
323
+ #elif defined(_MSC_VER) && _MSC_VER >= 1400
324
+ if (0 != fopen_s(&f, filename, mode))
325
+ f=0;
326
+ #else
327
+ f = fopen(filename, mode);
328
+ #endif
329
+ return f;
330
+ }
331
+
332
+ static int stbi__start_write_file(stbi__write_context *s, const char *filename)
333
+ {
334
+ FILE *f = stbiw__fopen(filename, "wb");
335
+ stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
336
+ return f != NULL;
337
+ }
338
+
339
+ static void stbi__end_write_file(stbi__write_context *s)
340
+ {
341
+ fclose((FILE *)s->context);
342
+ }
343
+
344
+ #endif // !STBI_WRITE_NO_STDIO
345
+
346
+ typedef unsigned int stbiw_uint32;
347
+ typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
348
+
349
+ static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
350
+ {
351
+ while (*fmt) {
352
+ switch (*fmt++) {
353
+ case ' ': break;
354
+ case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
355
+ s->func(s->context,&x,1);
356
+ break; }
357
+ case '2': { int x = va_arg(v,int);
358
+ unsigned char b[2];
359
+ b[0] = STBIW_UCHAR(x);
360
+ b[1] = STBIW_UCHAR(x>>8);
361
+ s->func(s->context,b,2);
362
+ break; }
363
+ case '4': { stbiw_uint32 x = va_arg(v,int);
364
+ unsigned char b[4];
365
+ b[0]=STBIW_UCHAR(x);
366
+ b[1]=STBIW_UCHAR(x>>8);
367
+ b[2]=STBIW_UCHAR(x>>16);
368
+ b[3]=STBIW_UCHAR(x>>24);
369
+ s->func(s->context,b,4);
370
+ break; }
371
+ default:
372
+ STBIW_ASSERT(0);
373
+ return;
374
+ }
375
+ }
376
+ }
377
+
378
+ static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
379
+ {
380
+ va_list v;
381
+ va_start(v, fmt);
382
+ stbiw__writefv(s, fmt, v);
383
+ va_end(v);
384
+ }
385
+
386
+ static void stbiw__write_flush(stbi__write_context *s)
387
+ {
388
+ if (s->buf_used) {
389
+ s->func(s->context, &s->buffer, s->buf_used);
390
+ s->buf_used = 0;
391
+ }
392
+ }
393
+
394
+ static void stbiw__putc(stbi__write_context *s, unsigned char c)
395
+ {
396
+ s->func(s->context, &c, 1);
397
+ }
398
+
399
+ static void stbiw__write1(stbi__write_context *s, unsigned char a)
400
+ {
401
+ if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
402
+ stbiw__write_flush(s);
403
+ s->buffer[s->buf_used++] = a;
404
+ }
405
+
406
+ static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
407
+ {
408
+ int n;
409
+ if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
410
+ stbiw__write_flush(s);
411
+ n = s->buf_used;
412
+ s->buf_used = n+3;
413
+ s->buffer[n+0] = a;
414
+ s->buffer[n+1] = b;
415
+ s->buffer[n+2] = c;
416
+ }
417
+
418
+ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
419
+ {
420
+ unsigned char bg[3] = { 255, 0, 255}, px[3];
421
+ int k;
422
+
423
+ if (write_alpha < 0)
424
+ stbiw__write1(s, d[comp - 1]);
425
+
426
+ switch (comp) {
427
+ case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
428
+ case 1:
429
+ if (expand_mono)
430
+ stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
431
+ else
432
+ stbiw__write1(s, d[0]); // monochrome TGA
433
+ break;
434
+ case 4:
435
+ if (!write_alpha) {
436
+ // composite against pink background
437
+ for (k = 0; k < 3; ++k)
438
+ px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
439
+ stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
440
+ break;
441
+ }
442
+ /* FALLTHROUGH */
443
+ case 3:
444
+ stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
445
+ break;
446
+ }
447
+ if (write_alpha > 0)
448
+ stbiw__write1(s, d[comp - 1]);
449
+ }
450
+
451
+ static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
452
+ {
453
+ stbiw_uint32 zero = 0;
454
+ int i,j, j_end;
455
+
456
+ if (y <= 0)
457
+ return;
458
+
459
+ if (stbi__flip_vertically_on_write)
460
+ vdir *= -1;
461
+
462
+ if (vdir < 0) {
463
+ j_end = -1; j = y-1;
464
+ } else {
465
+ j_end = y; j = 0;
466
+ }
467
+
468
+ for (; j != j_end; j += vdir) {
469
+ for (i=0; i < x; ++i) {
470
+ unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
471
+ stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
472
+ }
473
+ stbiw__write_flush(s);
474
+ s->func(s->context, &zero, scanline_pad);
475
+ }
476
+ }
477
+
478
+ static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
479
+ {
480
+ if (y < 0 || x < 0) {
481
+ return 0;
482
+ } else {
483
+ va_list v;
484
+ va_start(v, fmt);
485
+ stbiw__writefv(s, fmt, v);
486
+ va_end(v);
487
+ stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
488
+ return 1;
489
+ }
490
+ }
491
+
492
+ static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
493
+ {
494
+ if (comp != 4) {
495
+ // write RGB bitmap
496
+ int pad = (-x*3) & 3;
497
+ return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
498
+ "11 4 22 4" "4 44 22 444444",
499
+ 'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40, // file header
500
+ 40, x,y, 1,24, 0,0,0,0,0,0); // bitmap header
501
+ } else {
502
+ // RGBA bitmaps need a v4 header
503
+ // use BI_BITFIELDS mode with 32bpp and alpha mask
504
+ // (straight BI_RGB with alpha mask doesn't work in most readers)
505
+ return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
506
+ "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
507
+ 'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
508
+ 108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
509
+ }
510
+ }
511
+
512
+ STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
513
+ {
514
+ stbi__write_context s = { 0 };
515
+ stbi__start_write_callbacks(&s, func, context);
516
+ return stbi_write_bmp_core(&s, x, y, comp, data);
517
+ }
518
+
519
+ #ifndef STBI_WRITE_NO_STDIO
520
+ STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
521
+ {
522
+ stbi__write_context s = { 0 };
523
+ if (stbi__start_write_file(&s,filename)) {
524
+ int r = stbi_write_bmp_core(&s, x, y, comp, data);
525
+ stbi__end_write_file(&s);
526
+ return r;
527
+ } else
528
+ return 0;
529
+ }
530
+ #endif //!STBI_WRITE_NO_STDIO
531
+
532
+ static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
533
+ {
534
+ int has_alpha = (comp == 2 || comp == 4);
535
+ int colorbytes = has_alpha ? comp-1 : comp;
536
+ int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
537
+
538
+ if (y < 0 || x < 0)
539
+ return 0;
540
+
541
+ if (!stbi_write_tga_with_rle) {
542
+ return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
543
+ "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
544
+ } else {
545
+ int i,j,k;
546
+ int jend, jdir;
547
+
548
+ stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
549
+
550
+ if (stbi__flip_vertically_on_write) {
551
+ j = 0;
552
+ jend = y;
553
+ jdir = 1;
554
+ } else {
555
+ j = y-1;
556
+ jend = -1;
557
+ jdir = -1;
558
+ }
559
+ for (; j != jend; j += jdir) {
560
+ unsigned char *row = (unsigned char *) data + j * x * comp;
561
+ int len;
562
+
563
+ for (i = 0; i < x; i += len) {
564
+ unsigned char *begin = row + i * comp;
565
+ int diff = 1;
566
+ len = 1;
567
+
568
+ if (i < x - 1) {
569
+ ++len;
570
+ diff = memcmp(begin, row + (i + 1) * comp, comp);
571
+ if (diff) {
572
+ const unsigned char *prev = begin;
573
+ for (k = i + 2; k < x && len < 128; ++k) {
574
+ if (memcmp(prev, row + k * comp, comp)) {
575
+ prev += comp;
576
+ ++len;
577
+ } else {
578
+ --len;
579
+ break;
580
+ }
581
+ }
582
+ } else {
583
+ for (k = i + 2; k < x && len < 128; ++k) {
584
+ if (!memcmp(begin, row + k * comp, comp)) {
585
+ ++len;
586
+ } else {
587
+ break;
588
+ }
589
+ }
590
+ }
591
+ }
592
+
593
+ if (diff) {
594
+ unsigned char header = STBIW_UCHAR(len - 1);
595
+ stbiw__write1(s, header);
596
+ for (k = 0; k < len; ++k) {
597
+ stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
598
+ }
599
+ } else {
600
+ unsigned char header = STBIW_UCHAR(len - 129);
601
+ stbiw__write1(s, header);
602
+ stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
603
+ }
604
+ }
605
+ }
606
+ stbiw__write_flush(s);
607
+ }
608
+ return 1;
609
+ }
610
+
611
+ STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
612
+ {
613
+ stbi__write_context s = { 0 };
614
+ stbi__start_write_callbacks(&s, func, context);
615
+ return stbi_write_tga_core(&s, x, y, comp, (void *) data);
616
+ }
617
+
618
+ #ifndef STBI_WRITE_NO_STDIO
619
+ STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
620
+ {
621
+ stbi__write_context s = { 0 };
622
+ if (stbi__start_write_file(&s,filename)) {
623
+ int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
624
+ stbi__end_write_file(&s);
625
+ return r;
626
+ } else
627
+ return 0;
628
+ }
629
+ #endif
630
+
631
+ // *************************************************************************************************
632
+ // Radiance RGBE HDR writer
633
+ // by Baldur Karlsson
634
+
635
+ #define stbiw__max(a, b) ((a) > (b) ? (a) : (b))
636
+
637
+ #ifndef STBI_WRITE_NO_STDIO
638
+
639
+ static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
640
+ {
641
+ int exponent;
642
+ float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
643
+
644
+ if (maxcomp < 1e-32f) {
645
+ rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
646
+ } else {
647
+ float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
648
+
649
+ rgbe[0] = (unsigned char)(linear[0] * normalize);
650
+ rgbe[1] = (unsigned char)(linear[1] * normalize);
651
+ rgbe[2] = (unsigned char)(linear[2] * normalize);
652
+ rgbe[3] = (unsigned char)(exponent + 128);
653
+ }
654
+ }
655
+
656
+ static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
657
+ {
658
+ unsigned char lengthbyte = STBIW_UCHAR(length+128);
659
+ STBIW_ASSERT(length+128 <= 255);
660
+ s->func(s->context, &lengthbyte, 1);
661
+ s->func(s->context, &databyte, 1);
662
+ }
663
+
664
+ static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
665
+ {
666
+ unsigned char lengthbyte = STBIW_UCHAR(length);
667
+ STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
668
+ s->func(s->context, &lengthbyte, 1);
669
+ s->func(s->context, data, length);
670
+ }
671
+
672
+ static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
673
+ {
674
+ unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
675
+ unsigned char rgbe[4];
676
+ float linear[3];
677
+ int x;
678
+
679
+ scanlineheader[2] = (width&0xff00)>>8;
680
+ scanlineheader[3] = (width&0x00ff);
681
+
682
+ /* skip RLE for images too small or large */
683
+ if (width < 8 || width >= 32768) {
684
+ for (x=0; x < width; x++) {
685
+ switch (ncomp) {
686
+ case 4: /* fallthrough */
687
+ case 3: linear[2] = scanline[x*ncomp + 2];
688
+ linear[1] = scanline[x*ncomp + 1];
689
+ linear[0] = scanline[x*ncomp + 0];
690
+ break;
691
+ default:
692
+ linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
693
+ break;
694
+ }
695
+ stbiw__linear_to_rgbe(rgbe, linear);
696
+ s->func(s->context, rgbe, 4);
697
+ }
698
+ } else {
699
+ int c,r;
700
+ /* encode into scratch buffer */
701
+ for (x=0; x < width; x++) {
702
+ switch(ncomp) {
703
+ case 4: /* fallthrough */
704
+ case 3: linear[2] = scanline[x*ncomp + 2];
705
+ linear[1] = scanline[x*ncomp + 1];
706
+ linear[0] = scanline[x*ncomp + 0];
707
+ break;
708
+ default:
709
+ linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
710
+ break;
711
+ }
712
+ stbiw__linear_to_rgbe(rgbe, linear);
713
+ scratch[x + width*0] = rgbe[0];
714
+ scratch[x + width*1] = rgbe[1];
715
+ scratch[x + width*2] = rgbe[2];
716
+ scratch[x + width*3] = rgbe[3];
717
+ }
718
+
719
+ s->func(s->context, scanlineheader, 4);
720
+
721
+ /* RLE each component separately */
722
+ for (c=0; c < 4; c++) {
723
+ unsigned char *comp = &scratch[width*c];
724
+
725
+ x = 0;
726
+ while (x < width) {
727
+ // find first run
728
+ r = x;
729
+ while (r+2 < width) {
730
+ if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
731
+ break;
732
+ ++r;
733
+ }
734
+ if (r+2 >= width)
735
+ r = width;
736
+ // dump up to first run
737
+ while (x < r) {
738
+ int len = r-x;
739
+ if (len > 128) len = 128;
740
+ stbiw__write_dump_data(s, len, &comp[x]);
741
+ x += len;
742
+ }
743
+ // if there's a run, output it
744
+ if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
745
+ // find next byte after run
746
+ while (r < width && comp[r] == comp[x])
747
+ ++r;
748
+ // output run up to r
749
+ while (x < r) {
750
+ int len = r-x;
751
+ if (len > 127) len = 127;
752
+ stbiw__write_run_data(s, len, comp[x]);
753
+ x += len;
754
+ }
755
+ }
756
+ }
757
+ }
758
+ }
759
+ }
760
+
761
+ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
762
+ {
763
+ if (y <= 0 || x <= 0 || data == NULL)
764
+ return 0;
765
+ else {
766
+ // Each component is stored separately. Allocate scratch space for full output scanline.
767
+ unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
768
+ int i, len;
769
+ char buffer[128];
770
+ char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
771
+ s->func(s->context, header, sizeof(header)-1);
772
+
773
+ #ifdef __STDC_LIB_EXT1__
774
+ len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x);
775
+ #else
776
+ len = sprintf(buffer, "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x);
777
+ #endif
778
+ s->func(s->context, buffer, len);
779
+
780
+ for(i=0; i < y; i++)
781
+ stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
782
+ STBIW_FREE(scratch);
783
+ return 1;
784
+ }
785
+ }
786
+
787
+ STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
788
+ {
789
+ stbi__write_context s = { 0 };
790
+ stbi__start_write_callbacks(&s, func, context);
791
+ return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
792
+ }
793
+
794
+ STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
795
+ {
796
+ stbi__write_context s = { 0 };
797
+ if (stbi__start_write_file(&s,filename)) {
798
+ int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
799
+ stbi__end_write_file(&s);
800
+ return r;
801
+ } else
802
+ return 0;
803
+ }
804
+ #endif // STBI_WRITE_NO_STDIO
805
+
806
+
807
+ //////////////////////////////////////////////////////////////////////////////
808
+ //
809
+ // PNG writer
810
+ //
811
+
812
+ #ifndef STBIW_ZLIB_COMPRESS
813
+ // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
814
+ #define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
815
+ #define stbiw__sbm(a) stbiw__sbraw(a)[0]
816
+ #define stbiw__sbn(a) stbiw__sbraw(a)[1]
817
+
818
+ #define stbiw__sbneedgrow(a,n) ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
819
+ #define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
820
+ #define stbiw__sbgrow(a,n) stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
821
+
822
+ #define stbiw__sbpush(a, v) (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
823
+ #define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0)
824
+ #define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
825
+
826
+ static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
827
+ {
828
+ int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
829
+ void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
830
+ STBIW_ASSERT(p);
831
+ if (p) {
832
+ if (!*arr) ((int *) p)[1] = 0;
833
+ *arr = (void *) ((int *) p + 2);
834
+ stbiw__sbm(*arr) = m;
835
+ }
836
+ return *arr;
837
+ }
838
+
839
+ static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
840
+ {
841
+ while (*bitcount >= 8) {
842
+ stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
843
+ *bitbuffer >>= 8;
844
+ *bitcount -= 8;
845
+ }
846
+ return data;
847
+ }
848
+
849
+ static int stbiw__zlib_bitrev(int code, int codebits)
850
+ {
851
+ int res=0;
852
+ while (codebits--) {
853
+ res = (res << 1) | (code & 1);
854
+ code >>= 1;
855
+ }
856
+ return res;
857
+ }
858
+
859
+ static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
860
+ {
861
+ int i;
862
+ for (i=0; i < limit && i < 258; ++i)
863
+ if (a[i] != b[i]) break;
864
+ return i;
865
+ }
866
+
867
+ static unsigned int stbiw__zhash(unsigned char *data)
868
+ {
869
+ stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
870
+ hash ^= hash << 3;
871
+ hash += hash >> 5;
872
+ hash ^= hash << 4;
873
+ hash += hash >> 17;
874
+ hash ^= hash << 25;
875
+ hash += hash >> 6;
876
+ return hash;
877
+ }
878
+
879
+ #define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
880
+ #define stbiw__zlib_add(code,codebits) \
881
+ (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
882
+ #define stbiw__zlib_huffa(b,c) stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
883
+ // default huffman tables
884
+ #define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8)
885
+ #define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9)
886
+ #define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256,7)
887
+ #define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280,8)
888
+ #define stbiw__zlib_huff(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
889
+ #define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
890
+
891
+ #define stbiw__ZHASH 16384
892
+
893
+ #endif // STBIW_ZLIB_COMPRESS
894
+
895
+ STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
896
+ {
897
+ #ifdef STBIW_ZLIB_COMPRESS
898
+ // user provided a zlib compress implementation, use that
899
+ return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
900
+ #else // use builtin
901
+ static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
902
+ static unsigned char lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0 };
903
+ static unsigned short distc[] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
904
+ static unsigned char disteb[] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
905
+ unsigned int bitbuf=0;
906
+ int i,j, bitcount=0;
907
+ unsigned char *out = NULL;
908
+ unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
909
+ if (hash_table == NULL)
910
+ return NULL;
911
+ if (quality < 5) quality = 5;
912
+
913
+ stbiw__sbpush(out, 0x78); // DEFLATE 32K window
914
+ stbiw__sbpush(out, 0x5e); // FLEVEL = 1
915
+ stbiw__zlib_add(1,1); // BFINAL = 1
916
+ stbiw__zlib_add(1,2); // BTYPE = 1 -- fixed huffman
917
+
918
+ for (i=0; i < stbiw__ZHASH; ++i)
919
+ hash_table[i] = NULL;
920
+
921
+ i=0;
922
+ while (i < data_len-3) {
923
+ // hash next 3 bytes of data to be compressed
924
+ int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
925
+ unsigned char *bestloc = 0;
926
+ unsigned char **hlist = hash_table[h];
927
+ int n = stbiw__sbcount(hlist);
928
+ for (j=0; j < n; ++j) {
929
+ if (hlist[j]-data > i-32768) { // if entry lies within window
930
+ int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
931
+ if (d >= best) { best=d; bestloc=hlist[j]; }
932
+ }
933
+ }
934
+ // when hash table entry is too long, delete half the entries
935
+ if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
936
+ STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
937
+ stbiw__sbn(hash_table[h]) = quality;
938
+ }
939
+ stbiw__sbpush(hash_table[h],data+i);
940
+
941
+ if (bestloc) {
942
+ // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
943
+ h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
944
+ hlist = hash_table[h];
945
+ n = stbiw__sbcount(hlist);
946
+ for (j=0; j < n; ++j) {
947
+ if (hlist[j]-data > i-32767) {
948
+ int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
949
+ if (e > best) { // if next match is better, bail on current match
950
+ bestloc = NULL;
951
+ break;
952
+ }
953
+ }
954
+ }
955
+ }
956
+
957
+ if (bestloc) {
958
+ int d = (int) (data+i - bestloc); // distance back
959
+ STBIW_ASSERT(d <= 32767 && best <= 258);
960
+ for (j=0; best > lengthc[j+1]-1; ++j);
961
+ stbiw__zlib_huff(j+257);
962
+ if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
963
+ for (j=0; d > distc[j+1]-1; ++j);
964
+ stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
965
+ if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
966
+ i += best;
967
+ } else {
968
+ stbiw__zlib_huffb(data[i]);
969
+ ++i;
970
+ }
971
+ }
972
+ // write out final bytes
973
+ for (;i < data_len; ++i)
974
+ stbiw__zlib_huffb(data[i]);
975
+ stbiw__zlib_huff(256); // end of block
976
+ // pad with 0 bits to byte boundary
977
+ while (bitcount)
978
+ stbiw__zlib_add(0,1);
979
+
980
+ for (i=0; i < stbiw__ZHASH; ++i)
981
+ (void) stbiw__sbfree(hash_table[i]);
982
+ STBIW_FREE(hash_table);
983
+
984
+ // store uncompressed instead if compression was worse
985
+ if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
986
+ stbiw__sbn(out) = 2; // truncate to DEFLATE 32K window and FLEVEL = 1
987
+ for (j = 0; j < data_len;) {
988
+ int blocklen = data_len - j;
989
+ if (blocklen > 32767) blocklen = 32767;
990
+ stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
991
+ stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
992
+ stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
993
+ stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
994
+ stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
995
+ memcpy(out+stbiw__sbn(out), data+j, blocklen);
996
+ stbiw__sbn(out) += blocklen;
997
+ j += blocklen;
998
+ }
999
+ }
1000
+
1001
+ {
1002
+ // compute adler32 on input
1003
+ unsigned int s1=1, s2=0;
1004
+ int blocklen = (int) (data_len % 5552);
1005
+ j=0;
1006
+ while (j < data_len) {
1007
+ for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
1008
+ s1 %= 65521; s2 %= 65521;
1009
+ j += blocklen;
1010
+ blocklen = 5552;
1011
+ }
1012
+ stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
1013
+ stbiw__sbpush(out, STBIW_UCHAR(s2));
1014
+ stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
1015
+ stbiw__sbpush(out, STBIW_UCHAR(s1));
1016
+ }
1017
+ *out_len = stbiw__sbn(out);
1018
+ // make returned pointer freeable
1019
+ STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
1020
+ return (unsigned char *) stbiw__sbraw(out);
1021
+ #endif // STBIW_ZLIB_COMPRESS
1022
+ }
1023
+
1024
+ static unsigned int stbiw__crc32(unsigned char *buffer, int len)
1025
+ {
1026
+ #ifdef STBIW_CRC32
1027
+ return STBIW_CRC32(buffer, len);
1028
+ #else
1029
+ static unsigned int crc_table[256] =
1030
+ {
1031
+ 0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
1032
+ 0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
1033
+ 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
1034
+ 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
1035
+ 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
1036
+ 0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
1037
+ 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
1038
+ 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
1039
+ 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
1040
+ 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
1041
+ 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
1042
+ 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
1043
+ 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
1044
+ 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
1045
+ 0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
1046
+ 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
1047
+ 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
1048
+ 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
1049
+ 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
1050
+ 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
1051
+ 0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
1052
+ 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
1053
+ 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
1054
+ 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
1055
+ 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
1056
+ 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
1057
+ 0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
1058
+ 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
1059
+ 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
1060
+ 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
1061
+ 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
1062
+ 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
1063
+ };
1064
+
1065
+ unsigned int crc = ~0u;
1066
+ int i;
1067
+ for (i=0; i < len; ++i)
1068
+ crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
1069
+ return ~crc;
1070
+ #endif
1071
+ }
1072
+
1073
+ #define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
1074
+ #define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
1075
+ #define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
1076
+
1077
+ static void stbiw__wpcrc(unsigned char **data, int len)
1078
+ {
1079
+ unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
1080
+ stbiw__wp32(*data, crc);
1081
+ }
1082
+
1083
+ static unsigned char stbiw__paeth(int a, int b, int c)
1084
+ {
1085
+ int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
1086
+ if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
1087
+ if (pb <= pc) return STBIW_UCHAR(b);
1088
+ return STBIW_UCHAR(c);
1089
+ }
1090
+
1091
+ // @OPTIMIZE: provide an option that always forces left-predict or paeth predict
1092
+ static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
1093
+ {
1094
+ static int mapping[] = { 0,1,2,3,4 };
1095
+ static int firstmap[] = { 0,1,0,5,6 };
1096
+ int *mymap = (y != 0) ? mapping : firstmap;
1097
+ int i;
1098
+ int type = mymap[filter_type];
1099
+ unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
1100
+ int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
1101
+
1102
+ if (type==0) {
1103
+ memcpy(line_buffer, z, width*n);
1104
+ return;
1105
+ }
1106
+
1107
+ // first loop isn't optimized since it's just one pixel
1108
+ for (i = 0; i < n; ++i) {
1109
+ switch (type) {
1110
+ case 1: line_buffer[i] = z[i]; break;
1111
+ case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
1112
+ case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
1113
+ case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
1114
+ case 5: line_buffer[i] = z[i]; break;
1115
+ case 6: line_buffer[i] = z[i]; break;
1116
+ }
1117
+ }
1118
+ switch (type) {
1119
+ case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
1120
+ case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
1121
+ case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
1122
+ case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
1123
+ case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
1124
+ case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
1125
+ }
1126
+ }
1127
+
1128
+ STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len, const char* parameters)
1129
+ {
1130
+ int force_filter = stbi_write_force_png_filter;
1131
+ int param_length = 0;
1132
+ int ctype[5] = { -1, 0, 4, 2, 6 };
1133
+ unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
1134
+ unsigned char *out,*o, *filt, *zlib;
1135
+ signed char *line_buffer;
1136
+ int j,zlen;
1137
+
1138
+ if (stride_bytes == 0)
1139
+ stride_bytes = x * n;
1140
+
1141
+ if (force_filter >= 5) {
1142
+ force_filter = -1;
1143
+ }
1144
+
1145
+ filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
1146
+ line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
1147
+ for (j=0; j < y; ++j) {
1148
+ int filter_type;
1149
+ if (force_filter > -1) {
1150
+ filter_type = force_filter;
1151
+ stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
1152
+ } else { // Estimate the best filter by running through all of them:
1153
+ int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
1154
+ for (filter_type = 0; filter_type < 5; filter_type++) {
1155
+ stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
1156
+
1157
+ // Estimate the entropy of the line using this filter; the less, the better.
1158
+ est = 0;
1159
+ for (i = 0; i < x*n; ++i) {
1160
+ est += abs((signed char) line_buffer[i]);
1161
+ }
1162
+ if (est < best_filter_val) {
1163
+ best_filter_val = est;
1164
+ best_filter = filter_type;
1165
+ }
1166
+ }
1167
+ if (filter_type != best_filter) { // If the last iteration already got us the best filter, don't redo it
1168
+ stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
1169
+ filter_type = best_filter;
1170
+ }
1171
+ }
1172
+ // when we get here, filter_type contains the filter type, and line_buffer contains the data
1173
+ filt[j*(x*n+1)] = (unsigned char) filter_type;
1174
+ STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
1175
+ }
1176
+ STBIW_FREE(line_buffer);
1177
+ zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
1178
+ STBIW_FREE(filt);
1179
+ if (!zlib) return 0;
1180
+
1181
+ if(parameters != NULL) {
1182
+ param_length = strlen(parameters);
1183
+ param_length += strlen("parameters") + 1; // For the name and the null-byte
1184
+ }
1185
+
1186
+ // each tag requires 12 bytes of overhead
1187
+ out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12 + ((parameters)?(param_length+12):0));
1188
+ if (!out) return 0;
1189
+ *out_len = 8 + 12+13 + 12+zlen + 12 + ((parameters)?(param_length+12):0);
1190
+
1191
+ o=out;
1192
+ STBIW_MEMMOVE(o,sig,8); o+= 8;
1193
+ stbiw__wp32(o, 13); // header length
1194
+ stbiw__wptag(o, "IHDR");
1195
+ stbiw__wp32(o, x);
1196
+ stbiw__wp32(o, y);
1197
+ *o++ = 8;
1198
+ *o++ = STBIW_UCHAR(ctype[n]);
1199
+ *o++ = 0;
1200
+ *o++ = 0;
1201
+ *o++ = 0;
1202
+ stbiw__wpcrc(&o,13);
1203
+
1204
+ if(parameters != NULL) {
1205
+ stbiw__wp32(o, param_length);
1206
+ stbiw__wptag(o, "tEXt");
1207
+ STBIW_MEMMOVE(o, "parameters", strlen("parameters"));
1208
+ o+=strlen("parameters");
1209
+ *o++ = 0; // Null pyte separator
1210
+ STBIW_MEMMOVE(o, parameters, strlen(parameters));
1211
+ o+=strlen(parameters);
1212
+ stbiw__wpcrc(&o, param_length);
1213
+ }
1214
+
1215
+ stbiw__wp32(o, zlen);
1216
+ stbiw__wptag(o, "IDAT");
1217
+ STBIW_MEMMOVE(o, zlib, zlen);
1218
+ o += zlen;
1219
+ STBIW_FREE(zlib);
1220
+ stbiw__wpcrc(&o, zlen);
1221
+
1222
+ stbiw__wp32(o,0);
1223
+ stbiw__wptag(o, "IEND");
1224
+ stbiw__wpcrc(&o,0);
1225
+
1226
+ STBIW_ASSERT(o == out + *out_len);
1227
+
1228
+ return out;
1229
+ }
1230
+
1231
+ #ifndef STBI_WRITE_NO_STDIO
1232
+ STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes, const char* parameters)
1233
+ {
1234
+ FILE *f;
1235
+ int len;
1236
+ unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len, parameters);
1237
+ if (png == NULL) return 0;
1238
+
1239
+ f = stbiw__fopen(filename, "wb");
1240
+ if (!f) { STBIW_FREE(png); return 0; }
1241
+ fwrite(png, 1, len, f);
1242
+ fclose(f);
1243
+ STBIW_FREE(png);
1244
+ return 1;
1245
+ }
1246
+ #endif
1247
+
1248
+ STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
1249
+ {
1250
+ int len;
1251
+ unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len, NULL);
1252
+ if (png == NULL) return 0;
1253
+ func(context, png, len);
1254
+ STBIW_FREE(png);
1255
+ return 1;
1256
+ }
1257
+
1258
+
1259
+ /* ***************************************************************************
1260
+ *
1261
+ * JPEG writer
1262
+ *
1263
+ * This is based on Jon Olick's jo_jpeg.cpp:
1264
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
1265
+ */
1266
+
1267
+ static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
1268
+ 24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
1269
+
1270
+ static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
1271
+ int bitBuf = *bitBufP, bitCnt = *bitCntP;
1272
+ bitCnt += bs[1];
1273
+ bitBuf |= bs[0] << (24 - bitCnt);
1274
+ while(bitCnt >= 8) {
1275
+ unsigned char c = (bitBuf >> 16) & 255;
1276
+ stbiw__putc(s, c);
1277
+ if(c == 255) {
1278
+ stbiw__putc(s, 0);
1279
+ }
1280
+ bitBuf <<= 8;
1281
+ bitCnt -= 8;
1282
+ }
1283
+ *bitBufP = bitBuf;
1284
+ *bitCntP = bitCnt;
1285
+ }
1286
+
1287
+ static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
1288
+ float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
1289
+ float z1, z2, z3, z4, z5, z11, z13;
1290
+
1291
+ float tmp0 = d0 + d7;
1292
+ float tmp7 = d0 - d7;
1293
+ float tmp1 = d1 + d6;
1294
+ float tmp6 = d1 - d6;
1295
+ float tmp2 = d2 + d5;
1296
+ float tmp5 = d2 - d5;
1297
+ float tmp3 = d3 + d4;
1298
+ float tmp4 = d3 - d4;
1299
+
1300
+ // Even part
1301
+ float tmp10 = tmp0 + tmp3; // phase 2
1302
+ float tmp13 = tmp0 - tmp3;
1303
+ float tmp11 = tmp1 + tmp2;
1304
+ float tmp12 = tmp1 - tmp2;
1305
+
1306
+ d0 = tmp10 + tmp11; // phase 3
1307
+ d4 = tmp10 - tmp11;
1308
+
1309
+ z1 = (tmp12 + tmp13) * 0.707106781f; // c4
1310
+ d2 = tmp13 + z1; // phase 5
1311
+ d6 = tmp13 - z1;
1312
+
1313
+ // Odd part
1314
+ tmp10 = tmp4 + tmp5; // phase 2
1315
+ tmp11 = tmp5 + tmp6;
1316
+ tmp12 = tmp6 + tmp7;
1317
+
1318
+ // The rotator is modified from fig 4-8 to avoid extra negations.
1319
+ z5 = (tmp10 - tmp12) * 0.382683433f; // c6
1320
+ z2 = tmp10 * 0.541196100f + z5; // c2-c6
1321
+ z4 = tmp12 * 1.306562965f + z5; // c2+c6
1322
+ z3 = tmp11 * 0.707106781f; // c4
1323
+
1324
+ z11 = tmp7 + z3; // phase 5
1325
+ z13 = tmp7 - z3;
1326
+
1327
+ *d5p = z13 + z2; // phase 6
1328
+ *d3p = z13 - z2;
1329
+ *d1p = z11 + z4;
1330
+ *d7p = z11 - z4;
1331
+
1332
+ *d0p = d0; *d2p = d2; *d4p = d4; *d6p = d6;
1333
+ }
1334
+
1335
+ static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
1336
+ int tmp1 = val < 0 ? -val : val;
1337
+ val = val < 0 ? val-1 : val;
1338
+ bits[1] = 1;
1339
+ while(tmp1 >>= 1) {
1340
+ ++bits[1];
1341
+ }
1342
+ bits[0] = val & ((1<<bits[1])-1);
1343
+ }
1344
+
1345
+ static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
1346
+ const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
1347
+ const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
1348
+ int dataOff, i, j, n, diff, end0pos, x, y;
1349
+ int DU[64];
1350
+
1351
+ // DCT rows
1352
+ for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
1353
+ stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
1354
+ }
1355
+ // DCT columns
1356
+ for(dataOff=0; dataOff<8; ++dataOff) {
1357
+ stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
1358
+ &CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
1359
+ }
1360
+ // Quantize/descale/zigzag the coefficients
1361
+ for(y = 0, j=0; y < 8; ++y) {
1362
+ for(x = 0; x < 8; ++x,++j) {
1363
+ float v;
1364
+ i = y*du_stride+x;
1365
+ v = CDU[i]*fdtbl[j];
1366
+ // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
1367
+ // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
1368
+ DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
1369
+ }
1370
+ }
1371
+
1372
+ // Encode DC
1373
+ diff = DU[0] - DC;
1374
+ if (diff == 0) {
1375
+ stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
1376
+ } else {
1377
+ unsigned short bits[2];
1378
+ stbiw__jpg_calcBits(diff, bits);
1379
+ stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
1380
+ stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
1381
+ }
1382
+ // Encode ACs
1383
+ end0pos = 63;
1384
+ for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
1385
+ }
1386
+ // end0pos = first element in reverse order !=0
1387
+ if(end0pos == 0) {
1388
+ stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
1389
+ return DU[0];
1390
+ }
1391
+ for(i = 1; i <= end0pos; ++i) {
1392
+ int startpos = i;
1393
+ int nrzeroes;
1394
+ unsigned short bits[2];
1395
+ for (; DU[i]==0 && i<=end0pos; ++i) {
1396
+ }
1397
+ nrzeroes = i-startpos;
1398
+ if ( nrzeroes >= 16 ) {
1399
+ int lng = nrzeroes>>4;
1400
+ int nrmarker;
1401
+ for (nrmarker=1; nrmarker <= lng; ++nrmarker)
1402
+ stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
1403
+ nrzeroes &= 15;
1404
+ }
1405
+ stbiw__jpg_calcBits(DU[i], bits);
1406
+ stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
1407
+ stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
1408
+ }
1409
+ if(end0pos != 63) {
1410
+ stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
1411
+ }
1412
+ return DU[0];
1413
+ }
1414
+
1415
+ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
1416
+ // Constants that don't pollute global namespace
1417
+ static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
1418
+ static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
1419
+ static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
1420
+ static const unsigned char std_ac_luminance_values[] = {
1421
+ 0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
1422
+ 0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
1423
+ 0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
1424
+ 0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
1425
+ 0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
1426
+ 0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
1427
+ 0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
1428
+ };
1429
+ static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
1430
+ static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
1431
+ static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
1432
+ static const unsigned char std_ac_chrominance_values[] = {
1433
+ 0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
1434
+ 0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
1435
+ 0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
1436
+ 0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
1437
+ 0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
1438
+ 0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
1439
+ 0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
1440
+ };
1441
+ // Huffman tables
1442
+ static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
1443
+ static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
1444
+ static const unsigned short YAC_HT[256][2] = {
1445
+ {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1446
+ {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1447
+ {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1448
+ {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1449
+ {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1450
+ {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1451
+ {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1452
+ {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1453
+ {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1454
+ {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1455
+ {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1456
+ {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1457
+ {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1458
+ {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1459
+ {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
1460
+ {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
1461
+ };
1462
+ static const unsigned short UVAC_HT[256][2] = {
1463
+ {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1464
+ {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1465
+ {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1466
+ {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1467
+ {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1468
+ {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1469
+ {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1470
+ {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1471
+ {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1472
+ {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1473
+ {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1474
+ {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1475
+ {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1476
+ {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
1477
+ {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
1478
+ {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
1479
+ };
1480
+ static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
1481
+ 37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
1482
+ static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
1483
+ 99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
1484
+ static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
1485
+ 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
1486
+
1487
+ int row, col, i, k, subsample;
1488
+ float fdtbl_Y[64], fdtbl_UV[64];
1489
+ unsigned char YTable[64], UVTable[64];
1490
+
1491
+ if(!data || !width || !height || comp > 4 || comp < 1) {
1492
+ return 0;
1493
+ }
1494
+
1495
+ quality = quality ? quality : 90;
1496
+ subsample = quality <= 90 ? 1 : 0;
1497
+ quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
1498
+ quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
1499
+
1500
+ for(i = 0; i < 64; ++i) {
1501
+ int uvti, yti = (YQT[i]*quality+50)/100;
1502
+ YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
1503
+ uvti = (UVQT[i]*quality+50)/100;
1504
+ UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
1505
+ }
1506
+
1507
+ for(row = 0, k = 0; row < 8; ++row) {
1508
+ for(col = 0; col < 8; ++col, ++k) {
1509
+ fdtbl_Y[k] = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
1510
+ fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
1511
+ }
1512
+ }
1513
+
1514
+ // Write Headers
1515
+ {
1516
+ static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
1517
+ static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
1518
+ const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
1519
+ 3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
1520
+ s->func(s->context, (void*)head0, sizeof(head0));
1521
+ s->func(s->context, (void*)YTable, sizeof(YTable));
1522
+ stbiw__putc(s, 1);
1523
+ s->func(s->context, UVTable, sizeof(UVTable));
1524
+ s->func(s->context, (void*)head1, sizeof(head1));
1525
+ s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
1526
+ s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
1527
+ stbiw__putc(s, 0x10); // HTYACinfo
1528
+ s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
1529
+ s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
1530
+ stbiw__putc(s, 1); // HTUDCinfo
1531
+ s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
1532
+ s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
1533
+ stbiw__putc(s, 0x11); // HTUACinfo
1534
+ s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
1535
+ s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
1536
+ s->func(s->context, (void*)head2, sizeof(head2));
1537
+ }
1538
+
1539
+ // Encode 8x8 macroblocks
1540
+ {
1541
+ static const unsigned short fillBits[] = {0x7F, 7};
1542
+ int DCY=0, DCU=0, DCV=0;
1543
+ int bitBuf=0, bitCnt=0;
1544
+ // comp == 2 is grey+alpha (alpha is ignored)
1545
+ int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
1546
+ const unsigned char *dataR = (const unsigned char *)data;
1547
+ const unsigned char *dataG = dataR + ofsG;
1548
+ const unsigned char *dataB = dataR + ofsB;
1549
+ int x, y, pos;
1550
+ if(subsample) {
1551
+ for(y = 0; y < height; y += 16) {
1552
+ for(x = 0; x < width; x += 16) {
1553
+ float Y[256], U[256], V[256];
1554
+ for(row = y, pos = 0; row < y+16; ++row) {
1555
+ // row >= height => use last input row
1556
+ int clamped_row = (row < height) ? row : height - 1;
1557
+ int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
1558
+ for(col = x; col < x+16; ++col, ++pos) {
1559
+ // if col >= width => use pixel from last input column
1560
+ int p = base_p + ((col < width) ? col : (width-1))*comp;
1561
+ float r = dataR[p], g = dataG[p], b = dataB[p];
1562
+ Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
1563
+ U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
1564
+ V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
1565
+ }
1566
+ }
1567
+ DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1568
+ DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1569
+ DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1570
+ DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1571
+
1572
+ // subsample U,V
1573
+ {
1574
+ float subU[64], subV[64];
1575
+ int yy, xx;
1576
+ for(yy = 0, pos = 0; yy < 8; ++yy) {
1577
+ for(xx = 0; xx < 8; ++xx, ++pos) {
1578
+ int j = yy*32+xx*2;
1579
+ subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
1580
+ subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
1581
+ }
1582
+ }
1583
+ DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
1584
+ DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
1585
+ }
1586
+ }
1587
+ }
1588
+ } else {
1589
+ for(y = 0; y < height; y += 8) {
1590
+ for(x = 0; x < width; x += 8) {
1591
+ float Y[64], U[64], V[64];
1592
+ for(row = y, pos = 0; row < y+8; ++row) {
1593
+ // row >= height => use last input row
1594
+ int clamped_row = (row < height) ? row : height - 1;
1595
+ int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
1596
+ for(col = x; col < x+8; ++col, ++pos) {
1597
+ // if col >= width => use pixel from last input column
1598
+ int p = base_p + ((col < width) ? col : (width-1))*comp;
1599
+ float r = dataR[p], g = dataG[p], b = dataB[p];
1600
+ Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
1601
+ U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
1602
+ V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
1603
+ }
1604
+ }
1605
+
1606
+ DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1607
+ DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
1608
+ DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
1609
+ }
1610
+ }
1611
+ }
1612
+
1613
+ // Do the bit alignment of the EOI marker
1614
+ stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
1615
+ }
1616
+
1617
+ // EOI
1618
+ stbiw__putc(s, 0xFF);
1619
+ stbiw__putc(s, 0xD9);
1620
+
1621
+ return 1;
1622
+ }
1623
+
1624
+ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
1625
+ {
1626
+ stbi__write_context s = { 0 };
1627
+ stbi__start_write_callbacks(&s, func, context);
1628
+ return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
1629
+ }
1630
+
1631
+
1632
+ #ifndef STBI_WRITE_NO_STDIO
1633
+ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
1634
+ {
1635
+ stbi__write_context s = { 0 };
1636
+ if (stbi__start_write_file(&s,filename)) {
1637
+ int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
1638
+ stbi__end_write_file(&s);
1639
+ return r;
1640
+ } else
1641
+ return 0;
1642
+ }
1643
+ #endif
1644
+
1645
+ #endif // STB_IMAGE_WRITE_IMPLEMENTATION
1646
+
1647
+ /* Revision history
1648
+ 1.16 (2021-07-11)
1649
+ make Deflate code emit uncompressed blocks when it would otherwise expand
1650
+ support writing BMPs with alpha channel
1651
+ 1.15 (2020-07-13) unknown
1652
+ 1.14 (2020-02-02) updated JPEG writer to downsample chroma channels
1653
+ 1.13
1654
+ 1.12
1655
+ 1.11 (2019-08-11)
1656
+
1657
+ 1.10 (2019-02-07)
1658
+ support utf8 filenames in Windows; fix warnings and platform ifdefs
1659
+ 1.09 (2018-02-11)
1660
+ fix typo in zlib quality API, improve STB_I_W_STATIC in C++
1661
+ 1.08 (2018-01-29)
1662
+ add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
1663
+ 1.07 (2017-07-24)
1664
+ doc fix
1665
+ 1.06 (2017-07-23)
1666
+ writing JPEG (using Jon Olick's code)
1667
+ 1.05 ???
1668
+ 1.04 (2017-03-03)
1669
+ monochrome BMP expansion
1670
+ 1.03 ???
1671
+ 1.02 (2016-04-02)
1672
+ avoid allocating large structures on the stack
1673
+ 1.01 (2016-01-16)
1674
+ STBIW_REALLOC_SIZED: support allocators with no realloc support
1675
+ avoid race-condition in crc initialization
1676
+ minor compile issues
1677
+ 1.00 (2015-09-14)
1678
+ installable file IO function
1679
+ 0.99 (2015-09-13)
1680
+ warning fixes; TGA rle support
1681
+ 0.98 (2015-04-08)
1682
+ added STBIW_MALLOC, STBIW_ASSERT etc
1683
+ 0.97 (2015-01-18)
1684
+ fixed HDR asserts, rewrote HDR rle logic
1685
+ 0.96 (2015-01-17)
1686
+ add HDR output
1687
+ fix monochrome BMP
1688
+ 0.95 (2014-08-17)
1689
+ add monochrome TGA output
1690
+ 0.94 (2014-05-31)
1691
+ rename private functions to avoid conflicts with stb_image.h
1692
+ 0.93 (2014-05-27)
1693
+ warning fixes
1694
+ 0.92 (2010-08-01)
1695
+ casts to unsigned char to fix warnings
1696
+ 0.91 (2010-07-17)
1697
+ first public release
1698
+ 0.90 first internal release
1699
+ */
1700
+
1701
+ /*
1702
+ ------------------------------------------------------------------------------
1703
+ This software is available under 2 licenses -- choose whichever you prefer.
1704
+ ------------------------------------------------------------------------------
1705
+ ALTERNATIVE A - MIT License
1706
+ Copyright (c) 2017 Sean Barrett
1707
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
1708
+ this software and associated documentation files (the "Software"), to deal in
1709
+ the Software without restriction, including without limitation the rights to
1710
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
1711
+ of the Software, and to permit persons to whom the Software is furnished to do
1712
+ so, subject to the following conditions:
1713
+ The above copyright notice and this permission notice shall be included in all
1714
+ copies or substantial portions of the Software.
1715
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1716
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1717
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1718
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1719
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1720
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1721
+ SOFTWARE.
1722
+ ------------------------------------------------------------------------------
1723
+ ALTERNATIVE B - Public Domain (www.unlicense.org)
1724
+ This is free and unencumbered software released into the public domain.
1725
+ Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
1726
+ software, either in source code form or as a compiled binary, for any purpose,
1727
+ commercial or non-commercial, and by any means.
1728
+ In jurisdictions that recognize copyright laws, the author or authors of this
1729
+ software dedicate any and all copyright interest in the software to the public
1730
+ domain. We make this dedication for the benefit of the public at large and to
1731
+ the detriment of our heirs and successors. We intend this dedication to be an
1732
+ overt act of relinquishment in perpetuity of all present and future rights to
1733
+ this software under copyright law.
1734
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1735
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1736
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1737
+ AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
1738
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
1739
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1740
+ ------------------------------------------------------------------------------
1741
+ */
stable-diffusion.cpp/ggml/.editorconfig ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://EditorConfig.org
2
+
3
+ # Top-most EditorConfig file
4
+ root = true
5
+
6
+ # Unix-style newlines with a newline ending every file, utf-8 charset
7
+ [*]
8
+ end_of_line = lf
9
+ insert_final_newline = true
10
+ trim_trailing_whitespace = true
11
+ charset = utf-8
12
+ indent_style = space
13
+ indent_size = 4
14
+
15
+ [Makefile]
16
+ indent_style = tab
17
+
18
+ [prompts/*.txt]
19
+ insert_final_newline = unset
stable-diffusion.cpp/ggml/.github/workflows/ci.yml ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ master ]
6
+ pull_request:
7
+ branches: [ master ]
8
+
9
+ jobs:
10
+ test-ubuntu-opencl:
11
+ runs-on: ubuntu-latest
12
+ env:
13
+ GGML_NLOOP: 3
14
+ GGML_NITER: 1
15
+ GGML_N_THREADS: 2
16
+
17
+ steps:
18
+ - uses: actions/checkout@v3
19
+
20
+ - name: Dependencies
21
+ run: |
22
+ wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
23
+ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
24
+ sudo apt-get update
25
+ sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev
26
+ - name: Create Build Environment
27
+ run: mkdir build
28
+
29
+ - name: Configure CMake
30
+ working-directory: ./build
31
+ run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON -DGGML_CLBLAST=ON ..
32
+
33
+ - name: Build
34
+ working-directory: ./build
35
+ run: make
36
+
37
+ - name: Test
38
+ working-directory: ./build
39
+ run: ctest --verbose --timeout 900
40
+
41
+ - name: Test Coverage
42
+ working-directory: ./build
43
+ run: |
44
+ llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
45
+ llvm-cov report ./bin/test-grad0 -instr-profile=ggml.profdata
46
+ llvm-cov report ./bin/test-opt -instr-profile=ggml.profdata
47
+ test-macos-metal:
48
+ runs-on: macos-13
49
+ env:
50
+ GGML_NLOOP: 3
51
+ GGML_NITER: 1
52
+ GGML_N_THREADS: 2
53
+
54
+ steps:
55
+ - uses: actions/checkout@v3
56
+
57
+ - name: Create Build Environment
58
+ run: mkdir build
59
+
60
+ - name: Configure CMake
61
+ working-directory: ./build
62
+ run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON -DGGML_METAL=ON ..
63
+
64
+ - name: Build
65
+ working-directory: ./build
66
+ run: make
67
+
68
+ - name: Test
69
+ working-directory: ./build
70
+ run: ctest --verbose --timeout 900
71
+
72
+ - name: Test Coverage
73
+ working-directory: ./build
74
+ run: |
75
+ xcrun llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
76
+ xcrun llvm-cov report ./bin/test-grad0 -instr-profile=ggml.profdata
77
+ xcrun llvm-cov report ./bin/test-opt -instr-profile=ggml.profdata
78
+
79
+ build:
80
+
81
+ strategy:
82
+ matrix:
83
+ os: [ubuntu-latest, macos-latest]
84
+
85
+ runs-on: ${{ matrix.os }}
86
+
87
+ env:
88
+ GGML_NLOOP: 3
89
+ GGML_NITER: 1
90
+
91
+ steps:
92
+ - uses: actions/checkout@v3
93
+
94
+ - name: Dependencies for Ubuntu
95
+ if: matrix.os == 'ubuntu-latest'
96
+ run: |
97
+ sudo apt-get update
98
+ sudo apt-get install llvm
99
+
100
+ - name: Set GGML_N_THREADS for Ubuntu
101
+ run: echo "GGML_N_THREADS=2" >> $GITHUB_ENV
102
+ if: matrix.os == 'ubuntu-latest'
103
+
104
+ - name: Set GGML_N_THREADS for MacOS
105
+ run: echo "GGML_N_THREADS=2" >> $GITHUB_ENV
106
+ if: matrix.os == 'macos-latest'
107
+
108
+ - name: Create Build Environment
109
+ run: mkdir build
110
+
111
+ - name: Configure CMake
112
+ working-directory: ./build
113
+ run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON ..
114
+
115
+ - name: Build
116
+ working-directory: ./build
117
+ run: make
118
+
119
+ - name: Test
120
+ working-directory: ./build
121
+ run: ctest --verbose --timeout 900
122
+
123
+ - name: Test Coverage for Ubuntu
124
+ if: matrix.os == 'ubuntu-latest'
125
+ working-directory: ./build
126
+ run: |
127
+ llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
128
+ llvm-cov report ./bin/test-grad0 -instr-profile=ggml.profdata
129
+ llvm-cov report ./bin/test-opt -instr-profile=ggml.profdata
130
+
131
+ - name: Test Coverage for MacOS
132
+ if: matrix.os == 'macos-latest'
133
+ working-directory: ./build
134
+ run: |
135
+ xcrun llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
136
+ xcrun llvm-cov report ./bin/test-grad0 -instr-profile=ggml.profdata
137
+ xcrun llvm-cov report ./bin/test-opt -instr-profile=ggml.profdata
stable-diffusion.cpp/ggml/.gitignore ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ build/
2
+ build-debug/
3
+ build-release/
4
+ build-sanitize-addr/
5
+ build-sanitize-thread/
6
+ build-cov/
7
+ build-ci-debug/
8
+ build-ci-release/
9
+ build-cublas/
10
+ out/
11
+ tmp/
12
+ models/
13
+ models-mnt
14
+
15
+ compile_commands.json
16
+ CMakeSettings.json
17
+ .vs/
18
+ .vscode/
19
+ .clangd
20
+
21
+ .exrc
22
+ .cache
23
+ .DS_Store
24
+ .stablelm
25
+ .gpt-2
26
+
27
+ src/arm_neon.h
28
+ tests/arm_neon.h
29
+
30
+ zig-out/
31
+ zig-cache/
32
+
33
+ *.dot
34
+
35
+ *.sw?
36
+
37
+ __pycache__/
stable-diffusion.cpp/ggml/CMakeLists.txt ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required (VERSION 3.3)
2
+ project(ggml VERSION 0.1.0)
3
+
4
+ set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
5
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
6
+ set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
7
+
8
+ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
9
+ set(GGML_STANDALONE ON)
10
+ include(cmake/GitVars.cmake)
11
+ include(cmake/BuildTypes.cmake)
12
+ else()
13
+ set(GGML_STANDALONE OFF)
14
+ endif()
15
+
16
+ if (EMSCRIPTEN)
17
+ set(BUILD_SHARED_LIBS_DEFAULT OFF)
18
+ else()
19
+ if (MINGW)
20
+ set(BUILD_SHARED_LIBS_DEFAULT OFF)
21
+ else()
22
+ set(BUILD_SHARED_LIBS_DEFAULT ON)
23
+ endif()
24
+ endif()
25
+
26
+ # options
27
+
28
+ option(BUILD_SHARED_LIBS "ggml: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
29
+
30
+ option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
31
+ option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
32
+
33
+ option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF)
34
+ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
35
+ option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
36
+
37
+ option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
38
+ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
39
+
40
+ option(GGML_TEST_COVERAGE "ggml: enable test coverage" OFF)
41
+
42
+ option(GGML_PERF "ggml: enable perf timings" OFF)
43
+ option(GGML_NO_ACCELERATE "ggml: disable Accelerate framework" OFF)
44
+ option(GGML_OPENBLAS "ggml: use OpenBLAS" OFF)
45
+ option(GGML_CLBLAST "ggml: use clBLAST" OFF)
46
+ option(GGML_CUBLAS "ggml: use cuBLAS" OFF)
47
+ option(GGML_METAL "ggml: use Metal" OFF)
48
+
49
+ # sanitizers
50
+
51
+ if (GGML_SANITIZE_THREAD)
52
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
53
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
54
+ endif()
55
+
56
+ if (GGML_SANITIZE_ADDRESS)
57
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
58
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
59
+ endif()
60
+
61
+ if (GGML_SANITIZE_UNDEFINED)
62
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
63
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
64
+ endif()
65
+
66
+ # instruction set specific
67
+ option(GGML_AVX "ggml: enable AVX" ON)
68
+ option(GGML_AVX2 "ggml: enable AVX2" ON)
69
+ option(GGML_AVX512 "ggml: enable AVX512" OFF)
70
+ option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
71
+ option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
72
+ option(GGML_FMA "ggml: enable FMA" ON)
73
+ # in MSVC F16C is implied with AVX2/AVX512
74
+ if (NOT MSVC)
75
+ option(GGML_F16C "ggml: enable F16C" ON)
76
+ endif()
77
+
78
+ #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
79
+ #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
80
+ #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")
81
+
82
+ # warning flags
83
+
84
+ if (GGML_ALL_WARNINGS)
85
+ if (NOT MSVC)
86
+ set(c_flags -Wall -Wpedantic -Wformat=2 -Wno-unused -Wstrict-prototypes)
87
+ set(cxx_flags -Wall -Wpedantic -Wformat=2)
88
+ else()
89
+ # todo : windows
90
+ endif()
91
+
92
+ add_compile_options(
93
+ "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
94
+ "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
95
+ )
96
+ endif()
97
+
98
+ if (NOT MSVC)
99
+ add_compile_options(
100
+ "$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
101
+ "$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
102
+ "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
103
+ )
104
+ endif()
105
+
106
+ #
107
+ # POSIX conformance
108
+ #
109
+
110
+ # clock_gettime came in POSIX.1b (1993)
111
+ # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
112
+ # posix_memalign came in POSIX.1-2001 / SUSv3
113
+ # M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
114
+ add_compile_definitions(_XOPEN_SOURCE=600)
115
+
116
+ # Somehow in OpenBSD whenever POSIX conformance is specified
117
+ # some string functions rely on locale_t availability,
118
+ # which was introduced in POSIX.1-2008, forcing us to go higher
119
+ if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
120
+ remove_definitions(-D_XOPEN_SOURCE=600)
121
+ add_compile_definitions(_XOPEN_SOURCE=700)
122
+ endif()
123
+
124
+ # Data types, macros and functions related to controlling CPU affinity
125
+ # are available on Linux through GNU extensions in libc
126
+ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
127
+ add_compile_definitions(_GNU_SOURCE)
128
+ endif()
129
+
130
+ # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
131
+ # and on macOS its availability depends on enabling Darwin extensions
132
+ # similarly on DragonFly, enabling BSD extensions is necessary
133
+ if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
134
+ add_compile_definitions(_DARWIN_C_SOURCE)
135
+ endif()
136
+ if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
137
+ add_compile_definitions(_DARWIN_C_SOURCE)
138
+ endif()
139
+
140
+ # alloca is a non-standard interface that is not visible on BSDs when
141
+ # POSIX conformance is specified, but not all of them provide a clean way
142
+ # to enable it in such cases
143
+ if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
144
+ add_compile_definitions(__BSD_VISIBLE)
145
+ endif()
146
+ if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
147
+ add_compile_definitions(_NETBSD_SOURCE)
148
+ endif()
149
+ if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
150
+ add_compile_definitions(_BSD_SOURCE)
151
+ endif()
152
+
153
+ if (WHISPER_PERF)
154
+ set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
155
+ endif()
156
+
157
+ # dependencies
158
+
159
+ set(CMAKE_C_STANDARD 11)
160
+ set(CMAKE_CXX_STANDARD 11)
161
+
162
+ find_package(Threads REQUIRED)
163
+
164
+ # main
165
+
166
+ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
167
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
168
+ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
169
+ endif ()
170
+
171
+ if (GGML_BUILD_TESTS)
172
+ if (GGML_TEST_COVERAGE)
173
+ if (CMAKE_C_COMPILER_ID MATCHES "Clang")
174
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
175
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
176
+ else()
177
+ message(WARNING "Test coverage is only supported for Clang")
178
+ endif()
179
+ endif()
180
+ endif()
181
+
182
+ add_subdirectory(src)
183
+
184
+ if (GGML_BUILD_TESTS)
185
+ enable_testing()
186
+ add_subdirectory(tests)
187
+ endif ()
188
+
189
+ if (GGML_BUILD_EXAMPLES)
190
+ add_subdirectory(examples)
191
+ endif ()
192
+
193
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
194
+ ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
195
+ @ONLY)
196
+ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
197
+ DESTINATION share/pkgconfig)
stable-diffusion.cpp/ggml/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Georgi Gerganov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
stable-diffusion.cpp/ggml/README.md ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ggml
2
+
3
+ [Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205)
4
+
5
+ Tensor library for machine learning
6
+
7
+ ***Note that this project is under active development. \
8
+ Some of the development is currently happening in the [llama.cpp](https://github.com/ggerganov/llama.cpp) and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repos***
9
+
10
+ ## Features
11
+
12
+ - Written in C
13
+ - 16-bit float support
14
+ - Integer quantization support (4-bit, 5-bit, 8-bit, etc.)
15
+ - Automatic differentiation
16
+ - ADAM and L-BFGS optimizers
17
+ - Optimized for Apple Silicon
18
+ - On x86 architectures utilizes AVX / AVX2 intrinsics
19
+ - On ppc64 architectures utilizes VSX intrinsics
20
+ - No third-party dependencies
21
+ - Zero memory allocations during runtime
22
+
23
+ ## Updates
24
+
25
+ - [X] Example of GPT-2 inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
26
+ - [X] Example of GPT-J inference [examples/gpt-j](https://github.com/ggerganov/ggml/tree/master/examples/gpt-j)
27
+ - [X] Example of Whisper inference [examples/whisper](https://github.com/ggerganov/ggml/tree/master/examples/whisper)
28
+ - [X] Support 4-bit integer quantization https://github.com/ggerganov/ggml/pull/27
29
+ - [X] Example of Cerebras-GPT inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
30
+ - [ ] Example of FLAN-T5 inference https://github.com/ggerganov/ggml/pull/12
31
+ - [X] Example of LLaMA inference [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)
32
+ - [X] Example of LLaMA training [ggerganov/llama.cpp/examples/baby-llama](https://github.com/ggerganov/llama.cpp/tree/master/examples/baby-llama)
33
+ - [X] Example of Falcon inference [cmp-nct/ggllm.cpp](https://github.com/cmp-nct/ggllm.cpp)
34
+ - [X] Example of BLOOM inference [NouamaneTazi/bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp)
35
+ - [X] Example of RWKV inference [saharNooby/rwkv.cpp](https://github.com/saharNooby/rwkv.cpp)
36
+ - [X] Example of SAM inference [examples/sam](https://github.com/ggerganov/ggml/tree/master/examples/sam)
37
+ - [X] Idea for GPU support: https://github.com/ggerganov/llama.cpp/discussions/915
38
+ - [X] Example of StableLM (GPT-NeoX) inference [examples/gpt-neox](https://github.com/ggerganov/ggml/tree/master/examples/gpt-neox)
39
+ - [X] Example of BERT inference [skeskinen/bert.cpp](https://github.com/skeskinen/bert.cpp)
40
+ - [X] Example of 💫 StarCoder inference [examples/starcoder](https://github.com/ggerganov/ggml/tree/master/examples/starcoder)
41
+ - [X] Example of MPT inference [examples/mpt](https://github.com/ggerganov/ggml/tree/master/examples/mpt)
42
+ - [X] Example of Replit inference [examples/replit](https://github.com/ggerganov/ggml/tree/master/examples/replit)
43
+ - [X] Example of BioGPT inference [PABannier/biogpt.cpp](https://github.com/PABannier/biogpt.cpp)
44
+ - [X] Example of Encodec inference [PABannier/encodec.cpp](https://github.com/PABannier/encodec.cpp)
45
+ - [X] Example of CLIP inference [monatis/clip.cpp](https://github.com/monatis/clip.cpp)
46
+ - [X] Example of MiniGPT4 inference [Maknee/minigpt4.cpp](https://github.com/Maknee/minigpt4.cpp)
47
+ - [X] Example of ChatGLM inference [li-plus/chatglm.cpp](https://github.com/li-plus/chatglm.cpp)
48
+ - [X] Example of Stable Diffusion inference [leejet/stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)
49
+ - [X] Example of Qwen inference [QwenLM/qwen.cpp](https://github.com/QwenLM/qwen.cpp)
50
+
51
+ ## Whisper inference (example)
52
+
53
+ With ggml you can efficiently run [Whisper](examples/whisper) inference on the CPU.
54
+
55
+ Memory requirements:
56
+
57
+ | Model | Disk | Mem |
58
+ | --- | --- | --- |
59
+ | tiny | 75 MB | ~280 MB |
60
+ | base | 142 MB | ~430 MB |
61
+ | small | 466 MB | ~1.0 GB |
62
+ | medium | 1.5 GB | ~2.6 GB |
63
+ | large | 2.9 GB | ~4.7 GB |
64
+
65
+ ## GPT inference (example)
66
+
67
+ With ggml you can efficiently run [GPT-2](examples/gpt-2) and [GPT-J](examples/gpt-j) inference on the CPU.
68
+
69
+ Here is how to run the example programs:
70
+
71
+ ```bash
72
+ # Build ggml + examples
73
+ git clone https://github.com/ggerganov/ggml
74
+ cd ggml
75
+ mkdir build && cd build
76
+ cmake ..
77
+ make -j4 gpt-2 gpt-j
78
+
79
+ # Run the GPT-2 small 117M model
80
+ ../examples/gpt-2/download-ggml-model.sh 117M
81
+ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
82
+
83
+ # Run the GPT-J 6B model (requires 12GB disk space and 16GB CPU RAM)
84
+ ../examples/gpt-j/download-ggml-model.sh 6B
85
+ ./bin/gpt-j -m models/gpt-j-6B/ggml-model.bin -p "This is an example"
86
+
87
+ # Install Python dependencies
88
+ python3 -m pip install -r ../requirements.txt
89
+
90
+ # Run the Cerebras-GPT 111M model
91
+ # Download from: https://huggingface.co/cerebras
92
+ python3 ../examples/gpt-2/convert-cerebras-to-ggml.py /path/to/Cerebras-GPT-111M/
93
+ ./bin/gpt-2 -m /path/to/Cerebras-GPT-111M/ggml-model-f16.bin -p "This is an example"
94
+ ```
95
+
96
+ The inference speeds that I get for the different models on my 32GB MacBook M1 Pro are as follows:
97
+
98
+ | Model | Size | Time / Token |
99
+ | --- | --- | --- |
100
+ | GPT-2 | 117M | 5 ms |
101
+ | GPT-2 | 345M | 12 ms |
102
+ | GPT-2 | 774M | 23 ms |
103
+ | GPT-2 | 1558M | 42 ms |
104
+ | --- | --- | --- |
105
+ | GPT-J | 6B | 125 ms |
106
+
107
+ For more information, checkout the corresponding programs in the [examples](examples) folder.
108
+
109
+ ## Using Metal (only with GPT-2)
110
+
111
+ For GPT-2 models, offloading to GPU is possible. Note that it will not improve inference performances but will reduce power consumption and free up the CPU for other tasks.
112
+
113
+ To enable GPU offloading on MacOS:
114
+
115
+ ```bash
116
+ cmake -DGGML_METAL=ON -DBUILD_SHARED_LIBS=Off ..
117
+
118
+ # add -ngl 1
119
+ ./bin/gpt-2 -t 4 -ngl 100 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
120
+ ```
121
+
122
+ ## Using cuBLAS
123
+
124
+ ```bash
125
+ # fix the path to point to your CUDA compiler
126
+ cmake -DGGML_CUBLAS=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc ..
127
+ ```
128
+
129
+ ## Using clBLAST
130
+
131
+ ```bash
132
+ cmake -DGGML_CLBLAST=ON ..
133
+ ```
134
+
135
+ ## Resources
136
+
137
+ - [GGML - Large Language Models for Everyone](https://github.com/rustformers/llm/blob/main/crates/ggml/README.md): a description of the GGML format provided by the maintainers of the `llm` Rust crate, which provides Rust bindings for GGML
138
+ - [marella/ctransformers](https://github.com/marella/ctransformers): Python bindings for GGML models.
139
+ - [go-skynet/go-ggml-transformers.cpp](https://github.com/go-skynet/go-ggml-transformers.cpp): Golang bindings for GGML models
140
+ - [smspillaz/ggml-gobject](https://github.com/smspillaz/ggml-gobject): GObject-introspectable wrapper for use of GGML on the GNOME platform.
stable-diffusion.cpp/ggml/build.zig ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const std = @import("std");
2
+ const builtin = @import("builtin");
3
+
4
+ // Zig Version: 0.11.0
5
+ // Zig Build Command: zig build
6
+ // Zig Run Command: zig build -h
7
+ // zig build run_dolly-v2
8
+ // zig build run_gpt-2
9
+ // zig build run_gpt-j
10
+ // zig build run_gpt-neox
11
+ // zig build run_mnist
12
+ // zig build run_mpt
13
+ // zig build run_replit
14
+ // zig build run_starcoder
15
+ // zig build run_test-grad0
16
+ // zig build run_test-mul-mat0
17
+ // zig build run_test-mul-mat2
18
+ // zig build run_test-opt
19
+ // zig build run_test-vec1
20
+ // zig build run_test0
21
+ // zig build run_test1
22
+ // zig build run_test2
23
+ // zig build run_test3
24
+ // zig build run_zig_test0
25
+ // zig build run_zig_test1
26
+ // zig build run_zig_test2
27
+ // zig build run_zig_test3
28
+ pub fn build(b: *std.build.Builder) void {
29
+ const target = b.standardTargetOptions(.{});
30
+ const optimize = b.standardOptimizeOption(.{});
31
+ const lib = b.addStaticLibrary(.{
32
+ .name = "ggml",
33
+ .target = target,
34
+ .optimize = optimize,
35
+ });
36
+ lib.addIncludePath(.{ .path = "./include" });
37
+ lib.addIncludePath(.{ .path = "./include/ggml" });
38
+ lib.addCSourceFiles(&.{
39
+ "src/ggml.c",
40
+ }, &.{"-std=c11"});
41
+ lib.linkLibC();
42
+ lib.linkLibCpp();
43
+ b.installArtifact(lib);
44
+
45
+ // examples
46
+ const examples = .{
47
+ "dolly-v2",
48
+ "gpt-2",
49
+ "gpt-j",
50
+ "gpt-neox",
51
+ "mnist",
52
+ "mpt",
53
+ "replit",
54
+ "starcoder",
55
+ // "whisper",
56
+ };
57
+ inline for (examples) |name| {
58
+ const exe = b.addExecutable(.{
59
+ .name = name,
60
+ .target = target,
61
+ .optimize = optimize,
62
+ });
63
+ exe.addIncludePath(.{ .path = "./include" });
64
+ exe.addIncludePath(.{ .path = "./include/ggml" });
65
+ exe.addIncludePath(.{ .path = "./examples" });
66
+ // exe.addIncludePath("./examples/whisper");
67
+ exe.addCSourceFiles(&.{
68
+ std.fmt.comptimePrint("examples/{s}/main.cpp", .{name}),
69
+ "examples/common.cpp",
70
+ "examples/common-ggml.cpp",
71
+ // "examples/whisper/whisper.cpp",
72
+ }, &.{"-std=c++11"});
73
+ exe.linkLibrary(lib);
74
+ b.installArtifact(exe);
75
+ const run_cmd = b.addRunArtifact(exe);
76
+ run_cmd.step.dependOn(b.getInstallStep());
77
+ if (b.args) |args| run_cmd.addArgs(args);
78
+ const run_step = b.step("run_" ++ name, "Run examples");
79
+ run_step.dependOn(&run_cmd.step);
80
+ }
81
+
82
+ // tests
83
+ const tests = if (builtin.target.cpu.arch == .x86_64) .{
84
+ // "test-blas0",
85
+ // "test-grad0",
86
+ "test-mul-mat0",
87
+ // "test-mul-mat1",
88
+ "test-mul-mat2",
89
+ // "test-opt",
90
+ // "test-svd0",
91
+ // "test-vec0",
92
+ "test-vec1",
93
+ // "test-vec2",
94
+ "test0",
95
+ "test1",
96
+ "test2",
97
+ "test3",
98
+ } else .{
99
+ // "test-blas0",
100
+ // "test-grad0",
101
+ "test-mul-mat0",
102
+ // "test-mul-mat1",
103
+ "test-mul-mat2",
104
+ // "test-opt",
105
+ // "test-svd0",
106
+ // "test-vec0",
107
+ // "test-vec1",
108
+ // "test-vec2",
109
+ "test0",
110
+ "test1",
111
+ "test2",
112
+ "test3",
113
+ };
114
+ inline for (tests) |name| {
115
+ const exe = b.addExecutable(.{
116
+ .name = name,
117
+ .target = target,
118
+ .optimize = optimize,
119
+ });
120
+ exe.addIncludePath(.{ .path = "./include" });
121
+ exe.addIncludePath(.{ .path = "./include/ggml" });
122
+ exe.addCSourceFiles(&.{
123
+ std.fmt.comptimePrint("tests/{s}.c", .{name}),
124
+ }, &.{"-std=c11"});
125
+ exe.linkLibrary(lib);
126
+ b.installArtifact(exe);
127
+ const run_cmd = b.addRunArtifact(exe);
128
+ run_cmd.step.dependOn(b.getInstallStep());
129
+ if (b.args) |args| run_cmd.addArgs(args);
130
+ const run_step = b.step("run_" ++ name, "Run tests");
131
+ run_step.dependOn(&run_cmd.step);
132
+ }
133
+
134
+ // zig_tests
135
+ const zig_tests = .{
136
+ "test0",
137
+ "test1",
138
+ "test2",
139
+ "test3",
140
+ };
141
+ inline for (zig_tests) |name| {
142
+ const exe = b.addExecutable(.{
143
+ .name = name,
144
+ .root_source_file = .{ .path = std.fmt.comptimePrint("tests/{s}.zig", .{name}) },
145
+ .target = target,
146
+ .optimize = optimize,
147
+ });
148
+ exe.addIncludePath(.{ .path = "./include" });
149
+ exe.addIncludePath(.{ .path = "./include/ggml" });
150
+ exe.linkLibrary(lib);
151
+ b.installArtifact(exe);
152
+ const run_cmd = b.addRunArtifact(exe);
153
+ run_cmd.step.dependOn(b.getInstallStep());
154
+ if (b.args) |args| run_cmd.addArgs(args);
155
+ const run_step = b.step("run_zig_" ++ name, "Run zig_tests");
156
+ run_step.dependOn(&run_cmd.step);
157
+ }
158
+ }
stable-diffusion.cpp/ggml/ci/run.sh ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #/bin/bash
2
+ #
3
+ # sample usage:
4
+ #
5
+ # mkdir tmp
6
+ #
7
+ # # CPU-only build
8
+ # bash ./ci/run.sh ./tmp/results ./tmp/mnt
9
+ #
10
+ # # with CUDA support
11
+ # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
12
+ #
13
+
14
+ if [ -z "$2" ]; then
15
+ echo "usage: $0 <output-dir> <mnt-dir>"
16
+ exit 1
17
+ fi
18
+
19
+ mkdir -p "$1"
20
+ mkdir -p "$2"
21
+
22
+ OUT=$(realpath "$1")
23
+ MNT=$(realpath "$2")
24
+
25
+ rm -v $OUT/*.log
26
+ rm -v $OUT/*.exit
27
+ rm -v $OUT/*.md
28
+
29
+ sd=`dirname $0`
30
+ cd $sd/../
31
+ SRC=`pwd`
32
+
33
+ ## helpers
34
+
35
+ # download a file if it does not exist or if it is outdated
36
+ function gg_wget {
37
+ local out=$1
38
+ local url=$2
39
+
40
+ local cwd=`pwd`
41
+
42
+ mkdir -p $out
43
+ cd $out
44
+
45
+ # should not re-download if file is the same
46
+ wget -nv -N $url
47
+
48
+ cd $cwd
49
+ }
50
+
51
+ function gg_printf {
52
+ printf -- "$@" >> $OUT/README.md
53
+ }
54
+
55
+ function gg_run {
56
+ ci=$1
57
+
58
+ set -o pipefail
59
+ set -x
60
+
61
+ gg_run_$ci | tee $OUT/$ci.log
62
+ cur=$?
63
+ echo "$cur" > $OUT/$ci.exit
64
+
65
+ set +x
66
+ set +o pipefail
67
+
68
+ gg_sum_$ci
69
+
70
+ ret=$((ret | cur))
71
+ }
72
+
73
+ ## ci
74
+
75
+ # ctest_debug
76
+
77
+ function gg_run_ctest_debug {
78
+ cd ${SRC}
79
+
80
+ rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
81
+
82
+ set -e
83
+
84
+ (time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
85
+ (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
86
+
87
+ (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
88
+
89
+ set +e
90
+ }
91
+
92
+ function gg_sum_ctest_debug {
93
+ gg_printf '### %s\n\n' "${ci}"
94
+
95
+ gg_printf 'Runs ctest in debug mode\n'
96
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
97
+ gg_printf '```\n'
98
+ gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
99
+ gg_printf '```\n'
100
+ gg_printf '\n'
101
+ }
102
+
103
+ # ctest_release
104
+
105
+ function gg_run_ctest_release {
106
+ cd ${SRC}
107
+
108
+ rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
109
+
110
+ set -e
111
+
112
+ (time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
113
+ (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
114
+
115
+ if [ -z $GG_BUILD_LOW_PERF ]; then
116
+ (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
117
+ else
118
+ (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
119
+ fi
120
+
121
+ set +e
122
+ }
123
+
124
+ function gg_sum_ctest_release {
125
+ gg_printf '### %s\n\n' "${ci}"
126
+
127
+ gg_printf 'Runs ctest in release mode\n'
128
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
129
+ gg_printf '```\n'
130
+ gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
131
+ gg_printf '```\n'
132
+ }
133
+
134
+ # gpt_2
135
+
136
+ function gg_run_gpt_2 {
137
+ cd ${SRC}
138
+
139
+ gg_wget models-mnt/gpt-2 https://huggingface.co/ggerganov/ggml/resolve/main/ggml-model-gpt-2-117M.bin
140
+
141
+ cd build-ci-release
142
+
143
+ set -e
144
+
145
+ model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin"
146
+ prompts="../examples/prompts/gpt-2.txt"
147
+
148
+ (time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log
149
+ (time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
150
+
151
+ (time ./bin/gpt-2-batched --model ${model} -s 1234 -n 64 -np 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
152
+
153
+ set +e
154
+ }
155
+
156
+ function gg_sum_gpt_2 {
157
+ gg_printf '### %s\n\n' "${ci}"
158
+
159
+ gg_printf 'Runs short GPT-2 text generation\n'
160
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
161
+ gg_printf '```\n'
162
+ gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)"
163
+ gg_printf '```\n'
164
+ }
165
+
166
+ # mnist
167
+
168
+ function gg_run_mnist {
169
+ cd ${SRC}
170
+
171
+ cd build-ci-release
172
+
173
+ set -e
174
+
175
+ mkdir -p models/mnist
176
+ python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict
177
+
178
+ model_f32="./models/mnist/ggml-model-f32.bin"
179
+ samples="../examples/mnist/models/mnist/t10k-images.idx3-ubyte"
180
+
181
+ # first command runs and exports "mnist.ggml", the second command runs the exported model
182
+
183
+ (time ./bin/mnist ${model_f32} ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
184
+ (time ./bin/mnist-cpu ./mnist.ggml ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
185
+
186
+ set +e
187
+ }
188
+
189
+ function gg_sum_mnist {
190
+ gg_printf '### %s\n\n' "${ci}"
191
+
192
+ gg_printf 'MNIST\n'
193
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
194
+ gg_printf '```\n'
195
+ gg_printf '%s\n' "$(cat $OUT/${ci}-mnist.log)"
196
+ gg_printf '```\n'
197
+ }
198
+
199
+ # whisper
200
+
201
+ function gg_run_whisper {
202
+ cd ${SRC}
203
+
204
+ gg_wget models-mnt/whisper/ https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin
205
+ gg_wget models-mnt/whisper/ https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav
206
+
207
+ cd build-ci-release
208
+
209
+ set -e
210
+
211
+ path_models="../models-mnt/whisper/"
212
+ model_f16="${path_models}/ggml-base.en.bin"
213
+ audio_0="${path_models}/jfk.wav"
214
+
215
+ (time ./bin/whisper -m ${model_f16} -f ${audio_0} ) 2>&1 | tee -a $OUT/${ci}-main.log
216
+
217
+ grep -q "And so my fellow Americans" $OUT/${ci}-main.log
218
+
219
+ set +e
220
+ }
221
+
222
+ function gg_sum_whisper {
223
+ gg_printf '### %s\n\n' "${ci}"
224
+
225
+ gg_printf 'Runs short Whisper transcription\n'
226
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
227
+ gg_printf '```\n'
228
+ gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
229
+ gg_printf '```\n'
230
+ }
231
+
232
+ # sam
233
+
234
+ function gg_run_sam {
235
+ cd ${SRC}
236
+
237
+ gg_wget models-mnt/sam/ https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
238
+ gg_wget models-mnt/sam/ https://raw.githubusercontent.com/YavorGIvanov/sam.cpp/ceafb7467bff7ec98e0c4f952e58a9eb8fd0238b/img.jpg
239
+
240
+ cd build-ci-release
241
+
242
+ set -e
243
+
244
+ path_models="../models-mnt/sam/"
245
+ model_f16="${path_models}/ggml-model-f16.bin"
246
+ img_0="${path_models}/img.jpg"
247
+
248
+ python3 ../examples/sam/convert-pth-to-ggml.py ${path_models}/sam_vit_b_01ec64.pth ${path_models}/ 1
249
+
250
+ (time ./bin/sam -m ${model_f16} -i ${img_0} ) 2>&1 | tee -a $OUT/${ci}-main.log
251
+
252
+ grep -q "bbox (371, 436), (144, 168)" $OUT/${ci}-main.log
253
+
254
+ set +e
255
+ }
256
+
257
+ function gg_sum_sam {
258
+ gg_printf '### %s\n\n' "${ci}"
259
+
260
+ gg_printf 'Run SAM\n'
261
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
262
+ gg_printf '```\n'
263
+ gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
264
+ gg_printf '```\n'
265
+ }
266
+
267
+ # mpt
268
+
269
+ function gg_run_mpt {
270
+ cd ${SRC}
271
+
272
+ gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/config.json
273
+ gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer.json
274
+ gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer_config.json
275
+ gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/pytorch_model.bin.index.json
276
+ gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/configuration_mpt.py
277
+ gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00001-of-00002.bin
278
+ gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00002-of-00002.bin
279
+
280
+ cd build-ci-release
281
+
282
+ set -e
283
+
284
+ path_models="../models-mnt/mpt/7B"
285
+ model_f16="${path_models}/ggml-model-f16.bin"
286
+ model_q4_0="${path_models}/ggml-model-q4_0.bin"
287
+
288
+ python3 ../examples/mpt/convert-h5-to-ggml.py ${path_models} 1
289
+ ./bin/mpt-quantize ${model_f16} ${model_q4_0} q4_0
290
+
291
+ (time ./bin/mpt --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
292
+ (time ./bin/mpt --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
293
+
294
+ set +e
295
+ }
296
+
297
+ function gg_sum_mpt {
298
+ gg_printf '### %s\n\n' "${ci}"
299
+
300
+ gg_printf 'Runs short MPT text generation\n'
301
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
302
+ gg_printf '```\n'
303
+ gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)"
304
+ gg_printf '```\n'
305
+ }
306
+
307
+ ## main
308
+
309
+ if [ -z $GG_BUILD_LOW_PERF ]; then
310
+ rm -rf ${SRC}/models-mnt
311
+
312
+ mnt_models=${MNT}/models
313
+ mkdir -p ${mnt_models}
314
+ ln -sfn ${mnt_models} ${SRC}/models-mnt
315
+ fi
316
+
317
+ python3 -m pip install -r ${SRC}/requirements.txt
318
+
319
+ ret=0
320
+
321
+ test $ret -eq 0 && gg_run ctest_debug
322
+ test $ret -eq 0 && gg_run ctest_release
323
+ test $ret -eq 0 && gg_run gpt_2
324
+ test $ret -eq 0 && gg_run mnist
325
+ test $ret -eq 0 && gg_run whisper
326
+ test $ret -eq 0 && gg_run sam
327
+
328
+ if [ -z $GG_BUILD_LOW_PERF ]; then
329
+ if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 16 ]; then
330
+ test $ret -eq 0 && gg_run mpt
331
+ fi
332
+ fi
333
+
334
+ exit $ret
stable-diffusion.cpp/ggml/cmake/BuildTypes.cmake ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Add new build types
2
+
3
+ # ReleaseGG - Release with enabled asserts
4
+
5
+ SET(CMAKE_CXX_FLAGS_RELEASEGG
6
+ "-O3"
7
+ CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts."
8
+ FORCE )
9
+ SET(CMAKE_C_FLAGS_RELEASEGG
10
+ "-O3"
11
+ CACHE STRING "Flags used by the compiler during release builds with enabled asserts."
12
+ FORCE )
13
+ SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG
14
+ ""
15
+ CACHE STRING "Flags used for linking binaries during release builds with enabled asserts."
16
+ FORCE )
17
+ SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG
18
+ ""
19
+ CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts."
20
+ FORCE )
21
+ MARK_AS_ADVANCED(
22
+ CMAKE_CXX_FLAGS_RELEASEGG
23
+ CMAKE_C_FLAGS_RELEASEGG
24
+ CMAKE_EXE_LINKER_FLAGS_RELEASEGG
25
+ CMAKE_SHARED_LINKER_FLAGS_RELEASEGG )
26
+
27
+ # RelWithDebInfoGG - RelWithDebInfo with enabled asserts
28
+
29
+ SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
30
+ "-O2 -g"
31
+ CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts."
32
+ FORCE )
33
+ SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG
34
+ "-O2 -g"
35
+ CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts."
36
+ FORCE )
37
+ SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
38
+ ""
39
+ CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts."
40
+ FORCE )
41
+ SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG
42
+ ""
43
+ CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts."
44
+ FORCE )
45
+ MARK_AS_ADVANCED(
46
+ CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
47
+ CMAKE_C_FLAGS_RELWITHDEBINFOGG
48
+ CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
49
+ CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG )
50
+
51
+ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
52
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
53
+ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG")
54
+ endif()
stable-diffusion.cpp/ggml/cmake/GitVars.cmake ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ find_package(Git)
2
+
3
+ # the commit's SHA1
4
+ execute_process(COMMAND
5
+ "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
6
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
7
+ OUTPUT_VARIABLE GIT_SHA1
8
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+
10
+ # the date of the commit
11
+ execute_process(COMMAND
12
+ "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
13
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
14
+ OUTPUT_VARIABLE GIT_DATE
15
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
16
+
17
+ # the subject of the commit
18
+ execute_process(COMMAND
19
+ "${GIT_EXECUTABLE}" log -1 --format=%s
20
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
21
+ OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
22
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
stable-diffusion.cpp/ggml/examples/CMakeLists.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ if (GGML_ALL_WARNINGS)
2
+ if (NOT MSVC)
3
+ set(cxx_flags
4
+ # TODO(marella): Add other warnings.
5
+ -Wpedantic
6
+ -Wunused-variable
7
+ -Wno-unused-function
8
+ -Wno-multichar
9
+ )
10
+ add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>")
11
+ endif()
12
+ endif()
13
+
14
+ add_library(common STATIC common.cpp)
15
+ target_include_directories(common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
16
+
17
+ add_library(common-ggml STATIC common-ggml.cpp)
18
+ target_link_libraries(common-ggml PRIVATE ggml)
19
+ target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
20
+
21
+ add_subdirectory(gpt-2)
22
+ add_subdirectory(gpt-j)
23
+ add_subdirectory(whisper)
24
+ add_subdirectory(mnist)
25
+ add_subdirectory(gpt-neox)
26
+ add_subdirectory(dolly-v2)
27
+ add_subdirectory(replit)
28
+ add_subdirectory(mpt)
29
+ add_subdirectory(starcoder)
30
+ add_subdirectory(sam)
stable-diffusion.cpp/ggml/examples/common-ggml.cpp ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common-ggml.h"
2
+
3
+ #include <regex>
4
+ #include <map>
5
+
6
+ static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
7
+ {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
8
+ {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
9
+ {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
10
+ {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
11
+ {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
12
+ };
13
+
14
+ void ggml_print_ftypes(FILE * fp) {
15
+ for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
16
+ fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
17
+ }
18
+ }
19
+
20
+ enum ggml_ftype ggml_parse_ftype(const char * str) {
21
+ enum ggml_ftype ftype;
22
+ if (str[0] == 'q') {
23
+ const auto it = GGML_FTYPE_MAP.find(str);
24
+ if (it == GGML_FTYPE_MAP.end()) {
25
+ fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
26
+ return GGML_FTYPE_UNKNOWN;
27
+ }
28
+ ftype = it->second;
29
+ } else {
30
+ ftype = (enum ggml_ftype) atoi(str);
31
+ }
32
+
33
+ return ftype;
34
+ }
35
+
36
+ bool ggml_common_quantize_0(
37
+ std::ifstream & finp,
38
+ std::ofstream & fout,
39
+ const ggml_ftype ftype,
40
+ const std::vector<std::string> & to_quant,
41
+ const std::vector<std::string> & to_skip) {
42
+
43
+ ggml_type qtype = GGML_TYPE_F32;
44
+
45
+ switch (ftype) {
46
+ case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
47
+ case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
48
+ case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
49
+ case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
50
+ case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
51
+ case GGML_FTYPE_UNKNOWN:
52
+ case GGML_FTYPE_ALL_F32:
53
+ case GGML_FTYPE_MOSTLY_F16:
54
+ case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
55
+ case GGML_FTYPE_MOSTLY_Q2_K:
56
+ case GGML_FTYPE_MOSTLY_Q3_K:
57
+ case GGML_FTYPE_MOSTLY_Q4_K:
58
+ case GGML_FTYPE_MOSTLY_Q5_K:
59
+ case GGML_FTYPE_MOSTLY_Q6_K:
60
+ {
61
+ fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
62
+ return false;
63
+ }
64
+ };
65
+
66
+ if (!ggml_is_quantized(qtype)) {
67
+ fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
68
+ return false;
69
+ }
70
+
71
+ size_t total_size_org = 0;
72
+ size_t total_size_new = 0;
73
+
74
+ std::vector<float> work;
75
+
76
+ std::vector<uint8_t> data_u8;
77
+ std::vector<ggml_fp16_t> data_f16;
78
+ std::vector<float> data_f32;
79
+
80
+ std::vector<int64_t> hist_all(1 << 4, 0);
81
+
82
+ while (true) {
83
+ int32_t n_dims;
84
+ int32_t length;
85
+ int32_t ttype;
86
+
87
+ finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
88
+ finp.read(reinterpret_cast<char *>(&length), sizeof(length));
89
+ finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
90
+
91
+ if (finp.eof()) {
92
+ break;
93
+ }
94
+
95
+ int32_t nelements = 1;
96
+ int32_t ne[4] = { 1, 1, 1, 1 };
97
+ for (int i = 0; i < n_dims; ++i) {
98
+ finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
99
+ nelements *= ne[i];
100
+ }
101
+
102
+ std::string name(length, 0);
103
+ finp.read (&name[0], length);
104
+
105
+ printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
106
+
107
+ bool quantize = false;
108
+
109
+ // check if we should quantize this tensor
110
+ for (const auto & s : to_quant) {
111
+ if (std::regex_match(name, std::regex(s))) {
112
+ quantize = true;
113
+ break;
114
+ }
115
+ }
116
+
117
+ // check if we should skip this tensor
118
+ for (const auto & s : to_skip) {
119
+ if (std::regex_match(name, std::regex(s))) {
120
+ quantize = false;
121
+ break;
122
+ }
123
+ }
124
+
125
+ // quantize only 2D tensors
126
+ quantize &= (n_dims == 2);
127
+
128
+ if (quantize) {
129
+ if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
130
+ fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
131
+ return false;
132
+ }
133
+
134
+ if (ttype == GGML_TYPE_F16) {
135
+ data_f16.resize(nelements);
136
+ finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
137
+ data_f32.resize(nelements);
138
+ for (int i = 0; i < nelements; ++i) {
139
+ data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
140
+ }
141
+ } else {
142
+ data_f32.resize(nelements);
143
+ finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
144
+ }
145
+
146
+ ttype = qtype;
147
+ } else {
148
+ const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
149
+
150
+ data_u8.resize(nelements*bpe);
151
+ finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
152
+ }
153
+
154
+ fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
155
+ fout.write(reinterpret_cast<char *>(&length), sizeof(length));
156
+ fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
157
+ for (int i = 0; i < n_dims; ++i) {
158
+ fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
159
+ }
160
+ fout.write(&name[0], length);
161
+
162
+ if (quantize) {
163
+ work.resize(nelements); // for quantization
164
+
165
+ size_t cur_size = 0;
166
+ std::vector<int64_t> hist_cur(1 << 4, 0);
167
+
168
+ switch ((ggml_type) ttype) {
169
+ case GGML_TYPE_Q4_0:
170
+ {
171
+ cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
172
+ } break;
173
+ case GGML_TYPE_Q4_1:
174
+ {
175
+ cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
176
+ } break;
177
+ case GGML_TYPE_Q5_0:
178
+ {
179
+ cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
180
+ } break;
181
+ case GGML_TYPE_Q5_1:
182
+ {
183
+ cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
184
+ } break;
185
+ case GGML_TYPE_Q8_0:
186
+ {
187
+ cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
188
+ } break;
189
+ case GGML_TYPE_F32:
190
+ case GGML_TYPE_F16:
191
+ case GGML_TYPE_I8:
192
+ case GGML_TYPE_I16:
193
+ case GGML_TYPE_I32:
194
+ case GGML_TYPE_Q8_1:
195
+ case GGML_TYPE_Q2_K:
196
+ case GGML_TYPE_Q3_K:
197
+ case GGML_TYPE_Q4_K:
198
+ case GGML_TYPE_Q5_K:
199
+ case GGML_TYPE_Q6_K:
200
+ case GGML_TYPE_Q8_K:
201
+ case GGML_TYPE_COUNT:
202
+ {
203
+ fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
204
+ return false;
205
+ }
206
+ }
207
+
208
+ fout.write(reinterpret_cast<char *>(work.data()), cur_size);
209
+ total_size_new += cur_size;
210
+
211
+ printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
212
+ for (int i = 0; i < (int) hist_cur.size(); ++i) {
213
+ hist_all[i] += hist_cur[i];
214
+ }
215
+
216
+ for (int i = 0; i < (int) hist_cur.size(); ++i) {
217
+ printf("%5.3f ", hist_cur[i] / (float)nelements);
218
+ }
219
+ printf("\n");
220
+ } else {
221
+ printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
222
+ fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
223
+ total_size_new += data_u8.size();
224
+ }
225
+
226
+ total_size_org += nelements * sizeof(float);
227
+ }
228
+
229
+ printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
230
+ printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
231
+
232
+ {
233
+ int64_t sum_all = 0;
234
+ for (int i = 0; i < (int) hist_all.size(); ++i) {
235
+ sum_all += hist_all[i];
236
+ }
237
+
238
+ printf("%s: hist: ", __func__);
239
+ for (int i = 0; i < (int) hist_all.size(); ++i) {
240
+ printf("%5.3f ", hist_all[i] / (float)sum_all);
241
+ }
242
+ printf("\n");
243
+ }
244
+
245
+ return true;
246
+ }
stable-diffusion.cpp/ggml/examples/common-ggml.h ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ #include <fstream>
6
+ #include <vector>
7
+ #include <string>
8
+
9
+ enum ggml_ftype ggml_parse_ftype(const char * str);
10
+
11
+ void ggml_print_ftypes(FILE * fp = stderr);
12
+
13
+ bool ggml_common_quantize_0(
14
+ std::ifstream & finp,
15
+ std::ofstream & fout,
16
+ const ggml_ftype ftype,
17
+ const std::vector<std::string> & to_quant,
18
+ const std::vector<std::string> & to_skip);
stable-diffusion.cpp/ggml/examples/common.cpp ADDED
@@ -0,0 +1,817 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #define _USE_MATH_DEFINES // for M_PI
2
+
3
+ #include "common.h"
4
+
5
+ // third-party utilities
6
+ // use your favorite implementations
7
+ #define DR_WAV_IMPLEMENTATION
8
+ #include "dr_wav.h"
9
+
10
+ #include <cmath>
11
+ #include <cstring>
12
+ #include <fstream>
13
+ #include <regex>
14
+ #include <locale>
15
+ #include <codecvt>
16
+ #include <sstream>
17
+
18
+ #if defined(_MSC_VER)
19
+ #pragma warning(disable: 4244 4267) // possible loss of data
20
+ #endif
21
+
22
+ // Function to check if the next argument exists
23
+ std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
24
+ if (i + 1 < argc && argv[i + 1][0] != '-') {
25
+ return argv[++i];
26
+ } else {
27
+ fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
28
+ gpt_print_usage(argc, argv, params);
29
+ exit(0);
30
+ }
31
+ }
32
+
33
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
34
+ for (int i = 1; i < argc; i++) {
35
+ std::string arg = argv[i];
36
+
37
+ if (arg == "-s" || arg == "--seed") {
38
+ params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
39
+ } else if (arg == "-t" || arg == "--threads") {
40
+ params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
41
+ } else if (arg == "-p" || arg == "--prompt") {
42
+ params.prompt = get_next_arg(i, argc, argv, arg, params);
43
+ } else if (arg == "-n" || arg == "--n_predict") {
44
+ params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
45
+ } else if (arg == "-np" || arg == "--n_parallel") {
46
+ params.n_parallel = std::stoi(get_next_arg(i, argc, argv, arg, params));
47
+ } else if (arg == "--top_k") {
48
+ params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
49
+ } else if (arg == "--top_p") {
50
+ params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
51
+ } else if (arg == "--temp") {
52
+ params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
53
+ } else if (arg == "--repeat-last-n") {
54
+ params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
55
+ } else if (arg == "--repeat-penalty") {
56
+ params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
57
+ } else if (arg == "-b" || arg == "--batch_size") {
58
+ params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
59
+ } else if (arg == "-c" || arg == "--context") {
60
+ params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
61
+ } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
62
+ params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
63
+ } else if (arg == "--ignore-eos") {
64
+ params.ignore_eos = true;
65
+ } else if (arg == "-m" || arg == "--model") {
66
+ params.model = get_next_arg(i, argc, argv, arg, params);
67
+ } else if (arg == "-i" || arg == "--interactive") {
68
+ params.interactive = true;
69
+ } else if (arg == "-ip" || arg == "--interactive-port") {
70
+ params.interactive = true;
71
+ params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
72
+ } else if (arg == "-h" || arg == "--help") {
73
+ gpt_print_usage(argc, argv, params);
74
+ exit(0);
75
+ } else if (arg == "-f" || arg == "--file") {
76
+ get_next_arg(i, argc, argv, arg, params);
77
+ std::ifstream file(argv[i]);
78
+ if (!file) {
79
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
80
+ break;
81
+ }
82
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
83
+ if (params.prompt.back() == '\n') {
84
+ params.prompt.pop_back();
85
+ }
86
+ } else if (arg == "-tt" || arg == "--token_test") {
87
+ params.token_test = get_next_arg(i, argc, argv, arg, params);
88
+ }
89
+ else {
90
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
91
+ gpt_print_usage(argc, argv, params);
92
+ exit(0);
93
+ }
94
+ }
95
+
96
+ return true;
97
+ }
98
+
99
+ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
100
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
101
+ fprintf(stderr, "\n");
102
+ fprintf(stderr, "options:\n");
103
+ fprintf(stderr, " -h, --help show this help message and exit\n");
104
+ fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
105
+ fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
106
+ fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
107
+ fprintf(stderr, " prompt to start generation with (default: random)\n");
108
+ fprintf(stderr, " -f FNAME, --file FNAME\n");
109
+ fprintf(stderr, " load prompt from a file\n");
110
+ fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
111
+ fprintf(stderr, " test tokenization\n");
112
+ fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
113
+ fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
114
+ fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
115
+ fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
116
+ fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
117
+ fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
118
+ fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
119
+ fprintf(stderr, " -c N, --context N context / KV cache size (default: %d)\n", params.n_ctx);
120
+ fprintf(stderr, " --ignore-eos ignore EOS token during generation\n");
121
+ fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
122
+ fprintf(stderr, " -m FNAME, --model FNAME\n");
123
+ fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
124
+ fprintf(stderr, "\n");
125
+ }
126
+
127
+ std::string gpt_random_prompt(std::mt19937 & rng) {
128
+ const int r = rng() % 10;
129
+ switch (r) {
130
+ case 0: return "So";
131
+ case 1: return "Once upon a time";
132
+ case 2: return "When";
133
+ case 3: return "The";
134
+ case 4: return "After";
135
+ case 5: return "If";
136
+ case 6: return "import";
137
+ case 7: return "He";
138
+ case 8: return "She";
139
+ case 9: return "They";
140
+ default: return "To";
141
+ }
142
+
143
+ return "The";
144
+ }
145
+
146
+ std::string trim(const std::string & s) {
147
+ std::regex e("^\\s+|\\s+$");
148
+ return std::regex_replace(s, e, "");
149
+ }
150
+
151
+ std::string replace(const std::string & s, const std::string & from, const std::string & to) {
152
+ std::string result = s;
153
+ size_t pos = 0;
154
+ while ((pos = result.find(from, pos)) != std::string::npos) {
155
+ result.replace(pos, from.length(), to);
156
+ pos += to.length();
157
+ }
158
+ return result;
159
+ }
160
+
161
+ void gpt_vocab::add_special_token(const std::string & token) {
162
+ special_tokens.push_back(token);
163
+ }
164
+
165
+ std::map<std::string, int32_t> json_parse(const std::string & fname) {
166
+ std::map<std::string, int32_t> result;
167
+
168
+ // read file into string
169
+ std::string json;
170
+ {
171
+ std::ifstream ifs(fname);
172
+ if (!ifs) {
173
+ fprintf(stderr, "Failed to open %s\n", fname.c_str());
174
+ exit(1);
175
+ }
176
+
177
+ json = std::string((std::istreambuf_iterator<char>(ifs)),
178
+ (std::istreambuf_iterator<char>()));
179
+ }
180
+
181
+ if (json[0] != '{') {
182
+ return result;
183
+ }
184
+
185
+ // parse json
186
+ {
187
+ bool has_key = false;
188
+ bool in_token = false;
189
+
190
+ std::string str_key = "";
191
+ std::string str_val = "";
192
+
193
+ int n = json.size();
194
+ for (int i = 1; i < n; ++i) {
195
+ if (!in_token) {
196
+ if (json[i] == ' ') continue;
197
+ if (json[i] == '"') {
198
+ in_token = true;
199
+ continue;
200
+ }
201
+ } else {
202
+ if (json[i] == '\\' && i+1 < n) {
203
+ if (has_key == false) {
204
+ str_key += json[i];
205
+ } else {
206
+ str_val += json[i];
207
+ }
208
+ ++i;
209
+ } else if (json[i] == '"') {
210
+ if (has_key == false) {
211
+ has_key = true;
212
+ ++i;
213
+ while (json[i] == ' ') ++i;
214
+ ++i; // :
215
+ while (json[i] == ' ') ++i;
216
+ if (json[i] != '\"') {
217
+ while (json[i] != ',' && json[i] != '}') {
218
+ str_val += json[i++];
219
+ }
220
+ has_key = false;
221
+ } else {
222
+ in_token = true;
223
+ continue;
224
+ }
225
+ } else {
226
+ has_key = false;
227
+ }
228
+
229
+ str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
230
+ str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
231
+ str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> "
232
+
233
+ try {
234
+ result[str_key] = std::stoi(str_val);
235
+ } catch (...) {
236
+ //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
237
+
238
+ }
239
+ str_key = "";
240
+ str_val = "";
241
+ in_token = false;
242
+ continue;
243
+ }
244
+ if (has_key == false) {
245
+ str_key += json[i];
246
+ } else {
247
+ str_val += json[i];
248
+ }
249
+ }
250
+ }
251
+ }
252
+
253
+ return result;
254
+ }
255
+
256
+ std::string convert_to_utf8(const std::wstring & input) {
257
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
258
+ return converter.to_bytes(input);
259
+ }
260
+
261
+
262
+ std::wstring convert_to_wstring(const std::string & input) {
263
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
264
+ return converter.from_bytes(input);
265
+ }
266
+
267
+ void gpt_split_words(std::string str, std::vector<std::string>& words) {
268
+ const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
269
+ const std::regex re(pattern);
270
+ std::smatch m;
271
+
272
+ while (std::regex_search(str, m, re)) {
273
+ for (auto x : m) {
274
+ words.push_back(x);
275
+ }
276
+ str = m.suffix();
277
+ }
278
+ }
279
+
280
+ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
281
+ std::vector<std::string> words;
282
+
283
+ // first split the text into words
284
+ {
285
+ std::string str = text;
286
+
287
+ // Generate the subpattern from the special_tokens vector if it's not empty
288
+ if (!vocab.special_tokens.empty()) {
289
+ const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
290
+ std::string special_tokens_subpattern;
291
+ for (const auto & token : vocab.special_tokens) {
292
+ if (!special_tokens_subpattern.empty()) {
293
+ special_tokens_subpattern += "|";
294
+ }
295
+ special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
296
+ }
297
+
298
+ std::regex re(special_tokens_subpattern);
299
+ std::smatch m;
300
+ // Split the text by special tokens.
301
+ while (std::regex_search(str, m, re)) {
302
+ // Split the substrings in-between special tokens into words.
303
+ gpt_split_words(m.prefix(), words);
304
+ // Add matched special tokens as words.
305
+ for (auto x : m) {
306
+ words.push_back(x);
307
+ }
308
+ str = m.suffix();
309
+ }
310
+ // Remaining text without special tokens will be handled below.
311
+ }
312
+
313
+ gpt_split_words(str, words);
314
+ }
315
+
316
+ // find the longest token that forms each word in words:
317
+ std::vector<gpt_vocab::id> tokens;
318
+ for (const auto & word : words) {
319
+ for (int i = 0; i < (int) word.size(); ){
320
+ for (int j = word.size() - 1; j >= i; j--){
321
+ auto cand = word.substr(i, j-i+1);
322
+ auto it = vocab.token_to_id.find(cand);
323
+ if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
324
+ tokens.push_back(it->second);
325
+ i = j + 1;
326
+ break;
327
+ }
328
+ else if (j == i){ // word.substr(i, 1) has no matching
329
+ fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
330
+ i++;
331
+ }
332
+ }
333
+ }
334
+ }
335
+
336
+ return tokens;
337
+ }
338
+
339
+ std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
340
+ std::vector<gpt_vocab::id> output;
341
+ std::stringstream ss(input);
342
+ std::string token;
343
+
344
+ while (std::getline(ss, token, delimiter)) {
345
+ output.push_back(std::stoi(token));
346
+ }
347
+
348
+ return output;
349
+ }
350
+
351
+ std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
352
+ if (fpath_test.empty()){
353
+ fprintf(stderr, "%s : No test file found.\n", __func__);
354
+ return std::map<std::string, std::vector<gpt_vocab::id>>();
355
+ }
356
+
357
+ std::map<std::string, std::vector<gpt_vocab::id>> tests;
358
+
359
+ auto fin = std::ifstream(fpath_test, std::ios_base::in);
360
+ const char * delimeter = " => ";
361
+ const char del_tok = ',';
362
+ std::string line;
363
+ while (std::getline(fin, line)) {
364
+ size_t delimiterPos = line.find(delimeter);
365
+ if (delimiterPos != std::string::npos) {
366
+ std::string text = line.substr(0, delimiterPos);
367
+ std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
368
+ tests[text] = parse_tokens_from_string(s_tokens, del_tok);
369
+ }
370
+ }
371
+ return tests;
372
+ }
373
+
374
+ void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
375
+ std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
376
+
377
+ size_t n_fails = 0;
378
+
379
+ for (const auto & test : tests) {
380
+ std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
381
+
382
+ if (tokens != test.second){
383
+ n_fails++;
384
+
385
+ // print out failure cases
386
+ fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
387
+ fprintf(stderr, "%s : tokens in hf: ", __func__);
388
+ for (const auto & t : test.second) {
389
+ fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
390
+ }
391
+ fprintf(stderr, "\n");
392
+ fprintf(stderr, "%s : tokens in ggml: ", __func__);
393
+ for (const auto & t : tokens) {
394
+ fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
395
+ }
396
+ fprintf(stderr, "\n");
397
+ }
398
+ }
399
+
400
+ fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
401
+ }
402
+
403
+ bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
404
+ printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
405
+
406
+ vocab.token_to_id = ::json_parse(fname);
407
+
408
+ for (const auto & kv : vocab.token_to_id) {
409
+ vocab.id_to_token[kv.second] = kv.first;
410
+ }
411
+
412
+ printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
413
+
414
+ // print the vocabulary
415
+ //for (auto kv : vocab.token_to_id) {
416
+ // printf("'%s' -> %d\n", kv.first.data(), kv.second);
417
+ //}
418
+
419
+ return true;
420
+ }
421
+
422
+ gpt_vocab::id gpt_sample_top_k_top_p(
423
+ const gpt_vocab & vocab,
424
+ const float * logits,
425
+ int top_k,
426
+ double top_p,
427
+ double temp,
428
+ std::mt19937 & rng) {
429
+ int n_logits = vocab.id_to_token.size();
430
+
431
+ std::vector<std::pair<double, gpt_vocab::id>> logits_id;
432
+ logits_id.reserve(n_logits);
433
+
434
+ {
435
+ const double scale = 1.0/temp;
436
+ for (int i = 0; i < n_logits; ++i) {
437
+ logits_id.push_back(std::make_pair(logits[i]*scale, i));
438
+ }
439
+ }
440
+
441
+ // find the top K tokens
442
+ std::partial_sort(
443
+ logits_id.begin(),
444
+ logits_id.begin() + top_k, logits_id.end(),
445
+ [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
446
+ return a.first > b.first;
447
+ });
448
+
449
+ logits_id.resize(top_k);
450
+
451
+ double maxl = -INFINITY;
452
+ for (const auto & kv : logits_id) {
453
+ maxl = std::max(maxl, kv.first);
454
+ }
455
+
456
+ // compute probs for the top K tokens
457
+ std::vector<double> probs;
458
+ probs.reserve(logits_id.size());
459
+
460
+ double sum = 0.0;
461
+ for (const auto & kv : logits_id) {
462
+ double p = exp(kv.first - maxl);
463
+ probs.push_back(p);
464
+ sum += p;
465
+ }
466
+
467
+ // normalize the probs
468
+ for (auto & p : probs) {
469
+ p /= sum;
470
+ }
471
+
472
+ if (top_p < 1.0f) {
473
+ double cumsum = 0.0f;
474
+ for (int i = 0; i < top_k; i++) {
475
+ cumsum += probs[i];
476
+ if (cumsum >= top_p) {
477
+ top_k = i + 1;
478
+ probs.resize(top_k);
479
+ logits_id.resize(top_k);
480
+ break;
481
+ }
482
+ }
483
+
484
+ cumsum = 1.0/cumsum;
485
+ for (int i = 0; i < (int) probs.size(); i++) {
486
+ probs[i] *= cumsum;
487
+ }
488
+ }
489
+
490
+ //printf("\n");
491
+ //for (int i = 0; i < (int) probs.size(); i++) {
492
+ // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
493
+ //}
494
+ //exit(0);
495
+
496
+ std::discrete_distribution<> dist(probs.begin(), probs.end());
497
+ int idx = dist(rng);
498
+
499
+ return logits_id[idx].second;
500
+ }
501
+
502
+ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
503
+ const gpt_vocab & vocab,
504
+ const float * logits,
505
+ const int32_t * last_n_tokens_data,
506
+ size_t last_n_tokens_data_size,
507
+ int top_k,
508
+ double top_p,
509
+ double temp,
510
+ int repeat_last_n,
511
+ float repeat_penalty,
512
+ std::mt19937 & rng) {
513
+
514
+ int n_logits = vocab.id_to_token.size();
515
+
516
+ const auto * plogits = logits;
517
+
518
+ const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
519
+
520
+ if (temp <= 0) {
521
+ // select the token with the highest logit directly
522
+ float max_logit = plogits[0];
523
+ gpt_vocab::id max_id = 0;
524
+
525
+ for (int i = 1; i < n_logits; ++i) {
526
+ if (plogits[i] > max_logit) {
527
+ max_logit = plogits[i];
528
+ max_id = i;
529
+ }
530
+ }
531
+ return max_id;
532
+ }
533
+
534
+
535
+ std::vector<std::pair<double, gpt_vocab::id>> logits_id;
536
+ logits_id.reserve(n_logits);
537
+
538
+ {
539
+ const float scale = 1.0f/temp;
540
+ for (int i = 0; i < n_logits; ++i) {
541
+ // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
542
+ // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
543
+ if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
544
+ // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
545
+ if (plogits[i] < 0.0f) {
546
+ logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
547
+ } else {
548
+ logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
549
+ }
550
+ } else {
551
+ logits_id.push_back(std::make_pair(plogits[i]*scale, i));
552
+ }
553
+ }
554
+ }
555
+
556
+ // find the top K tokens
557
+ std::partial_sort(
558
+ logits_id.begin(),
559
+ logits_id.begin() + top_k, logits_id.end(),
560
+ [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
561
+ return a.first > b.first;
562
+ });
563
+
564
+ logits_id.resize(top_k);
565
+
566
+ double maxl = -INFINITY;
567
+ for (const auto & kv : logits_id) {
568
+ maxl = std::max(maxl, kv.first);
569
+ }
570
+
571
+ // compute probs for the top K tokens
572
+ std::vector<double> probs;
573
+ probs.reserve(logits_id.size());
574
+
575
+ double sum = 0.0;
576
+ for (const auto & kv : logits_id) {
577
+ double p = exp(kv.first - maxl);
578
+ probs.push_back(p);
579
+ sum += p;
580
+ }
581
+
582
+ // normalize the probs
583
+ for (auto & p : probs) {
584
+ p /= sum;
585
+ }
586
+
587
+ if (top_p < 1.0f) {
588
+ double cumsum = 0.0f;
589
+ for (int i = 0; i < top_k; i++) {
590
+ cumsum += probs[i];
591
+ if (cumsum >= top_p) {
592
+ top_k = i + 1;
593
+ probs.resize(top_k);
594
+ logits_id.resize(top_k);
595
+ break;
596
+ }
597
+ }
598
+
599
+ cumsum = 1.0/cumsum;
600
+ for (int i = 0; i < (int) probs.size(); i++) {
601
+ probs[i] *= cumsum;
602
+ }
603
+ }
604
+
605
+ // printf("\n");
606
+ // for (int i = 0; i < (int) probs.size(); i++) {
607
+ // for (int i = 0; i < 10; i++) {
608
+ // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
609
+ // }
610
+
611
+ std::discrete_distribution<> dist(probs.begin(), probs.end());
612
+ int idx = dist(rng);
613
+
614
+ return logits_id[idx].second;
615
+
616
+ }
617
+
618
+ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
619
+ drwav wav;
620
+ std::vector<uint8_t> wav_data; // used for pipe input from stdin
621
+
622
+ if (fname == "-") {
623
+ {
624
+ uint8_t buf[1024];
625
+ while (true)
626
+ {
627
+ const size_t n = fread(buf, 1, sizeof(buf), stdin);
628
+ if (n == 0) {
629
+ break;
630
+ }
631
+ wav_data.insert(wav_data.end(), buf, buf + n);
632
+ }
633
+ }
634
+
635
+ if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
636
+ fprintf(stderr, "error: failed to open WAV file from stdin\n");
637
+ return false;
638
+ }
639
+
640
+ fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
641
+ }
642
+ else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
643
+ fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
644
+ return false;
645
+ }
646
+
647
+ if (wav.channels != 1 && wav.channels != 2) {
648
+ fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
649
+ return false;
650
+ }
651
+
652
+ if (stereo && wav.channels != 2) {
653
+ fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
654
+ return false;
655
+ }
656
+
657
+ if (wav.sampleRate != COMMON_SAMPLE_RATE) {
658
+ fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
659
+ return false;
660
+ }
661
+
662
+ if (wav.bitsPerSample != 16) {
663
+ fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
664
+ return false;
665
+ }
666
+
667
+ const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
668
+
669
+ std::vector<int16_t> pcm16;
670
+ pcm16.resize(n*wav.channels);
671
+ drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
672
+ drwav_uninit(&wav);
673
+
674
+ // convert to mono, float
675
+ pcmf32.resize(n);
676
+ if (wav.channels == 1) {
677
+ for (uint64_t i = 0; i < n; i++) {
678
+ pcmf32[i] = float(pcm16[i])/32768.0f;
679
+ }
680
+ } else {
681
+ for (uint64_t i = 0; i < n; i++) {
682
+ pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
683
+ }
684
+ }
685
+
686
+ if (stereo) {
687
+ // convert to stereo, float
688
+ pcmf32s.resize(2);
689
+
690
+ pcmf32s[0].resize(n);
691
+ pcmf32s[1].resize(n);
692
+ for (uint64_t i = 0; i < n; i++) {
693
+ pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
694
+ pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
695
+ }
696
+ }
697
+
698
+ return true;
699
+ }
700
+
701
+ void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
702
+ const float rc = 1.0f / (2.0f * M_PI * cutoff);
703
+ const float dt = 1.0f / sample_rate;
704
+ const float alpha = dt / (rc + dt);
705
+
706
+ float y = data[0];
707
+
708
+ for (size_t i = 1; i < data.size(); i++) {
709
+ y = alpha * (y + data[i] - data[i - 1]);
710
+ data[i] = y;
711
+ }
712
+ }
713
+
714
+ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
715
+ const int n_samples = pcmf32.size();
716
+ const int n_samples_last = (sample_rate * last_ms) / 1000;
717
+
718
+ if (n_samples_last >= n_samples) {
719
+ // not enough samples - assume no speech
720
+ return false;
721
+ }
722
+
723
+ if (freq_thold > 0.0f) {
724
+ high_pass_filter(pcmf32, freq_thold, sample_rate);
725
+ }
726
+
727
+ float energy_all = 0.0f;
728
+ float energy_last = 0.0f;
729
+
730
+ for (int i = 0; i < n_samples; i++) {
731
+ energy_all += fabsf(pcmf32[i]);
732
+
733
+ if (i >= n_samples - n_samples_last) {
734
+ energy_last += fabsf(pcmf32[i]);
735
+ }
736
+ }
737
+
738
+ energy_all /= n_samples;
739
+ energy_last /= n_samples_last;
740
+
741
+ if (verbose) {
742
+ fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
743
+ }
744
+
745
+ if (energy_last > vad_thold*energy_all) {
746
+ return false;
747
+ }
748
+
749
+ return true;
750
+ }
751
+
752
+ float similarity(const std::string & s0, const std::string & s1) {
753
+ const size_t len0 = s0.size() + 1;
754
+ const size_t len1 = s1.size() + 1;
755
+
756
+ std::vector<int> col(len1, 0);
757
+ std::vector<int> prevCol(len1, 0);
758
+
759
+ for (size_t i = 0; i < len1; i++) {
760
+ prevCol[i] = i;
761
+ }
762
+
763
+ for (size_t i = 0; i < len0; i++) {
764
+ col[0] = i;
765
+ for (size_t j = 1; j < len1; j++) {
766
+ col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
767
+ }
768
+ col.swap(prevCol);
769
+ }
770
+
771
+ const float dist = prevCol[len1 - 1];
772
+
773
+ return 1.0f - (dist / std::max(s0.size(), s1.size()));
774
+ }
775
+
776
+ bool sam_params_parse(int argc, char ** argv, sam_params & params) {
777
+ for (int i = 1; i < argc; i++) {
778
+ std::string arg = argv[i];
779
+
780
+ if (arg == "-s" || arg == "--seed") {
781
+ params.seed = std::stoi(argv[++i]);
782
+ } else if (arg == "-t" || arg == "--threads") {
783
+ params.n_threads = std::stoi(argv[++i]);
784
+ } else if (arg == "-m" || arg == "--model") {
785
+ params.model = argv[++i];
786
+ } else if (arg == "-i" || arg == "--inp") {
787
+ params.fname_inp = argv[++i];
788
+ } else if (arg == "-o" || arg == "--out") {
789
+ params.fname_out = argv[++i];
790
+ } else if (arg == "-h" || arg == "--help") {
791
+ sam_print_usage(argc, argv, params);
792
+ exit(0);
793
+ } else {
794
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
795
+ sam_print_usage(argc, argv, params);
796
+ exit(0);
797
+ }
798
+ }
799
+
800
+ return true;
801
+ }
802
+
803
+ void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
804
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
805
+ fprintf(stderr, "\n");
806
+ fprintf(stderr, "options:\n");
807
+ fprintf(stderr, " -h, --help show this help message and exit\n");
808
+ fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
809
+ fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
810
+ fprintf(stderr, " -m FNAME, --model FNAME\n");
811
+ fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
812
+ fprintf(stderr, " -i FNAME, --inp FNAME\n");
813
+ fprintf(stderr, " input file (default: %s)\n", params.fname_inp.c_str());
814
+ fprintf(stderr, " -o FNAME, --out FNAME\n");
815
+ fprintf(stderr, " output file (default: %s)\n", params.fname_out.c_str());
816
+ fprintf(stderr, "\n");
817
+ }
stable-diffusion.cpp/ggml/examples/common.h ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include <string>
6
+ #include <map>
7
+ #include <vector>
8
+ #include <random>
9
+ #include <thread>
10
+
11
+ #define COMMON_SAMPLE_RATE 16000
12
+
13
+ //
14
+ // GPT CLI argument parsing
15
+ //
16
+
17
+ struct gpt_params {
18
+ int32_t seed = -1; // RNG seed
19
+ int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
20
+ int32_t n_predict = 200; // new tokens to predict
21
+ int32_t n_parallel = 1; // number of parallel streams
22
+ int32_t n_batch = 8; // batch size for prompt processing
23
+ int32_t n_ctx = 2048; // context size (this is the KV cache max size)
24
+ int32_t n_gpu_layers = 0; // number of layers to offlload to the GPU
25
+
26
+ bool ignore_eos = false; // ignore EOS token when generating text
27
+
28
+ // sampling parameters
29
+ int32_t top_k = 40;
30
+ float top_p = 0.9f;
31
+ float temp = 0.9f;
32
+ int32_t repeat_last_n = 64;
33
+ float repeat_penalty = 1.00f;
34
+
35
+ std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
36
+ std::string prompt = "";
37
+ std::string token_test = "";
38
+
39
+ bool interactive = false;
40
+ int32_t interactive_port = -1;
41
+ };
42
+
43
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
44
+
45
+ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
46
+
47
+ std::string gpt_random_prompt(std::mt19937 & rng);
48
+
49
+ //
50
+ // Vocab utils
51
+ //
52
+
53
+ std::string trim(const std::string & s);
54
+
55
+ std::string replace(
56
+ const std::string & s,
57
+ const std::string & from,
58
+ const std::string & to);
59
+
60
+ struct gpt_vocab {
61
+ using id = int32_t;
62
+ using token = std::string;
63
+
64
+ std::map<token, id> token_to_id;
65
+ std::map<id, token> id_to_token;
66
+ std::vector<std::string> special_tokens;
67
+
68
+ void add_special_token(const std::string & token);
69
+ };
70
+
71
+ // poor-man's JSON parsing
72
+ std::map<std::string, int32_t> json_parse(const std::string & fname);
73
+
74
+ std::string convert_to_utf8(const std::wstring & input);
75
+
76
+ std::wstring convert_to_wstring(const std::string & input);
77
+
78
+ void gpt_split_words(std::string str, std::vector<std::string>& words);
79
+
80
+ // split text into tokens
81
+ //
82
+ // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
83
+ //
84
+ // Regex (Python):
85
+ // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
86
+ //
87
+ // Regex (C++):
88
+ // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
89
+ //
90
+ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
91
+
92
+ // test outputs of gpt_tokenize
93
+ //
94
+ // - compare with tokens generated by the huggingface tokenizer
95
+ // - test cases are chosen based on the model's main language (under 'prompt' directory)
96
+ // - if all sentences are tokenized identically, print 'All tests passed.'
97
+ // - otherwise, print sentence, huggingface tokens, ggml tokens
98
+ //
99
+ void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);
100
+
101
+ // load the tokens from encoder.json
102
+ bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
103
+
104
+ // sample next token given probabilities for each embedding
105
+ //
106
+ // - consider only the top K tokens
107
+ // - from them, consider only the top tokens with cumulative probability > P
108
+ //
109
+ // TODO: not sure if this implementation is correct
110
+ // TODO: temperature is not implemented
111
+ //
112
+ gpt_vocab::id gpt_sample_top_k_top_p(
113
+ const gpt_vocab & vocab,
114
+ const float * logits,
115
+ int top_k,
116
+ double top_p,
117
+ double temp,
118
+ std::mt19937 & rng);
119
+
120
+ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
121
+ const gpt_vocab & vocab,
122
+ const float * logits,
123
+ const int32_t * last_n_tokens_data,
124
+ size_t last_n_tokens_data_size,
125
+ int top_k,
126
+ double top_p,
127
+ double temp,
128
+ int repeat_last_n,
129
+ float repeat_penalty,
130
+ std::mt19937 & rng);
131
+
132
+ //
133
+ // Audio utils
134
+ //
135
+
136
+ // Read WAV audio file and store the PCM data into pcmf32
137
+ // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
138
+ // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
139
+ bool read_wav(
140
+ const std::string & fname,
141
+ std::vector<float> & pcmf32,
142
+ std::vector<std::vector<float>> & pcmf32s,
143
+ bool stereo);
144
+
145
+ // Apply a high-pass frequency filter to PCM audio
146
+ // Suppresses frequencies below cutoff Hz
147
+ void high_pass_filter(
148
+ std::vector<float> & data,
149
+ float cutoff,
150
+ float sample_rate);
151
+
152
+ // Basic voice activity detection (VAD) using audio energy adaptive threshold
153
+ bool vad_simple(
154
+ std::vector<float> & pcmf32,
155
+ int sample_rate,
156
+ int last_ms,
157
+ float vad_thold,
158
+ float freq_thold,
159
+ bool verbose);
160
+
161
+ // compute similarity between two strings using Levenshtein distance
162
+ float similarity(const std::string & s0, const std::string & s1);
163
+
164
+ //
165
+ // SAM argument parsing
166
+ //
167
+
168
+ struct sam_params {
169
+ int32_t seed = -1; // RNG seed
170
+ int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
171
+
172
+ std::string model = "models/sam-vit-b/ggml-model-f16.bin"; // model path
173
+ std::string fname_inp = "img.jpg";
174
+ std::string fname_out = "img.out";
175
+ };
176
+
177
+ bool sam_params_parse(int argc, char ** argv, sam_params & params);
178
+
179
+ void sam_print_usage(int argc, char ** argv, const sam_params & params);
stable-diffusion.cpp/ggml/examples/dolly-v2/CMakeLists.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # dollyv2
3
+
4
+ set(TEST_TARGET dollyv2)
5
+ add_executable(${TEST_TARGET} main.cpp)
6
+ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
7
+
8
+ #
9
+ # dollyv2-quantize
10
+
11
+ set(TEST_TARGET dollyv2-quantize)
12
+ add_executable(${TEST_TARGET} quantize.cpp)
13
+ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
stable-diffusion.cpp/ggml/examples/dolly-v2/README.md ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dolly-V2
2
+
3
+ Transformer architecture: GPT-NeoX
4
+
5
+ Modeled from examples/stablelm
6
+
7
+ Ref: https://github.com/databrickslabs/dolly
8
+
9
+ Ref: https://github.com/stability-AI/stableLM/#stablelm-alpha
10
+
11
+ ## Usage
12
+
13
+ ```bash
14
+ # get the repo and build it
15
+ git clone https://github.com/ggerganov/ggml
16
+ cd ggml
17
+ mkdir build && cd build
18
+ cmake ..
19
+ make -j
20
+
21
+ # get the Dolly-V2 3B model
22
+ git clone https://huggingface.co/databricks/dolly-v2-3b
23
+
24
+ # install Python dependencies
25
+ python3 -m pip install -r ../requirements.txt
26
+
27
+ # convert model to FP16
28
+ python3 ../examples/dolly-v2/convert-h5-to-ggml.py ./dolly-v2-3b/ 1
29
+
30
+ # run inference using FP16 precision
31
+ ./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-f16.bin -p "State the meaning of life." -t 6 -n 64
32
+
33
+ main: seed = 1683218142
34
+ dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-f16.bin' - please wait ...
35
+ dollyv2_model_load: n_vocab = 50280
36
+ dollyv2_model_load: n_ctx = 2048
37
+ dollyv2_model_load: n_embd = 2560
38
+ dollyv2_model_load: n_head = 32
39
+ dollyv2_model_load: n_layer = 32
40
+ dollyv2_model_load: n_rot = 20
41
+ dollyv2_model_load: ftype = 1
42
+ dollyv2_model_load: ggml ctx size = 7374.91 MB
43
+ dollyv2_model_load: memory_size = 640.00 MB, n_mem = 65536
44
+ dollyv2_model_load: ................................................ done
45
+ dollyv2_model_load: model size = 5295.10 MB / num tensors = 388
46
+ main: number of tokens in prompt = 32
47
+ main: token[0] = 30003, Below
48
+ main: token[1] = 310, is
49
+ main: token[2] = 271, an
50
+ main: token[3] = 9775, instruction
51
+ main: token[4] = 326, that
52
+ main: token[5] = 8631, describes
53
+ main: token[6] = 247, a
54
+ main: token[7] = 4836, task
55
+ main: token[8] = 964, .
56
+ main: token[9] = 19566, Write
57
+ main: token[10] = 247, a
58
+ main: token[11] = 2380, response
59
+ main: token[12] = 326, that
60
+ main: token[13] = 20420, appropriately
61
+ main: token[14] = 29141, completes
62
+ main: token[15] = 253, the
63
+ main: token[16] = 2748, request
64
+ main: token[17] = 964, .
65
+ main: token[18] = 187,
66
+
67
+ main: token[19] = 187,
68
+
69
+ main: token[20] = 50278, ### Instruction:
70
+ main: token[21] = 187,
71
+
72
+ main: token[22] = 5443, State
73
+ main: token[23] = 253, the
74
+ main: token[24] = 4495, meaning
75
+ main: token[25] = 273, of
76
+ main: token[26] = 1495, life
77
+ main: token[27] = 964, .
78
+ main: token[28] = 187,
79
+
80
+ main: token[29] = 187,
81
+
82
+ main: token[30] = 50279, ### Response:
83
+ main: token[31] = 187,
84
+
85
+
86
+ Below is an instruction that describes a task. Write a response that appropriately completes the request.
87
+
88
+ ### Instruction:
89
+ State the meaning of life.
90
+
91
+ ### Response:
92
+ The meaning of life is to love and be loved.
93
+
94
+ ### End
95
+
96
+ main: mem per token = 16136720 bytes
97
+ main: load time = 2202.58 ms
98
+ main: sample time = 2.57 ms
99
+ main: predict time = 1497.14 ms / 33.27 ms per token
100
+ main: total time = 6187.27 ms
101
+ ```
102
+
103
+ ## 5-bit integer quantization mode
104
+
105
+ ```bash
106
+ # quantize the model to 5-bits using Q5_0 quantization
107
+ ./bin/dollyv2-quantize ./dolly-v2-3b/ggml-model-f16.bin ./dolly-v2-3b/ggml-model-q5_0.bin q5_0
108
+
109
+ # run the quantized model
110
+ ./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-q5_0.bin -p "State the meaning of life." -t 6 -n 64
111
+
112
+ main: seed = 1683218518
113
+ dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-q5_0.bin' - please wait ...
114
+ dollyv2_model_load: n_vocab = 50280
115
+ dollyv2_model_load: n_ctx = 2048
116
+ dollyv2_model_load: n_embd = 2560
117
+ dollyv2_model_load: n_head = 32
118
+ dollyv2_model_load: n_layer = 32
119
+ dollyv2_model_load: n_rot = 20
120
+ dollyv2_model_load: ftype = 8
121
+ dollyv2_model_load: ggml ctx size = 3902.68 MB
122
+ dollyv2_model_load: memory_size = 640.00 MB, n_mem = 65536
123
+ dollyv2_model_load: ................................................ done
124
+ dollyv2_model_load: model size = 1822.87 MB / num tensors = 388
125
+ main: number of tokens in prompt = 32
126
+ main: token[0] = 30003, Below
127
+ main: token[1] = 310, is
128
+ main: token[2] = 271, an
129
+ main: token[3] = 9775, instruction
130
+ main: token[4] = 326, that
131
+ main: token[5] = 8631, describes
132
+ main: token[6] = 247, a
133
+ main: token[7] = 4836, task
134
+ main: token[8] = 964, .
135
+ main: token[9] = 19566, Write
136
+ main: token[10] = 247, a
137
+ main: token[11] = 2380, response
138
+ main: token[12] = 326, that
139
+ main: token[13] = 20420, appropriately
140
+ main: token[14] = 29141, completes
141
+ main: token[15] = 253, the
142
+ main: token[16] = 2748, request
143
+ main: token[17] = 964, .
144
+ main: token[18] = 187,
145
+
146
+ main: token[19] = 187,
147
+
148
+ main: token[20] = 50278, ### Instruction:
149
+ main: token[21] = 187,
150
+
151
+ main: token[22] = 5443, State
152
+ main: token[23] = 253, the
153
+ main: token[24] = 4495, meaning
154
+ main: token[25] = 273, of
155
+ main: token[26] = 1495, life
156
+ main: token[27] = 964, .
157
+ main: token[28] = 187,
158
+
159
+ main: token[29] = 187,
160
+
161
+ main: token[30] = 50279, ### Response:
162
+ main: token[31] = 187,
163
+
164
+
165
+ Below is an instruction that describes a task. Write a response that appropriately completes the request.
166
+
167
+ ### Instruction:
168
+ State the meaning of life.
169
+
170
+ ### Response:
171
+ The meaning of life is the discovery of the true self.
172
+
173
+ ### End
174
+
175
+ main: mem per token = 16127760 bytes
176
+ main: load time = 1011.09 ms
177
+ main: sample time = 2.79 ms
178
+ main: predict time = 1271.62 ms / 27.64 ms per token
179
+ main: total time = 2802.51 ms
180
+ ```
181
+
182
+ ## Notes
183
+
184
+ - No guarantees for correctness
185
+ - The tokenizer is currently hacked - probably works only for English
186
+ - Non-parallel residual is not supported
187
+ - Contributions and improvements are welcome
stable-diffusion.cpp/ggml/examples/dolly-v2/convert-h5-to-ggml.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import struct
3
+ import json
4
+ import numpy as np
5
+
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+
8
+ if len(sys.argv) < 3:
9
+ print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
10
+ print(" ftype == 0 -> float32")
11
+ print(" ftype == 1 -> float16")
12
+ sys.exit(1)
13
+
14
+ # output in the same directory as the model
15
+ dir_model = sys.argv[1]
16
+ fname_out = sys.argv[1] + "/ggml-model.bin"
17
+
18
+ with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
19
+ encoder = json.load(f)
20
+
21
+ with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
22
+ hparams = json.load(f)
23
+
24
+ # possible data types
25
+ # ftype == 0 -> float32
26
+ # ftype == 1 -> float16
27
+ #
28
+ # map from ftype to string
29
+ ftype_str = ["f32", "f16"]
30
+
31
+ ftype = 1
32
+ if len(sys.argv) > 2:
33
+ ftype = int(sys.argv[2])
34
+ if ftype < 0 or ftype > 1:
35
+ print("Invalid ftype: " + str(ftype))
36
+ sys.exit(1)
37
+ fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
38
+
39
+
40
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
41
+ model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
42
+ #print (model)
43
+
44
+ #print(tokenizer.encode('I believe the meaning of life is'))
45
+
46
+ list_vars = model.state_dict()
47
+ for name in list_vars.keys():
48
+ print(name, list_vars[name].shape, list_vars[name].dtype)
49
+
50
+ fout = open(fname_out, "wb")
51
+
52
+ print(hparams)
53
+
54
+ fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
55
+ fout.write(struct.pack("i", hparams["vocab_size"]))
56
+ fout.write(struct.pack("i", hparams["max_position_embeddings"]))
57
+ fout.write(struct.pack("i", hparams["hidden_size"]))
58
+ fout.write(struct.pack("i", hparams["num_attention_heads"]))
59
+ fout.write(struct.pack("i", hparams["num_hidden_layers"]))
60
+ fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
61
+ fout.write(struct.pack("i", hparams["use_parallel_residual"]))
62
+ fout.write(struct.pack("i", ftype))
63
+
64
+ # TODO: temporary hack to not deal with implementing the tokenizer
65
+ dot_token = tokenizer.encode('.')[0]
66
+ for i in range(hparams["vocab_size"]):
67
+ text = tokenizer.decode([dot_token, i]).encode('utf-8')
68
+ # remove the first byte (it's always '.')
69
+ text = text[1:]
70
+ fout.write(struct.pack("i", len(text)))
71
+ fout.write(text)
72
+
73
+ for name in list_vars.keys():
74
+ data = list_vars[name].squeeze().numpy()
75
+ print("Processing variable: " + name + " with shape: ", data.shape)
76
+
77
+ # we don't need these
78
+ if name.endswith(".attention.masked_bias") or \
79
+ name.endswith(".attention.bias") or \
80
+ name.endswith(".attention.rotary_emb.inv_freq"):
81
+ print(" Skipping variable: " + name)
82
+ continue
83
+
84
+ n_dims = len(data.shape);
85
+
86
+ # ftype == 0 -> float32, ftype == 1 -> float16
87
+ ftype_cur = 0;
88
+ if ftype != 0:
89
+ if name[-7:] == ".weight" and n_dims == 2:
90
+ print(" Converting to float16")
91
+ data = data.astype(np.float16)
92
+ ftype_cur = 1
93
+ else:
94
+ print(" Converting to float32")
95
+ data = data.astype(np.float32)
96
+ ftype_cur = 0
97
+ else:
98
+ if data.dtype != np.float32:
99
+ print(" Converting to float32")
100
+ data = data.astype(np.float32)
101
+ ftype_cur = 0
102
+
103
+ # header
104
+ str = name.encode('utf-8')
105
+ fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
106
+ for i in range(n_dims):
107
+ fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
108
+ fout.write(str);
109
+
110
+ # data
111
+ data.tofile(fout)
112
+
113
+ fout.close()
114
+
115
+ print("Done. Output file: " + fname_out)
116
+ print("")
stable-diffusion.cpp/ggml/examples/dolly-v2/main.cpp ADDED
@@ -0,0 +1,969 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml/ggml.h"
2
+
3
+ #include "common.h"
4
+ #include "common-ggml.h"
5
+
6
+ #include <cassert>
7
+ #include <cmath>
8
+ #include <cstdio>
9
+ #include <cstring>
10
+ #include <cinttypes>
11
+ #include <fstream>
12
+ #include <iostream>
13
+ #include <map>
14
+ #include <string>
15
+ #include <vector>
16
+
17
+ #if !defined(_WIN32)
18
+ #define DOLLY_INTERACTIVE_PORT
19
+ #endif
20
+
21
+ #if defined(DOLLY_INTERACTIVE_PORT)
22
+ #include <arpa/inet.h>
23
+ #include <netinet/in.h>
24
+ #include <sys/socket.h>
25
+ #include <unistd.h>
26
+ #endif
27
+
28
+ #if defined(_MSC_VER)
29
+ #pragma warning(disable: 4244 4267) // possible loss of data
30
+ #endif
31
+
32
+ // default hparams (Dolly-V2 3B)
33
+ struct dollyv2_hparams {
34
+ int32_t n_vocab = 50254; // tokenizer.vocab_size
35
+ int32_t n_ctx = 2048; // model.config.max_position_embeddings
36
+ int32_t n_embd = 2560; // model.config.hidden_size
37
+ int32_t n_head = 32; // model.config.num_attention_heads
38
+ int32_t n_layer = 32; // model.config.num_hidden_layers
39
+ int32_t n_rot = 20; // rotary_pct[25%] * (n_embd / n_head)
40
+ int32_t par_res = 1; // 1 = true, 0 = false
41
+ int32_t ftype = GGML_FTYPE_MOSTLY_F16;
42
+ float eps = 1e-5f;
43
+ };
44
+
45
+ const std::string INSTRUCTION_KEY = "### Instruction:";
46
+ const std::string RESPONSE_KEY = "### Response:";
47
+ const std::string END_KEY = "### End";
48
+ const std::string INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request.";
49
+
50
+ // dollyv2 prompt format
51
+ std::string prompt_for_generation(const std::string& instruction) {
52
+ return INTRO_BLURB + "\n\n" + INSTRUCTION_KEY + "\n" + instruction + "\n\n" + RESPONSE_KEY + "\n";
53
+ }
54
+
55
+ struct dollyv2_layer {
56
+ // pre normalization
57
+ struct ggml_tensor * ln_1_g;
58
+ struct ggml_tensor * ln_1_b;
59
+
60
+ // attention
61
+ struct ggml_tensor * c_attn_attn_w;
62
+ struct ggml_tensor * c_attn_attn_b;
63
+
64
+ struct ggml_tensor * c_attn_proj_w;
65
+ struct ggml_tensor * c_attn_proj_b;
66
+
67
+ // post normalization
68
+ struct ggml_tensor * ln_2_g;
69
+ struct ggml_tensor * ln_2_b;
70
+
71
+ // ff
72
+ struct ggml_tensor * c_mlp_fc_w;
73
+ struct ggml_tensor * c_mlp_fc_b;
74
+
75
+ struct ggml_tensor * c_mlp_proj_w;
76
+ struct ggml_tensor * c_mlp_proj_b;
77
+ };
78
+
79
+ struct dollyv2_model {
80
+ dollyv2_hparams hparams;
81
+
82
+ // normalization
83
+ struct ggml_tensor * ln_f_g;
84
+ struct ggml_tensor * ln_f_b;
85
+
86
+ struct ggml_tensor * wte; // position embedding
87
+
88
+ struct ggml_tensor * lmh_g; // language model head
89
+ //struct ggml_tensor * lmh_b; // language model bias
90
+
91
+ std::vector<dollyv2_layer> layers;
92
+
93
+ // key + value memory
94
+ struct ggml_tensor * memory_k;
95
+ struct ggml_tensor * memory_v;
96
+
97
+ //
98
+ struct ggml_context * ctx;
99
+ std::map<std::string, struct ggml_tensor *> tensors;
100
+ };
101
+
102
+ // load the model's weights from a file
103
+ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vocab & vocab) {
104
+ printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
105
+
106
+ auto fin = std::ifstream(fname, std::ios::binary);
107
+ if (!fin) {
108
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
109
+ return false;
110
+ }
111
+
112
+ // verify magic
113
+ {
114
+ uint32_t magic;
115
+ fin.read((char *) &magic, sizeof(magic));
116
+ if (magic != GGML_FILE_MAGIC) {
117
+ fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
118
+ return false;
119
+ }
120
+ }
121
+
122
+ // load hparams
123
+ {
124
+ auto & hparams = model.hparams;
125
+
126
+ fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
127
+ fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
128
+ fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
129
+ fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
130
+ fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
131
+ fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
132
+ fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
133
+ fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
134
+
135
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
136
+
137
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
138
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
139
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
140
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
141
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
142
+ printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
143
+ printf("%s: par_res = %d\n", __func__, hparams.par_res);
144
+ printf("%s: ftype = %d\n", __func__, hparams.ftype);
145
+ printf("%s: qntvr = %d\n", __func__, qntvr);
146
+
147
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
148
+ }
149
+
150
+ // load vocab
151
+ {
152
+ const int32_t n_vocab = model.hparams.n_vocab;
153
+
154
+ std::string word;
155
+ std::vector<char> buf(128);
156
+
157
+ for (int i = 0; i < n_vocab; i++) {
158
+ uint32_t len;
159
+ fin.read((char *) &len, sizeof(len));
160
+
161
+ buf.resize(len);
162
+ fin.read((char *) buf.data(), len);
163
+ word.assign(buf.data(), len);
164
+
165
+ vocab.token_to_id[word] = i;
166
+ vocab.id_to_token[i] = word;
167
+ }
168
+
169
+ vocab.add_special_token("### End");
170
+ vocab.add_special_token("### Instruction:");
171
+ vocab.add_special_token("### Response:");
172
+ }
173
+
174
+ // for the big tensors, we have the option to store the data in 16-bit floats or quantized
175
+ // in order to save memory and also to speed up the computation
176
+ ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
177
+ if (wtype == GGML_TYPE_COUNT) {
178
+ fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
179
+ __func__, fname.c_str(), model.hparams.ftype);
180
+ return false;
181
+ }
182
+
183
+ auto & ctx = model.ctx;
184
+
185
+ size_t ctx_size = 0;
186
+
187
+ {
188
+ const auto & hparams = model.hparams;
189
+
190
+ const int n_embd = hparams.n_embd;
191
+ const int n_layer = hparams.n_layer;
192
+ const int n_ctx = hparams.n_ctx;
193
+ const int n_vocab = hparams.n_vocab;
194
+
195
+ ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
196
+ ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
197
+
198
+ ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
199
+
200
+ ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g
201
+ //ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
202
+
203
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
204
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
205
+
206
+ ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
207
+ ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
208
+
209
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
210
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
211
+
212
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
213
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
214
+
215
+ ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
216
+ ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
217
+
218
+ ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
219
+ ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
220
+
221
+ ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
222
+ ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
223
+
224
+ ctx_size += (6 + 16*n_layer)*512; // object overhead
225
+
226
+ printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
227
+ }
228
+
229
+ // create the ggml context
230
+ {
231
+ struct ggml_init_params params = {
232
+ /*.mem_size =*/ ctx_size,
233
+ /*.mem_buffer =*/ NULL,
234
+ /*.no_alloc =*/ false,
235
+ };
236
+
237
+ model.ctx = ggml_init(params);
238
+ if (!model.ctx) {
239
+ fprintf(stderr, "%s: ggml_init() failed\n", __func__);
240
+ return false;
241
+ }
242
+ }
243
+
244
+ // prepare memory for the weights
245
+ {
246
+ const auto & hparams = model.hparams;
247
+
248
+ const int n_embd = hparams.n_embd;
249
+ const int n_layer = hparams.n_layer;
250
+ const int n_vocab = hparams.n_vocab;
251
+
252
+ model.layers.resize(n_layer);
253
+
254
+ model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
255
+
256
+ model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
257
+ model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
258
+
259
+ model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
260
+ //model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
261
+
262
+ // map by name
263
+ model.tensors["gpt_neox.embed_in.weight"] = model.wte;
264
+
265
+ model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
266
+ model.tensors["gpt_neox.final_layer_norm.bias"] = model.ln_f_b;
267
+
268
+ model.tensors["embed_out.weight"] = model.lmh_g;
269
+ //model.tensors["lm_head.bias"] = model.lmh_b;
270
+
271
+ for (int i = 0; i < n_layer; ++i) {
272
+ auto & layer = model.layers[i];
273
+
274
+ layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
275
+ layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
276
+
277
+ layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
278
+ layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
279
+
280
+ layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
281
+ layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
282
+
283
+ layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
284
+ layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
285
+
286
+ layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
287
+ layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
288
+
289
+ layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
290
+ layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
291
+
292
+ // map by name
293
+
294
+ // unmapped: attention.rotary_emb, mlp.act
295
+
296
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
297
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"] = layer.ln_1_b;
298
+
299
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
300
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"] = layer.c_attn_attn_b;
301
+
302
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
303
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"] = layer.c_attn_proj_b;
304
+
305
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
306
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"] = layer.ln_2_b;
307
+
308
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
309
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"] = layer.c_mlp_fc_b;
310
+
311
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
312
+ model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"] = layer.c_mlp_proj_b;
313
+ }
314
+ }
315
+
316
+ // key + value memory
317
+ {
318
+ const auto & hparams = model.hparams;
319
+
320
+ const int n_embd = hparams.n_embd;
321
+ const int n_layer = hparams.n_layer;
322
+ const int n_ctx = hparams.n_ctx;
323
+
324
+ const int64_t n_mem = n_layer*n_ctx;
325
+ const int64_t n_elements = n_embd*n_mem;
326
+
327
+ model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
328
+ model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
329
+
330
+ const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
331
+
332
+ printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
333
+ }
334
+
335
+ // load weights
336
+ {
337
+ int n_tensors = 0;
338
+ size_t total_size = 0;
339
+
340
+ printf("%s: ", __func__);
341
+
342
+ while (true) {
343
+ int32_t n_dims;
344
+ int32_t length;
345
+ int32_t ttype;
346
+
347
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
348
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
349
+ fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
350
+
351
+ if (fin.eof()) {
352
+ break;
353
+ }
354
+
355
+ int32_t nelements = 1;
356
+ int32_t ne[2] = { 1, 1 };
357
+ for (int i = 0; i < n_dims; ++i) {
358
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
359
+ nelements *= ne[i];
360
+ }
361
+
362
+ std::string name(length, 0);
363
+ fin.read(&name[0], length);
364
+
365
+ if (model.tensors.find(name) == model.tensors.end()) {
366
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
367
+ return false;
368
+ }
369
+
370
+ auto tensor = model.tensors[name];
371
+ if (ggml_nelements(tensor) != nelements) {
372
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
373
+ return false;
374
+ }
375
+
376
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
377
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
378
+ __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
379
+ return false;
380
+ }
381
+
382
+ // for debugging
383
+ if (0) {
384
+ printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
385
+ }
386
+
387
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
388
+
389
+ if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
390
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
391
+ __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
392
+ return false;
393
+ }
394
+
395
+ fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
396
+
397
+ total_size += ggml_nbytes(tensor);
398
+ if (++n_tensors % 8 == 0) {
399
+ printf(".");
400
+ fflush(stdout);
401
+ }
402
+ }
403
+
404
+ printf(" done\n");
405
+
406
+ printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
407
+ }
408
+
409
+ fin.close();
410
+
411
+ return true;
412
+ }
413
+
414
+ // feed-forward network
415
+ ggml_tensor * gpt_neox_ff(
416
+ const dollyv2_layer & layer,
417
+ ggml_context * ctx0,
418
+ ggml_tensor * inp,
419
+ float eps) {
420
+ ggml_tensor * cur = ggml_norm(ctx0, inp, eps);
421
+
422
+ cur = ggml_add(ctx0,
423
+ ggml_mul(ctx0,
424
+ ggml_repeat(ctx0, layer.ln_2_g, cur),
425
+ cur),
426
+ ggml_repeat(ctx0, layer.ln_2_b, cur));
427
+
428
+ cur = ggml_mul_mat(ctx0,
429
+ layer.c_mlp_fc_w,
430
+ cur);
431
+
432
+ cur = ggml_add(ctx0,
433
+ ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
434
+ cur);
435
+
436
+ // GELU activation
437
+ cur = ggml_gelu(ctx0, cur);
438
+
439
+ // projection
440
+ // cur = proj_w*cur + proj_b
441
+ cur = ggml_mul_mat(ctx0,
442
+ layer.c_mlp_proj_w,
443
+ cur);
444
+
445
+ cur = ggml_add(ctx0,
446
+ ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
447
+ cur);
448
+ return cur;
449
+ }
450
+
451
+ // evaluate the transformer
452
+ //
453
+ // - model: the model
454
+ // - n_threads: number of threads to use
455
+ // - n_past: the context size so far
456
+ // - embd_inp: the embeddings of the tokens in the context
457
+ // - embd_w: the predicted logits for the next token
458
+ //
459
+ bool dollyv2_eval(
460
+ const dollyv2_model & model,
461
+ const int n_threads,
462
+ const int n_past,
463
+ const std::vector<gpt_vocab::id> & embd_inp,
464
+ std::vector<float> & embd_w,
465
+ size_t & mem_per_token) {
466
+ const int N = embd_inp.size();
467
+
468
+ const auto & hparams = model.hparams;
469
+
470
+ const int n_embd = hparams.n_embd;
471
+ const int n_layer = hparams.n_layer;
472
+ const int n_ctx = hparams.n_ctx;
473
+ const int n_head = hparams.n_head;
474
+ const int n_vocab = hparams.n_vocab;
475
+ const int n_rot = hparams.n_rot;
476
+
477
+ static size_t buf_size = 256u*1024*1024;
478
+ static void * buf = malloc(buf_size);
479
+
480
+ if (mem_per_token > 0 && mem_per_token*N > buf_size) {
481
+ const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
482
+ //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
483
+
484
+ // reallocate
485
+ buf_size = buf_size_new;
486
+ buf = realloc(buf, buf_size);
487
+ if (buf == nullptr) {
488
+ fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
489
+ return false;
490
+ }
491
+ }
492
+
493
+ struct ggml_init_params params = {
494
+ /*.mem_size =*/ buf_size,
495
+ /*.mem_buffer =*/ buf,
496
+ /*.no_alloc =*/ false,
497
+ };
498
+
499
+ struct ggml_context * ctx0 = ggml_init(params);
500
+ struct ggml_cgraph gf = { };
501
+
502
+ // KQ_pos - contains the positions
503
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
504
+ int * data = (int *) KQ_pos->data;
505
+ for (int i = 0; i < N; ++i) {
506
+ data[i] = n_past + i;
507
+ }
508
+
509
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
510
+ memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
511
+
512
+ // wte
513
+ struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
514
+
515
+ for (int il = 0; il < n_layer; ++il) {
516
+ struct ggml_tensor * cur;
517
+
518
+ // self-attention
519
+ {
520
+ {
521
+ cur = ggml_norm(ctx0, inpL, hparams.eps);
522
+
523
+ cur = ggml_add(ctx0,
524
+ ggml_mul(ctx0,
525
+ ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
526
+ cur),
527
+ ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
528
+ }
529
+
530
+ // compute QKV
531
+ {
532
+ cur = ggml_mul_mat(ctx0,
533
+ model.layers[il].c_attn_attn_w,
534
+ cur);
535
+
536
+ cur = ggml_add(ctx0,
537
+ ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
538
+ cur);
539
+ }
540
+
541
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
542
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
543
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
544
+
545
+ // using mode = 2 for GPT-NeoX mode
546
+ Qcur = ggml_rope_inplace(ctx0, Qcur, KQ_pos, n_rot, 2, 0);
547
+ Kcur = ggml_rope_inplace(ctx0, Kcur, KQ_pos, n_rot, 2, 0);
548
+
549
+ // store key and value to memory
550
+ {
551
+ Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
552
+
553
+ struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
554
+ struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
555
+ ( n_ctx)*ggml_element_size(model.memory_v),
556
+ (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
557
+
558
+ ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
559
+ ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
560
+ }
561
+
562
+ // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
563
+ struct ggml_tensor * Q =
564
+ ggml_permute(ctx0,
565
+ Qcur,
566
+ 0, 2, 1, 3);
567
+
568
+ // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
569
+ struct ggml_tensor * K =
570
+ ggml_permute(ctx0,
571
+ ggml_reshape_3d(ctx0,
572
+ ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
573
+ n_embd/n_head, n_head, n_past + N),
574
+ 0, 2, 1, 3);
575
+
576
+ // K * Q
577
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
578
+
579
+ // KQ_scaled = KQ / sqrt(n_embd/n_head)
580
+ struct ggml_tensor * KQ_scaled =
581
+ ggml_scale_inplace(ctx0,
582
+ KQ,
583
+ ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
584
+ );
585
+
586
+ // KQ_masked = mask_past(KQ_scaled)
587
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
588
+
589
+ // KQ = soft_max(KQ_masked)
590
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
591
+
592
+ // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
593
+ struct ggml_tensor * V =
594
+ ggml_view_3d(ctx0, model.memory_v,
595
+ n_past + N, n_embd/n_head, n_head,
596
+ n_ctx*ggml_element_size(model.memory_v),
597
+ n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
598
+ il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
599
+
600
+ // KQV = transpose(V) * KQ_soft_max
601
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
602
+
603
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
604
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
605
+
606
+ // cur = KQV_merged.contiguous().view(n_embd, N)
607
+ cur = ggml_cpy(ctx0,
608
+ KQV_merged,
609
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
610
+
611
+ // projection
612
+ {
613
+ cur = ggml_mul_mat(ctx0,
614
+ model.layers[il].c_attn_proj_w,
615
+ cur);
616
+
617
+ cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
618
+ }
619
+ }
620
+
621
+ if (hparams.par_res == 0) {
622
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
623
+
624
+ cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps);
625
+
626
+ // input for next layer
627
+ inpL = ggml_add(ctx0, cur, inpFF);
628
+ } else {
629
+ struct ggml_tensor * inpFF = cur;
630
+
631
+ // this is independent of the self-attention result, so it could be done in parallel to the self-attention
632
+ // note here we pass inpL instead of cur
633
+ cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps);
634
+
635
+ // layer input + FF
636
+ cur = ggml_add(ctx0, cur, inpFF);
637
+
638
+ // input for next layer
639
+ inpL = ggml_add(ctx0, cur, inpL);
640
+ }
641
+
642
+ }
643
+
644
+ // norm
645
+ {
646
+ inpL = ggml_norm(ctx0, inpL, hparams.eps);
647
+
648
+ // inpL = ln_f_g*inpL + ln_f_b
649
+ inpL = ggml_add(ctx0,
650
+ ggml_mul(ctx0,
651
+ ggml_repeat(ctx0, model.ln_f_g, inpL),
652
+ inpL),
653
+ ggml_repeat(ctx0, model.ln_f_b, inpL));
654
+ }
655
+
656
+ // lm_head
657
+ {
658
+ inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
659
+
660
+ //inpL = ggml_add(ctx0,
661
+ // ggml_repeat(ctx0, model.lmh_b, inpL),
662
+ // inpL);
663
+ }
664
+
665
+ // logits -> probs
666
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
667
+
668
+ // run the computation
669
+ ggml_build_forward_expand(&gf, inpL);
670
+ ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
671
+
672
+ //if (n_past%100 == 0) {
673
+ // ggml_graph_print (&gf);
674
+ // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
675
+ //}
676
+
677
+ //embd_w.resize(n_vocab*N);
678
+ //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
679
+
680
+ // return result for just the last token
681
+ embd_w.resize(n_vocab);
682
+ memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
683
+
684
+ if (mem_per_token == 0) {
685
+ mem_per_token = ggml_used_mem(ctx0)/N;
686
+ }
687
+ //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
688
+
689
+ ggml_free(ctx0);
690
+
691
+ return true;
692
+ }
693
+
694
+ std::string execute_prompt(
695
+ const dollyv2_model &model,
696
+ gpt_vocab &vocab,
697
+ const std::string &prompt,
698
+ gpt_params &params,
699
+ std::mt19937 &rng,
700
+ int64_t t_load_us,
701
+ int64_t t_sample_us,
702
+ int64_t t_predict_us,
703
+ size_t mem_per_token,
704
+ int n_past,
705
+ bool stream_response_to_cout = false) {
706
+ std::string output = "";
707
+ std::vector<float> logits;
708
+
709
+ // tokenize the prompt
710
+ std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, prompt);
711
+
712
+ params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int)embd_inp.size());
713
+
714
+ printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
715
+ for (size_t i = 0; i < embd_inp.size(); i++) {
716
+ printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
717
+ }
718
+ printf("\n");
719
+
720
+ std::vector<gpt_vocab::id> embd;
721
+
722
+ dollyv2_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
723
+
724
+ const int32_t end_token = vocab.token_to_id["### End"];
725
+
726
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
727
+ // predict
728
+ if (embd.size() > 0) {
729
+ const int64_t t_start_us = ggml_time_us();
730
+
731
+ if (!dollyv2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
732
+ printf("Failed to predict\n");
733
+ return output;
734
+ }
735
+
736
+ t_predict_us += ggml_time_us() - t_start_us;
737
+ }
738
+
739
+ n_past += embd.size();
740
+ embd.clear();
741
+
742
+ if (i >= embd_inp.size()) {
743
+ // sample next token
744
+ const int top_k = params.top_k;
745
+ const float top_p = params.top_p;
746
+ const float temp = params.temp;
747
+
748
+ const int n_vocab = model.hparams.n_vocab;
749
+
750
+ gpt_vocab::id id = 0;
751
+
752
+ {
753
+ const int64_t t_start_sample_us = ggml_time_us();
754
+
755
+ id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
756
+
757
+ t_sample_us += ggml_time_us() - t_start_sample_us;
758
+ }
759
+
760
+ // add it to the context
761
+ embd.push_back(id);
762
+ } else {
763
+ // if here, it means we are still processing the input prompt
764
+ for (size_t k = i; k < embd_inp.size(); k++) {
765
+ embd.push_back(embd_inp[k]);
766
+ if (int32_t(embd.size()) > params.n_batch) {
767
+ break;
768
+ }
769
+ }
770
+ i += embd.size() - 1;
771
+ }
772
+
773
+ // display text
774
+ for (auto id : embd) {
775
+ output += vocab.id_to_token[id];
776
+ if (stream_response_to_cout) {
777
+ printf("%s", vocab.id_to_token[id].c_str());
778
+ }
779
+ }
780
+ if (stream_response_to_cout) {
781
+ fflush(stdout);
782
+ }
783
+
784
+ // end of text token
785
+ if (embd.back() == 0 || (end_token > 0 && embd.back() == end_token)) {
786
+ return output;
787
+ }
788
+ }
789
+ return output;
790
+ }
791
+
792
+ #if defined(DOLLY_INTERACTIVE_PORT)
793
+ int setup_port(const int port) {
794
+ int sockfd = socket(AF_INET, SOCK_STREAM, 0);
795
+ if (sockfd < 0) {
796
+ fprintf(stderr, "%s: Failed to create new socket\n", __func__);
797
+ return -1;
798
+ }
799
+
800
+ sockaddr_in servaddr;
801
+ std::memset(&servaddr, 0, sizeof(servaddr));
802
+
803
+ servaddr.sin_family = AF_INET;
804
+ servaddr.sin_addr.s_addr = htonl(INADDR_ANY);
805
+ servaddr.sin_port = htons(port);
806
+
807
+ if (bind(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) < 0) {
808
+ fprintf(stderr, "%s: Failed to bind to port %i\n", __func__, port);
809
+ return -1;
810
+ }
811
+
812
+ if (listen(sockfd, 10) < 0) {
813
+ fprintf(stderr, "%s: Failed to listen to socket on port %i\n", __func__, port);
814
+ return -1;
815
+ }
816
+ return sockfd;
817
+ }
818
+
819
+ std::string read_from_port(int sockfd, int clientfd) {
820
+ if (clientfd < 0) {
821
+ fprintf(stderr, "%s: Failed to accept new connection\n", __func__);
822
+ return "";
823
+ }
824
+
825
+ char buffer[4096];
826
+ std::memset(buffer, 0, sizeof(buffer));
827
+
828
+ if (read(clientfd, buffer, sizeof(buffer)) < 0) {
829
+ fprintf(stderr, "%s: Failed to read from client\n", __func__);
830
+ } else {
831
+ std::cout << "Received: " << buffer;
832
+ return std::string(buffer);
833
+ }
834
+ return std::string("");
835
+ }
836
+ #endif
837
+
838
+ int main(int argc, char ** argv) {
839
+ ggml_time_init();
840
+
841
+ const int64_t t_main_start_us = ggml_time_us();
842
+
843
+ gpt_params params;
844
+ params.model = "models/dolly-v2-3b/ggml-model-f16.bin";
845
+
846
+ if (gpt_params_parse(argc, argv, params) == false) {
847
+ return 1;
848
+ }
849
+
850
+ if (params.seed < 0) {
851
+ params.seed = time(NULL);
852
+ }
853
+
854
+ printf("%s: seed = %d\n", __func__, params.seed);
855
+
856
+ std::mt19937 rng(params.seed);
857
+
858
+ int64_t t_load_us = 0;
859
+ int64_t t_sample_us = 0;
860
+ int64_t t_predict_us = 0;
861
+
862
+ // determine the required inference memory per token:
863
+ size_t mem_per_token = 0;
864
+
865
+ int n_past = 0;
866
+
867
+ gpt_vocab vocab;
868
+ dollyv2_model model;
869
+
870
+ // load the model
871
+ {
872
+ const int64_t t_start_us = ggml_time_us();
873
+
874
+ if (!dollyv2_model_load(params.model, model, vocab)) {
875
+ fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
876
+ return 1;
877
+ }
878
+
879
+ t_load_us = ggml_time_us() - t_start_us;
880
+
881
+ test_gpt_tokenizer(vocab, params.token_test);
882
+ }
883
+
884
+ #if defined(DOLLY_INTERACTIVE_PORT)
885
+ int sockfd = -1;
886
+ if (params.interactive_port != -1) {
887
+ sockfd = setup_port(params.interactive_port);
888
+ if (sockfd == -1) {
889
+ return 1;
890
+ }
891
+ fprintf(stdout, "Model is ready on port %i\n", params.interactive_port);
892
+ fflush(stdout);
893
+ }
894
+ #endif
895
+
896
+ if (params.interactive || params.interactive_port != -1) {
897
+ while (true) {
898
+ std::string prompt_input;
899
+ #if defined(DOLLY_INTERACTIVE_PORT)
900
+ int clientfd = -1;
901
+ if (params.interactive_port != -1) {
902
+ sockaddr_in clientaddr;
903
+ socklen_t clientaddrlen = sizeof(clientaddr);
904
+ clientfd = accept(sockfd, (struct sockaddr *)&clientaddr, &clientaddrlen);
905
+ prompt_input = read_from_port(sockfd, clientfd);
906
+ } else
907
+ #endif
908
+ {
909
+ printf("Please enter your quesiton:\n>");
910
+ fflush(stdout);
911
+
912
+ std::getline(std::cin, prompt_input);
913
+ }
914
+
915
+ if (strcmp(prompt_input.c_str(), "exit") == 0) {
916
+ break;
917
+ }
918
+
919
+ const std::string prompt = prompt_for_generation(prompt_input);
920
+ // call the model
921
+ const std::string response = execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true);
922
+
923
+ #if defined(DOLLY_INTERACTIVE_PORT)
924
+ if (params.interactive_port != -1) {
925
+ if (write(clientfd, response.c_str(), response.size()) < 0) {
926
+ fprintf(stderr, "%s: Failed to write answer '%s' to client\n", __func__, response.c_str());
927
+ }
928
+
929
+ if (close(clientfd) < 0) {
930
+ fprintf(stderr, "%s: Failed to close client socket\n", __func__);
931
+ }
932
+ } else
933
+ #endif
934
+ {
935
+ printf("%s\n\n", response.c_str());
936
+ }
937
+ fflush(stdout);
938
+ }
939
+ } else {
940
+ if (params.prompt.empty()) {
941
+ params.prompt = gpt_random_prompt(rng);
942
+ }
943
+
944
+ const std::string prompt = prompt_for_generation(params.prompt);
945
+ execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true);
946
+ }
947
+
948
+ // report timing
949
+ {
950
+ const int64_t t_main_end_us = ggml_time_us();
951
+
952
+ printf("\n\n");
953
+ printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
954
+ printf("%s: load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
955
+ printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us / 1000.0f);
956
+ printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past);
957
+ printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
958
+ }
959
+
960
+ ggml_free(model.ctx);
961
+
962
+ #if defined(DOLLY_INTERACTIVE_PORT)
963
+ if (params.interactive_port != -1 && close(sockfd) < 0) {
964
+ fprintf(stderr, "%s: Failed to close server socket\n", __func__);
965
+ }
966
+ #endif
967
+
968
+ return 0;
969
+ }
stable-diffusion.cpp/ggml/examples/dolly-v2/quantize.cpp ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml/ggml.h"
2
+
3
+ #include "common.h"
4
+ #include "common-ggml.h"
5
+
6
+ #include <cassert>
7
+ #include <cmath>
8
+ #include <cstdio>
9
+ #include <cstring>
10
+ #include <fstream>
11
+ #include <map>
12
+ #include <string>
13
+ #include <vector>
14
+ #include <regex>
15
+
16
+ // default hparams (dollyv2 3B)
17
+ struct dollyv2_hparams {
18
+ int32_t n_vocab = 50254; // tokenizer.vocab_size
19
+ int32_t n_ctx = 2048; // model.config.max_position_embeddings
20
+ int32_t n_embd = 2560; // model.config.hidden_size
21
+ int32_t n_head = 32; // model.config.num_attention_heads
22
+ int32_t n_layer = 32; // model.config.num_hidden_layers
23
+ int32_t n_rot = 20; // rotary_pct[25%] * (n_embd / n_head)
24
+ int32_t par_res = 1; // 1 = true, 0 = false
25
+ int32_t ftype = GGML_FTYPE_MOSTLY_F16;
26
+ };
27
+
28
+ // quantize a model
29
+ bool dollyv2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
30
+ gpt_vocab vocab;
31
+
32
+ printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
33
+
34
+ auto finp = std::ifstream(fname_inp, std::ios::binary);
35
+ if (!finp) {
36
+ fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
37
+ return false;
38
+ }
39
+
40
+ auto fout = std::ofstream(fname_out, std::ios::binary);
41
+ if (!fout) {
42
+ fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
43
+ return false;
44
+ }
45
+
46
+ // verify magic
47
+ {
48
+ uint32_t magic;
49
+ finp.read((char *) &magic, sizeof(magic));
50
+ if (magic != GGML_FILE_MAGIC) {
51
+ fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
52
+ return false;
53
+ }
54
+
55
+ fout.write((char *) &magic, sizeof(magic));
56
+ }
57
+
58
+ dollyv2_hparams hparams;
59
+
60
+ // load hparams
61
+ {
62
+ finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
63
+ finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
64
+ finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
65
+ finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
66
+ finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
67
+ finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
68
+ finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
69
+ finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
70
+
71
+ const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
72
+ const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
73
+
74
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
75
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
76
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
77
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
78
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
79
+ printf("%s: par_res = %d\n", __func__, hparams.par_res);
80
+ printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
81
+ printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
82
+ printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
83
+ printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
84
+
85
+ fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
86
+ fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
87
+ fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
88
+ fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
89
+ fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
90
+ fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
91
+ fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
92
+ fout.write((char *) &ftype_dst, sizeof(ftype_dst));
93
+ }
94
+
95
+ // load vocab
96
+ {
97
+ const int32_t n_vocab = hparams.n_vocab;
98
+
99
+ std::string word;
100
+ for (int i = 0; i < n_vocab; i++) {
101
+ uint32_t len;
102
+ finp.read ((char *) &len, sizeof(len));
103
+ fout.write((char *) &len, sizeof(len));
104
+
105
+ word.resize(len);
106
+ finp.read ((char *) word.data(), len);
107
+ fout.write((char *) word.data(), len);
108
+
109
+ vocab.token_to_id[word] = i;
110
+ vocab.id_to_token[i] = word;
111
+ }
112
+ }
113
+
114
+ // regexes of tensor names to be quantized
115
+ const std::vector<std::string> to_quant = {
116
+ ".*weight",
117
+ };
118
+
119
+ if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
120
+ fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
121
+ return false;
122
+ }
123
+
124
+ finp.close();
125
+ fout.close();
126
+
127
+ return true;
128
+ }
129
+
130
+ // usage:
131
+ // ./dollyv2-quantize models/dolly-v2-3B/ggml-model.bin models/dolly-v2-3B/ggml-model-quant.bin type
132
+ //
133
+ int main(int argc, char ** argv) {
134
+ if (argc != 4) {
135
+ fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
136
+ ggml_print_ftypes(stderr);
137
+ return 1;
138
+ }
139
+
140
+ // needed to initialize f16 tables
141
+ {
142
+ struct ggml_init_params params = { 0, NULL, false };
143
+ struct ggml_context * ctx = ggml_init(params);
144
+ ggml_free(ctx);
145
+ }
146
+
147
+ const std::string fname_inp = argv[1];
148
+ const std::string fname_out = argv[2];
149
+
150
+ const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
151
+
152
+ const int64_t t_main_start_us = ggml_time_us();
153
+
154
+ int64_t t_quantize_us = 0;
155
+
156
+ // load the model
157
+ {
158
+ const int64_t t_start_us = ggml_time_us();
159
+
160
+ if (!dollyv2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
161
+ fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
162
+ return 1;
163
+ }
164
+
165
+ t_quantize_us = ggml_time_us() - t_start_us;
166
+ }
167
+
168
+ // report timing
169
+ {
170
+ const int64_t t_main_end_us = ggml_time_us();
171
+
172
+ printf("\n");
173
+ printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
174
+ printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
175
+ }
176
+
177
+ return 0;
178
+ }
stable-diffusion.cpp/ggml/examples/dr_wav.h ADDED
The diff for this file is too large to render. See raw diff
 
stable-diffusion.cpp/ggml/examples/gpt-2/CMakeLists.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # gpt-2
3
+
4
+ set(TEST_TARGET gpt-2)
5
+ add_executable(${TEST_TARGET} main.cpp)
6
+ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
7
+
8
+ #
9
+ # gpt-2-quantize
10
+
11
+ set(TEST_TARGET gpt-2-quantize)
12
+ add_executable(${TEST_TARGET} quantize.cpp)
13
+ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
14
+
15
+ #
16
+ # gpt-2-batched
17
+
18
+ set(TEST_TARGET gpt-2-batched)
19
+ add_executable(${TEST_TARGET} main-batched.cpp)
20
+ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
21
+
22
+
23
+ #
24
+ # For GPU offloading
25
+
26
+ if (GGML_CUBLAS)
27
+ add_compile_definitions(GGML_USE_CUBLAS)
28
+ endif()
29
+
30
+ if (GGML_CLBLAST)
31
+ add_compile_definitions(GGML_USE_CLBLAST)
32
+ endif()
33
+
34
+ if (GGML_METAL)
35
+ add_compile_definitions(GGML_USE_METAL)
36
+ endif()
stable-diffusion.cpp/ggml/examples/gpt-2/README.md ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # gpt-2
2
+
3
+ This is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library.
4
+
5
+ The program runs on the CPU - no video card is required.
6
+
7
+ The [Cerebras-GPT](https://huggingface.co/cerebras) models are also supported.
8
+
9
+ The example supports the following GPT-2 models:
10
+
11
+ | Model | Description | Disk Size |
12
+ | --- | --- | --- |
13
+ | 117M | Small model | 240 MB |
14
+ | 345M | Medium model | 680 MB |
15
+ | 774M | Large model | 1.5 GB |
16
+ | 1558M | XL model | 3.0 GB |
17
+
18
+ Sample performance on MacBook M1 Pro:
19
+
20
+ | Model | Size | Time / Token |
21
+ | --- | --- | --- |
22
+ | GPT-2 | 117M | 5 ms |
23
+ | GPT-2 | 345M | 12 ms |
24
+ | GPT-2 | 774M | 23 ms |
25
+ | GPT-2 | 1558M | 42 ms |
26
+
27
+ *TODO: add tables for Cerebras-GPT models*
28
+
29
+ Sample output:
30
+
31
+ ```
32
+ $ ./bin/gpt-2 -h
33
+ usage: ./bin/gpt-2 [options]
34
+
35
+ options:
36
+ -h, --help show this help message and exit
37
+ -s SEED, --seed SEED RNG seed (default: -1)
38
+ -t N, --threads N number of threads to use during computation (default: 8)
39
+ -p PROMPT, --prompt PROMPT
40
+ prompt to start generation with (default: random)
41
+ -n N, --n_predict N number of tokens to predict (default: 200)
42
+ --top_k N top-k sampling (default: 40)
43
+ --top_p N top-p sampling (default: 0.9)
44
+ --temp N temperature (default: 1.0)
45
+ -b N, --batch_size N batch size for prompt processing (default: 8)
46
+ -m FNAME, --model FNAME
47
+ model path (default: models/gpt-2-117M/ggml-model.bin)
48
+
49
+ $ ./bin/gpt-2
50
+ gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
51
+ gpt2_model_load: n_vocab = 50257
52
+ gpt2_model_load: n_ctx = 1024
53
+ gpt2_model_load: n_embd = 768
54
+ gpt2_model_load: n_head = 12
55
+ gpt2_model_load: n_layer = 12
56
+ gpt2_model_load: f16 = 1
57
+ gpt2_model_load: ggml ctx size = 311.12 MB
58
+ gpt2_model_load: memory size = 72.00 MB, n_mem = 12288
59
+ gpt2_model_load: model size = 239.08 MB
60
+ main: number of tokens in prompt = 1
61
+
62
+ So this is going to be the end of the line for us.
63
+
64
+ If the Dolphins continue to do their business, it's possible that the team could make a bid to bring in new defensive coordinator Scott Linehan.
65
+
66
+ Linehan's job is a little daunting, but he's a great coach and an excellent coach. I don't believe we're going to make the playoffs.
67
+
68
+ We're going to have to work hard to keep our heads down and get ready to go.<|endoftext|>
69
+
70
+ main: mem per token = 2048612 bytes
71
+ main: load time = 106.32 ms
72
+ main: sample time = 7.10 ms
73
+ main: predict time = 506.40 ms / 5.06 ms per token
74
+ main: total time = 629.84 ms
75
+ ```
76
+
77
+ ## Downloading and converting the original models (GPT-2)
78
+
79
+ You can download the original model files using the [download-model.sh](download-model.sh) Bash script. The models are
80
+ in Tensorflow format, so in order to use them with ggml, you need to convert them to appropriate format. This is done
81
+ via the [convert-ckpt-to-ggml.py](convert-ckpt-to-ggml.py) python script.
82
+
83
+ Here is the entire process for the GPT-2 117M model (download from official site + conversion):
84
+
85
+ ```
86
+ cd ggml/build
87
+ ../examples/gpt-2/download-model.sh 117M
88
+
89
+ Downloading model 117M ...
90
+ models/gpt-2-117M/checkpoint 100%[=============================>] 77 --.-KB/s in 0s
91
+ models/gpt-2-117M/encoder.json 100%[=============================>] 1018K 1.20MB/s in 0.8s
92
+ models/gpt-2-117M/hparams.json 100%[=============================>] 90 --.-KB/s in 0s
93
+ models/gpt-2-117M/model.ckpt.data-00000-of-00001 100%[=============================>] 474.70M 1.21MB/s in 8m 39s
94
+ models/gpt-2-117M/model.ckpt.index 100%[=============================>] 5.09K --.-KB/s in 0s
95
+ models/gpt-2-117M/model.ckpt.meta 100%[=============================>] 460.11K 806KB/s in 0.6s
96
+ models/gpt-2-117M/vocab.bpe 100%[=============================>] 445.62K 799KB/s in 0.6s
97
+ Done! Model '117M' saved in 'models/gpt-2-117M/'
98
+
99
+ Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.
100
+
101
+ python /Users/john/ggml/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-117M/ 1
102
+
103
+ ```
104
+
105
+ This conversion requires that you have python and Tensorflow installed on your computer. Still, if you want to avoid
106
+ this, you can download the already converted ggml models as described below.
107
+
108
+ ## Downloading and converting the original models (Cerebras-GPT)
109
+
110
+ Clone the respective repository from here: https://huggingface.co/cerebras
111
+
112
+ Use the [convert-cerebras-to-ggml.py](convert-cerebras-to-ggml.py) script to convert the model to `ggml` format:
113
+
114
+ ```
115
+ cd ggml/build
116
+ git clone https://huggingface.co/cerebras/Cerebras-GPT-111M models/
117
+ python ../examples/gpt-2/convert-cerebras-to-ggml.py models/Cerebras-GPT-111M/
118
+
119
+ ```
120
+
121
+ ## Downloading the ggml model directly (GPT-2)
122
+
123
+ For convenience, I will be hosting the converted ggml model files in order to make it easier to run the examples. This
124
+ way, you can directly download a single binary file and start using it. No python or Tensorflow is required.
125
+
126
+ Here is how to get the 117M ggml model:
127
+
128
+ ```
129
+ cd ggml/build
130
+ ../examples/gpt-2/download-ggml-model.sh 117M
131
+
132
+ Downloading ggml model 117M ...
133
+ models/gpt-2-117M/ggml-model.bin 100%[===============================>] 239.58M 8.52MB/s in 28s
134
+ Done! Model '117M' saved in 'models/gpt-2-117M/ggml-model.bin'
135
+ You can now use it like this:
136
+
137
+ $ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
138
+
139
+ ```
140
+
141
+ At some point, I might decide to stop hosting these models. So in that case, simply revert to the manual process above.
142
+
143
+ ## Quantizing the models
144
+
145
+ You can also try to quantize the `ggml` models via 4-bit integer quantization.
146
+ Keep in mind that for smaller models, this will render them completely useless.
147
+ You generally want to quantize larger models.
148
+
149
+ ```
150
+ # quantize GPT-2 F16 to Q4_0 (faster but less precise)
151
+ ./bin/gpt-2-quantize models/gpt-2-1558M/ggml-model-f16.bin models/gpt-2-1558M/ggml-model-q4_0.bin 2
152
+ ./bin/gpt-2 -m models/gpt-2-1558M/ggml-model-q4_0.bin -p "This is an example"
153
+
154
+ # quantize Cerebras F16 to Q4_1 (slower but more precise)
155
+ ./bin/gpt-2-quantize models/Cerebras-GPT-6.7B/ggml-model-f16.bin models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin 3
156
+ ./bin/gpt-2 -m models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin -p "This is an example"
157
+
158
+ ```
159
+
160
+ ## Batched generation example
161
+
162
+ You can try the batched generation from a given prompt using the gpt-2-batched binary.
163
+
164
+ Sample output:
165
+
166
+ ```
167
+ $ gpt-2-batched -np 5 -m models/gpt-2-117M/ggml-model.bin -p "Hello my name is" -n 50
168
+
169
+ main: seed = 1697037431
170
+ gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
171
+ gpt2_model_load: n_vocab = 50257
172
+ gpt2_model_load: n_ctx = 1024
173
+ gpt2_model_load: n_embd = 768
174
+ gpt2_model_load: n_head = 12
175
+ gpt2_model_load: n_layer = 12
176
+ gpt2_model_load: ftype = 1
177
+ gpt2_model_load: qntvr = 0
178
+ gpt2_model_load: ggml tensor size = 320 bytes
179
+ gpt2_model_load: backend buffer size = 312.72 MB
180
+ ggml_init_cublas: found 1 CUDA devices:
181
+ Device 0: NVIDIA GeForce GTX 1660, compute capability 7.5
182
+ gpt2_model_load: using CPU backend
183
+ gpt2_model_load: memory size = 72.00 MB, n_mem = 12288
184
+ gpt2_model_load: model size = 239.08 MB
185
+ extract_tests_from_file : No test file found.
186
+ test_gpt_tokenizer : 0 tests failed out of 0 tests.
187
+ main: compute buffer size: 3.26 MB
188
+
189
+
190
+ main: generating 5 sequences ...
191
+ main: prompt: 'Hello my name is'
192
+ main: number of tokens in prompt = 4, first 8 tokens: 15496 616 1438 318
193
+
194
+
195
+ sequence 0:
196
+
197
+ Hello my name is John. You can call me any way you want, if you want, but for my very first date, I will be on the phone with you. We're both in our early 20s, but I feel like it's all
198
+
199
+ sequence 1:
200
+
201
+ Hello my name is Robert, and I want to say that we're proud to have your company here on the world's largest platform for sharing your stories with us. This is a huge opportunity for our community. We have hundreds of people on this team and
202
+
203
+ sequence 2:
204
+
205
+ Hello my name is Jack. I'm the one who created you.
206
+
207
+ Jack is a boy with a big smile and a big heart. He is a handsome guy. He loves the outdoors and loves the people he meets. He wants to be a
208
+
209
+ sequence 3:
210
+
211
+ Hello my name is John. I am a Canadian citizen with a large number of family in Quebec and I am interested in studying. My aim is to take up a post in the Journal of the International Academy of Sciences of Canada which I am currently finishing.
212
+
213
+ sequence 4:
214
+
215
+ Hello my name is Dan. I am an entrepreneur. I am a great father. I am a great husband. I am a great husband. I am a great dad. And I am a great husband.
216
+
217
+ I love my life. I love
218
+
219
+
220
+
221
+ main: load time = 880.80 ms
222
+ main: sample time = 91.43 ms
223
+ main: predict time = 2518.29 ms
224
+ main: total time = 3544.32 ms
225
+ ```
stable-diffusion.cpp/ggml/examples/gpt-2/convert-cerebras-to-ggml.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert Cerebras models to ggml format
2
+ #
3
+ # ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/
4
+ #
5
+
6
+ import sys
7
+ import struct
8
+ import json
9
+ import torch
10
+ import numpy as np
11
+ import re
12
+
13
+ from transformers import AutoModelForCausalLM
14
+
15
+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
16
+ def bytes_to_unicode():
17
+ """
18
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
19
+ The reversible bpe codes work on unicode strings.
20
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
21
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
22
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
23
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
24
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
25
+ """
26
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
27
+ cs = bs[:]
28
+ n = 0
29
+ for b in range(2**8):
30
+ if b not in bs:
31
+ bs.append(b)
32
+ cs.append(2**8+n)
33
+ n += 1
34
+ cs = [chr(n) for n in cs]
35
+ return dict(zip(bs, cs))
36
+
37
+ if len(sys.argv) < 2:
38
+ print("Usage: convert-cerebras-to-ggml.py dir-model [use-f32]\n")
39
+ sys.exit(1)
40
+
41
+ # output in the same directory as the model
42
+ dir_model = sys.argv[1]
43
+ fname_out = sys.argv[1] + "/ggml-model-f16.bin"
44
+
45
+ with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
46
+ encoder = json.load(f)
47
+
48
+ with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
49
+ hparams = json.load(f)
50
+
51
+ # use 16-bit or 32-bit floats
52
+ use_f16 = True
53
+ if len(sys.argv) > 2:
54
+ use_f16 = False
55
+ fname_out = sys.argv[1] + "/ggml-model-f32.bin"
56
+
57
+ model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
58
+ #print (model)
59
+
60
+ list_vars = model.state_dict()
61
+ #print (list_vars)
62
+
63
+ print(hparams)
64
+
65
+ fout = open(fname_out, "wb")
66
+
67
+ fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
68
+ fout.write(struct.pack("i", hparams["vocab_size"]))
69
+ fout.write(struct.pack("i", hparams["n_positions"]))
70
+ fout.write(struct.pack("i", hparams["n_embd"]))
71
+ fout.write(struct.pack("i", hparams["n_head"]))
72
+ fout.write(struct.pack("i", hparams["n_layer"]))
73
+ fout.write(struct.pack("i", use_f16))
74
+
75
+ byte_encoder = bytes_to_unicode()
76
+ byte_decoder = {v:k for k, v in byte_encoder.items()}
77
+
78
+ fout.write(struct.pack("i", len(encoder)))
79
+
80
+ for key in encoder:
81
+ text = bytearray([byte_decoder[c] for c in key])
82
+ fout.write(struct.pack("i", len(text)))
83
+ fout.write(text)
84
+
85
+ for name in list_vars.keys():
86
+ data = list_vars[name].squeeze().numpy()
87
+ print("Processing variable: " + name + " with shape: ", data.shape)
88
+
89
+ # rename headers to keep compatibility
90
+ if name == "transformer.ln_f.weight":
91
+ name = "model/ln_f/g"
92
+ elif name == "transformer.ln_f.bias":
93
+ name = "model/ln_f/b"
94
+ elif name == "transformer.wte.weight":
95
+ name = "model/wte"
96
+ elif name == "transformer.wpe.weight":
97
+ name = "model/wpe"
98
+ elif name == "lm_head.weight":
99
+ name = "model/lm_head"
100
+ elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
101
+ i = re.findall("\d+", name)[0]
102
+ name = f"model/h{i}/ln_1/g"
103
+ elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
104
+ i = re.findall("\d+", name)[0]
105
+ name = f"model/h{i}/ln_1/b"
106
+ elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
107
+ i = re.findall("\d+", name)[0]
108
+ name = f"model/h{i}/attn/c_attn/w"
109
+ elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
110
+ i = re.findall("\d+", name)[0]
111
+ name = f"model/h{i}/attn/c_attn/b"
112
+ elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
113
+ i = re.findall("\d+", name)[0]
114
+ name = f"model/h{i}/attn/c_proj/w"
115
+ elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
116
+ i = re.findall("\d+", name)[0]
117
+ name = f"model/h{i}/attn/c_proj/b"
118
+ elif re.match(r"transformer.h.\d+.ln_2.weight", name):
119
+ i = re.findall("\d+", name)[0]
120
+ name = f"model/h{i}/ln_2/g"
121
+ elif re.match(r"transformer.h.\d+.ln_2.bias", name):
122
+ i = re.findall("\d+", name)[0]
123
+ name = f"model/h{i}/ln_2/b"
124
+ elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
125
+ i = re.findall("\d+", name)[0]
126
+ name = f"model/h{i}/mlp/c_fc/w"
127
+ elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
128
+ i = re.findall("\d+", name)[0]
129
+ name = f"model/h{i}/mlp/c_fc/b"
130
+ elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
131
+ i = re.findall("\d+", name)[0]
132
+ name = f"model/h{i}/mlp/c_proj/w"
133
+ elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
134
+ i = re.findall("\d+", name)[0]
135
+ name = f"model/h{i}/mlp/c_proj/b"
136
+ else:
137
+ print("Unrecognized variable name. %s", name)
138
+
139
+ # we don't need these
140
+ if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
141
+ print(" Skipping variable: " + name)
142
+ continue
143
+
144
+ n_dims = len(data.shape);
145
+
146
+ # ftype == 0 -> float32, ftype == 1 -> float16
147
+ ftype = 0;
148
+ if use_f16:
149
+ if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
150
+ print(" Converting to float16")
151
+ data = data.astype(np.float16)
152
+ ftype = 1
153
+ else:
154
+ print(" Converting to float32")
155
+ data = data.astype(np.float32)
156
+ ftype = 0
157
+
158
+ # for efficiency - transpose the projection matrices
159
+ # "model/h.*/attn/c_attn/w"
160
+ # "model/h.*/attn/c_proj/w"
161
+ # "model/h.*/mlp/c_fc/w"
162
+ # "model/h.*/mlp/c_proj/w"
163
+ if name[-14:] == "/attn/c_attn/w" or \
164
+ name[-14:] == "/attn/c_proj/w" or \
165
+ name[-11:] == "/mlp/c_fc/w" or \
166
+ name[-13:] == "/mlp/c_proj/w":
167
+ print(" Transposing")
168
+ data = data.transpose()
169
+
170
+ # header
171
+ str = name.encode('utf-8')
172
+ fout.write(struct.pack("iii", n_dims, len(str), ftype))
173
+ for i in range(n_dims):
174
+ fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
175
+ fout.write(str);
176
+
177
+ # data
178
+ data.tofile(fout)
179
+
180
+ fout.close()
181
+
182
+ print("Done. Output file: " + fname_out)
183
+ print("")
stable-diffusion.cpp/ggml/examples/gpt-2/convert-ckpt-to-ggml.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert a model checkpoint to a ggml compatible file
2
+ #
3
+ # Load the model using TensorFlow.
4
+ # Iterate over all variables and write them to a binary file.
5
+ #
6
+ # For each variable, write the following:
7
+ # - Number of dimensions (int)
8
+ # - Name length (int)
9
+ # - Dimensions (int[n_dims])
10
+ # - Name (char[name_length])
11
+ # - Data (float[n_dims])
12
+ #
13
+ # By default, the bigger matrices are converted to 16-bit floats.
14
+ # This can be disabled by adding the "use-f32" CLI argument.
15
+ #
16
+ # At the start of the ggml file we write the model parameters
17
+ # and vocabulary.
18
+ #
19
+
20
+ import sys
21
+ import json
22
+ import struct
23
+ import numpy as np
24
+ import tensorflow as tf
25
+
26
+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
27
+ def bytes_to_unicode():
28
+ """
29
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
30
+ The reversible bpe codes work on unicode strings.
31
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
32
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
33
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
34
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
35
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
36
+ """
37
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
38
+ cs = bs[:]
39
+ n = 0
40
+ for b in range(2**8):
41
+ if b not in bs:
42
+ bs.append(b)
43
+ cs.append(2**8+n)
44
+ n += 1
45
+ cs = [chr(n) for n in cs]
46
+ return dict(zip(bs, cs))
47
+
48
+ # helper method to convert a numpy array to different float types
49
+ def convert_to_ftype(data, ftype):
50
+ # fp16
51
+ if ftype == 1:
52
+ return data.astype(np.float16)
53
+
54
+ assert False, "Invalid ftype: " + str(ftype)
55
+
56
+ if len(sys.argv) < 3:
57
+ print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
58
+ print(" ftype == 0 -> float32")
59
+ print(" ftype == 1 -> float16")
60
+ sys.exit(1)
61
+
62
+ # output in the same directory as the model
63
+ dir_model = sys.argv[1]
64
+ fname_out = sys.argv[1] + "/ggml-model.bin"
65
+
66
+ with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f:
67
+ encoder = json.load(f)
68
+
69
+ with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f:
70
+ hparams = json.load(f)
71
+
72
+ # possible data types
73
+ # ftype == 0 -> float32
74
+ # ftype == 1 -> float16
75
+ #
76
+ # map from ftype to string
77
+ ftype_str = ["f32", "f16"]
78
+
79
+ ftype = 1
80
+ if len(sys.argv) > 2:
81
+ ftype = int(sys.argv[2])
82
+ if ftype < 0 or ftype > 1:
83
+ print("Invalid ftype: " + str(ftype))
84
+ sys.exit(1)
85
+ fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
86
+
87
+ list_vars = tf.train.list_variables(dir_model)
88
+
89
+ fout = open(fname_out, "wb")
90
+
91
+ fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
92
+ fout.write(struct.pack("i", hparams["n_vocab"]))
93
+ fout.write(struct.pack("i", hparams["n_ctx"]))
94
+ fout.write(struct.pack("i", hparams["n_embd"]))
95
+ fout.write(struct.pack("i", hparams["n_head"]))
96
+ fout.write(struct.pack("i", hparams["n_layer"]))
97
+ fout.write(struct.pack("i", ftype))
98
+
99
+ byte_encoder = bytes_to_unicode()
100
+ byte_decoder = {v:k for k, v in byte_encoder.items()}
101
+
102
+ fout.write(struct.pack("i", len(encoder)))
103
+
104
+ for key in encoder:
105
+ text = bytearray([byte_decoder[c] for c in key])
106
+ fout.write(struct.pack("i", len(text)))
107
+ fout.write(text)
108
+
109
+ for name, shape in list_vars:
110
+ print("Processing variable: " + name + " with shape: ", shape)
111
+
112
+ data = tf.train.load_variable(dir_model, name).squeeze()
113
+ n_dims = len(data.shape);
114
+
115
+ # for efficiency - transpose the projection matrices
116
+ # "model/h.*/attn/c_attn/w"
117
+ # "model/h.*/attn/c_proj/w"
118
+ # "model/h.*/mlp/c_fc/w"
119
+ # "model/h.*/mlp/c_proj/w"
120
+ if name[-14:] == "/attn/c_attn/w" or \
121
+ name[-14:] == "/attn/c_proj/w" or \
122
+ name[-11:] == "/mlp/c_fc/w" or \
123
+ name[-13:] == "/mlp/c_proj/w":
124
+ print(" Transposing")
125
+ data = data.transpose()
126
+
127
+ dshape = data.shape
128
+
129
+ ftype_cur = 0
130
+ if ftype != 0:
131
+ # match name:
132
+ # "model/wte"
133
+ # "model/h.*/attn/c_attn/w"
134
+ # "model/h.*/attn/c_proj/w"
135
+ # "model/h.*/mlp/c_fc/w"
136
+ # "model/h.*/mlp/c_proj/w"
137
+ if name == "model/wte" or name[-2:] == "/w":
138
+ print(" Converting to " + ftype_str[ftype])
139
+ data = convert_to_ftype(data, ftype)
140
+ ftype_cur = ftype
141
+ else:
142
+ print(" Converting to float32")
143
+ data = data.astype(np.float32)
144
+ ftype_cur = 0
145
+
146
+ # header
147
+ str = name.encode('utf-8')
148
+ fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
149
+ for i in range(n_dims):
150
+ fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
151
+ fout.write(str);
152
+
153
+ # data
154
+ data.tofile(fout)
155
+
156
+ fout.close()
157
+
158
+ print("Done. Output file: " + fname_out)
159
+ print("")
stable-diffusion.cpp/ggml/examples/gpt-2/convert-h5-to-ggml.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert GPT-2 h5 transformer model to ggml format
2
+ #
3
+ # Load the model using GPT2Model.
4
+ # Iterate over all variables and write them to a binary file.
5
+ #
6
+ # For each variable, write the following:
7
+ # - Number of dimensions (int)
8
+ # - Name length (int)
9
+ # - Dimensions (int[n_dims])
10
+ # - Name (char[name_length])
11
+ # - Data (float[n_dims])
12
+ #
13
+ # By default, the bigger matrices are converted to 16-bit floats.
14
+ # This can be disabled by adding the "use-f32" CLI argument.
15
+ #
16
+ # At the start of the ggml file we write the model parameters
17
+ # and vocabulary.
18
+ #
19
+
20
+ import sys
21
+ import struct
22
+ import json
23
+ import numpy as np
24
+ import re
25
+
26
+ from transformers import GPT2Model
27
+
28
+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
29
+ def bytes_to_unicode():
30
+ """
31
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
32
+ The reversible bpe codes work on unicode strings.
33
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
34
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
35
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
36
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
37
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
38
+ """
39
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
40
+ cs = bs[:]
41
+ n = 0
42
+ for b in range(2**8):
43
+ if b not in bs:
44
+ bs.append(b)
45
+ cs.append(2**8+n)
46
+ n += 1
47
+ cs = [chr(n) for n in cs]
48
+ return dict(zip(bs, cs))
49
+
50
+ if len(sys.argv) < 2:
51
+ print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
52
+ sys.exit(1)
53
+
54
+ # output in the same directory as the model
55
+ dir_model = sys.argv[1]
56
+ fname_out = sys.argv[1] + "/ggml-model.bin"
57
+
58
+ with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
59
+ encoder = json.load(f)
60
+
61
+ with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
62
+ encoder_added = json.load(f)
63
+
64
+ with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
65
+ hparams = json.load(f)
66
+
67
+ # use 16-bit or 32-bit floats
68
+ use_f16 = True
69
+ if len(sys.argv) > 2:
70
+ use_f16 = False
71
+ fname_out = sys.argv[1] + "/ggml-model-f32.bin"
72
+
73
+ model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True)
74
+ #print (model)
75
+
76
+ list_vars = model.state_dict()
77
+ #print (list_vars)
78
+
79
+ fout = open(fname_out, "wb")
80
+
81
+ fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
82
+ fout.write(struct.pack("i", hparams["vocab_size"]))
83
+ fout.write(struct.pack("i", hparams["n_positions"]))
84
+ fout.write(struct.pack("i", hparams["n_embd"]))
85
+ fout.write(struct.pack("i", hparams["n_head"]))
86
+ fout.write(struct.pack("i", hparams["n_layer"]))
87
+ #fout.write(struct.pack("i", hparams["rotary_dim"]))
88
+ fout.write(struct.pack("i", use_f16))
89
+
90
+ byte_encoder = bytes_to_unicode()
91
+ byte_decoder = {v:k for k, v in byte_encoder.items()}
92
+
93
+ fout.write(struct.pack("i", len(encoder) + len(encoder_added)))
94
+
95
+ for key in encoder:
96
+ text = bytearray([byte_decoder[c] for c in key])
97
+ fout.write(struct.pack("i", len(text)))
98
+ fout.write(text)
99
+
100
+ for key in encoder_added:
101
+ text = bytearray([byte_decoder[c] for c in key])
102
+ fout.write(struct.pack("i", len(text)))
103
+ fout.write(text)
104
+
105
+ for name in list_vars.keys():
106
+ data = list_vars[name].squeeze().numpy()
107
+ print("Processing variable: " + name + " with shape: ", data.shape)
108
+
109
+ # we don't need these
110
+ if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
111
+ print(" Skipping variable: " + name)
112
+ continue
113
+
114
+ n_dims = len(data.shape);
115
+
116
+ # ftype == 0 -> float32, ftype == 1 -> float16
117
+ ftype = 0;
118
+ if use_f16:
119
+ if name[-7:] == ".weight" and n_dims == 2:
120
+ print(" Converting to float16")
121
+ data = data.astype(np.float16)
122
+ ftype = 1
123
+ else:
124
+ print(" Converting to float32")
125
+ data = data.astype(np.float32)
126
+ ftype = 0
127
+
128
+ # for efficiency - transpose these matrices:
129
+ # "transformer.h.*.mlp.c_proj.weight
130
+ if name.endswith(".mlp.c_proj.weight"):
131
+ print(" Transposing")
132
+ data = data.transpose()
133
+
134
+ # rename headers to keep compatibility
135
+ if name == "ln_f.weight":
136
+ name = "model/ln_f/g"
137
+ elif name == "ln_f.bias":
138
+ name = "model/ln_f/b"
139
+ elif name == "wte.weight":
140
+ name = "model/wte"
141
+ elif name == "wpe.weight":
142
+ name = "model/wpe"
143
+ elif re.match(r"h\.\d+\.ln_1\.weight", name):
144
+ i = re.findall("\d+", name)[0]
145
+ name = f"model/h{i}/ln_1/g"
146
+ elif re.match(r"h\.\d+\.ln_1\.bias", name):
147
+ i = re.findall("\d+", name)[0]
148
+ name = f"model/h{i}/ln_1/b"
149
+ elif re.match(r"h\.\d+\.attn\.c_attn\.weight", name):
150
+ i = re.findall("\d+", name)[0]
151
+ name = f"model/h{i}/attn/c_attn/w"
152
+ elif re.match(r"h\.\d+\.attn\.c_attn\.bias", name):
153
+ i = re.findall("\d+", name)[0]
154
+ name = f"model/h{i}/attn/c_attn/b"
155
+ elif re.match(r"h\.\d+\.attn\.c_proj\.weight", name):
156
+ i = re.findall("\d+", name)[0]
157
+ name = f"model/h{i}/attn/c_proj/w"
158
+ elif re.match(r"h.\d+.attn.c_proj.bias", name):
159
+ i = re.findall("\d+", name)[0]
160
+ name = f"model/h{i}/attn/c_proj/b"
161
+ elif re.match(r"h.\d+.ln_2.weight", name):
162
+ i = re.findall("\d+", name)[0]
163
+ name = f"model/h{i}/ln_2/g"
164
+ elif re.match(r"h.\d+.ln_2.bias", name):
165
+ i = re.findall("\d+", name)[0]
166
+ name = f"model/h{i}/ln_2/b"
167
+ elif re.match(r"h.\d+.mlp.c_fc.weight", name):
168
+ i = re.findall("\d+", name)[0]
169
+ name = f"model/h{i}/mlp/c_fc/w"
170
+ elif re.match(r"h.\d+.mlp.c_fc.bias", name):
171
+ i = re.findall("\d+", name)[0]
172
+ name = f"model/h{i}/mlp/c_fc/b"
173
+ elif re.match(r"h.\d+.mlp.c_proj.weight", name):
174
+ i = re.findall("\d+", name)[0]
175
+ name = f"model/h{i}/mlp/c_proj/w"
176
+ elif re.match(r"h.\d+.mlp.c_proj.bias", name):
177
+ i = re.findall("\d+", name)[0]
178
+ name = f"model/h{i}/mlp/c_proj/b"
179
+ else:
180
+ print("Unrecognized variable name. %s", name)
181
+
182
+ str = name.encode('utf-8')
183
+
184
+ fout.write(struct.pack("iii", n_dims, len(str), ftype))
185
+ for i in range(n_dims):
186
+ fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
187
+ fout.write(str);
188
+
189
+ # data
190
+ data.tofile(fout)
191
+
192
+ fout.close()
193
+
194
+ print("Done. Output file: " + fname_out)
195
+ print("")
stable-diffusion.cpp/ggml/examples/gpt-2/download-ggml-model.sh ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script downloads GPT-2 model files that have already been converted to ggml format.
4
+ # This way you don't have to convert them yourself.
5
+ #
6
+ # If you want to download the original GPT-2 model files, use the "download-model.sh" script instead.
7
+
8
+ #src="https://ggml.ggerganov.com"
9
+ #pfx="ggml-model-gpt-2"
10
+
11
+ src="https://huggingface.co/ggerganov/ggml"
12
+ pfx="resolve/main/ggml-model-gpt-2"
13
+
14
+ ggml_path=$(dirname $(realpath $0))
15
+
16
+ # GPT-2 models
17
+ models=( "117M" "345M" "774M" "1558M" )
18
+
19
+ # list available models
20
+ function list_models {
21
+ printf "\n"
22
+ printf " Available models:"
23
+ for model in "${models[@]}"; do
24
+ printf " $model"
25
+ done
26
+ printf "\n\n"
27
+ }
28
+
29
+ if [ "$#" -ne 1 ]; then
30
+ printf "Usage: $0 <model>\n"
31
+ list_models
32
+
33
+ exit 1
34
+ fi
35
+
36
+ model=$1
37
+
38
+ if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
39
+ printf "Invalid model: $model\n"
40
+ list_models
41
+
42
+ exit 1
43
+ fi
44
+
45
+ # download ggml model
46
+
47
+ printf "Downloading ggml model $model ...\n"
48
+
49
+ mkdir -p models/gpt-2-$model
50
+
51
+ if [ -x "$(command -v wget)" ]; then
52
+ wget --quiet --show-progress -O models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
53
+ elif [ -x "$(command -v curl)" ]; then
54
+ curl -L --output models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
55
+ else
56
+ printf "Either wget or curl is required to download models.\n"
57
+ exit 1
58
+ fi
59
+
60
+ if [ $? -ne 0 ]; then
61
+ printf "Failed to download ggml model $model \n"
62
+ printf "Please try again later or download the original GPT-2 model files and convert them yourself.\n"
63
+ exit 1
64
+ fi
65
+
66
+ printf "Done! Model '$model' saved in 'models/gpt-2-$model/ggml-model.bin'\n"
67
+ printf "You can now use it like this:\n\n"
68
+ printf " $ ./bin/gpt-2 -m models/gpt-2-$model/ggml-model.bin -p \"This is an example\"\n"
69
+ printf "\n"
stable-diffusion.cpp/ggml/examples/gpt-2/download-model.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ ggml_path=$(dirname $(realpath $0))
4
+
5
+ # GPT-2 models
6
+ models=( "117M" "345M" "774M" "1558M" )
7
+
8
+ # list available models
9
+ function list_models {
10
+ printf "\n"
11
+ printf " Available models:"
12
+ for model in "${models[@]}"; do
13
+ printf " $model"
14
+ done
15
+ printf "\n\n"
16
+ }
17
+
18
+ if [ "$#" -ne 1 ]; then
19
+ printf "Usage: $0 <model>\n"
20
+ list_models
21
+
22
+ exit 1
23
+ fi
24
+
25
+ model=$1
26
+
27
+ if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
28
+ printf "Invalid model: $model\n"
29
+ list_models
30
+
31
+ exit 1
32
+ fi
33
+
34
+ # download model
35
+
36
+ printf "Downloading model $model ...\n"
37
+
38
+ mkdir -p models/gpt-2-$model
39
+
40
+ for file in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do
41
+ wget --quiet --show-progress -O models/gpt-2-$model/$file https://openaipublic.blob.core.windows.net/gpt-2/models/$model/$file
42
+ done
43
+
44
+ printf "Done! Model '$model' saved in 'models/gpt-2-$model/'\n\n"
45
+ printf "Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.\n"
46
+ printf "\n"
47
+ printf " python $ggml_path/convert-ckpt-to-ggml.py models/gpt-2-$model/\n"
48
+ printf "\n"