Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +2 -0
- stable-diffusion.cpp/.dockerignore +6 -0
- stable-diffusion.cpp/.github/workflows/build.yml +201 -0
- stable-diffusion.cpp/.gitignore +5 -0
- stable-diffusion.cpp/.gitmodules +3 -0
- stable-diffusion.cpp/CMakeLists.txt +45 -0
- stable-diffusion.cpp/Dockerfile +17 -0
- stable-diffusion.cpp/LICENSE +21 -0
- stable-diffusion.cpp/README.md +198 -0
- stable-diffusion.cpp/assets/a lovely cat.png +0 -0
- stable-diffusion.cpp/assets/f16.png +0 -0
- stable-diffusion.cpp/assets/f32.png +0 -0
- stable-diffusion.cpp/assets/img2img_output.png +0 -0
- stable-diffusion.cpp/assets/q4_0.png +0 -0
- stable-diffusion.cpp/assets/q4_1.png +0 -0
- stable-diffusion.cpp/assets/q5_0.png +0 -0
- stable-diffusion.cpp/assets/q5_1.png +0 -0
- stable-diffusion.cpp/assets/q8_0.png +0 -0
- stable-diffusion.cpp/examples/CMakeLists.txt +8 -0
- stable-diffusion.cpp/examples/main.cpp +473 -0
- stable-diffusion.cpp/examples/stb_image.h +0 -0
- stable-diffusion.cpp/examples/stb_image_write.h +1741 -0
- stable-diffusion.cpp/ggml/.editorconfig +19 -0
- stable-diffusion.cpp/ggml/.github/workflows/ci.yml +137 -0
- stable-diffusion.cpp/ggml/.gitignore +37 -0
- stable-diffusion.cpp/ggml/CMakeLists.txt +197 -0
- stable-diffusion.cpp/ggml/LICENSE +21 -0
- stable-diffusion.cpp/ggml/README.md +140 -0
- stable-diffusion.cpp/ggml/build.zig +158 -0
- stable-diffusion.cpp/ggml/ci/run.sh +334 -0
- stable-diffusion.cpp/ggml/cmake/BuildTypes.cmake +54 -0
- stable-diffusion.cpp/ggml/cmake/GitVars.cmake +22 -0
- stable-diffusion.cpp/ggml/examples/CMakeLists.txt +30 -0
- stable-diffusion.cpp/ggml/examples/common-ggml.cpp +246 -0
- stable-diffusion.cpp/ggml/examples/common-ggml.h +18 -0
- stable-diffusion.cpp/ggml/examples/common.cpp +817 -0
- stable-diffusion.cpp/ggml/examples/common.h +179 -0
- stable-diffusion.cpp/ggml/examples/dolly-v2/CMakeLists.txt +13 -0
- stable-diffusion.cpp/ggml/examples/dolly-v2/README.md +187 -0
- stable-diffusion.cpp/ggml/examples/dolly-v2/convert-h5-to-ggml.py +116 -0
- stable-diffusion.cpp/ggml/examples/dolly-v2/main.cpp +969 -0
- stable-diffusion.cpp/ggml/examples/dolly-v2/quantize.cpp +178 -0
- stable-diffusion.cpp/ggml/examples/dr_wav.h +0 -0
- stable-diffusion.cpp/ggml/examples/gpt-2/CMakeLists.txt +36 -0
- stable-diffusion.cpp/ggml/examples/gpt-2/README.md +225 -0
- stable-diffusion.cpp/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
- stable-diffusion.cpp/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
- stable-diffusion.cpp/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
- stable-diffusion.cpp/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
- stable-diffusion.cpp/ggml/examples/gpt-2/download-model.sh +48 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
stable-diffusion.cpp/ggml/examples/mnist/models/mnist/mnist_model.state_dict filter=lfs diff=lfs merge=lfs -text
|
37 |
+
stable-diffusion.cpp/ggml/examples/mnist/models/mnist/t10k-images.idx3-ubyte filter=lfs diff=lfs merge=lfs -text
|
stable-diffusion.cpp/.dockerignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
build*/
|
2 |
+
test/
|
3 |
+
|
4 |
+
.cache/
|
5 |
+
*.swp
|
6 |
+
models/
|
stable-diffusion.cpp/.github/workflows/build.yml
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: CI
|
2 |
+
|
3 |
+
on:
|
4 |
+
workflow_dispatch: # allows manual triggering
|
5 |
+
inputs:
|
6 |
+
create_release:
|
7 |
+
description: 'Create new release'
|
8 |
+
required: true
|
9 |
+
type: boolean
|
10 |
+
push:
|
11 |
+
branches:
|
12 |
+
- master
|
13 |
+
- ci
|
14 |
+
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
15 |
+
pull_request:
|
16 |
+
types: [opened, synchronize, reopened]
|
17 |
+
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
18 |
+
|
19 |
+
env:
|
20 |
+
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
21 |
+
|
22 |
+
jobs:
|
23 |
+
ubuntu-latest-cmake:
|
24 |
+
runs-on: ubuntu-latest
|
25 |
+
|
26 |
+
steps:
|
27 |
+
- name: Clone
|
28 |
+
id: checkout
|
29 |
+
uses: actions/checkout@v3
|
30 |
+
with:
|
31 |
+
submodules: recursive
|
32 |
+
|
33 |
+
|
34 |
+
- name: Dependencies
|
35 |
+
id: depends
|
36 |
+
run: |
|
37 |
+
sudo apt-get update
|
38 |
+
sudo apt-get install build-essential
|
39 |
+
|
40 |
+
- name: Build
|
41 |
+
id: cmake_build
|
42 |
+
run: |
|
43 |
+
mkdir build
|
44 |
+
cd build
|
45 |
+
cmake ..
|
46 |
+
cmake --build . --config Release
|
47 |
+
|
48 |
+
#- name: Test
|
49 |
+
#id: cmake_test
|
50 |
+
#run: |
|
51 |
+
#cd build
|
52 |
+
#ctest --verbose --timeout 900
|
53 |
+
|
54 |
+
macOS-latest-cmake:
|
55 |
+
runs-on: macos-latest
|
56 |
+
|
57 |
+
steps:
|
58 |
+
- name: Clone
|
59 |
+
id: checkout
|
60 |
+
uses: actions/checkout@v3
|
61 |
+
with:
|
62 |
+
submodules: recursive
|
63 |
+
|
64 |
+
- name: Dependencies
|
65 |
+
id: depends
|
66 |
+
continue-on-error: true
|
67 |
+
run: |
|
68 |
+
brew update
|
69 |
+
|
70 |
+
- name: Build
|
71 |
+
id: cmake_build
|
72 |
+
run: |
|
73 |
+
sysctl -a
|
74 |
+
mkdir build
|
75 |
+
cd build
|
76 |
+
cmake ..
|
77 |
+
cmake --build . --config Release
|
78 |
+
|
79 |
+
#- name: Test
|
80 |
+
#id: cmake_test
|
81 |
+
#run: |
|
82 |
+
#cd build
|
83 |
+
#ctest --verbose --timeout 900
|
84 |
+
|
85 |
+
windows-latest-cmake:
|
86 |
+
runs-on: windows-latest
|
87 |
+
|
88 |
+
strategy:
|
89 |
+
matrix:
|
90 |
+
include:
|
91 |
+
- build: 'noavx'
|
92 |
+
defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
|
93 |
+
- build: 'avx2'
|
94 |
+
defines: '-DGGML_AVX2=ON'
|
95 |
+
- build: 'avx'
|
96 |
+
defines: '-DGGML_AVX2=OFF'
|
97 |
+
- build: 'avx512'
|
98 |
+
defines: '-DGGML_AVX512=ON'
|
99 |
+
|
100 |
+
steps:
|
101 |
+
- name: Clone
|
102 |
+
id: checkout
|
103 |
+
uses: actions/checkout@v3
|
104 |
+
with:
|
105 |
+
submodules: recursive
|
106 |
+
|
107 |
+
- name: Build
|
108 |
+
id: cmake_build
|
109 |
+
run: |
|
110 |
+
mkdir build
|
111 |
+
cd build
|
112 |
+
cmake .. ${{ matrix.defines }}
|
113 |
+
cmake --build . --config Release
|
114 |
+
|
115 |
+
- name: Check AVX512F support
|
116 |
+
id: check_avx512f
|
117 |
+
if: ${{ matrix.build == 'avx512' }}
|
118 |
+
continue-on-error: true
|
119 |
+
run: |
|
120 |
+
cd build
|
121 |
+
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
122 |
+
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
123 |
+
$cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
|
124 |
+
echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
|
125 |
+
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
|
126 |
+
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
|
127 |
+
|
128 |
+
#- name: Test
|
129 |
+
#id: cmake_test
|
130 |
+
#run: |
|
131 |
+
#cd build
|
132 |
+
#ctest -C Release --verbose --timeout 900
|
133 |
+
|
134 |
+
- name: Get commit hash
|
135 |
+
id: commit
|
136 |
+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
137 |
+
uses: pr-mpt/actions-commit-hash@v2
|
138 |
+
|
139 |
+
- name: Pack artifacts
|
140 |
+
id: pack_artifacts
|
141 |
+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
142 |
+
run: |
|
143 |
+
Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
|
144 |
+
Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
|
145 |
+
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
|
146 |
+
|
147 |
+
- name: Upload artifacts
|
148 |
+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
149 |
+
uses: actions/upload-artifact@v3
|
150 |
+
with:
|
151 |
+
path: |
|
152 |
+
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
|
153 |
+
|
154 |
+
release:
|
155 |
+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
156 |
+
|
157 |
+
runs-on: ubuntu-latest
|
158 |
+
|
159 |
+
needs:
|
160 |
+
- ubuntu-latest-cmake
|
161 |
+
- macOS-latest-cmake
|
162 |
+
- windows-latest-cmake
|
163 |
+
|
164 |
+
steps:
|
165 |
+
- name: Download artifacts
|
166 |
+
id: download-artifact
|
167 |
+
uses: actions/download-artifact@v3
|
168 |
+
|
169 |
+
- name: Get commit hash
|
170 |
+
id: commit
|
171 |
+
uses: pr-mpt/actions-commit-hash@v2
|
172 |
+
|
173 |
+
- name: Create release
|
174 |
+
id: create_release
|
175 |
+
uses: anzz1/action-create-release@v1
|
176 |
+
env:
|
177 |
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
178 |
+
with:
|
179 |
+
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
|
180 |
+
|
181 |
+
- name: Upload release
|
182 |
+
id: upload_release
|
183 |
+
uses: actions/github-script@v3
|
184 |
+
with:
|
185 |
+
github-token: ${{secrets.GITHUB_TOKEN}}
|
186 |
+
script: |
|
187 |
+
const path = require('path');
|
188 |
+
const fs = require('fs');
|
189 |
+
const release_id = '${{ steps.create_release.outputs.id }}';
|
190 |
+
for (let file of await fs.readdirSync('./artifact')) {
|
191 |
+
if (path.extname(file) === '.zip') {
|
192 |
+
console.log('uploadReleaseAsset', file);
|
193 |
+
await github.repos.uploadReleaseAsset({
|
194 |
+
owner: context.repo.owner,
|
195 |
+
repo: context.repo.repo,
|
196 |
+
release_id: release_id,
|
197 |
+
name: file,
|
198 |
+
data: await fs.readFileSync(`./artifact/${file}`)
|
199 |
+
});
|
200 |
+
}
|
201 |
+
}
|
stable-diffusion.cpp/.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
build*/
|
2 |
+
test/
|
3 |
+
|
4 |
+
.cache/
|
5 |
+
*.swp
|
stable-diffusion.cpp/.gitmodules
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[submodule "ggml"]
|
2 |
+
path = ggml
|
3 |
+
url = https://github.com/leejet/ggml.git
|
stable-diffusion.cpp/CMakeLists.txt
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cmake_minimum_required(VERSION 3.12)
|
2 |
+
project("stable-diffusion")
|
3 |
+
|
4 |
+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
5 |
+
|
6 |
+
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
7 |
+
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
8 |
+
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
9 |
+
endif()
|
10 |
+
|
11 |
+
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
12 |
+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
13 |
+
|
14 |
+
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
15 |
+
set(SD_STANDALONE ON)
|
16 |
+
else()
|
17 |
+
set(SD_STANDALONE OFF)
|
18 |
+
endif()
|
19 |
+
|
20 |
+
#
|
21 |
+
# Option list
|
22 |
+
#
|
23 |
+
|
24 |
+
# general
|
25 |
+
#option(SD_BUILD_TESTS "sd: build tests" ${SD_STANDALONE})
|
26 |
+
option(SD_BUILD_EXAMPLES "sd: build examples" ${SD_STANDALONE})
|
27 |
+
option(BUILD_SHARED_LIBS "sd: build shared libs" OFF)
|
28 |
+
#option(SD_BUILD_SERVER "sd: build server example" ON)
|
29 |
+
|
30 |
+
|
31 |
+
# deps
|
32 |
+
add_subdirectory(ggml)
|
33 |
+
|
34 |
+
set(SD_LIB stable-diffusion)
|
35 |
+
|
36 |
+
add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp)
|
37 |
+
target_link_libraries(${SD_LIB} PUBLIC ggml)
|
38 |
+
target_include_directories(${SD_LIB} PUBLIC .)
|
39 |
+
target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
|
40 |
+
|
41 |
+
|
42 |
+
if (SD_BUILD_EXAMPLES)
|
43 |
+
add_subdirectory(examples)
|
44 |
+
endif()
|
45 |
+
|
stable-diffusion.cpp/Dockerfile
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ARG UBUNTU_VERSION=22.04
|
2 |
+
|
3 |
+
FROM ubuntu:$UBUNTU_VERSION as build
|
4 |
+
|
5 |
+
RUN apt-get update && apt-get install -y build-essential git cmake
|
6 |
+
|
7 |
+
WORKDIR /sd.cpp
|
8 |
+
|
9 |
+
COPY . .
|
10 |
+
|
11 |
+
RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
|
12 |
+
|
13 |
+
FROM ubuntu:$UBUNTU_VERSION as runtime
|
14 |
+
|
15 |
+
COPY --from=build /sd.cpp/build/bin/sd /sd
|
16 |
+
|
17 |
+
ENTRYPOINT [ "/sd" ]
|
stable-diffusion.cpp/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 leejet
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
stable-diffusion.cpp/README.md
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<p align="center">
|
2 |
+
<img src="./assets/a%20lovely%20cat.png" width="256x">
|
3 |
+
</p>
|
4 |
+
|
5 |
+
# stable-diffusion.cpp
|
6 |
+
|
7 |
+
Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in pure C/C++
|
8 |
+
|
9 |
+
## Features
|
10 |
+
|
11 |
+
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
|
12 |
+
- 16-bit, 32-bit float support
|
13 |
+
- 4-bit, 5-bit and 8-bit integer quantization support
|
14 |
+
- Accelerated memory-efficient CPU inference
|
15 |
+
- Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image
|
16 |
+
- AVX, AVX2 and AVX512 support for x86 architectures
|
17 |
+
- SD1.x and SD2.x support
|
18 |
+
- Original `txt2img` and `img2img` mode
|
19 |
+
- Negative prompt
|
20 |
+
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
|
21 |
+
- Sampling method
|
22 |
+
- `Euler A`
|
23 |
+
- `Euler`
|
24 |
+
- `Heun`
|
25 |
+
- `DPM2`
|
26 |
+
- `DPM++ 2M`
|
27 |
+
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
|
28 |
+
- `DPM++ 2S a`
|
29 |
+
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
|
30 |
+
- Embedds generation parameters into png output as webui-compatible text string
|
31 |
+
- Supported platforms
|
32 |
+
- Linux
|
33 |
+
- Mac OS
|
34 |
+
- Windows
|
35 |
+
- Android (via Termux)
|
36 |
+
|
37 |
+
### TODO
|
38 |
+
|
39 |
+
- [ ] More sampling methods
|
40 |
+
- [ ] GPU support
|
41 |
+
- [ ] Make inference faster
|
42 |
+
- The current implementation of ggml_conv_2d is slow and has high memory usage
|
43 |
+
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
|
44 |
+
- [ ] LoRA support
|
45 |
+
- [ ] k-quants support
|
46 |
+
|
47 |
+
## Usage
|
48 |
+
|
49 |
+
### Get the Code
|
50 |
+
|
51 |
+
```
|
52 |
+
git clone --recursive https://github.com/leejet/stable-diffusion.cpp
|
53 |
+
cd stable-diffusion.cpp
|
54 |
+
```
|
55 |
+
|
56 |
+
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
|
57 |
+
|
58 |
+
```
|
59 |
+
cd stable-diffusion.cpp
|
60 |
+
git pull origin master
|
61 |
+
git submodule init
|
62 |
+
git submodule update
|
63 |
+
```
|
64 |
+
|
65 |
+
### Convert weights
|
66 |
+
|
67 |
+
- download original weights(.ckpt or .safetensors). For example
|
68 |
+
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
|
69 |
+
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
|
70 |
+
- Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
|
71 |
+
|
72 |
+
```shell
|
73 |
+
curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
|
74 |
+
# curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
|
75 |
+
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-nonema-pruned.safetensors
|
76 |
+
```
|
77 |
+
|
78 |
+
- convert weights to ggml model format
|
79 |
+
|
80 |
+
```shell
|
81 |
+
cd models
|
82 |
+
pip install -r requirements.txt
|
83 |
+
python convert.py [path to weights] --out_type [output precision]
|
84 |
+
# For example, python convert.py sd-v1-4.ckpt --out_type f16
|
85 |
+
```
|
86 |
+
|
87 |
+
### Quantization
|
88 |
+
|
89 |
+
You can specify the output model format using the --out_type parameter
|
90 |
+
|
91 |
+
- `f16` for 16-bit floating-point
|
92 |
+
- `f32` for 32-bit floating-point
|
93 |
+
- `q8_0` for 8-bit integer quantization
|
94 |
+
- `q5_0` or `q5_1` for 5-bit integer quantization
|
95 |
+
- `q4_0` or `q4_1` for 4-bit integer quantization
|
96 |
+
|
97 |
+
### Build
|
98 |
+
|
99 |
+
#### Build from scratch
|
100 |
+
|
101 |
+
```shell
|
102 |
+
mkdir build
|
103 |
+
cd build
|
104 |
+
cmake ..
|
105 |
+
cmake --build . --config Release
|
106 |
+
```
|
107 |
+
|
108 |
+
##### Using OpenBLAS
|
109 |
+
|
110 |
+
```
|
111 |
+
cmake .. -DGGML_OPENBLAS=ON
|
112 |
+
cmake --build . --config Release
|
113 |
+
```
|
114 |
+
|
115 |
+
### Run
|
116 |
+
|
117 |
+
```
|
118 |
+
usage: ./bin/sd [arguments]
|
119 |
+
|
120 |
+
arguments:
|
121 |
+
-h, --help show this help message and exit
|
122 |
+
-M, --mode [txt2img or img2img] generation mode (default: txt2img)
|
123 |
+
-t, --threads N number of threads to use during computation (default: -1).
|
124 |
+
If threads <= 0, then threads will be set to the number of CPU physical cores
|
125 |
+
-m, --model [MODEL] path to model
|
126 |
+
-i, --init-img [IMAGE] path to the input image, required by img2img
|
127 |
+
-o, --output OUTPUT path to write result image to (default: .\output.png)
|
128 |
+
-p, --prompt [PROMPT] the prompt to render
|
129 |
+
-n, --negative-prompt PROMPT the negative prompt (default: "")
|
130 |
+
--cfg-scale SCALE unconditional guidance scale: (default: 7.0)
|
131 |
+
--strength STRENGTH strength for noising/unnoising (default: 0.75)
|
132 |
+
1.0 corresponds to full destruction of information in init image
|
133 |
+
-H, --height H image height, in pixel space (default: 512)
|
134 |
+
-W, --width W image width, in pixel space (default: 512)
|
135 |
+
--sampling-method {euler, euler_a, heun, dpm++2m, dpm++2mv2}
|
136 |
+
sampling method (default: "euler_a")
|
137 |
+
--steps STEPS number of sample steps (default: 20)
|
138 |
+
--rng {std_default, cuda} RNG (default: cuda)
|
139 |
+
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
|
140 |
+
-v, --verbose print extra info
|
141 |
+
```
|
142 |
+
|
143 |
+
#### txt2img example
|
144 |
+
|
145 |
+
```
|
146 |
+
./bin/sd -m ../models/sd-v1-4-ggml-model-f16.bin -p "a lovely cat"
|
147 |
+
```
|
148 |
+
|
149 |
+
Using formats of different precisions will yield results of varying quality.
|
150 |
+
|
151 |
+
| f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
|
152 |
+
| ---- |---- |---- |---- |---- |---- |---- |
|
153 |
+
| ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
|
154 |
+
|
155 |
+
#### img2img example
|
156 |
+
|
157 |
+
- `./output.png` is the image generated from the above txt2img pipeline
|
158 |
+
|
159 |
+
|
160 |
+
```
|
161 |
+
./bin/sd --mode img2img -m ../models/sd-v1-4-ggml-model-f16.bin -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
|
162 |
+
```
|
163 |
+
|
164 |
+
<p align="center">
|
165 |
+
<img src="./assets/img2img_output.png" width="256x">
|
166 |
+
</p>
|
167 |
+
|
168 |
+
### Docker
|
169 |
+
|
170 |
+
#### Building using Docker
|
171 |
+
|
172 |
+
```shell
|
173 |
+
docker build -t sd .
|
174 |
+
```
|
175 |
+
|
176 |
+
#### Run
|
177 |
+
|
178 |
+
```shell
|
179 |
+
docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
|
180 |
+
# For example
|
181 |
+
# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4-ggml-model-f16.bin -p "a lovely cat" -v -o /output/output.png
|
182 |
+
```
|
183 |
+
|
184 |
+
## Memory/Disk Requirements
|
185 |
+
|
186 |
+
| precision | f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
|
187 |
+
| ---- | ---- |---- |---- |---- |---- |---- |---- |
|
188 |
+
| **Disk** | 2.7G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
|
189 |
+
| **Memory**(txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
|
190 |
+
|
191 |
+
|
192 |
+
## References
|
193 |
+
|
194 |
+
- [ggml](https://github.com/ggerganov/ggml)
|
195 |
+
- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
|
196 |
+
- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
|
197 |
+
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
|
198 |
+
- [k-diffusion](https://github.com/crowsonkb/k-diffusion)
|
stable-diffusion.cpp/assets/a lovely cat.png
ADDED
![]() |
stable-diffusion.cpp/assets/f16.png
ADDED
![]() |
stable-diffusion.cpp/assets/f32.png
ADDED
![]() |
stable-diffusion.cpp/assets/img2img_output.png
ADDED
![]() |
stable-diffusion.cpp/assets/q4_0.png
ADDED
![]() |
stable-diffusion.cpp/assets/q4_1.png
ADDED
![]() |
stable-diffusion.cpp/assets/q5_0.png
ADDED
![]() |
stable-diffusion.cpp/assets/q5_1.png
ADDED
![]() |
stable-diffusion.cpp/assets/q8_0.png
ADDED
![]() |
stable-diffusion.cpp/examples/CMakeLists.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# TODO: move into its own subdirectoy
|
2 |
+
# TODO: make stb libs a target (maybe common)
|
3 |
+
set(SD_TARGET sd)
|
4 |
+
|
5 |
+
add_executable(${SD_TARGET} main.cpp stb_image.h stb_image_write.h)
|
6 |
+
install(TARGETS ${SD_TARGET} RUNTIME)
|
7 |
+
target_link_libraries(${SD_TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
|
8 |
+
target_compile_features(${SD_TARGET} PUBLIC cxx_std_11)
|
stable-diffusion.cpp/examples/main.cpp
ADDED
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <stdio.h>
|
2 |
+
#include <ctime>
|
3 |
+
#include <fstream>
|
4 |
+
#include <iostream>
|
5 |
+
#include <random>
|
6 |
+
#include <string>
|
7 |
+
#include <thread>
|
8 |
+
#include <unordered_set>
|
9 |
+
|
10 |
+
#include "stable-diffusion.h"
|
11 |
+
|
12 |
+
#define STB_IMAGE_IMPLEMENTATION
|
13 |
+
#include "stb_image.h"
|
14 |
+
|
15 |
+
#define STB_IMAGE_WRITE_IMPLEMENTATION
|
16 |
+
#define STB_IMAGE_WRITE_STATIC
|
17 |
+
#include "stb_image_write.h"
|
18 |
+
|
19 |
+
#if defined(__APPLE__) && defined(__MACH__)
|
20 |
+
#include <sys/sysctl.h>
|
21 |
+
#include <sys/types.h>
|
22 |
+
#endif
|
23 |
+
|
24 |
+
#if !defined(_WIN32)
|
25 |
+
#include <sys/ioctl.h>
|
26 |
+
#include <unistd.h>
|
27 |
+
#endif
|
28 |
+
|
29 |
+
#define TXT2IMG "txt2img"
|
30 |
+
#define IMG2IMG "img2img"
|
31 |
+
|
32 |
+
// get_num_physical_cores is copy from
|
33 |
+
// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
|
34 |
+
// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
|
35 |
+
int32_t get_num_physical_cores() {
|
36 |
+
#ifdef __linux__
|
37 |
+
// enumerate the set of thread siblings, num entries is num cores
|
38 |
+
std::unordered_set<std::string> siblings;
|
39 |
+
for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
|
40 |
+
std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
|
41 |
+
if (!thread_siblings.is_open()) {
|
42 |
+
break; // no more cpus
|
43 |
+
}
|
44 |
+
std::string line;
|
45 |
+
if (std::getline(thread_siblings, line)) {
|
46 |
+
siblings.insert(line);
|
47 |
+
}
|
48 |
+
}
|
49 |
+
if (siblings.size() > 0) {
|
50 |
+
return static_cast<int32_t>(siblings.size());
|
51 |
+
}
|
52 |
+
#elif defined(__APPLE__) && defined(__MACH__)
|
53 |
+
int32_t num_physical_cores;
|
54 |
+
size_t len = sizeof(num_physical_cores);
|
55 |
+
int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
|
56 |
+
if (result == 0) {
|
57 |
+
return num_physical_cores;
|
58 |
+
}
|
59 |
+
result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
|
60 |
+
if (result == 0) {
|
61 |
+
return num_physical_cores;
|
62 |
+
}
|
63 |
+
#elif defined(_WIN32)
|
64 |
+
// TODO: Implement
|
65 |
+
#endif
|
66 |
+
unsigned int n_threads = std::thread::hardware_concurrency();
|
67 |
+
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
68 |
+
}
|
69 |
+
|
70 |
+
const char* rng_type_to_str[] = {
|
71 |
+
"std_default",
|
72 |
+
"cuda",
|
73 |
+
};
|
74 |
+
|
75 |
+
// Names of the sampler method, same order as enum SampleMethod in stable-diffusion.h
|
76 |
+
const char* sample_method_str[] = {
|
77 |
+
"euler_a",
|
78 |
+
"euler",
|
79 |
+
"heun",
|
80 |
+
"dpm2",
|
81 |
+
"dpm++2s_a",
|
82 |
+
"dpm++2m",
|
83 |
+
"dpm++2mv2"};
|
84 |
+
|
85 |
+
// Names of the sigma schedule overrides, same order as Schedule in stable-diffusion.h
|
86 |
+
const char* schedule_str[] = {
|
87 |
+
"default",
|
88 |
+
"discrete",
|
89 |
+
"karras"};
|
90 |
+
|
91 |
+
struct Option {
|
92 |
+
int n_threads = -1;
|
93 |
+
std::string mode = TXT2IMG;
|
94 |
+
std::string model_path;
|
95 |
+
std::string output_path = "output.png";
|
96 |
+
std::string init_img;
|
97 |
+
std::string prompt;
|
98 |
+
std::string negative_prompt;
|
99 |
+
float cfg_scale = 7.0f;
|
100 |
+
int w = 512;
|
101 |
+
int h = 512;
|
102 |
+
SampleMethod sample_method = EULER_A;
|
103 |
+
Schedule schedule = DEFAULT;
|
104 |
+
int sample_steps = 20;
|
105 |
+
float strength = 0.75f;
|
106 |
+
RNGType rng_type = CUDA_RNG;
|
107 |
+
int64_t seed = 42;
|
108 |
+
bool verbose = false;
|
109 |
+
|
110 |
+
void print() {
|
111 |
+
printf("Option: \n");
|
112 |
+
printf(" n_threads: %d\n", n_threads);
|
113 |
+
printf(" mode: %s\n", mode.c_str());
|
114 |
+
printf(" model_path: %s\n", model_path.c_str());
|
115 |
+
printf(" output_path: %s\n", output_path.c_str());
|
116 |
+
printf(" init_img: %s\n", init_img.c_str());
|
117 |
+
printf(" prompt: %s\n", prompt.c_str());
|
118 |
+
printf(" negative_prompt: %s\n", negative_prompt.c_str());
|
119 |
+
printf(" cfg_scale: %.2f\n", cfg_scale);
|
120 |
+
printf(" width: %d\n", w);
|
121 |
+
printf(" height: %d\n", h);
|
122 |
+
printf(" sample_method: %s\n", sample_method_str[sample_method]);
|
123 |
+
printf(" schedule: %s\n", schedule_str[schedule]);
|
124 |
+
printf(" sample_steps: %d\n", sample_steps);
|
125 |
+
printf(" strength: %.2f\n", strength);
|
126 |
+
printf(" rng: %s\n", rng_type_to_str[rng_type]);
|
127 |
+
printf(" seed: %ld\n", seed);
|
128 |
+
}
|
129 |
+
};
|
130 |
+
|
131 |
+
void print_usage(int argc, const char* argv[]) {
|
132 |
+
printf("usage: %s [arguments]\n", argv[0]);
|
133 |
+
printf("\n");
|
134 |
+
printf("arguments:\n");
|
135 |
+
printf(" -h, --help show this help message and exit\n");
|
136 |
+
printf(" -M, --mode [txt2img or img2img] generation mode (default: txt2img)\n");
|
137 |
+
printf(" -t, --threads N number of threads to use during computation (default: -1).\n");
|
138 |
+
printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
|
139 |
+
printf(" -m, --model [MODEL] path to model\n");
|
140 |
+
printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n");
|
141 |
+
printf(" -o, --output OUTPUT path to write result image to (default: .\\output.png)\n");
|
142 |
+
printf(" -p, --prompt [PROMPT] the prompt to render\n");
|
143 |
+
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
|
144 |
+
printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
|
145 |
+
printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
|
146 |
+
printf(" 1.0 corresponds to full destruction of information in init image\n");
|
147 |
+
printf(" -H, --height H image height, in pixel space (default: 512)\n");
|
148 |
+
printf(" -W, --width W image width, in pixel space (default: 512)\n");
|
149 |
+
printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2}\n");
|
150 |
+
printf(" sampling method (default: \"euler_a\")\n");
|
151 |
+
printf(" --steps STEPS number of sample steps (default: 20)\n");
|
152 |
+
printf(" --rng {std_default, cuda} RNG (default: cuda)\n");
|
153 |
+
printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
|
154 |
+
printf(" --schedule {discrete, karras} Denoiser sigma schedule (default: discrete)\n");
|
155 |
+
printf(" -v, --verbose print extra info\n");
|
156 |
+
}
|
157 |
+
|
158 |
+
void parse_args(int argc, const char* argv[], Option* opt) {
|
159 |
+
bool invalid_arg = false;
|
160 |
+
|
161 |
+
for (int i = 1; i < argc; i++) {
|
162 |
+
std::string arg = argv[i];
|
163 |
+
|
164 |
+
if (arg == "-t" || arg == "--threads") {
|
165 |
+
if (++i >= argc) {
|
166 |
+
invalid_arg = true;
|
167 |
+
break;
|
168 |
+
}
|
169 |
+
opt->n_threads = std::stoi(argv[i]);
|
170 |
+
} else if (arg == "-M" || arg == "--mode") {
|
171 |
+
if (++i >= argc) {
|
172 |
+
invalid_arg = true;
|
173 |
+
break;
|
174 |
+
}
|
175 |
+
opt->mode = argv[i];
|
176 |
+
|
177 |
+
} else if (arg == "-m" || arg == "--model") {
|
178 |
+
if (++i >= argc) {
|
179 |
+
invalid_arg = true;
|
180 |
+
break;
|
181 |
+
}
|
182 |
+
opt->model_path = argv[i];
|
183 |
+
} else if (arg == "-i" || arg == "--init-img") {
|
184 |
+
if (++i >= argc) {
|
185 |
+
invalid_arg = true;
|
186 |
+
break;
|
187 |
+
}
|
188 |
+
opt->init_img = argv[i];
|
189 |
+
} else if (arg == "-o" || arg == "--output") {
|
190 |
+
if (++i >= argc) {
|
191 |
+
invalid_arg = true;
|
192 |
+
break;
|
193 |
+
}
|
194 |
+
opt->output_path = argv[i];
|
195 |
+
} else if (arg == "-p" || arg == "--prompt") {
|
196 |
+
if (++i >= argc) {
|
197 |
+
invalid_arg = true;
|
198 |
+
break;
|
199 |
+
}
|
200 |
+
opt->prompt = argv[i];
|
201 |
+
} else if (arg == "-n" || arg == "--negative-prompt") {
|
202 |
+
if (++i >= argc) {
|
203 |
+
invalid_arg = true;
|
204 |
+
break;
|
205 |
+
}
|
206 |
+
opt->negative_prompt = argv[i];
|
207 |
+
} else if (arg == "--cfg-scale") {
|
208 |
+
if (++i >= argc) {
|
209 |
+
invalid_arg = true;
|
210 |
+
break;
|
211 |
+
}
|
212 |
+
opt->cfg_scale = std::stof(argv[i]);
|
213 |
+
} else if (arg == "--strength") {
|
214 |
+
if (++i >= argc) {
|
215 |
+
invalid_arg = true;
|
216 |
+
break;
|
217 |
+
}
|
218 |
+
opt->strength = std::stof(argv[i]);
|
219 |
+
} else if (arg == "-H" || arg == "--height") {
|
220 |
+
if (++i >= argc) {
|
221 |
+
invalid_arg = true;
|
222 |
+
break;
|
223 |
+
}
|
224 |
+
opt->h = std::stoi(argv[i]);
|
225 |
+
} else if (arg == "-W" || arg == "--width") {
|
226 |
+
if (++i >= argc) {
|
227 |
+
invalid_arg = true;
|
228 |
+
break;
|
229 |
+
}
|
230 |
+
opt->w = std::stoi(argv[i]);
|
231 |
+
} else if (arg == "--steps") {
|
232 |
+
if (++i >= argc) {
|
233 |
+
invalid_arg = true;
|
234 |
+
break;
|
235 |
+
}
|
236 |
+
opt->sample_steps = std::stoi(argv[i]);
|
237 |
+
} else if (arg == "--rng") {
|
238 |
+
if (++i >= argc) {
|
239 |
+
invalid_arg = true;
|
240 |
+
break;
|
241 |
+
}
|
242 |
+
std::string rng_type_str = argv[i];
|
243 |
+
if (rng_type_str == "std_default") {
|
244 |
+
opt->rng_type = STD_DEFAULT_RNG;
|
245 |
+
} else if (rng_type_str == "cuda") {
|
246 |
+
opt->rng_type = CUDA_RNG;
|
247 |
+
} else {
|
248 |
+
invalid_arg = true;
|
249 |
+
break;
|
250 |
+
}
|
251 |
+
} else if (arg == "--schedule") {
|
252 |
+
if (++i >= argc) {
|
253 |
+
invalid_arg = true;
|
254 |
+
break;
|
255 |
+
}
|
256 |
+
const char* schedule_selected = argv[i];
|
257 |
+
int schedule_found = -1;
|
258 |
+
for (int d = 0; d < N_SCHEDULES; d++) {
|
259 |
+
if (!strcmp(schedule_selected, schedule_str[d])) {
|
260 |
+
schedule_found = d;
|
261 |
+
}
|
262 |
+
}
|
263 |
+
if (schedule_found == -1) {
|
264 |
+
invalid_arg = true;
|
265 |
+
break;
|
266 |
+
}
|
267 |
+
opt->schedule = (Schedule)schedule_found;
|
268 |
+
} else if (arg == "-s" || arg == "--seed") {
|
269 |
+
if (++i >= argc) {
|
270 |
+
invalid_arg = true;
|
271 |
+
break;
|
272 |
+
}
|
273 |
+
opt->seed = std::stoll(argv[i]);
|
274 |
+
} else if (arg == "--sampling-method") {
|
275 |
+
if (++i >= argc) {
|
276 |
+
invalid_arg = true;
|
277 |
+
break;
|
278 |
+
}
|
279 |
+
const char* sample_method_selected = argv[i];
|
280 |
+
int sample_method_found = -1;
|
281 |
+
for (int m = 0; m < N_SAMPLE_METHODS; m++) {
|
282 |
+
if (!strcmp(sample_method_selected, sample_method_str[m])) {
|
283 |
+
sample_method_found = m;
|
284 |
+
}
|
285 |
+
}
|
286 |
+
if (sample_method_found == -1) {
|
287 |
+
invalid_arg = true;
|
288 |
+
break;
|
289 |
+
}
|
290 |
+
opt->sample_method = (SampleMethod)sample_method_found;
|
291 |
+
} else if (arg == "-h" || arg == "--help") {
|
292 |
+
print_usage(argc, argv);
|
293 |
+
exit(0);
|
294 |
+
} else if (arg == "-v" || arg == "--verbose") {
|
295 |
+
opt->verbose = true;
|
296 |
+
} else {
|
297 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
298 |
+
print_usage(argc, argv);
|
299 |
+
exit(1);
|
300 |
+
}
|
301 |
+
if (invalid_arg) {
|
302 |
+
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
303 |
+
print_usage(argc, argv);
|
304 |
+
exit(1);
|
305 |
+
}
|
306 |
+
}
|
307 |
+
|
308 |
+
if (opt->n_threads <= 0) {
|
309 |
+
opt->n_threads = get_num_physical_cores();
|
310 |
+
}
|
311 |
+
|
312 |
+
if (opt->mode != TXT2IMG && opt->mode != IMG2IMG) {
|
313 |
+
fprintf(stderr, "error: invalid mode %s, must be one of ['%s', '%s']\n",
|
314 |
+
opt->mode.c_str(), TXT2IMG, IMG2IMG);
|
315 |
+
exit(1);
|
316 |
+
}
|
317 |
+
|
318 |
+
if (opt->prompt.length() == 0) {
|
319 |
+
fprintf(stderr, "error: the following arguments are required: prompt\n");
|
320 |
+
print_usage(argc, argv);
|
321 |
+
exit(1);
|
322 |
+
}
|
323 |
+
|
324 |
+
if (opt->model_path.length() == 0) {
|
325 |
+
fprintf(stderr, "error: the following arguments are required: model_path\n");
|
326 |
+
print_usage(argc, argv);
|
327 |
+
exit(1);
|
328 |
+
}
|
329 |
+
|
330 |
+
if (opt->mode == IMG2IMG && opt->init_img.length() == 0) {
|
331 |
+
fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
|
332 |
+
print_usage(argc, argv);
|
333 |
+
exit(1);
|
334 |
+
}
|
335 |
+
|
336 |
+
if (opt->output_path.length() == 0) {
|
337 |
+
fprintf(stderr, "error: the following arguments are required: output_path\n");
|
338 |
+
print_usage(argc, argv);
|
339 |
+
exit(1);
|
340 |
+
}
|
341 |
+
|
342 |
+
if (opt->w <= 0 || opt->w % 64 != 0) {
|
343 |
+
fprintf(stderr, "error: the width must be a multiple of 64\n");
|
344 |
+
exit(1);
|
345 |
+
}
|
346 |
+
|
347 |
+
if (opt->h <= 0 || opt->h % 64 != 0) {
|
348 |
+
fprintf(stderr, "error: the height must be a multiple of 64\n");
|
349 |
+
exit(1);
|
350 |
+
}
|
351 |
+
|
352 |
+
if (opt->sample_steps <= 0) {
|
353 |
+
fprintf(stderr, "error: the sample_steps must be greater than 0\n");
|
354 |
+
exit(1);
|
355 |
+
}
|
356 |
+
|
357 |
+
if (opt->strength < 0.f || opt->strength > 1.f) {
|
358 |
+
fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
|
359 |
+
exit(1);
|
360 |
+
}
|
361 |
+
|
362 |
+
if (opt->seed < 0) {
|
363 |
+
srand((int)time(NULL));
|
364 |
+
opt->seed = rand();
|
365 |
+
}
|
366 |
+
}
|
367 |
+
|
368 |
+
std::string basename(const std::string& path) {
|
369 |
+
size_t pos = path.find_last_of('/');
|
370 |
+
if (pos != std::string::npos) {
|
371 |
+
return path.substr(pos + 1);
|
372 |
+
}
|
373 |
+
pos = path.find_last_of('\\');
|
374 |
+
if (pos != std::string::npos) {
|
375 |
+
return path.substr(pos + 1);
|
376 |
+
}
|
377 |
+
return path;
|
378 |
+
}
|
379 |
+
|
380 |
+
int main(int argc, const char* argv[]) {
|
381 |
+
Option opt;
|
382 |
+
parse_args(argc, argv, &opt);
|
383 |
+
|
384 |
+
if (opt.verbose) {
|
385 |
+
opt.print();
|
386 |
+
printf("%s", sd_get_system_info().c_str());
|
387 |
+
set_sd_log_level(SDLogLevel::DEBUG);
|
388 |
+
}
|
389 |
+
|
390 |
+
bool vae_decode_only = true;
|
391 |
+
std::vector<uint8_t> init_img;
|
392 |
+
if (opt.mode == IMG2IMG) {
|
393 |
+
vae_decode_only = false;
|
394 |
+
|
395 |
+
int c = 0;
|
396 |
+
unsigned char* img_data = stbi_load(opt.init_img.c_str(), &opt.w, &opt.h, &c, 3);
|
397 |
+
if (img_data == NULL) {
|
398 |
+
fprintf(stderr, "load image from '%s' failed\n", opt.init_img.c_str());
|
399 |
+
return 1;
|
400 |
+
}
|
401 |
+
if (c != 3) {
|
402 |
+
fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
|
403 |
+
free(img_data);
|
404 |
+
return 1;
|
405 |
+
}
|
406 |
+
if (opt.w <= 0 || opt.w % 64 != 0) {
|
407 |
+
fprintf(stderr, "error: the width of image must be a multiple of 64\n");
|
408 |
+
free(img_data);
|
409 |
+
return 1;
|
410 |
+
}
|
411 |
+
if (opt.h <= 0 || opt.h % 64 != 0) {
|
412 |
+
fprintf(stderr, "error: the height of image must be a multiple of 64\n");
|
413 |
+
free(img_data);
|
414 |
+
return 1;
|
415 |
+
}
|
416 |
+
init_img.assign(img_data, img_data + (opt.w * opt.h * c));
|
417 |
+
}
|
418 |
+
|
419 |
+
StableDiffusion sd(opt.n_threads, vae_decode_only, true, opt.rng_type);
|
420 |
+
if (!sd.load_from_file(opt.model_path, opt.schedule)) {
|
421 |
+
return 1;
|
422 |
+
}
|
423 |
+
|
424 |
+
std::vector<uint8_t> img;
|
425 |
+
if (opt.mode == TXT2IMG) {
|
426 |
+
img = sd.txt2img(opt.prompt,
|
427 |
+
opt.negative_prompt,
|
428 |
+
opt.cfg_scale,
|
429 |
+
opt.w,
|
430 |
+
opt.h,
|
431 |
+
opt.sample_method,
|
432 |
+
opt.sample_steps,
|
433 |
+
opt.seed);
|
434 |
+
} else {
|
435 |
+
img = sd.img2img(init_img,
|
436 |
+
opt.prompt,
|
437 |
+
opt.negative_prompt,
|
438 |
+
opt.cfg_scale,
|
439 |
+
opt.w,
|
440 |
+
opt.h,
|
441 |
+
opt.sample_method,
|
442 |
+
opt.sample_steps,
|
443 |
+
opt.strength,
|
444 |
+
opt.seed);
|
445 |
+
}
|
446 |
+
|
447 |
+
if (img.size() == 0) {
|
448 |
+
fprintf(stderr, "generate failed\n");
|
449 |
+
return 1;
|
450 |
+
}
|
451 |
+
|
452 |
+
std::string parameter_string = opt.prompt + "\n";
|
453 |
+
if (opt.negative_prompt.size() != 0) {
|
454 |
+
parameter_string += "Negative prompt: " + opt.negative_prompt + "\n";
|
455 |
+
}
|
456 |
+
parameter_string += "Steps: " + std::to_string(opt.sample_steps) + ", ";
|
457 |
+
parameter_string += "CFG scale: " + std::to_string(opt.cfg_scale) + ", ";
|
458 |
+
parameter_string += "Seed: " + std::to_string(opt.seed) + ", ";
|
459 |
+
parameter_string += "Size: " + std::to_string(opt.w) + "x" + std::to_string(opt.h) + ", ";
|
460 |
+
parameter_string += "Model: " + basename(opt.model_path) + ", ";
|
461 |
+
parameter_string += "RNG: " + std::string(rng_type_to_str[opt.rng_type]) + ", ";
|
462 |
+
parameter_string += "Sampler: " + std::string(sample_method_str[opt.sample_method]);
|
463 |
+
if (opt.schedule == KARRAS) {
|
464 |
+
parameter_string += " karras";
|
465 |
+
}
|
466 |
+
parameter_string += ", ";
|
467 |
+
parameter_string += "Version: stable-diffusion.cpp";
|
468 |
+
|
469 |
+
stbi_write_png(opt.output_path.c_str(), opt.w, opt.h, 3, img.data(), 0, parameter_string.c_str());
|
470 |
+
printf("save result image to '%s'\n", opt.output_path.c_str());
|
471 |
+
|
472 |
+
return 0;
|
473 |
+
}
|
stable-diffusion.cpp/examples/stb_image.h
ADDED
The diff for this file is too large to render.
See raw diff
|
|
stable-diffusion.cpp/examples/stb_image_write.h
ADDED
@@ -0,0 +1,1741 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
|
2 |
+
writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
|
3 |
+
no warranty implied; use at your own risk
|
4 |
+
|
5 |
+
Before #including,
|
6 |
+
|
7 |
+
#define STB_IMAGE_WRITE_IMPLEMENTATION
|
8 |
+
|
9 |
+
in the file that you want to have the implementation.
|
10 |
+
|
11 |
+
Will probably not work correctly with strict-aliasing optimizations.
|
12 |
+
|
13 |
+
ABOUT:
|
14 |
+
|
15 |
+
This header file is a library for writing images to C stdio or a callback.
|
16 |
+
|
17 |
+
The PNG output is not optimal; it is 20-50% larger than the file
|
18 |
+
written by a decent optimizing implementation; though providing a custom
|
19 |
+
zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
|
20 |
+
This library is designed for source code compactness and simplicity,
|
21 |
+
not optimal image file size or run-time performance.
|
22 |
+
|
23 |
+
BUILDING:
|
24 |
+
|
25 |
+
You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
|
26 |
+
You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
|
27 |
+
malloc,realloc,free.
|
28 |
+
You can #define STBIW_MEMMOVE() to replace memmove()
|
29 |
+
You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
|
30 |
+
for PNG compression (instead of the builtin one), it must have the following signature:
|
31 |
+
unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
|
32 |
+
The returned data will be freed with STBIW_FREE() (free() by default),
|
33 |
+
so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
|
34 |
+
|
35 |
+
UNICODE:
|
36 |
+
|
37 |
+
If compiling for Windows and you wish to use Unicode filenames, compile
|
38 |
+
with
|
39 |
+
#define STBIW_WINDOWS_UTF8
|
40 |
+
and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
|
41 |
+
Windows wchar_t filenames to utf8.
|
42 |
+
|
43 |
+
USAGE:
|
44 |
+
|
45 |
+
There are five functions, one for each image file format:
|
46 |
+
|
47 |
+
int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
|
48 |
+
int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
|
49 |
+
int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
|
50 |
+
int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
|
51 |
+
int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
|
52 |
+
|
53 |
+
void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
|
54 |
+
|
55 |
+
There are also five equivalent functions that use an arbitrary write function. You are
|
56 |
+
expected to open/close your file-equivalent before and after calling these:
|
57 |
+
|
58 |
+
int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data, int stride_in_bytes);
|
59 |
+
int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
|
60 |
+
int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
|
61 |
+
int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
|
62 |
+
int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
|
63 |
+
|
64 |
+
where the callback is:
|
65 |
+
void stbi_write_func(void *context, void *data, int size);
|
66 |
+
|
67 |
+
You can configure it with these global variables:
|
68 |
+
int stbi_write_tga_with_rle; // defaults to true; set to 0 to disable RLE
|
69 |
+
int stbi_write_png_compression_level; // defaults to 8; set to higher for more compression
|
70 |
+
int stbi_write_force_png_filter; // defaults to -1; set to 0..5 to force a filter mode
|
71 |
+
|
72 |
+
|
73 |
+
You can define STBI_WRITE_NO_STDIO to disable the file variant of these
|
74 |
+
functions, so the library will not use stdio.h at all. However, this will
|
75 |
+
also disable HDR writing, because it requires stdio for formatted output.
|
76 |
+
|
77 |
+
Each function returns 0 on failure and non-0 on success.
|
78 |
+
|
79 |
+
The functions create an image file defined by the parameters. The image
|
80 |
+
is a rectangle of pixels stored from left-to-right, top-to-bottom.
|
81 |
+
Each pixel contains 'comp' channels of data stored interleaved with 8-bits
|
82 |
+
per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
|
83 |
+
monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
|
84 |
+
The *data pointer points to the first byte of the top-left-most pixel.
|
85 |
+
For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
|
86 |
+
a row of pixels to the first byte of the next row of pixels.
|
87 |
+
|
88 |
+
PNG creates output files with the same number of components as the input.
|
89 |
+
The BMP format expands Y to RGB in the file format and does not
|
90 |
+
output alpha.
|
91 |
+
|
92 |
+
PNG supports writing rectangles of data even when the bytes storing rows of
|
93 |
+
data are not consecutive in memory (e.g. sub-rectangles of a larger image),
|
94 |
+
by supplying the stride between the beginning of adjacent rows. The other
|
95 |
+
formats do not. (Thus you cannot write a native-format BMP through the BMP
|
96 |
+
writer, both because it is in BGR order and because it may have padding
|
97 |
+
at the end of the line.)
|
98 |
+
|
99 |
+
PNG allows you to set the deflate compression level by setting the global
|
100 |
+
variable 'stbi_write_png_compression_level' (it defaults to 8).
|
101 |
+
|
102 |
+
HDR expects linear float data. Since the format is always 32-bit rgb(e)
|
103 |
+
data, alpha (if provided) is discarded, and for monochrome data it is
|
104 |
+
replicated across all three channels.
|
105 |
+
|
106 |
+
TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
|
107 |
+
data, set the global variable 'stbi_write_tga_with_rle' to 0.
|
108 |
+
|
109 |
+
JPEG does ignore alpha channels in input data; quality is between 1 and 100.
|
110 |
+
Higher quality looks better but results in a bigger image.
|
111 |
+
JPEG baseline (no JPEG progressive).
|
112 |
+
|
113 |
+
CREDITS:
|
114 |
+
|
115 |
+
|
116 |
+
Sean Barrett - PNG/BMP/TGA
|
117 |
+
Baldur Karlsson - HDR
|
118 |
+
Jean-Sebastien Guay - TGA monochrome
|
119 |
+
Tim Kelsey - misc enhancements
|
120 |
+
Alan Hickman - TGA RLE
|
121 |
+
Emmanuel Julien - initial file IO callback implementation
|
122 |
+
Jon Olick - original jo_jpeg.cpp code
|
123 |
+
Daniel Gibson - integrate JPEG, allow external zlib
|
124 |
+
Aarni Koskela - allow choosing PNG filter
|
125 |
+
|
126 |
+
bugfixes:
|
127 |
+
github:Chribba
|
128 |
+
Guillaume Chereau
|
129 |
+
github:jry2
|
130 |
+
github:romigrou
|
131 |
+
Sergio Gonzalez
|
132 |
+
Jonas Karlsson
|
133 |
+
Filip Wasil
|
134 |
+
Thatcher Ulrich
|
135 |
+
github:poppolopoppo
|
136 |
+
Patrick Boettcher
|
137 |
+
github:xeekworx
|
138 |
+
Cap Petschulat
|
139 |
+
Simon Rodriguez
|
140 |
+
Ivan Tikhonov
|
141 |
+
github:ignotion
|
142 |
+
Adam Schackart
|
143 |
+
Andrew Kensler
|
144 |
+
|
145 |
+
LICENSE
|
146 |
+
|
147 |
+
See end of file for license information.
|
148 |
+
|
149 |
+
*/
|
150 |
+
|
151 |
+
#ifndef INCLUDE_STB_IMAGE_WRITE_H
|
152 |
+
#define INCLUDE_STB_IMAGE_WRITE_H
|
153 |
+
|
154 |
+
#include <stdlib.h>
|
155 |
+
|
156 |
+
// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
|
157 |
+
#ifndef STBIWDEF
|
158 |
+
#ifdef STB_IMAGE_WRITE_STATIC
|
159 |
+
#define STBIWDEF static
|
160 |
+
#else
|
161 |
+
#ifdef __cplusplus
|
162 |
+
#define STBIWDEF extern "C"
|
163 |
+
#else
|
164 |
+
#define STBIWDEF extern
|
165 |
+
#endif
|
166 |
+
#endif
|
167 |
+
#endif
|
168 |
+
|
169 |
+
#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations
|
170 |
+
STBIWDEF int stbi_write_tga_with_rle;
|
171 |
+
STBIWDEF int stbi_write_png_compression_level;
|
172 |
+
STBIWDEF int stbi_write_force_png_filter;
|
173 |
+
#endif
|
174 |
+
|
175 |
+
#ifndef STBI_WRITE_NO_STDIO
|
176 |
+
STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes, const char* parameters = NULL);
|
177 |
+
STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
|
178 |
+
STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
|
179 |
+
STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
|
180 |
+
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality);
|
181 |
+
|
182 |
+
#ifdef STBIW_WINDOWS_UTF8
|
183 |
+
STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
|
184 |
+
#endif
|
185 |
+
#endif
|
186 |
+
|
187 |
+
typedef void stbi_write_func(void *context, void *data, int size);
|
188 |
+
|
189 |
+
STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data, int stride_in_bytes);
|
190 |
+
STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
|
191 |
+
STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
|
192 |
+
STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
|
193 |
+
STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
|
194 |
+
|
195 |
+
STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
|
196 |
+
|
197 |
+
#endif//INCLUDE_STB_IMAGE_WRITE_H
|
198 |
+
|
199 |
+
#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
|
200 |
+
|
201 |
+
#ifdef _WIN32
|
202 |
+
#ifndef _CRT_SECURE_NO_WARNINGS
|
203 |
+
#define _CRT_SECURE_NO_WARNINGS
|
204 |
+
#endif
|
205 |
+
#ifndef _CRT_NONSTDC_NO_DEPRECATE
|
206 |
+
#define _CRT_NONSTDC_NO_DEPRECATE
|
207 |
+
#endif
|
208 |
+
#endif
|
209 |
+
|
210 |
+
#ifndef STBI_WRITE_NO_STDIO
|
211 |
+
#include <stdio.h>
|
212 |
+
#endif // STBI_WRITE_NO_STDIO
|
213 |
+
|
214 |
+
#include <stdarg.h>
|
215 |
+
#include <stdlib.h>
|
216 |
+
#include <string.h>
|
217 |
+
#include <math.h>
|
218 |
+
|
219 |
+
#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
|
220 |
+
// ok
|
221 |
+
#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
|
222 |
+
// ok
|
223 |
+
#else
|
224 |
+
#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
|
225 |
+
#endif
|
226 |
+
|
227 |
+
#ifndef STBIW_MALLOC
|
228 |
+
#define STBIW_MALLOC(sz) malloc(sz)
|
229 |
+
#define STBIW_REALLOC(p,newsz) realloc(p,newsz)
|
230 |
+
#define STBIW_FREE(p) free(p)
|
231 |
+
#endif
|
232 |
+
|
233 |
+
#ifndef STBIW_REALLOC_SIZED
|
234 |
+
#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
|
235 |
+
#endif
|
236 |
+
|
237 |
+
|
238 |
+
#ifndef STBIW_MEMMOVE
|
239 |
+
#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
|
240 |
+
#endif
|
241 |
+
|
242 |
+
|
243 |
+
#ifndef STBIW_ASSERT
|
244 |
+
#include <assert.h>
|
245 |
+
#define STBIW_ASSERT(x) assert(x)
|
246 |
+
#endif
|
247 |
+
|
248 |
+
#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
|
249 |
+
|
250 |
+
#ifdef STB_IMAGE_WRITE_STATIC
|
251 |
+
static int stbi_write_png_compression_level = 8;
|
252 |
+
static int stbi_write_tga_with_rle = 1;
|
253 |
+
static int stbi_write_force_png_filter = -1;
|
254 |
+
#else
|
255 |
+
int stbi_write_png_compression_level = 8;
|
256 |
+
int stbi_write_tga_with_rle = 1;
|
257 |
+
int stbi_write_force_png_filter = -1;
|
258 |
+
#endif
|
259 |
+
|
260 |
+
static int stbi__flip_vertically_on_write = 0;
|
261 |
+
|
262 |
+
STBIWDEF void stbi_flip_vertically_on_write(int flag)
|
263 |
+
{
|
264 |
+
stbi__flip_vertically_on_write = flag;
|
265 |
+
}
|
266 |
+
|
267 |
+
typedef struct
|
268 |
+
{
|
269 |
+
stbi_write_func *func;
|
270 |
+
void *context;
|
271 |
+
unsigned char buffer[64];
|
272 |
+
int buf_used;
|
273 |
+
} stbi__write_context;
|
274 |
+
|
275 |
+
// initialize a callback-based context
|
276 |
+
static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
|
277 |
+
{
|
278 |
+
s->func = c;
|
279 |
+
s->context = context;
|
280 |
+
}
|
281 |
+
|
282 |
+
#ifndef STBI_WRITE_NO_STDIO
|
283 |
+
|
284 |
+
static void stbi__stdio_write(void *context, void *data, int size)
|
285 |
+
{
|
286 |
+
fwrite(data,1,size,(FILE*) context);
|
287 |
+
}
|
288 |
+
|
289 |
+
#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
|
290 |
+
#ifdef __cplusplus
|
291 |
+
#define STBIW_EXTERN extern "C"
|
292 |
+
#else
|
293 |
+
#define STBIW_EXTERN extern
|
294 |
+
#endif
|
295 |
+
STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
|
296 |
+
STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
|
297 |
+
|
298 |
+
STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
|
299 |
+
{
|
300 |
+
return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
|
301 |
+
}
|
302 |
+
#endif
|
303 |
+
|
304 |
+
static FILE *stbiw__fopen(char const *filename, char const *mode)
|
305 |
+
{
|
306 |
+
FILE *f;
|
307 |
+
#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
|
308 |
+
wchar_t wMode[64];
|
309 |
+
wchar_t wFilename[1024];
|
310 |
+
if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
|
311 |
+
return 0;
|
312 |
+
|
313 |
+
if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
|
314 |
+
return 0;
|
315 |
+
|
316 |
+
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
317 |
+
if (0 != _wfopen_s(&f, wFilename, wMode))
|
318 |
+
f = 0;
|
319 |
+
#else
|
320 |
+
f = _wfopen(wFilename, wMode);
|
321 |
+
#endif
|
322 |
+
|
323 |
+
#elif defined(_MSC_VER) && _MSC_VER >= 1400
|
324 |
+
if (0 != fopen_s(&f, filename, mode))
|
325 |
+
f=0;
|
326 |
+
#else
|
327 |
+
f = fopen(filename, mode);
|
328 |
+
#endif
|
329 |
+
return f;
|
330 |
+
}
|
331 |
+
|
332 |
+
static int stbi__start_write_file(stbi__write_context *s, const char *filename)
|
333 |
+
{
|
334 |
+
FILE *f = stbiw__fopen(filename, "wb");
|
335 |
+
stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
|
336 |
+
return f != NULL;
|
337 |
+
}
|
338 |
+
|
339 |
+
static void stbi__end_write_file(stbi__write_context *s)
|
340 |
+
{
|
341 |
+
fclose((FILE *)s->context);
|
342 |
+
}
|
343 |
+
|
344 |
+
#endif // !STBI_WRITE_NO_STDIO
|
345 |
+
|
346 |
+
typedef unsigned int stbiw_uint32;
|
347 |
+
typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
|
348 |
+
|
349 |
+
static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
|
350 |
+
{
|
351 |
+
while (*fmt) {
|
352 |
+
switch (*fmt++) {
|
353 |
+
case ' ': break;
|
354 |
+
case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
|
355 |
+
s->func(s->context,&x,1);
|
356 |
+
break; }
|
357 |
+
case '2': { int x = va_arg(v,int);
|
358 |
+
unsigned char b[2];
|
359 |
+
b[0] = STBIW_UCHAR(x);
|
360 |
+
b[1] = STBIW_UCHAR(x>>8);
|
361 |
+
s->func(s->context,b,2);
|
362 |
+
break; }
|
363 |
+
case '4': { stbiw_uint32 x = va_arg(v,int);
|
364 |
+
unsigned char b[4];
|
365 |
+
b[0]=STBIW_UCHAR(x);
|
366 |
+
b[1]=STBIW_UCHAR(x>>8);
|
367 |
+
b[2]=STBIW_UCHAR(x>>16);
|
368 |
+
b[3]=STBIW_UCHAR(x>>24);
|
369 |
+
s->func(s->context,b,4);
|
370 |
+
break; }
|
371 |
+
default:
|
372 |
+
STBIW_ASSERT(0);
|
373 |
+
return;
|
374 |
+
}
|
375 |
+
}
|
376 |
+
}
|
377 |
+
|
378 |
+
static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
|
379 |
+
{
|
380 |
+
va_list v;
|
381 |
+
va_start(v, fmt);
|
382 |
+
stbiw__writefv(s, fmt, v);
|
383 |
+
va_end(v);
|
384 |
+
}
|
385 |
+
|
386 |
+
static void stbiw__write_flush(stbi__write_context *s)
|
387 |
+
{
|
388 |
+
if (s->buf_used) {
|
389 |
+
s->func(s->context, &s->buffer, s->buf_used);
|
390 |
+
s->buf_used = 0;
|
391 |
+
}
|
392 |
+
}
|
393 |
+
|
394 |
+
static void stbiw__putc(stbi__write_context *s, unsigned char c)
|
395 |
+
{
|
396 |
+
s->func(s->context, &c, 1);
|
397 |
+
}
|
398 |
+
|
399 |
+
static void stbiw__write1(stbi__write_context *s, unsigned char a)
|
400 |
+
{
|
401 |
+
if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
|
402 |
+
stbiw__write_flush(s);
|
403 |
+
s->buffer[s->buf_used++] = a;
|
404 |
+
}
|
405 |
+
|
406 |
+
static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
|
407 |
+
{
|
408 |
+
int n;
|
409 |
+
if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
|
410 |
+
stbiw__write_flush(s);
|
411 |
+
n = s->buf_used;
|
412 |
+
s->buf_used = n+3;
|
413 |
+
s->buffer[n+0] = a;
|
414 |
+
s->buffer[n+1] = b;
|
415 |
+
s->buffer[n+2] = c;
|
416 |
+
}
|
417 |
+
|
418 |
+
static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
|
419 |
+
{
|
420 |
+
unsigned char bg[3] = { 255, 0, 255}, px[3];
|
421 |
+
int k;
|
422 |
+
|
423 |
+
if (write_alpha < 0)
|
424 |
+
stbiw__write1(s, d[comp - 1]);
|
425 |
+
|
426 |
+
switch (comp) {
|
427 |
+
case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
|
428 |
+
case 1:
|
429 |
+
if (expand_mono)
|
430 |
+
stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
|
431 |
+
else
|
432 |
+
stbiw__write1(s, d[0]); // monochrome TGA
|
433 |
+
break;
|
434 |
+
case 4:
|
435 |
+
if (!write_alpha) {
|
436 |
+
// composite against pink background
|
437 |
+
for (k = 0; k < 3; ++k)
|
438 |
+
px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
|
439 |
+
stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
|
440 |
+
break;
|
441 |
+
}
|
442 |
+
/* FALLTHROUGH */
|
443 |
+
case 3:
|
444 |
+
stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
|
445 |
+
break;
|
446 |
+
}
|
447 |
+
if (write_alpha > 0)
|
448 |
+
stbiw__write1(s, d[comp - 1]);
|
449 |
+
}
|
450 |
+
|
451 |
+
static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
|
452 |
+
{
|
453 |
+
stbiw_uint32 zero = 0;
|
454 |
+
int i,j, j_end;
|
455 |
+
|
456 |
+
if (y <= 0)
|
457 |
+
return;
|
458 |
+
|
459 |
+
if (stbi__flip_vertically_on_write)
|
460 |
+
vdir *= -1;
|
461 |
+
|
462 |
+
if (vdir < 0) {
|
463 |
+
j_end = -1; j = y-1;
|
464 |
+
} else {
|
465 |
+
j_end = y; j = 0;
|
466 |
+
}
|
467 |
+
|
468 |
+
for (; j != j_end; j += vdir) {
|
469 |
+
for (i=0; i < x; ++i) {
|
470 |
+
unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
|
471 |
+
stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
|
472 |
+
}
|
473 |
+
stbiw__write_flush(s);
|
474 |
+
s->func(s->context, &zero, scanline_pad);
|
475 |
+
}
|
476 |
+
}
|
477 |
+
|
478 |
+
static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
|
479 |
+
{
|
480 |
+
if (y < 0 || x < 0) {
|
481 |
+
return 0;
|
482 |
+
} else {
|
483 |
+
va_list v;
|
484 |
+
va_start(v, fmt);
|
485 |
+
stbiw__writefv(s, fmt, v);
|
486 |
+
va_end(v);
|
487 |
+
stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
|
488 |
+
return 1;
|
489 |
+
}
|
490 |
+
}
|
491 |
+
|
492 |
+
static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
|
493 |
+
{
|
494 |
+
if (comp != 4) {
|
495 |
+
// write RGB bitmap
|
496 |
+
int pad = (-x*3) & 3;
|
497 |
+
return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
|
498 |
+
"11 4 22 4" "4 44 22 444444",
|
499 |
+
'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40, // file header
|
500 |
+
40, x,y, 1,24, 0,0,0,0,0,0); // bitmap header
|
501 |
+
} else {
|
502 |
+
// RGBA bitmaps need a v4 header
|
503 |
+
// use BI_BITFIELDS mode with 32bpp and alpha mask
|
504 |
+
// (straight BI_RGB with alpha mask doesn't work in most readers)
|
505 |
+
return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
|
506 |
+
"11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
|
507 |
+
'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
|
508 |
+
108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
|
509 |
+
}
|
510 |
+
}
|
511 |
+
|
512 |
+
STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
|
513 |
+
{
|
514 |
+
stbi__write_context s = { 0 };
|
515 |
+
stbi__start_write_callbacks(&s, func, context);
|
516 |
+
return stbi_write_bmp_core(&s, x, y, comp, data);
|
517 |
+
}
|
518 |
+
|
519 |
+
#ifndef STBI_WRITE_NO_STDIO
|
520 |
+
STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
|
521 |
+
{
|
522 |
+
stbi__write_context s = { 0 };
|
523 |
+
if (stbi__start_write_file(&s,filename)) {
|
524 |
+
int r = stbi_write_bmp_core(&s, x, y, comp, data);
|
525 |
+
stbi__end_write_file(&s);
|
526 |
+
return r;
|
527 |
+
} else
|
528 |
+
return 0;
|
529 |
+
}
|
530 |
+
#endif //!STBI_WRITE_NO_STDIO
|
531 |
+
|
532 |
+
static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
|
533 |
+
{
|
534 |
+
int has_alpha = (comp == 2 || comp == 4);
|
535 |
+
int colorbytes = has_alpha ? comp-1 : comp;
|
536 |
+
int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
|
537 |
+
|
538 |
+
if (y < 0 || x < 0)
|
539 |
+
return 0;
|
540 |
+
|
541 |
+
if (!stbi_write_tga_with_rle) {
|
542 |
+
return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
|
543 |
+
"111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
|
544 |
+
} else {
|
545 |
+
int i,j,k;
|
546 |
+
int jend, jdir;
|
547 |
+
|
548 |
+
stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
|
549 |
+
|
550 |
+
if (stbi__flip_vertically_on_write) {
|
551 |
+
j = 0;
|
552 |
+
jend = y;
|
553 |
+
jdir = 1;
|
554 |
+
} else {
|
555 |
+
j = y-1;
|
556 |
+
jend = -1;
|
557 |
+
jdir = -1;
|
558 |
+
}
|
559 |
+
for (; j != jend; j += jdir) {
|
560 |
+
unsigned char *row = (unsigned char *) data + j * x * comp;
|
561 |
+
int len;
|
562 |
+
|
563 |
+
for (i = 0; i < x; i += len) {
|
564 |
+
unsigned char *begin = row + i * comp;
|
565 |
+
int diff = 1;
|
566 |
+
len = 1;
|
567 |
+
|
568 |
+
if (i < x - 1) {
|
569 |
+
++len;
|
570 |
+
diff = memcmp(begin, row + (i + 1) * comp, comp);
|
571 |
+
if (diff) {
|
572 |
+
const unsigned char *prev = begin;
|
573 |
+
for (k = i + 2; k < x && len < 128; ++k) {
|
574 |
+
if (memcmp(prev, row + k * comp, comp)) {
|
575 |
+
prev += comp;
|
576 |
+
++len;
|
577 |
+
} else {
|
578 |
+
--len;
|
579 |
+
break;
|
580 |
+
}
|
581 |
+
}
|
582 |
+
} else {
|
583 |
+
for (k = i + 2; k < x && len < 128; ++k) {
|
584 |
+
if (!memcmp(begin, row + k * comp, comp)) {
|
585 |
+
++len;
|
586 |
+
} else {
|
587 |
+
break;
|
588 |
+
}
|
589 |
+
}
|
590 |
+
}
|
591 |
+
}
|
592 |
+
|
593 |
+
if (diff) {
|
594 |
+
unsigned char header = STBIW_UCHAR(len - 1);
|
595 |
+
stbiw__write1(s, header);
|
596 |
+
for (k = 0; k < len; ++k) {
|
597 |
+
stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
|
598 |
+
}
|
599 |
+
} else {
|
600 |
+
unsigned char header = STBIW_UCHAR(len - 129);
|
601 |
+
stbiw__write1(s, header);
|
602 |
+
stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
|
603 |
+
}
|
604 |
+
}
|
605 |
+
}
|
606 |
+
stbiw__write_flush(s);
|
607 |
+
}
|
608 |
+
return 1;
|
609 |
+
}
|
610 |
+
|
611 |
+
STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
|
612 |
+
{
|
613 |
+
stbi__write_context s = { 0 };
|
614 |
+
stbi__start_write_callbacks(&s, func, context);
|
615 |
+
return stbi_write_tga_core(&s, x, y, comp, (void *) data);
|
616 |
+
}
|
617 |
+
|
618 |
+
#ifndef STBI_WRITE_NO_STDIO
|
619 |
+
STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
|
620 |
+
{
|
621 |
+
stbi__write_context s = { 0 };
|
622 |
+
if (stbi__start_write_file(&s,filename)) {
|
623 |
+
int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
|
624 |
+
stbi__end_write_file(&s);
|
625 |
+
return r;
|
626 |
+
} else
|
627 |
+
return 0;
|
628 |
+
}
|
629 |
+
#endif
|
630 |
+
|
631 |
+
// *************************************************************************************************
|
632 |
+
// Radiance RGBE HDR writer
|
633 |
+
// by Baldur Karlsson
|
634 |
+
|
635 |
+
#define stbiw__max(a, b) ((a) > (b) ? (a) : (b))
|
636 |
+
|
637 |
+
#ifndef STBI_WRITE_NO_STDIO
|
638 |
+
|
639 |
+
static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
|
640 |
+
{
|
641 |
+
int exponent;
|
642 |
+
float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
|
643 |
+
|
644 |
+
if (maxcomp < 1e-32f) {
|
645 |
+
rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
|
646 |
+
} else {
|
647 |
+
float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
|
648 |
+
|
649 |
+
rgbe[0] = (unsigned char)(linear[0] * normalize);
|
650 |
+
rgbe[1] = (unsigned char)(linear[1] * normalize);
|
651 |
+
rgbe[2] = (unsigned char)(linear[2] * normalize);
|
652 |
+
rgbe[3] = (unsigned char)(exponent + 128);
|
653 |
+
}
|
654 |
+
}
|
655 |
+
|
656 |
+
static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
|
657 |
+
{
|
658 |
+
unsigned char lengthbyte = STBIW_UCHAR(length+128);
|
659 |
+
STBIW_ASSERT(length+128 <= 255);
|
660 |
+
s->func(s->context, &lengthbyte, 1);
|
661 |
+
s->func(s->context, &databyte, 1);
|
662 |
+
}
|
663 |
+
|
664 |
+
static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
|
665 |
+
{
|
666 |
+
unsigned char lengthbyte = STBIW_UCHAR(length);
|
667 |
+
STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
|
668 |
+
s->func(s->context, &lengthbyte, 1);
|
669 |
+
s->func(s->context, data, length);
|
670 |
+
}
|
671 |
+
|
672 |
+
static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
|
673 |
+
{
|
674 |
+
unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
|
675 |
+
unsigned char rgbe[4];
|
676 |
+
float linear[3];
|
677 |
+
int x;
|
678 |
+
|
679 |
+
scanlineheader[2] = (width&0xff00)>>8;
|
680 |
+
scanlineheader[3] = (width&0x00ff);
|
681 |
+
|
682 |
+
/* skip RLE for images too small or large */
|
683 |
+
if (width < 8 || width >= 32768) {
|
684 |
+
for (x=0; x < width; x++) {
|
685 |
+
switch (ncomp) {
|
686 |
+
case 4: /* fallthrough */
|
687 |
+
case 3: linear[2] = scanline[x*ncomp + 2];
|
688 |
+
linear[1] = scanline[x*ncomp + 1];
|
689 |
+
linear[0] = scanline[x*ncomp + 0];
|
690 |
+
break;
|
691 |
+
default:
|
692 |
+
linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
|
693 |
+
break;
|
694 |
+
}
|
695 |
+
stbiw__linear_to_rgbe(rgbe, linear);
|
696 |
+
s->func(s->context, rgbe, 4);
|
697 |
+
}
|
698 |
+
} else {
|
699 |
+
int c,r;
|
700 |
+
/* encode into scratch buffer */
|
701 |
+
for (x=0; x < width; x++) {
|
702 |
+
switch(ncomp) {
|
703 |
+
case 4: /* fallthrough */
|
704 |
+
case 3: linear[2] = scanline[x*ncomp + 2];
|
705 |
+
linear[1] = scanline[x*ncomp + 1];
|
706 |
+
linear[0] = scanline[x*ncomp + 0];
|
707 |
+
break;
|
708 |
+
default:
|
709 |
+
linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
|
710 |
+
break;
|
711 |
+
}
|
712 |
+
stbiw__linear_to_rgbe(rgbe, linear);
|
713 |
+
scratch[x + width*0] = rgbe[0];
|
714 |
+
scratch[x + width*1] = rgbe[1];
|
715 |
+
scratch[x + width*2] = rgbe[2];
|
716 |
+
scratch[x + width*3] = rgbe[3];
|
717 |
+
}
|
718 |
+
|
719 |
+
s->func(s->context, scanlineheader, 4);
|
720 |
+
|
721 |
+
/* RLE each component separately */
|
722 |
+
for (c=0; c < 4; c++) {
|
723 |
+
unsigned char *comp = &scratch[width*c];
|
724 |
+
|
725 |
+
x = 0;
|
726 |
+
while (x < width) {
|
727 |
+
// find first run
|
728 |
+
r = x;
|
729 |
+
while (r+2 < width) {
|
730 |
+
if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
|
731 |
+
break;
|
732 |
+
++r;
|
733 |
+
}
|
734 |
+
if (r+2 >= width)
|
735 |
+
r = width;
|
736 |
+
// dump up to first run
|
737 |
+
while (x < r) {
|
738 |
+
int len = r-x;
|
739 |
+
if (len > 128) len = 128;
|
740 |
+
stbiw__write_dump_data(s, len, &comp[x]);
|
741 |
+
x += len;
|
742 |
+
}
|
743 |
+
// if there's a run, output it
|
744 |
+
if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
|
745 |
+
// find next byte after run
|
746 |
+
while (r < width && comp[r] == comp[x])
|
747 |
+
++r;
|
748 |
+
// output run up to r
|
749 |
+
while (x < r) {
|
750 |
+
int len = r-x;
|
751 |
+
if (len > 127) len = 127;
|
752 |
+
stbiw__write_run_data(s, len, comp[x]);
|
753 |
+
x += len;
|
754 |
+
}
|
755 |
+
}
|
756 |
+
}
|
757 |
+
}
|
758 |
+
}
|
759 |
+
}
|
760 |
+
|
761 |
+
static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
|
762 |
+
{
|
763 |
+
if (y <= 0 || x <= 0 || data == NULL)
|
764 |
+
return 0;
|
765 |
+
else {
|
766 |
+
// Each component is stored separately. Allocate scratch space for full output scanline.
|
767 |
+
unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
|
768 |
+
int i, len;
|
769 |
+
char buffer[128];
|
770 |
+
char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
|
771 |
+
s->func(s->context, header, sizeof(header)-1);
|
772 |
+
|
773 |
+
#ifdef __STDC_LIB_EXT1__
|
774 |
+
len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x);
|
775 |
+
#else
|
776 |
+
len = sprintf(buffer, "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x);
|
777 |
+
#endif
|
778 |
+
s->func(s->context, buffer, len);
|
779 |
+
|
780 |
+
for(i=0; i < y; i++)
|
781 |
+
stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
|
782 |
+
STBIW_FREE(scratch);
|
783 |
+
return 1;
|
784 |
+
}
|
785 |
+
}
|
786 |
+
|
787 |
+
STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
|
788 |
+
{
|
789 |
+
stbi__write_context s = { 0 };
|
790 |
+
stbi__start_write_callbacks(&s, func, context);
|
791 |
+
return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
|
792 |
+
}
|
793 |
+
|
794 |
+
STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
|
795 |
+
{
|
796 |
+
stbi__write_context s = { 0 };
|
797 |
+
if (stbi__start_write_file(&s,filename)) {
|
798 |
+
int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
|
799 |
+
stbi__end_write_file(&s);
|
800 |
+
return r;
|
801 |
+
} else
|
802 |
+
return 0;
|
803 |
+
}
|
804 |
+
#endif // STBI_WRITE_NO_STDIO
|
805 |
+
|
806 |
+
|
807 |
+
//////////////////////////////////////////////////////////////////////////////
|
808 |
+
//
|
809 |
+
// PNG writer
|
810 |
+
//
|
811 |
+
|
812 |
+
#ifndef STBIW_ZLIB_COMPRESS
|
813 |
+
// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
|
814 |
+
#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
|
815 |
+
#define stbiw__sbm(a) stbiw__sbraw(a)[0]
|
816 |
+
#define stbiw__sbn(a) stbiw__sbraw(a)[1]
|
817 |
+
|
818 |
+
#define stbiw__sbneedgrow(a,n) ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
|
819 |
+
#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
|
820 |
+
#define stbiw__sbgrow(a,n) stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
|
821 |
+
|
822 |
+
#define stbiw__sbpush(a, v) (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
|
823 |
+
#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0)
|
824 |
+
#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
|
825 |
+
|
826 |
+
static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
|
827 |
+
{
|
828 |
+
int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
|
829 |
+
void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
|
830 |
+
STBIW_ASSERT(p);
|
831 |
+
if (p) {
|
832 |
+
if (!*arr) ((int *) p)[1] = 0;
|
833 |
+
*arr = (void *) ((int *) p + 2);
|
834 |
+
stbiw__sbm(*arr) = m;
|
835 |
+
}
|
836 |
+
return *arr;
|
837 |
+
}
|
838 |
+
|
839 |
+
static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
|
840 |
+
{
|
841 |
+
while (*bitcount >= 8) {
|
842 |
+
stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
|
843 |
+
*bitbuffer >>= 8;
|
844 |
+
*bitcount -= 8;
|
845 |
+
}
|
846 |
+
return data;
|
847 |
+
}
|
848 |
+
|
849 |
+
static int stbiw__zlib_bitrev(int code, int codebits)
|
850 |
+
{
|
851 |
+
int res=0;
|
852 |
+
while (codebits--) {
|
853 |
+
res = (res << 1) | (code & 1);
|
854 |
+
code >>= 1;
|
855 |
+
}
|
856 |
+
return res;
|
857 |
+
}
|
858 |
+
|
859 |
+
static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
|
860 |
+
{
|
861 |
+
int i;
|
862 |
+
for (i=0; i < limit && i < 258; ++i)
|
863 |
+
if (a[i] != b[i]) break;
|
864 |
+
return i;
|
865 |
+
}
|
866 |
+
|
867 |
+
static unsigned int stbiw__zhash(unsigned char *data)
|
868 |
+
{
|
869 |
+
stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
|
870 |
+
hash ^= hash << 3;
|
871 |
+
hash += hash >> 5;
|
872 |
+
hash ^= hash << 4;
|
873 |
+
hash += hash >> 17;
|
874 |
+
hash ^= hash << 25;
|
875 |
+
hash += hash >> 6;
|
876 |
+
return hash;
|
877 |
+
}
|
878 |
+
|
879 |
+
#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
|
880 |
+
#define stbiw__zlib_add(code,codebits) \
|
881 |
+
(bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
|
882 |
+
#define stbiw__zlib_huffa(b,c) stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
|
883 |
+
// default huffman tables
|
884 |
+
#define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8)
|
885 |
+
#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9)
|
886 |
+
#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256,7)
|
887 |
+
#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280,8)
|
888 |
+
#define stbiw__zlib_huff(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
|
889 |
+
#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
|
890 |
+
|
891 |
+
#define stbiw__ZHASH 16384
|
892 |
+
|
893 |
+
#endif // STBIW_ZLIB_COMPRESS
|
894 |
+
|
895 |
+
STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
|
896 |
+
{
|
897 |
+
#ifdef STBIW_ZLIB_COMPRESS
|
898 |
+
// user provided a zlib compress implementation, use that
|
899 |
+
return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
|
900 |
+
#else // use builtin
|
901 |
+
static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
|
902 |
+
static unsigned char lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0 };
|
903 |
+
static unsigned short distc[] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
|
904 |
+
static unsigned char disteb[] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
|
905 |
+
unsigned int bitbuf=0;
|
906 |
+
int i,j, bitcount=0;
|
907 |
+
unsigned char *out = NULL;
|
908 |
+
unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
|
909 |
+
if (hash_table == NULL)
|
910 |
+
return NULL;
|
911 |
+
if (quality < 5) quality = 5;
|
912 |
+
|
913 |
+
stbiw__sbpush(out, 0x78); // DEFLATE 32K window
|
914 |
+
stbiw__sbpush(out, 0x5e); // FLEVEL = 1
|
915 |
+
stbiw__zlib_add(1,1); // BFINAL = 1
|
916 |
+
stbiw__zlib_add(1,2); // BTYPE = 1 -- fixed huffman
|
917 |
+
|
918 |
+
for (i=0; i < stbiw__ZHASH; ++i)
|
919 |
+
hash_table[i] = NULL;
|
920 |
+
|
921 |
+
i=0;
|
922 |
+
while (i < data_len-3) {
|
923 |
+
// hash next 3 bytes of data to be compressed
|
924 |
+
int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
|
925 |
+
unsigned char *bestloc = 0;
|
926 |
+
unsigned char **hlist = hash_table[h];
|
927 |
+
int n = stbiw__sbcount(hlist);
|
928 |
+
for (j=0; j < n; ++j) {
|
929 |
+
if (hlist[j]-data > i-32768) { // if entry lies within window
|
930 |
+
int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
|
931 |
+
if (d >= best) { best=d; bestloc=hlist[j]; }
|
932 |
+
}
|
933 |
+
}
|
934 |
+
// when hash table entry is too long, delete half the entries
|
935 |
+
if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
|
936 |
+
STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
|
937 |
+
stbiw__sbn(hash_table[h]) = quality;
|
938 |
+
}
|
939 |
+
stbiw__sbpush(hash_table[h],data+i);
|
940 |
+
|
941 |
+
if (bestloc) {
|
942 |
+
// "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
|
943 |
+
h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
|
944 |
+
hlist = hash_table[h];
|
945 |
+
n = stbiw__sbcount(hlist);
|
946 |
+
for (j=0; j < n; ++j) {
|
947 |
+
if (hlist[j]-data > i-32767) {
|
948 |
+
int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
|
949 |
+
if (e > best) { // if next match is better, bail on current match
|
950 |
+
bestloc = NULL;
|
951 |
+
break;
|
952 |
+
}
|
953 |
+
}
|
954 |
+
}
|
955 |
+
}
|
956 |
+
|
957 |
+
if (bestloc) {
|
958 |
+
int d = (int) (data+i - bestloc); // distance back
|
959 |
+
STBIW_ASSERT(d <= 32767 && best <= 258);
|
960 |
+
for (j=0; best > lengthc[j+1]-1; ++j);
|
961 |
+
stbiw__zlib_huff(j+257);
|
962 |
+
if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
|
963 |
+
for (j=0; d > distc[j+1]-1; ++j);
|
964 |
+
stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
|
965 |
+
if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
|
966 |
+
i += best;
|
967 |
+
} else {
|
968 |
+
stbiw__zlib_huffb(data[i]);
|
969 |
+
++i;
|
970 |
+
}
|
971 |
+
}
|
972 |
+
// write out final bytes
|
973 |
+
for (;i < data_len; ++i)
|
974 |
+
stbiw__zlib_huffb(data[i]);
|
975 |
+
stbiw__zlib_huff(256); // end of block
|
976 |
+
// pad with 0 bits to byte boundary
|
977 |
+
while (bitcount)
|
978 |
+
stbiw__zlib_add(0,1);
|
979 |
+
|
980 |
+
for (i=0; i < stbiw__ZHASH; ++i)
|
981 |
+
(void) stbiw__sbfree(hash_table[i]);
|
982 |
+
STBIW_FREE(hash_table);
|
983 |
+
|
984 |
+
// store uncompressed instead if compression was worse
|
985 |
+
if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
|
986 |
+
stbiw__sbn(out) = 2; // truncate to DEFLATE 32K window and FLEVEL = 1
|
987 |
+
for (j = 0; j < data_len;) {
|
988 |
+
int blocklen = data_len - j;
|
989 |
+
if (blocklen > 32767) blocklen = 32767;
|
990 |
+
stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
|
991 |
+
stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
|
992 |
+
stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
|
993 |
+
stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
|
994 |
+
stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
|
995 |
+
memcpy(out+stbiw__sbn(out), data+j, blocklen);
|
996 |
+
stbiw__sbn(out) += blocklen;
|
997 |
+
j += blocklen;
|
998 |
+
}
|
999 |
+
}
|
1000 |
+
|
1001 |
+
{
|
1002 |
+
// compute adler32 on input
|
1003 |
+
unsigned int s1=1, s2=0;
|
1004 |
+
int blocklen = (int) (data_len % 5552);
|
1005 |
+
j=0;
|
1006 |
+
while (j < data_len) {
|
1007 |
+
for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
|
1008 |
+
s1 %= 65521; s2 %= 65521;
|
1009 |
+
j += blocklen;
|
1010 |
+
blocklen = 5552;
|
1011 |
+
}
|
1012 |
+
stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
|
1013 |
+
stbiw__sbpush(out, STBIW_UCHAR(s2));
|
1014 |
+
stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
|
1015 |
+
stbiw__sbpush(out, STBIW_UCHAR(s1));
|
1016 |
+
}
|
1017 |
+
*out_len = stbiw__sbn(out);
|
1018 |
+
// make returned pointer freeable
|
1019 |
+
STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
|
1020 |
+
return (unsigned char *) stbiw__sbraw(out);
|
1021 |
+
#endif // STBIW_ZLIB_COMPRESS
|
1022 |
+
}
|
1023 |
+
|
1024 |
+
static unsigned int stbiw__crc32(unsigned char *buffer, int len)
|
1025 |
+
{
|
1026 |
+
#ifdef STBIW_CRC32
|
1027 |
+
return STBIW_CRC32(buffer, len);
|
1028 |
+
#else
|
1029 |
+
static unsigned int crc_table[256] =
|
1030 |
+
{
|
1031 |
+
0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
|
1032 |
+
0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
|
1033 |
+
0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
|
1034 |
+
0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
|
1035 |
+
0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
|
1036 |
+
0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
|
1037 |
+
0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
|
1038 |
+
0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
|
1039 |
+
0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
|
1040 |
+
0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
|
1041 |
+
0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
|
1042 |
+
0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
|
1043 |
+
0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
|
1044 |
+
0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
|
1045 |
+
0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
|
1046 |
+
0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
|
1047 |
+
0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
|
1048 |
+
0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
|
1049 |
+
0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
|
1050 |
+
0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
|
1051 |
+
0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
|
1052 |
+
0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
|
1053 |
+
0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
|
1054 |
+
0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
|
1055 |
+
0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
|
1056 |
+
0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
|
1057 |
+
0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
|
1058 |
+
0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
|
1059 |
+
0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
|
1060 |
+
0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
|
1061 |
+
0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
|
1062 |
+
0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
|
1063 |
+
};
|
1064 |
+
|
1065 |
+
unsigned int crc = ~0u;
|
1066 |
+
int i;
|
1067 |
+
for (i=0; i < len; ++i)
|
1068 |
+
crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
|
1069 |
+
return ~crc;
|
1070 |
+
#endif
|
1071 |
+
}
|
1072 |
+
|
1073 |
+
#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
|
1074 |
+
#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
|
1075 |
+
#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
|
1076 |
+
|
1077 |
+
static void stbiw__wpcrc(unsigned char **data, int len)
|
1078 |
+
{
|
1079 |
+
unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
|
1080 |
+
stbiw__wp32(*data, crc);
|
1081 |
+
}
|
1082 |
+
|
1083 |
+
static unsigned char stbiw__paeth(int a, int b, int c)
|
1084 |
+
{
|
1085 |
+
int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
|
1086 |
+
if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
|
1087 |
+
if (pb <= pc) return STBIW_UCHAR(b);
|
1088 |
+
return STBIW_UCHAR(c);
|
1089 |
+
}
|
1090 |
+
|
1091 |
+
// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
|
1092 |
+
static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
|
1093 |
+
{
|
1094 |
+
static int mapping[] = { 0,1,2,3,4 };
|
1095 |
+
static int firstmap[] = { 0,1,0,5,6 };
|
1096 |
+
int *mymap = (y != 0) ? mapping : firstmap;
|
1097 |
+
int i;
|
1098 |
+
int type = mymap[filter_type];
|
1099 |
+
unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
|
1100 |
+
int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
|
1101 |
+
|
1102 |
+
if (type==0) {
|
1103 |
+
memcpy(line_buffer, z, width*n);
|
1104 |
+
return;
|
1105 |
+
}
|
1106 |
+
|
1107 |
+
// first loop isn't optimized since it's just one pixel
|
1108 |
+
for (i = 0; i < n; ++i) {
|
1109 |
+
switch (type) {
|
1110 |
+
case 1: line_buffer[i] = z[i]; break;
|
1111 |
+
case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
|
1112 |
+
case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
|
1113 |
+
case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
|
1114 |
+
case 5: line_buffer[i] = z[i]; break;
|
1115 |
+
case 6: line_buffer[i] = z[i]; break;
|
1116 |
+
}
|
1117 |
+
}
|
1118 |
+
switch (type) {
|
1119 |
+
case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
|
1120 |
+
case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
|
1121 |
+
case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
|
1122 |
+
case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
|
1123 |
+
case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
|
1124 |
+
case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
|
1125 |
+
}
|
1126 |
+
}
|
1127 |
+
|
1128 |
+
STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len, const char* parameters)
|
1129 |
+
{
|
1130 |
+
int force_filter = stbi_write_force_png_filter;
|
1131 |
+
int param_length = 0;
|
1132 |
+
int ctype[5] = { -1, 0, 4, 2, 6 };
|
1133 |
+
unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
|
1134 |
+
unsigned char *out,*o, *filt, *zlib;
|
1135 |
+
signed char *line_buffer;
|
1136 |
+
int j,zlen;
|
1137 |
+
|
1138 |
+
if (stride_bytes == 0)
|
1139 |
+
stride_bytes = x * n;
|
1140 |
+
|
1141 |
+
if (force_filter >= 5) {
|
1142 |
+
force_filter = -1;
|
1143 |
+
}
|
1144 |
+
|
1145 |
+
filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
|
1146 |
+
line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
|
1147 |
+
for (j=0; j < y; ++j) {
|
1148 |
+
int filter_type;
|
1149 |
+
if (force_filter > -1) {
|
1150 |
+
filter_type = force_filter;
|
1151 |
+
stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
|
1152 |
+
} else { // Estimate the best filter by running through all of them:
|
1153 |
+
int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
|
1154 |
+
for (filter_type = 0; filter_type < 5; filter_type++) {
|
1155 |
+
stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
|
1156 |
+
|
1157 |
+
// Estimate the entropy of the line using this filter; the less, the better.
|
1158 |
+
est = 0;
|
1159 |
+
for (i = 0; i < x*n; ++i) {
|
1160 |
+
est += abs((signed char) line_buffer[i]);
|
1161 |
+
}
|
1162 |
+
if (est < best_filter_val) {
|
1163 |
+
best_filter_val = est;
|
1164 |
+
best_filter = filter_type;
|
1165 |
+
}
|
1166 |
+
}
|
1167 |
+
if (filter_type != best_filter) { // If the last iteration already got us the best filter, don't redo it
|
1168 |
+
stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
|
1169 |
+
filter_type = best_filter;
|
1170 |
+
}
|
1171 |
+
}
|
1172 |
+
// when we get here, filter_type contains the filter type, and line_buffer contains the data
|
1173 |
+
filt[j*(x*n+1)] = (unsigned char) filter_type;
|
1174 |
+
STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
|
1175 |
+
}
|
1176 |
+
STBIW_FREE(line_buffer);
|
1177 |
+
zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
|
1178 |
+
STBIW_FREE(filt);
|
1179 |
+
if (!zlib) return 0;
|
1180 |
+
|
1181 |
+
if(parameters != NULL) {
|
1182 |
+
param_length = strlen(parameters);
|
1183 |
+
param_length += strlen("parameters") + 1; // For the name and the null-byte
|
1184 |
+
}
|
1185 |
+
|
1186 |
+
// each tag requires 12 bytes of overhead
|
1187 |
+
out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12 + ((parameters)?(param_length+12):0));
|
1188 |
+
if (!out) return 0;
|
1189 |
+
*out_len = 8 + 12+13 + 12+zlen + 12 + ((parameters)?(param_length+12):0);
|
1190 |
+
|
1191 |
+
o=out;
|
1192 |
+
STBIW_MEMMOVE(o,sig,8); o+= 8;
|
1193 |
+
stbiw__wp32(o, 13); // header length
|
1194 |
+
stbiw__wptag(o, "IHDR");
|
1195 |
+
stbiw__wp32(o, x);
|
1196 |
+
stbiw__wp32(o, y);
|
1197 |
+
*o++ = 8;
|
1198 |
+
*o++ = STBIW_UCHAR(ctype[n]);
|
1199 |
+
*o++ = 0;
|
1200 |
+
*o++ = 0;
|
1201 |
+
*o++ = 0;
|
1202 |
+
stbiw__wpcrc(&o,13);
|
1203 |
+
|
1204 |
+
if(parameters != NULL) {
|
1205 |
+
stbiw__wp32(o, param_length);
|
1206 |
+
stbiw__wptag(o, "tEXt");
|
1207 |
+
STBIW_MEMMOVE(o, "parameters", strlen("parameters"));
|
1208 |
+
o+=strlen("parameters");
|
1209 |
+
*o++ = 0; // Null pyte separator
|
1210 |
+
STBIW_MEMMOVE(o, parameters, strlen(parameters));
|
1211 |
+
o+=strlen(parameters);
|
1212 |
+
stbiw__wpcrc(&o, param_length);
|
1213 |
+
}
|
1214 |
+
|
1215 |
+
stbiw__wp32(o, zlen);
|
1216 |
+
stbiw__wptag(o, "IDAT");
|
1217 |
+
STBIW_MEMMOVE(o, zlib, zlen);
|
1218 |
+
o += zlen;
|
1219 |
+
STBIW_FREE(zlib);
|
1220 |
+
stbiw__wpcrc(&o, zlen);
|
1221 |
+
|
1222 |
+
stbiw__wp32(o,0);
|
1223 |
+
stbiw__wptag(o, "IEND");
|
1224 |
+
stbiw__wpcrc(&o,0);
|
1225 |
+
|
1226 |
+
STBIW_ASSERT(o == out + *out_len);
|
1227 |
+
|
1228 |
+
return out;
|
1229 |
+
}
|
1230 |
+
|
1231 |
+
#ifndef STBI_WRITE_NO_STDIO
|
1232 |
+
STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes, const char* parameters)
|
1233 |
+
{
|
1234 |
+
FILE *f;
|
1235 |
+
int len;
|
1236 |
+
unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len, parameters);
|
1237 |
+
if (png == NULL) return 0;
|
1238 |
+
|
1239 |
+
f = stbiw__fopen(filename, "wb");
|
1240 |
+
if (!f) { STBIW_FREE(png); return 0; }
|
1241 |
+
fwrite(png, 1, len, f);
|
1242 |
+
fclose(f);
|
1243 |
+
STBIW_FREE(png);
|
1244 |
+
return 1;
|
1245 |
+
}
|
1246 |
+
#endif
|
1247 |
+
|
1248 |
+
STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
|
1249 |
+
{
|
1250 |
+
int len;
|
1251 |
+
unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len, NULL);
|
1252 |
+
if (png == NULL) return 0;
|
1253 |
+
func(context, png, len);
|
1254 |
+
STBIW_FREE(png);
|
1255 |
+
return 1;
|
1256 |
+
}
|
1257 |
+
|
1258 |
+
|
1259 |
+
/* ***************************************************************************
|
1260 |
+
*
|
1261 |
+
* JPEG writer
|
1262 |
+
*
|
1263 |
+
* This is based on Jon Olick's jo_jpeg.cpp:
|
1264 |
+
* public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
|
1265 |
+
*/
|
1266 |
+
|
1267 |
+
static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
|
1268 |
+
24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
|
1269 |
+
|
1270 |
+
static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
|
1271 |
+
int bitBuf = *bitBufP, bitCnt = *bitCntP;
|
1272 |
+
bitCnt += bs[1];
|
1273 |
+
bitBuf |= bs[0] << (24 - bitCnt);
|
1274 |
+
while(bitCnt >= 8) {
|
1275 |
+
unsigned char c = (bitBuf >> 16) & 255;
|
1276 |
+
stbiw__putc(s, c);
|
1277 |
+
if(c == 255) {
|
1278 |
+
stbiw__putc(s, 0);
|
1279 |
+
}
|
1280 |
+
bitBuf <<= 8;
|
1281 |
+
bitCnt -= 8;
|
1282 |
+
}
|
1283 |
+
*bitBufP = bitBuf;
|
1284 |
+
*bitCntP = bitCnt;
|
1285 |
+
}
|
1286 |
+
|
1287 |
+
static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
|
1288 |
+
float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
|
1289 |
+
float z1, z2, z3, z4, z5, z11, z13;
|
1290 |
+
|
1291 |
+
float tmp0 = d0 + d7;
|
1292 |
+
float tmp7 = d0 - d7;
|
1293 |
+
float tmp1 = d1 + d6;
|
1294 |
+
float tmp6 = d1 - d6;
|
1295 |
+
float tmp2 = d2 + d5;
|
1296 |
+
float tmp5 = d2 - d5;
|
1297 |
+
float tmp3 = d3 + d4;
|
1298 |
+
float tmp4 = d3 - d4;
|
1299 |
+
|
1300 |
+
// Even part
|
1301 |
+
float tmp10 = tmp0 + tmp3; // phase 2
|
1302 |
+
float tmp13 = tmp0 - tmp3;
|
1303 |
+
float tmp11 = tmp1 + tmp2;
|
1304 |
+
float tmp12 = tmp1 - tmp2;
|
1305 |
+
|
1306 |
+
d0 = tmp10 + tmp11; // phase 3
|
1307 |
+
d4 = tmp10 - tmp11;
|
1308 |
+
|
1309 |
+
z1 = (tmp12 + tmp13) * 0.707106781f; // c4
|
1310 |
+
d2 = tmp13 + z1; // phase 5
|
1311 |
+
d6 = tmp13 - z1;
|
1312 |
+
|
1313 |
+
// Odd part
|
1314 |
+
tmp10 = tmp4 + tmp5; // phase 2
|
1315 |
+
tmp11 = tmp5 + tmp6;
|
1316 |
+
tmp12 = tmp6 + tmp7;
|
1317 |
+
|
1318 |
+
// The rotator is modified from fig 4-8 to avoid extra negations.
|
1319 |
+
z5 = (tmp10 - tmp12) * 0.382683433f; // c6
|
1320 |
+
z2 = tmp10 * 0.541196100f + z5; // c2-c6
|
1321 |
+
z4 = tmp12 * 1.306562965f + z5; // c2+c6
|
1322 |
+
z3 = tmp11 * 0.707106781f; // c4
|
1323 |
+
|
1324 |
+
z11 = tmp7 + z3; // phase 5
|
1325 |
+
z13 = tmp7 - z3;
|
1326 |
+
|
1327 |
+
*d5p = z13 + z2; // phase 6
|
1328 |
+
*d3p = z13 - z2;
|
1329 |
+
*d1p = z11 + z4;
|
1330 |
+
*d7p = z11 - z4;
|
1331 |
+
|
1332 |
+
*d0p = d0; *d2p = d2; *d4p = d4; *d6p = d6;
|
1333 |
+
}
|
1334 |
+
|
1335 |
+
static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
|
1336 |
+
int tmp1 = val < 0 ? -val : val;
|
1337 |
+
val = val < 0 ? val-1 : val;
|
1338 |
+
bits[1] = 1;
|
1339 |
+
while(tmp1 >>= 1) {
|
1340 |
+
++bits[1];
|
1341 |
+
}
|
1342 |
+
bits[0] = val & ((1<<bits[1])-1);
|
1343 |
+
}
|
1344 |
+
|
1345 |
+
static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
|
1346 |
+
const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
|
1347 |
+
const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
|
1348 |
+
int dataOff, i, j, n, diff, end0pos, x, y;
|
1349 |
+
int DU[64];
|
1350 |
+
|
1351 |
+
// DCT rows
|
1352 |
+
for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
|
1353 |
+
stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
|
1354 |
+
}
|
1355 |
+
// DCT columns
|
1356 |
+
for(dataOff=0; dataOff<8; ++dataOff) {
|
1357 |
+
stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
|
1358 |
+
&CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
|
1359 |
+
}
|
1360 |
+
// Quantize/descale/zigzag the coefficients
|
1361 |
+
for(y = 0, j=0; y < 8; ++y) {
|
1362 |
+
for(x = 0; x < 8; ++x,++j) {
|
1363 |
+
float v;
|
1364 |
+
i = y*du_stride+x;
|
1365 |
+
v = CDU[i]*fdtbl[j];
|
1366 |
+
// DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
|
1367 |
+
// ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
|
1368 |
+
DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
|
1369 |
+
}
|
1370 |
+
}
|
1371 |
+
|
1372 |
+
// Encode DC
|
1373 |
+
diff = DU[0] - DC;
|
1374 |
+
if (diff == 0) {
|
1375 |
+
stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
|
1376 |
+
} else {
|
1377 |
+
unsigned short bits[2];
|
1378 |
+
stbiw__jpg_calcBits(diff, bits);
|
1379 |
+
stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
|
1380 |
+
stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
|
1381 |
+
}
|
1382 |
+
// Encode ACs
|
1383 |
+
end0pos = 63;
|
1384 |
+
for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
|
1385 |
+
}
|
1386 |
+
// end0pos = first element in reverse order !=0
|
1387 |
+
if(end0pos == 0) {
|
1388 |
+
stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
|
1389 |
+
return DU[0];
|
1390 |
+
}
|
1391 |
+
for(i = 1; i <= end0pos; ++i) {
|
1392 |
+
int startpos = i;
|
1393 |
+
int nrzeroes;
|
1394 |
+
unsigned short bits[2];
|
1395 |
+
for (; DU[i]==0 && i<=end0pos; ++i) {
|
1396 |
+
}
|
1397 |
+
nrzeroes = i-startpos;
|
1398 |
+
if ( nrzeroes >= 16 ) {
|
1399 |
+
int lng = nrzeroes>>4;
|
1400 |
+
int nrmarker;
|
1401 |
+
for (nrmarker=1; nrmarker <= lng; ++nrmarker)
|
1402 |
+
stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
|
1403 |
+
nrzeroes &= 15;
|
1404 |
+
}
|
1405 |
+
stbiw__jpg_calcBits(DU[i], bits);
|
1406 |
+
stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
|
1407 |
+
stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
|
1408 |
+
}
|
1409 |
+
if(end0pos != 63) {
|
1410 |
+
stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
|
1411 |
+
}
|
1412 |
+
return DU[0];
|
1413 |
+
}
|
1414 |
+
|
1415 |
+
static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
|
1416 |
+
// Constants that don't pollute global namespace
|
1417 |
+
static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
|
1418 |
+
static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
|
1419 |
+
static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
|
1420 |
+
static const unsigned char std_ac_luminance_values[] = {
|
1421 |
+
0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
|
1422 |
+
0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
|
1423 |
+
0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
|
1424 |
+
0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
|
1425 |
+
0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
|
1426 |
+
0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
|
1427 |
+
0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
|
1428 |
+
};
|
1429 |
+
static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
|
1430 |
+
static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
|
1431 |
+
static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
|
1432 |
+
static const unsigned char std_ac_chrominance_values[] = {
|
1433 |
+
0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
|
1434 |
+
0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
|
1435 |
+
0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
|
1436 |
+
0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
|
1437 |
+
0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
|
1438 |
+
0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
|
1439 |
+
0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
|
1440 |
+
};
|
1441 |
+
// Huffman tables
|
1442 |
+
static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
|
1443 |
+
static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
|
1444 |
+
static const unsigned short YAC_HT[256][2] = {
|
1445 |
+
{10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1446 |
+
{12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1447 |
+
{28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1448 |
+
{58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1449 |
+
{59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1450 |
+
{122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1451 |
+
{123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1452 |
+
{250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1453 |
+
{504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1454 |
+
{505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1455 |
+
{506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1456 |
+
{1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1457 |
+
{1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1458 |
+
{2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1459 |
+
{65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1460 |
+
{2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
|
1461 |
+
};
|
1462 |
+
static const unsigned short UVAC_HT[256][2] = {
|
1463 |
+
{0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1464 |
+
{11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1465 |
+
{26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1466 |
+
{27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1467 |
+
{58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1468 |
+
{59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1469 |
+
{121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1470 |
+
{122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1471 |
+
{249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1472 |
+
{503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1473 |
+
{504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1474 |
+
{505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1475 |
+
{506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1476 |
+
{2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1477 |
+
{16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
|
1478 |
+
{1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
|
1479 |
+
};
|
1480 |
+
static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
|
1481 |
+
37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
|
1482 |
+
static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
|
1483 |
+
99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
|
1484 |
+
static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
|
1485 |
+
1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
|
1486 |
+
|
1487 |
+
int row, col, i, k, subsample;
|
1488 |
+
float fdtbl_Y[64], fdtbl_UV[64];
|
1489 |
+
unsigned char YTable[64], UVTable[64];
|
1490 |
+
|
1491 |
+
if(!data || !width || !height || comp > 4 || comp < 1) {
|
1492 |
+
return 0;
|
1493 |
+
}
|
1494 |
+
|
1495 |
+
quality = quality ? quality : 90;
|
1496 |
+
subsample = quality <= 90 ? 1 : 0;
|
1497 |
+
quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
|
1498 |
+
quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
|
1499 |
+
|
1500 |
+
for(i = 0; i < 64; ++i) {
|
1501 |
+
int uvti, yti = (YQT[i]*quality+50)/100;
|
1502 |
+
YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
|
1503 |
+
uvti = (UVQT[i]*quality+50)/100;
|
1504 |
+
UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
|
1505 |
+
}
|
1506 |
+
|
1507 |
+
for(row = 0, k = 0; row < 8; ++row) {
|
1508 |
+
for(col = 0; col < 8; ++col, ++k) {
|
1509 |
+
fdtbl_Y[k] = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
|
1510 |
+
fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
|
1511 |
+
}
|
1512 |
+
}
|
1513 |
+
|
1514 |
+
// Write Headers
|
1515 |
+
{
|
1516 |
+
static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
|
1517 |
+
static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
|
1518 |
+
const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
|
1519 |
+
3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
|
1520 |
+
s->func(s->context, (void*)head0, sizeof(head0));
|
1521 |
+
s->func(s->context, (void*)YTable, sizeof(YTable));
|
1522 |
+
stbiw__putc(s, 1);
|
1523 |
+
s->func(s->context, UVTable, sizeof(UVTable));
|
1524 |
+
s->func(s->context, (void*)head1, sizeof(head1));
|
1525 |
+
s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
|
1526 |
+
s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
|
1527 |
+
stbiw__putc(s, 0x10); // HTYACinfo
|
1528 |
+
s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
|
1529 |
+
s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
|
1530 |
+
stbiw__putc(s, 1); // HTUDCinfo
|
1531 |
+
s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
|
1532 |
+
s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
|
1533 |
+
stbiw__putc(s, 0x11); // HTUACinfo
|
1534 |
+
s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
|
1535 |
+
s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
|
1536 |
+
s->func(s->context, (void*)head2, sizeof(head2));
|
1537 |
+
}
|
1538 |
+
|
1539 |
+
// Encode 8x8 macroblocks
|
1540 |
+
{
|
1541 |
+
static const unsigned short fillBits[] = {0x7F, 7};
|
1542 |
+
int DCY=0, DCU=0, DCV=0;
|
1543 |
+
int bitBuf=0, bitCnt=0;
|
1544 |
+
// comp == 2 is grey+alpha (alpha is ignored)
|
1545 |
+
int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
|
1546 |
+
const unsigned char *dataR = (const unsigned char *)data;
|
1547 |
+
const unsigned char *dataG = dataR + ofsG;
|
1548 |
+
const unsigned char *dataB = dataR + ofsB;
|
1549 |
+
int x, y, pos;
|
1550 |
+
if(subsample) {
|
1551 |
+
for(y = 0; y < height; y += 16) {
|
1552 |
+
for(x = 0; x < width; x += 16) {
|
1553 |
+
float Y[256], U[256], V[256];
|
1554 |
+
for(row = y, pos = 0; row < y+16; ++row) {
|
1555 |
+
// row >= height => use last input row
|
1556 |
+
int clamped_row = (row < height) ? row : height - 1;
|
1557 |
+
int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
|
1558 |
+
for(col = x; col < x+16; ++col, ++pos) {
|
1559 |
+
// if col >= width => use pixel from last input column
|
1560 |
+
int p = base_p + ((col < width) ? col : (width-1))*comp;
|
1561 |
+
float r = dataR[p], g = dataG[p], b = dataB[p];
|
1562 |
+
Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
|
1563 |
+
U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
|
1564 |
+
V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
|
1565 |
+
}
|
1566 |
+
}
|
1567 |
+
DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
|
1568 |
+
DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
|
1569 |
+
DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
|
1570 |
+
DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
|
1571 |
+
|
1572 |
+
// subsample U,V
|
1573 |
+
{
|
1574 |
+
float subU[64], subV[64];
|
1575 |
+
int yy, xx;
|
1576 |
+
for(yy = 0, pos = 0; yy < 8; ++yy) {
|
1577 |
+
for(xx = 0; xx < 8; ++xx, ++pos) {
|
1578 |
+
int j = yy*32+xx*2;
|
1579 |
+
subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
|
1580 |
+
subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
|
1581 |
+
}
|
1582 |
+
}
|
1583 |
+
DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
|
1584 |
+
DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
|
1585 |
+
}
|
1586 |
+
}
|
1587 |
+
}
|
1588 |
+
} else {
|
1589 |
+
for(y = 0; y < height; y += 8) {
|
1590 |
+
for(x = 0; x < width; x += 8) {
|
1591 |
+
float Y[64], U[64], V[64];
|
1592 |
+
for(row = y, pos = 0; row < y+8; ++row) {
|
1593 |
+
// row >= height => use last input row
|
1594 |
+
int clamped_row = (row < height) ? row : height - 1;
|
1595 |
+
int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
|
1596 |
+
for(col = x; col < x+8; ++col, ++pos) {
|
1597 |
+
// if col >= width => use pixel from last input column
|
1598 |
+
int p = base_p + ((col < width) ? col : (width-1))*comp;
|
1599 |
+
float r = dataR[p], g = dataG[p], b = dataB[p];
|
1600 |
+
Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
|
1601 |
+
U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
|
1602 |
+
V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
|
1603 |
+
}
|
1604 |
+
}
|
1605 |
+
|
1606 |
+
DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y, DCY, YDC_HT, YAC_HT);
|
1607 |
+
DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
|
1608 |
+
DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
|
1609 |
+
}
|
1610 |
+
}
|
1611 |
+
}
|
1612 |
+
|
1613 |
+
// Do the bit alignment of the EOI marker
|
1614 |
+
stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
|
1615 |
+
}
|
1616 |
+
|
1617 |
+
// EOI
|
1618 |
+
stbiw__putc(s, 0xFF);
|
1619 |
+
stbiw__putc(s, 0xD9);
|
1620 |
+
|
1621 |
+
return 1;
|
1622 |
+
}
|
1623 |
+
|
1624 |
+
STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
|
1625 |
+
{
|
1626 |
+
stbi__write_context s = { 0 };
|
1627 |
+
stbi__start_write_callbacks(&s, func, context);
|
1628 |
+
return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
|
1629 |
+
}
|
1630 |
+
|
1631 |
+
|
1632 |
+
#ifndef STBI_WRITE_NO_STDIO
|
1633 |
+
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
|
1634 |
+
{
|
1635 |
+
stbi__write_context s = { 0 };
|
1636 |
+
if (stbi__start_write_file(&s,filename)) {
|
1637 |
+
int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
|
1638 |
+
stbi__end_write_file(&s);
|
1639 |
+
return r;
|
1640 |
+
} else
|
1641 |
+
return 0;
|
1642 |
+
}
|
1643 |
+
#endif
|
1644 |
+
|
1645 |
+
#endif // STB_IMAGE_WRITE_IMPLEMENTATION
|
1646 |
+
|
1647 |
+
/* Revision history
|
1648 |
+
1.16 (2021-07-11)
|
1649 |
+
make Deflate code emit uncompressed blocks when it would otherwise expand
|
1650 |
+
support writing BMPs with alpha channel
|
1651 |
+
1.15 (2020-07-13) unknown
|
1652 |
+
1.14 (2020-02-02) updated JPEG writer to downsample chroma channels
|
1653 |
+
1.13
|
1654 |
+
1.12
|
1655 |
+
1.11 (2019-08-11)
|
1656 |
+
|
1657 |
+
1.10 (2019-02-07)
|
1658 |
+
support utf8 filenames in Windows; fix warnings and platform ifdefs
|
1659 |
+
1.09 (2018-02-11)
|
1660 |
+
fix typo in zlib quality API, improve STB_I_W_STATIC in C++
|
1661 |
+
1.08 (2018-01-29)
|
1662 |
+
add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
|
1663 |
+
1.07 (2017-07-24)
|
1664 |
+
doc fix
|
1665 |
+
1.06 (2017-07-23)
|
1666 |
+
writing JPEG (using Jon Olick's code)
|
1667 |
+
1.05 ???
|
1668 |
+
1.04 (2017-03-03)
|
1669 |
+
monochrome BMP expansion
|
1670 |
+
1.03 ???
|
1671 |
+
1.02 (2016-04-02)
|
1672 |
+
avoid allocating large structures on the stack
|
1673 |
+
1.01 (2016-01-16)
|
1674 |
+
STBIW_REALLOC_SIZED: support allocators with no realloc support
|
1675 |
+
avoid race-condition in crc initialization
|
1676 |
+
minor compile issues
|
1677 |
+
1.00 (2015-09-14)
|
1678 |
+
installable file IO function
|
1679 |
+
0.99 (2015-09-13)
|
1680 |
+
warning fixes; TGA rle support
|
1681 |
+
0.98 (2015-04-08)
|
1682 |
+
added STBIW_MALLOC, STBIW_ASSERT etc
|
1683 |
+
0.97 (2015-01-18)
|
1684 |
+
fixed HDR asserts, rewrote HDR rle logic
|
1685 |
+
0.96 (2015-01-17)
|
1686 |
+
add HDR output
|
1687 |
+
fix monochrome BMP
|
1688 |
+
0.95 (2014-08-17)
|
1689 |
+
add monochrome TGA output
|
1690 |
+
0.94 (2014-05-31)
|
1691 |
+
rename private functions to avoid conflicts with stb_image.h
|
1692 |
+
0.93 (2014-05-27)
|
1693 |
+
warning fixes
|
1694 |
+
0.92 (2010-08-01)
|
1695 |
+
casts to unsigned char to fix warnings
|
1696 |
+
0.91 (2010-07-17)
|
1697 |
+
first public release
|
1698 |
+
0.90 first internal release
|
1699 |
+
*/
|
1700 |
+
|
1701 |
+
/*
|
1702 |
+
------------------------------------------------------------------------------
|
1703 |
+
This software is available under 2 licenses -- choose whichever you prefer.
|
1704 |
+
------------------------------------------------------------------------------
|
1705 |
+
ALTERNATIVE A - MIT License
|
1706 |
+
Copyright (c) 2017 Sean Barrett
|
1707 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
1708 |
+
this software and associated documentation files (the "Software"), to deal in
|
1709 |
+
the Software without restriction, including without limitation the rights to
|
1710 |
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
1711 |
+
of the Software, and to permit persons to whom the Software is furnished to do
|
1712 |
+
so, subject to the following conditions:
|
1713 |
+
The above copyright notice and this permission notice shall be included in all
|
1714 |
+
copies or substantial portions of the Software.
|
1715 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
1716 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
1717 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
1718 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
1719 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
1720 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
1721 |
+
SOFTWARE.
|
1722 |
+
------------------------------------------------------------------------------
|
1723 |
+
ALTERNATIVE B - Public Domain (www.unlicense.org)
|
1724 |
+
This is free and unencumbered software released into the public domain.
|
1725 |
+
Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
|
1726 |
+
software, either in source code form or as a compiled binary, for any purpose,
|
1727 |
+
commercial or non-commercial, and by any means.
|
1728 |
+
In jurisdictions that recognize copyright laws, the author or authors of this
|
1729 |
+
software dedicate any and all copyright interest in the software to the public
|
1730 |
+
domain. We make this dedication for the benefit of the public at large and to
|
1731 |
+
the detriment of our heirs and successors. We intend this dedication to be an
|
1732 |
+
overt act of relinquishment in perpetuity of all present and future rights to
|
1733 |
+
this software under copyright law.
|
1734 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
1735 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
1736 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
1737 |
+
AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
1738 |
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
1739 |
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
1740 |
+
------------------------------------------------------------------------------
|
1741 |
+
*/
|
stable-diffusion.cpp/ggml/.editorconfig
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://EditorConfig.org
|
2 |
+
|
3 |
+
# Top-most EditorConfig file
|
4 |
+
root = true
|
5 |
+
|
6 |
+
# Unix-style newlines with a newline ending every file, utf-8 charset
|
7 |
+
[*]
|
8 |
+
end_of_line = lf
|
9 |
+
insert_final_newline = true
|
10 |
+
trim_trailing_whitespace = true
|
11 |
+
charset = utf-8
|
12 |
+
indent_style = space
|
13 |
+
indent_size = 4
|
14 |
+
|
15 |
+
[Makefile]
|
16 |
+
indent_style = tab
|
17 |
+
|
18 |
+
[prompts/*.txt]
|
19 |
+
insert_final_newline = unset
|
stable-diffusion.cpp/ggml/.github/workflows/ci.yml
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: CI
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches: [ master ]
|
6 |
+
pull_request:
|
7 |
+
branches: [ master ]
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
test-ubuntu-opencl:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
env:
|
13 |
+
GGML_NLOOP: 3
|
14 |
+
GGML_NITER: 1
|
15 |
+
GGML_N_THREADS: 2
|
16 |
+
|
17 |
+
steps:
|
18 |
+
- uses: actions/checkout@v3
|
19 |
+
|
20 |
+
- name: Dependencies
|
21 |
+
run: |
|
22 |
+
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
|
23 |
+
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
|
24 |
+
sudo apt-get update
|
25 |
+
sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev
|
26 |
+
- name: Create Build Environment
|
27 |
+
run: mkdir build
|
28 |
+
|
29 |
+
- name: Configure CMake
|
30 |
+
working-directory: ./build
|
31 |
+
run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON -DGGML_CLBLAST=ON ..
|
32 |
+
|
33 |
+
- name: Build
|
34 |
+
working-directory: ./build
|
35 |
+
run: make
|
36 |
+
|
37 |
+
- name: Test
|
38 |
+
working-directory: ./build
|
39 |
+
run: ctest --verbose --timeout 900
|
40 |
+
|
41 |
+
- name: Test Coverage
|
42 |
+
working-directory: ./build
|
43 |
+
run: |
|
44 |
+
llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
|
45 |
+
llvm-cov report ./bin/test-grad0 -instr-profile=ggml.profdata
|
46 |
+
llvm-cov report ./bin/test-opt -instr-profile=ggml.profdata
|
47 |
+
test-macos-metal:
|
48 |
+
runs-on: macos-13
|
49 |
+
env:
|
50 |
+
GGML_NLOOP: 3
|
51 |
+
GGML_NITER: 1
|
52 |
+
GGML_N_THREADS: 2
|
53 |
+
|
54 |
+
steps:
|
55 |
+
- uses: actions/checkout@v3
|
56 |
+
|
57 |
+
- name: Create Build Environment
|
58 |
+
run: mkdir build
|
59 |
+
|
60 |
+
- name: Configure CMake
|
61 |
+
working-directory: ./build
|
62 |
+
run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON -DGGML_METAL=ON ..
|
63 |
+
|
64 |
+
- name: Build
|
65 |
+
working-directory: ./build
|
66 |
+
run: make
|
67 |
+
|
68 |
+
- name: Test
|
69 |
+
working-directory: ./build
|
70 |
+
run: ctest --verbose --timeout 900
|
71 |
+
|
72 |
+
- name: Test Coverage
|
73 |
+
working-directory: ./build
|
74 |
+
run: |
|
75 |
+
xcrun llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
|
76 |
+
xcrun llvm-cov report ./bin/test-grad0 -instr-profile=ggml.profdata
|
77 |
+
xcrun llvm-cov report ./bin/test-opt -instr-profile=ggml.profdata
|
78 |
+
|
79 |
+
build:
|
80 |
+
|
81 |
+
strategy:
|
82 |
+
matrix:
|
83 |
+
os: [ubuntu-latest, macos-latest]
|
84 |
+
|
85 |
+
runs-on: ${{ matrix.os }}
|
86 |
+
|
87 |
+
env:
|
88 |
+
GGML_NLOOP: 3
|
89 |
+
GGML_NITER: 1
|
90 |
+
|
91 |
+
steps:
|
92 |
+
- uses: actions/checkout@v3
|
93 |
+
|
94 |
+
- name: Dependencies for Ubuntu
|
95 |
+
if: matrix.os == 'ubuntu-latest'
|
96 |
+
run: |
|
97 |
+
sudo apt-get update
|
98 |
+
sudo apt-get install llvm
|
99 |
+
|
100 |
+
- name: Set GGML_N_THREADS for Ubuntu
|
101 |
+
run: echo "GGML_N_THREADS=2" >> $GITHUB_ENV
|
102 |
+
if: matrix.os == 'ubuntu-latest'
|
103 |
+
|
104 |
+
- name: Set GGML_N_THREADS for MacOS
|
105 |
+
run: echo "GGML_N_THREADS=2" >> $GITHUB_ENV
|
106 |
+
if: matrix.os == 'macos-latest'
|
107 |
+
|
108 |
+
- name: Create Build Environment
|
109 |
+
run: mkdir build
|
110 |
+
|
111 |
+
- name: Configure CMake
|
112 |
+
working-directory: ./build
|
113 |
+
run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON ..
|
114 |
+
|
115 |
+
- name: Build
|
116 |
+
working-directory: ./build
|
117 |
+
run: make
|
118 |
+
|
119 |
+
- name: Test
|
120 |
+
working-directory: ./build
|
121 |
+
run: ctest --verbose --timeout 900
|
122 |
+
|
123 |
+
- name: Test Coverage for Ubuntu
|
124 |
+
if: matrix.os == 'ubuntu-latest'
|
125 |
+
working-directory: ./build
|
126 |
+
run: |
|
127 |
+
llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
|
128 |
+
llvm-cov report ./bin/test-grad0 -instr-profile=ggml.profdata
|
129 |
+
llvm-cov report ./bin/test-opt -instr-profile=ggml.profdata
|
130 |
+
|
131 |
+
- name: Test Coverage for MacOS
|
132 |
+
if: matrix.os == 'macos-latest'
|
133 |
+
working-directory: ./build
|
134 |
+
run: |
|
135 |
+
xcrun llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
|
136 |
+
xcrun llvm-cov report ./bin/test-grad0 -instr-profile=ggml.profdata
|
137 |
+
xcrun llvm-cov report ./bin/test-opt -instr-profile=ggml.profdata
|
stable-diffusion.cpp/ggml/.gitignore
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
build/
|
2 |
+
build-debug/
|
3 |
+
build-release/
|
4 |
+
build-sanitize-addr/
|
5 |
+
build-sanitize-thread/
|
6 |
+
build-cov/
|
7 |
+
build-ci-debug/
|
8 |
+
build-ci-release/
|
9 |
+
build-cublas/
|
10 |
+
out/
|
11 |
+
tmp/
|
12 |
+
models/
|
13 |
+
models-mnt
|
14 |
+
|
15 |
+
compile_commands.json
|
16 |
+
CMakeSettings.json
|
17 |
+
.vs/
|
18 |
+
.vscode/
|
19 |
+
.clangd
|
20 |
+
|
21 |
+
.exrc
|
22 |
+
.cache
|
23 |
+
.DS_Store
|
24 |
+
.stablelm
|
25 |
+
.gpt-2
|
26 |
+
|
27 |
+
src/arm_neon.h
|
28 |
+
tests/arm_neon.h
|
29 |
+
|
30 |
+
zig-out/
|
31 |
+
zig-cache/
|
32 |
+
|
33 |
+
*.dot
|
34 |
+
|
35 |
+
*.sw?
|
36 |
+
|
37 |
+
__pycache__/
|
stable-diffusion.cpp/ggml/CMakeLists.txt
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cmake_minimum_required (VERSION 3.3)
|
2 |
+
project(ggml VERSION 0.1.0)
|
3 |
+
|
4 |
+
set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
|
5 |
+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
6 |
+
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
|
7 |
+
|
8 |
+
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
9 |
+
set(GGML_STANDALONE ON)
|
10 |
+
include(cmake/GitVars.cmake)
|
11 |
+
include(cmake/BuildTypes.cmake)
|
12 |
+
else()
|
13 |
+
set(GGML_STANDALONE OFF)
|
14 |
+
endif()
|
15 |
+
|
16 |
+
if (EMSCRIPTEN)
|
17 |
+
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
18 |
+
else()
|
19 |
+
if (MINGW)
|
20 |
+
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
21 |
+
else()
|
22 |
+
set(BUILD_SHARED_LIBS_DEFAULT ON)
|
23 |
+
endif()
|
24 |
+
endif()
|
25 |
+
|
26 |
+
# options
|
27 |
+
|
28 |
+
option(BUILD_SHARED_LIBS "ggml: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
|
29 |
+
|
30 |
+
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
|
31 |
+
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
|
32 |
+
|
33 |
+
option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF)
|
34 |
+
option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
|
35 |
+
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
|
36 |
+
|
37 |
+
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
38 |
+
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
39 |
+
|
40 |
+
option(GGML_TEST_COVERAGE "ggml: enable test coverage" OFF)
|
41 |
+
|
42 |
+
option(GGML_PERF "ggml: enable perf timings" OFF)
|
43 |
+
option(GGML_NO_ACCELERATE "ggml: disable Accelerate framework" OFF)
|
44 |
+
option(GGML_OPENBLAS "ggml: use OpenBLAS" OFF)
|
45 |
+
option(GGML_CLBLAST "ggml: use clBLAST" OFF)
|
46 |
+
option(GGML_CUBLAS "ggml: use cuBLAS" OFF)
|
47 |
+
option(GGML_METAL "ggml: use Metal" OFF)
|
48 |
+
|
49 |
+
# sanitizers
|
50 |
+
|
51 |
+
if (GGML_SANITIZE_THREAD)
|
52 |
+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
|
53 |
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
|
54 |
+
endif()
|
55 |
+
|
56 |
+
if (GGML_SANITIZE_ADDRESS)
|
57 |
+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
|
58 |
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
|
59 |
+
endif()
|
60 |
+
|
61 |
+
if (GGML_SANITIZE_UNDEFINED)
|
62 |
+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
|
63 |
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
|
64 |
+
endif()
|
65 |
+
|
66 |
+
# instruction set specific
|
67 |
+
option(GGML_AVX "ggml: enable AVX" ON)
|
68 |
+
option(GGML_AVX2 "ggml: enable AVX2" ON)
|
69 |
+
option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
70 |
+
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
71 |
+
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
72 |
+
option(GGML_FMA "ggml: enable FMA" ON)
|
73 |
+
# in MSVC F16C is implied with AVX2/AVX512
|
74 |
+
if (NOT MSVC)
|
75 |
+
option(GGML_F16C "ggml: enable F16C" ON)
|
76 |
+
endif()
|
77 |
+
|
78 |
+
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
|
79 |
+
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
|
80 |
+
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")
|
81 |
+
|
82 |
+
# warning flags
|
83 |
+
|
84 |
+
if (GGML_ALL_WARNINGS)
|
85 |
+
if (NOT MSVC)
|
86 |
+
set(c_flags -Wall -Wpedantic -Wformat=2 -Wno-unused -Wstrict-prototypes)
|
87 |
+
set(cxx_flags -Wall -Wpedantic -Wformat=2)
|
88 |
+
else()
|
89 |
+
# todo : windows
|
90 |
+
endif()
|
91 |
+
|
92 |
+
add_compile_options(
|
93 |
+
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
94 |
+
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
95 |
+
)
|
96 |
+
endif()
|
97 |
+
|
98 |
+
if (NOT MSVC)
|
99 |
+
add_compile_options(
|
100 |
+
"$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
|
101 |
+
"$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
|
102 |
+
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
|
103 |
+
)
|
104 |
+
endif()
|
105 |
+
|
106 |
+
#
|
107 |
+
# POSIX conformance
|
108 |
+
#
|
109 |
+
|
110 |
+
# clock_gettime came in POSIX.1b (1993)
|
111 |
+
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
112 |
+
# posix_memalign came in POSIX.1-2001 / SUSv3
|
113 |
+
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
114 |
+
add_compile_definitions(_XOPEN_SOURCE=600)
|
115 |
+
|
116 |
+
# Somehow in OpenBSD whenever POSIX conformance is specified
|
117 |
+
# some string functions rely on locale_t availability,
|
118 |
+
# which was introduced in POSIX.1-2008, forcing us to go higher
|
119 |
+
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
120 |
+
remove_definitions(-D_XOPEN_SOURCE=600)
|
121 |
+
add_compile_definitions(_XOPEN_SOURCE=700)
|
122 |
+
endif()
|
123 |
+
|
124 |
+
# Data types, macros and functions related to controlling CPU affinity
|
125 |
+
# are available on Linux through GNU extensions in libc
|
126 |
+
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
127 |
+
add_compile_definitions(_GNU_SOURCE)
|
128 |
+
endif()
|
129 |
+
|
130 |
+
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
131 |
+
# and on macOS its availability depends on enabling Darwin extensions
|
132 |
+
# similarly on DragonFly, enabling BSD extensions is necessary
|
133 |
+
if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
|
134 |
+
add_compile_definitions(_DARWIN_C_SOURCE)
|
135 |
+
endif()
|
136 |
+
if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
|
137 |
+
add_compile_definitions(_DARWIN_C_SOURCE)
|
138 |
+
endif()
|
139 |
+
|
140 |
+
# alloca is a non-standard interface that is not visible on BSDs when
|
141 |
+
# POSIX conformance is specified, but not all of them provide a clean way
|
142 |
+
# to enable it in such cases
|
143 |
+
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
144 |
+
add_compile_definitions(__BSD_VISIBLE)
|
145 |
+
endif()
|
146 |
+
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
|
147 |
+
add_compile_definitions(_NETBSD_SOURCE)
|
148 |
+
endif()
|
149 |
+
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
150 |
+
add_compile_definitions(_BSD_SOURCE)
|
151 |
+
endif()
|
152 |
+
|
153 |
+
if (WHISPER_PERF)
|
154 |
+
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
|
155 |
+
endif()
|
156 |
+
|
157 |
+
# dependencies
|
158 |
+
|
159 |
+
set(CMAKE_C_STANDARD 11)
|
160 |
+
set(CMAKE_CXX_STANDARD 11)
|
161 |
+
|
162 |
+
find_package(Threads REQUIRED)
|
163 |
+
|
164 |
+
# main
|
165 |
+
|
166 |
+
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
167 |
+
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
168 |
+
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
|
169 |
+
endif ()
|
170 |
+
|
171 |
+
if (GGML_BUILD_TESTS)
|
172 |
+
if (GGML_TEST_COVERAGE)
|
173 |
+
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
|
174 |
+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
|
175 |
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
|
176 |
+
else()
|
177 |
+
message(WARNING "Test coverage is only supported for Clang")
|
178 |
+
endif()
|
179 |
+
endif()
|
180 |
+
endif()
|
181 |
+
|
182 |
+
add_subdirectory(src)
|
183 |
+
|
184 |
+
if (GGML_BUILD_TESTS)
|
185 |
+
enable_testing()
|
186 |
+
add_subdirectory(tests)
|
187 |
+
endif ()
|
188 |
+
|
189 |
+
if (GGML_BUILD_EXAMPLES)
|
190 |
+
add_subdirectory(examples)
|
191 |
+
endif ()
|
192 |
+
|
193 |
+
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
|
194 |
+
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
195 |
+
@ONLY)
|
196 |
+
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
197 |
+
DESTINATION share/pkgconfig)
|
stable-diffusion.cpp/ggml/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 Georgi Gerganov
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
stable-diffusion.cpp/ggml/README.md
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ggml
|
2 |
+
|
3 |
+
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205)
|
4 |
+
|
5 |
+
Tensor library for machine learning
|
6 |
+
|
7 |
+
***Note that this project is under active development. \
|
8 |
+
Some of the development is currently happening in the [llama.cpp](https://github.com/ggerganov/llama.cpp) and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repos***
|
9 |
+
|
10 |
+
## Features
|
11 |
+
|
12 |
+
- Written in C
|
13 |
+
- 16-bit float support
|
14 |
+
- Integer quantization support (4-bit, 5-bit, 8-bit, etc.)
|
15 |
+
- Automatic differentiation
|
16 |
+
- ADAM and L-BFGS optimizers
|
17 |
+
- Optimized for Apple Silicon
|
18 |
+
- On x86 architectures utilizes AVX / AVX2 intrinsics
|
19 |
+
- On ppc64 architectures utilizes VSX intrinsics
|
20 |
+
- No third-party dependencies
|
21 |
+
- Zero memory allocations during runtime
|
22 |
+
|
23 |
+
## Updates
|
24 |
+
|
25 |
+
- [X] Example of GPT-2 inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
|
26 |
+
- [X] Example of GPT-J inference [examples/gpt-j](https://github.com/ggerganov/ggml/tree/master/examples/gpt-j)
|
27 |
+
- [X] Example of Whisper inference [examples/whisper](https://github.com/ggerganov/ggml/tree/master/examples/whisper)
|
28 |
+
- [X] Support 4-bit integer quantization https://github.com/ggerganov/ggml/pull/27
|
29 |
+
- [X] Example of Cerebras-GPT inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
|
30 |
+
- [ ] Example of FLAN-T5 inference https://github.com/ggerganov/ggml/pull/12
|
31 |
+
- [X] Example of LLaMA inference [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)
|
32 |
+
- [X] Example of LLaMA training [ggerganov/llama.cpp/examples/baby-llama](https://github.com/ggerganov/llama.cpp/tree/master/examples/baby-llama)
|
33 |
+
- [X] Example of Falcon inference [cmp-nct/ggllm.cpp](https://github.com/cmp-nct/ggllm.cpp)
|
34 |
+
- [X] Example of BLOOM inference [NouamaneTazi/bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp)
|
35 |
+
- [X] Example of RWKV inference [saharNooby/rwkv.cpp](https://github.com/saharNooby/rwkv.cpp)
|
36 |
+
- [X] Example of SAM inference [examples/sam](https://github.com/ggerganov/ggml/tree/master/examples/sam)
|
37 |
+
- [X] Idea for GPU support: https://github.com/ggerganov/llama.cpp/discussions/915
|
38 |
+
- [X] Example of StableLM (GPT-NeoX) inference [examples/gpt-neox](https://github.com/ggerganov/ggml/tree/master/examples/gpt-neox)
|
39 |
+
- [X] Example of BERT inference [skeskinen/bert.cpp](https://github.com/skeskinen/bert.cpp)
|
40 |
+
- [X] Example of 💫 StarCoder inference [examples/starcoder](https://github.com/ggerganov/ggml/tree/master/examples/starcoder)
|
41 |
+
- [X] Example of MPT inference [examples/mpt](https://github.com/ggerganov/ggml/tree/master/examples/mpt)
|
42 |
+
- [X] Example of Replit inference [examples/replit](https://github.com/ggerganov/ggml/tree/master/examples/replit)
|
43 |
+
- [X] Example of BioGPT inference [PABannier/biogpt.cpp](https://github.com/PABannier/biogpt.cpp)
|
44 |
+
- [X] Example of Encodec inference [PABannier/encodec.cpp](https://github.com/PABannier/encodec.cpp)
|
45 |
+
- [X] Example of CLIP inference [monatis/clip.cpp](https://github.com/monatis/clip.cpp)
|
46 |
+
- [X] Example of MiniGPT4 inference [Maknee/minigpt4.cpp](https://github.com/Maknee/minigpt4.cpp)
|
47 |
+
- [X] Example of ChatGLM inference [li-plus/chatglm.cpp](https://github.com/li-plus/chatglm.cpp)
|
48 |
+
- [X] Example of Stable Diffusion inference [leejet/stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)
|
49 |
+
- [X] Example of Qwen inference [QwenLM/qwen.cpp](https://github.com/QwenLM/qwen.cpp)
|
50 |
+
|
51 |
+
## Whisper inference (example)
|
52 |
+
|
53 |
+
With ggml you can efficiently run [Whisper](examples/whisper) inference on the CPU.
|
54 |
+
|
55 |
+
Memory requirements:
|
56 |
+
|
57 |
+
| Model | Disk | Mem |
|
58 |
+
| --- | --- | --- |
|
59 |
+
| tiny | 75 MB | ~280 MB |
|
60 |
+
| base | 142 MB | ~430 MB |
|
61 |
+
| small | 466 MB | ~1.0 GB |
|
62 |
+
| medium | 1.5 GB | ~2.6 GB |
|
63 |
+
| large | 2.9 GB | ~4.7 GB |
|
64 |
+
|
65 |
+
## GPT inference (example)
|
66 |
+
|
67 |
+
With ggml you can efficiently run [GPT-2](examples/gpt-2) and [GPT-J](examples/gpt-j) inference on the CPU.
|
68 |
+
|
69 |
+
Here is how to run the example programs:
|
70 |
+
|
71 |
+
```bash
|
72 |
+
# Build ggml + examples
|
73 |
+
git clone https://github.com/ggerganov/ggml
|
74 |
+
cd ggml
|
75 |
+
mkdir build && cd build
|
76 |
+
cmake ..
|
77 |
+
make -j4 gpt-2 gpt-j
|
78 |
+
|
79 |
+
# Run the GPT-2 small 117M model
|
80 |
+
../examples/gpt-2/download-ggml-model.sh 117M
|
81 |
+
./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
|
82 |
+
|
83 |
+
# Run the GPT-J 6B model (requires 12GB disk space and 16GB CPU RAM)
|
84 |
+
../examples/gpt-j/download-ggml-model.sh 6B
|
85 |
+
./bin/gpt-j -m models/gpt-j-6B/ggml-model.bin -p "This is an example"
|
86 |
+
|
87 |
+
# Install Python dependencies
|
88 |
+
python3 -m pip install -r ../requirements.txt
|
89 |
+
|
90 |
+
# Run the Cerebras-GPT 111M model
|
91 |
+
# Download from: https://huggingface.co/cerebras
|
92 |
+
python3 ../examples/gpt-2/convert-cerebras-to-ggml.py /path/to/Cerebras-GPT-111M/
|
93 |
+
./bin/gpt-2 -m /path/to/Cerebras-GPT-111M/ggml-model-f16.bin -p "This is an example"
|
94 |
+
```
|
95 |
+
|
96 |
+
The inference speeds that I get for the different models on my 32GB MacBook M1 Pro are as follows:
|
97 |
+
|
98 |
+
| Model | Size | Time / Token |
|
99 |
+
| --- | --- | --- |
|
100 |
+
| GPT-2 | 117M | 5 ms |
|
101 |
+
| GPT-2 | 345M | 12 ms |
|
102 |
+
| GPT-2 | 774M | 23 ms |
|
103 |
+
| GPT-2 | 1558M | 42 ms |
|
104 |
+
| --- | --- | --- |
|
105 |
+
| GPT-J | 6B | 125 ms |
|
106 |
+
|
107 |
+
For more information, checkout the corresponding programs in the [examples](examples) folder.
|
108 |
+
|
109 |
+
## Using Metal (only with GPT-2)
|
110 |
+
|
111 |
+
For GPT-2 models, offloading to GPU is possible. Note that it will not improve inference performances but will reduce power consumption and free up the CPU for other tasks.
|
112 |
+
|
113 |
+
To enable GPU offloading on MacOS:
|
114 |
+
|
115 |
+
```bash
|
116 |
+
cmake -DGGML_METAL=ON -DBUILD_SHARED_LIBS=Off ..
|
117 |
+
|
118 |
+
# add -ngl 1
|
119 |
+
./bin/gpt-2 -t 4 -ngl 100 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
|
120 |
+
```
|
121 |
+
|
122 |
+
## Using cuBLAS
|
123 |
+
|
124 |
+
```bash
|
125 |
+
# fix the path to point to your CUDA compiler
|
126 |
+
cmake -DGGML_CUBLAS=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc ..
|
127 |
+
```
|
128 |
+
|
129 |
+
## Using clBLAST
|
130 |
+
|
131 |
+
```bash
|
132 |
+
cmake -DGGML_CLBLAST=ON ..
|
133 |
+
```
|
134 |
+
|
135 |
+
## Resources
|
136 |
+
|
137 |
+
- [GGML - Large Language Models for Everyone](https://github.com/rustformers/llm/blob/main/crates/ggml/README.md): a description of the GGML format provided by the maintainers of the `llm` Rust crate, which provides Rust bindings for GGML
|
138 |
+
- [marella/ctransformers](https://github.com/marella/ctransformers): Python bindings for GGML models.
|
139 |
+
- [go-skynet/go-ggml-transformers.cpp](https://github.com/go-skynet/go-ggml-transformers.cpp): Golang bindings for GGML models
|
140 |
+
- [smspillaz/ggml-gobject](https://github.com/smspillaz/ggml-gobject): GObject-introspectable wrapper for use of GGML on the GNOME platform.
|
stable-diffusion.cpp/ggml/build.zig
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const std = @import("std");
|
2 |
+
const builtin = @import("builtin");
|
3 |
+
|
4 |
+
// Zig Version: 0.11.0
|
5 |
+
// Zig Build Command: zig build
|
6 |
+
// Zig Run Command: zig build -h
|
7 |
+
// zig build run_dolly-v2
|
8 |
+
// zig build run_gpt-2
|
9 |
+
// zig build run_gpt-j
|
10 |
+
// zig build run_gpt-neox
|
11 |
+
// zig build run_mnist
|
12 |
+
// zig build run_mpt
|
13 |
+
// zig build run_replit
|
14 |
+
// zig build run_starcoder
|
15 |
+
// zig build run_test-grad0
|
16 |
+
// zig build run_test-mul-mat0
|
17 |
+
// zig build run_test-mul-mat2
|
18 |
+
// zig build run_test-opt
|
19 |
+
// zig build run_test-vec1
|
20 |
+
// zig build run_test0
|
21 |
+
// zig build run_test1
|
22 |
+
// zig build run_test2
|
23 |
+
// zig build run_test3
|
24 |
+
// zig build run_zig_test0
|
25 |
+
// zig build run_zig_test1
|
26 |
+
// zig build run_zig_test2
|
27 |
+
// zig build run_zig_test3
|
28 |
+
pub fn build(b: *std.build.Builder) void {
|
29 |
+
const target = b.standardTargetOptions(.{});
|
30 |
+
const optimize = b.standardOptimizeOption(.{});
|
31 |
+
const lib = b.addStaticLibrary(.{
|
32 |
+
.name = "ggml",
|
33 |
+
.target = target,
|
34 |
+
.optimize = optimize,
|
35 |
+
});
|
36 |
+
lib.addIncludePath(.{ .path = "./include" });
|
37 |
+
lib.addIncludePath(.{ .path = "./include/ggml" });
|
38 |
+
lib.addCSourceFiles(&.{
|
39 |
+
"src/ggml.c",
|
40 |
+
}, &.{"-std=c11"});
|
41 |
+
lib.linkLibC();
|
42 |
+
lib.linkLibCpp();
|
43 |
+
b.installArtifact(lib);
|
44 |
+
|
45 |
+
// examples
|
46 |
+
const examples = .{
|
47 |
+
"dolly-v2",
|
48 |
+
"gpt-2",
|
49 |
+
"gpt-j",
|
50 |
+
"gpt-neox",
|
51 |
+
"mnist",
|
52 |
+
"mpt",
|
53 |
+
"replit",
|
54 |
+
"starcoder",
|
55 |
+
// "whisper",
|
56 |
+
};
|
57 |
+
inline for (examples) |name| {
|
58 |
+
const exe = b.addExecutable(.{
|
59 |
+
.name = name,
|
60 |
+
.target = target,
|
61 |
+
.optimize = optimize,
|
62 |
+
});
|
63 |
+
exe.addIncludePath(.{ .path = "./include" });
|
64 |
+
exe.addIncludePath(.{ .path = "./include/ggml" });
|
65 |
+
exe.addIncludePath(.{ .path = "./examples" });
|
66 |
+
// exe.addIncludePath("./examples/whisper");
|
67 |
+
exe.addCSourceFiles(&.{
|
68 |
+
std.fmt.comptimePrint("examples/{s}/main.cpp", .{name}),
|
69 |
+
"examples/common.cpp",
|
70 |
+
"examples/common-ggml.cpp",
|
71 |
+
// "examples/whisper/whisper.cpp",
|
72 |
+
}, &.{"-std=c++11"});
|
73 |
+
exe.linkLibrary(lib);
|
74 |
+
b.installArtifact(exe);
|
75 |
+
const run_cmd = b.addRunArtifact(exe);
|
76 |
+
run_cmd.step.dependOn(b.getInstallStep());
|
77 |
+
if (b.args) |args| run_cmd.addArgs(args);
|
78 |
+
const run_step = b.step("run_" ++ name, "Run examples");
|
79 |
+
run_step.dependOn(&run_cmd.step);
|
80 |
+
}
|
81 |
+
|
82 |
+
// tests
|
83 |
+
const tests = if (builtin.target.cpu.arch == .x86_64) .{
|
84 |
+
// "test-blas0",
|
85 |
+
// "test-grad0",
|
86 |
+
"test-mul-mat0",
|
87 |
+
// "test-mul-mat1",
|
88 |
+
"test-mul-mat2",
|
89 |
+
// "test-opt",
|
90 |
+
// "test-svd0",
|
91 |
+
// "test-vec0",
|
92 |
+
"test-vec1",
|
93 |
+
// "test-vec2",
|
94 |
+
"test0",
|
95 |
+
"test1",
|
96 |
+
"test2",
|
97 |
+
"test3",
|
98 |
+
} else .{
|
99 |
+
// "test-blas0",
|
100 |
+
// "test-grad0",
|
101 |
+
"test-mul-mat0",
|
102 |
+
// "test-mul-mat1",
|
103 |
+
"test-mul-mat2",
|
104 |
+
// "test-opt",
|
105 |
+
// "test-svd0",
|
106 |
+
// "test-vec0",
|
107 |
+
// "test-vec1",
|
108 |
+
// "test-vec2",
|
109 |
+
"test0",
|
110 |
+
"test1",
|
111 |
+
"test2",
|
112 |
+
"test3",
|
113 |
+
};
|
114 |
+
inline for (tests) |name| {
|
115 |
+
const exe = b.addExecutable(.{
|
116 |
+
.name = name,
|
117 |
+
.target = target,
|
118 |
+
.optimize = optimize,
|
119 |
+
});
|
120 |
+
exe.addIncludePath(.{ .path = "./include" });
|
121 |
+
exe.addIncludePath(.{ .path = "./include/ggml" });
|
122 |
+
exe.addCSourceFiles(&.{
|
123 |
+
std.fmt.comptimePrint("tests/{s}.c", .{name}),
|
124 |
+
}, &.{"-std=c11"});
|
125 |
+
exe.linkLibrary(lib);
|
126 |
+
b.installArtifact(exe);
|
127 |
+
const run_cmd = b.addRunArtifact(exe);
|
128 |
+
run_cmd.step.dependOn(b.getInstallStep());
|
129 |
+
if (b.args) |args| run_cmd.addArgs(args);
|
130 |
+
const run_step = b.step("run_" ++ name, "Run tests");
|
131 |
+
run_step.dependOn(&run_cmd.step);
|
132 |
+
}
|
133 |
+
|
134 |
+
// zig_tests
|
135 |
+
const zig_tests = .{
|
136 |
+
"test0",
|
137 |
+
"test1",
|
138 |
+
"test2",
|
139 |
+
"test3",
|
140 |
+
};
|
141 |
+
inline for (zig_tests) |name| {
|
142 |
+
const exe = b.addExecutable(.{
|
143 |
+
.name = name,
|
144 |
+
.root_source_file = .{ .path = std.fmt.comptimePrint("tests/{s}.zig", .{name}) },
|
145 |
+
.target = target,
|
146 |
+
.optimize = optimize,
|
147 |
+
});
|
148 |
+
exe.addIncludePath(.{ .path = "./include" });
|
149 |
+
exe.addIncludePath(.{ .path = "./include/ggml" });
|
150 |
+
exe.linkLibrary(lib);
|
151 |
+
b.installArtifact(exe);
|
152 |
+
const run_cmd = b.addRunArtifact(exe);
|
153 |
+
run_cmd.step.dependOn(b.getInstallStep());
|
154 |
+
if (b.args) |args| run_cmd.addArgs(args);
|
155 |
+
const run_step = b.step("run_zig_" ++ name, "Run zig_tests");
|
156 |
+
run_step.dependOn(&run_cmd.step);
|
157 |
+
}
|
158 |
+
}
|
stable-diffusion.cpp/ggml/ci/run.sh
ADDED
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#/bin/bash
|
2 |
+
#
|
3 |
+
# sample usage:
|
4 |
+
#
|
5 |
+
# mkdir tmp
|
6 |
+
#
|
7 |
+
# # CPU-only build
|
8 |
+
# bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
9 |
+
#
|
10 |
+
# # with CUDA support
|
11 |
+
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
12 |
+
#
|
13 |
+
|
14 |
+
if [ -z "$2" ]; then
|
15 |
+
echo "usage: $0 <output-dir> <mnt-dir>"
|
16 |
+
exit 1
|
17 |
+
fi
|
18 |
+
|
19 |
+
mkdir -p "$1"
|
20 |
+
mkdir -p "$2"
|
21 |
+
|
22 |
+
OUT=$(realpath "$1")
|
23 |
+
MNT=$(realpath "$2")
|
24 |
+
|
25 |
+
rm -v $OUT/*.log
|
26 |
+
rm -v $OUT/*.exit
|
27 |
+
rm -v $OUT/*.md
|
28 |
+
|
29 |
+
sd=`dirname $0`
|
30 |
+
cd $sd/../
|
31 |
+
SRC=`pwd`
|
32 |
+
|
33 |
+
## helpers
|
34 |
+
|
35 |
+
# download a file if it does not exist or if it is outdated
|
36 |
+
function gg_wget {
|
37 |
+
local out=$1
|
38 |
+
local url=$2
|
39 |
+
|
40 |
+
local cwd=`pwd`
|
41 |
+
|
42 |
+
mkdir -p $out
|
43 |
+
cd $out
|
44 |
+
|
45 |
+
# should not re-download if file is the same
|
46 |
+
wget -nv -N $url
|
47 |
+
|
48 |
+
cd $cwd
|
49 |
+
}
|
50 |
+
|
51 |
+
function gg_printf {
|
52 |
+
printf -- "$@" >> $OUT/README.md
|
53 |
+
}
|
54 |
+
|
55 |
+
function gg_run {
|
56 |
+
ci=$1
|
57 |
+
|
58 |
+
set -o pipefail
|
59 |
+
set -x
|
60 |
+
|
61 |
+
gg_run_$ci | tee $OUT/$ci.log
|
62 |
+
cur=$?
|
63 |
+
echo "$cur" > $OUT/$ci.exit
|
64 |
+
|
65 |
+
set +x
|
66 |
+
set +o pipefail
|
67 |
+
|
68 |
+
gg_sum_$ci
|
69 |
+
|
70 |
+
ret=$((ret | cur))
|
71 |
+
}
|
72 |
+
|
73 |
+
## ci
|
74 |
+
|
75 |
+
# ctest_debug
|
76 |
+
|
77 |
+
function gg_run_ctest_debug {
|
78 |
+
cd ${SRC}
|
79 |
+
|
80 |
+
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
81 |
+
|
82 |
+
set -e
|
83 |
+
|
84 |
+
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
85 |
+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
86 |
+
|
87 |
+
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
88 |
+
|
89 |
+
set +e
|
90 |
+
}
|
91 |
+
|
92 |
+
function gg_sum_ctest_debug {
|
93 |
+
gg_printf '### %s\n\n' "${ci}"
|
94 |
+
|
95 |
+
gg_printf 'Runs ctest in debug mode\n'
|
96 |
+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
97 |
+
gg_printf '```\n'
|
98 |
+
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
99 |
+
gg_printf '```\n'
|
100 |
+
gg_printf '\n'
|
101 |
+
}
|
102 |
+
|
103 |
+
# ctest_release
|
104 |
+
|
105 |
+
function gg_run_ctest_release {
|
106 |
+
cd ${SRC}
|
107 |
+
|
108 |
+
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
109 |
+
|
110 |
+
set -e
|
111 |
+
|
112 |
+
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
113 |
+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
114 |
+
|
115 |
+
if [ -z $GG_BUILD_LOW_PERF ]; then
|
116 |
+
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
117 |
+
else
|
118 |
+
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
119 |
+
fi
|
120 |
+
|
121 |
+
set +e
|
122 |
+
}
|
123 |
+
|
124 |
+
function gg_sum_ctest_release {
|
125 |
+
gg_printf '### %s\n\n' "${ci}"
|
126 |
+
|
127 |
+
gg_printf 'Runs ctest in release mode\n'
|
128 |
+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
129 |
+
gg_printf '```\n'
|
130 |
+
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
131 |
+
gg_printf '```\n'
|
132 |
+
}
|
133 |
+
|
134 |
+
# gpt_2
|
135 |
+
|
136 |
+
function gg_run_gpt_2 {
|
137 |
+
cd ${SRC}
|
138 |
+
|
139 |
+
gg_wget models-mnt/gpt-2 https://huggingface.co/ggerganov/ggml/resolve/main/ggml-model-gpt-2-117M.bin
|
140 |
+
|
141 |
+
cd build-ci-release
|
142 |
+
|
143 |
+
set -e
|
144 |
+
|
145 |
+
model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin"
|
146 |
+
prompts="../examples/prompts/gpt-2.txt"
|
147 |
+
|
148 |
+
(time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log
|
149 |
+
(time ./bin/gpt-2 --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
|
150 |
+
|
151 |
+
(time ./bin/gpt-2-batched --model ${model} -s 1234 -n 64 -np 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
|
152 |
+
|
153 |
+
set +e
|
154 |
+
}
|
155 |
+
|
156 |
+
function gg_sum_gpt_2 {
|
157 |
+
gg_printf '### %s\n\n' "${ci}"
|
158 |
+
|
159 |
+
gg_printf 'Runs short GPT-2 text generation\n'
|
160 |
+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
161 |
+
gg_printf '```\n'
|
162 |
+
gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)"
|
163 |
+
gg_printf '```\n'
|
164 |
+
}
|
165 |
+
|
166 |
+
# mnist
|
167 |
+
|
168 |
+
function gg_run_mnist {
|
169 |
+
cd ${SRC}
|
170 |
+
|
171 |
+
cd build-ci-release
|
172 |
+
|
173 |
+
set -e
|
174 |
+
|
175 |
+
mkdir -p models/mnist
|
176 |
+
python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict
|
177 |
+
|
178 |
+
model_f32="./models/mnist/ggml-model-f32.bin"
|
179 |
+
samples="../examples/mnist/models/mnist/t10k-images.idx3-ubyte"
|
180 |
+
|
181 |
+
# first command runs and exports "mnist.ggml", the second command runs the exported model
|
182 |
+
|
183 |
+
(time ./bin/mnist ${model_f32} ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
|
184 |
+
(time ./bin/mnist-cpu ./mnist.ggml ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
|
185 |
+
|
186 |
+
set +e
|
187 |
+
}
|
188 |
+
|
189 |
+
function gg_sum_mnist {
|
190 |
+
gg_printf '### %s\n\n' "${ci}"
|
191 |
+
|
192 |
+
gg_printf 'MNIST\n'
|
193 |
+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
194 |
+
gg_printf '```\n'
|
195 |
+
gg_printf '%s\n' "$(cat $OUT/${ci}-mnist.log)"
|
196 |
+
gg_printf '```\n'
|
197 |
+
}
|
198 |
+
|
199 |
+
# whisper
|
200 |
+
|
201 |
+
function gg_run_whisper {
|
202 |
+
cd ${SRC}
|
203 |
+
|
204 |
+
gg_wget models-mnt/whisper/ https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin
|
205 |
+
gg_wget models-mnt/whisper/ https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav
|
206 |
+
|
207 |
+
cd build-ci-release
|
208 |
+
|
209 |
+
set -e
|
210 |
+
|
211 |
+
path_models="../models-mnt/whisper/"
|
212 |
+
model_f16="${path_models}/ggml-base.en.bin"
|
213 |
+
audio_0="${path_models}/jfk.wav"
|
214 |
+
|
215 |
+
(time ./bin/whisper -m ${model_f16} -f ${audio_0} ) 2>&1 | tee -a $OUT/${ci}-main.log
|
216 |
+
|
217 |
+
grep -q "And so my fellow Americans" $OUT/${ci}-main.log
|
218 |
+
|
219 |
+
set +e
|
220 |
+
}
|
221 |
+
|
222 |
+
function gg_sum_whisper {
|
223 |
+
gg_printf '### %s\n\n' "${ci}"
|
224 |
+
|
225 |
+
gg_printf 'Runs short Whisper transcription\n'
|
226 |
+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
227 |
+
gg_printf '```\n'
|
228 |
+
gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
|
229 |
+
gg_printf '```\n'
|
230 |
+
}
|
231 |
+
|
232 |
+
# sam
|
233 |
+
|
234 |
+
function gg_run_sam {
|
235 |
+
cd ${SRC}
|
236 |
+
|
237 |
+
gg_wget models-mnt/sam/ https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
|
238 |
+
gg_wget models-mnt/sam/ https://raw.githubusercontent.com/YavorGIvanov/sam.cpp/ceafb7467bff7ec98e0c4f952e58a9eb8fd0238b/img.jpg
|
239 |
+
|
240 |
+
cd build-ci-release
|
241 |
+
|
242 |
+
set -e
|
243 |
+
|
244 |
+
path_models="../models-mnt/sam/"
|
245 |
+
model_f16="${path_models}/ggml-model-f16.bin"
|
246 |
+
img_0="${path_models}/img.jpg"
|
247 |
+
|
248 |
+
python3 ../examples/sam/convert-pth-to-ggml.py ${path_models}/sam_vit_b_01ec64.pth ${path_models}/ 1
|
249 |
+
|
250 |
+
(time ./bin/sam -m ${model_f16} -i ${img_0} ) 2>&1 | tee -a $OUT/${ci}-main.log
|
251 |
+
|
252 |
+
grep -q "bbox (371, 436), (144, 168)" $OUT/${ci}-main.log
|
253 |
+
|
254 |
+
set +e
|
255 |
+
}
|
256 |
+
|
257 |
+
function gg_sum_sam {
|
258 |
+
gg_printf '### %s\n\n' "${ci}"
|
259 |
+
|
260 |
+
gg_printf 'Run SAM\n'
|
261 |
+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
262 |
+
gg_printf '```\n'
|
263 |
+
gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
|
264 |
+
gg_printf '```\n'
|
265 |
+
}
|
266 |
+
|
267 |
+
# mpt
|
268 |
+
|
269 |
+
function gg_run_mpt {
|
270 |
+
cd ${SRC}
|
271 |
+
|
272 |
+
gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/config.json
|
273 |
+
gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer.json
|
274 |
+
gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer_config.json
|
275 |
+
gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/pytorch_model.bin.index.json
|
276 |
+
gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/configuration_mpt.py
|
277 |
+
gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00001-of-00002.bin
|
278 |
+
gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00002-of-00002.bin
|
279 |
+
|
280 |
+
cd build-ci-release
|
281 |
+
|
282 |
+
set -e
|
283 |
+
|
284 |
+
path_models="../models-mnt/mpt/7B"
|
285 |
+
model_f16="${path_models}/ggml-model-f16.bin"
|
286 |
+
model_q4_0="${path_models}/ggml-model-q4_0.bin"
|
287 |
+
|
288 |
+
python3 ../examples/mpt/convert-h5-to-ggml.py ${path_models} 1
|
289 |
+
./bin/mpt-quantize ${model_f16} ${model_q4_0} q4_0
|
290 |
+
|
291 |
+
(time ./bin/mpt --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
|
292 |
+
(time ./bin/mpt --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
|
293 |
+
|
294 |
+
set +e
|
295 |
+
}
|
296 |
+
|
297 |
+
function gg_sum_mpt {
|
298 |
+
gg_printf '### %s\n\n' "${ci}"
|
299 |
+
|
300 |
+
gg_printf 'Runs short MPT text generation\n'
|
301 |
+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
302 |
+
gg_printf '```\n'
|
303 |
+
gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)"
|
304 |
+
gg_printf '```\n'
|
305 |
+
}
|
306 |
+
|
307 |
+
## main
|
308 |
+
|
309 |
+
if [ -z $GG_BUILD_LOW_PERF ]; then
|
310 |
+
rm -rf ${SRC}/models-mnt
|
311 |
+
|
312 |
+
mnt_models=${MNT}/models
|
313 |
+
mkdir -p ${mnt_models}
|
314 |
+
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
315 |
+
fi
|
316 |
+
|
317 |
+
python3 -m pip install -r ${SRC}/requirements.txt
|
318 |
+
|
319 |
+
ret=0
|
320 |
+
|
321 |
+
test $ret -eq 0 && gg_run ctest_debug
|
322 |
+
test $ret -eq 0 && gg_run ctest_release
|
323 |
+
test $ret -eq 0 && gg_run gpt_2
|
324 |
+
test $ret -eq 0 && gg_run mnist
|
325 |
+
test $ret -eq 0 && gg_run whisper
|
326 |
+
test $ret -eq 0 && gg_run sam
|
327 |
+
|
328 |
+
if [ -z $GG_BUILD_LOW_PERF ]; then
|
329 |
+
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 16 ]; then
|
330 |
+
test $ret -eq 0 && gg_run mpt
|
331 |
+
fi
|
332 |
+
fi
|
333 |
+
|
334 |
+
exit $ret
|
stable-diffusion.cpp/ggml/cmake/BuildTypes.cmake
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Add new build types
|
2 |
+
|
3 |
+
# ReleaseGG - Release with enabled asserts
|
4 |
+
|
5 |
+
SET(CMAKE_CXX_FLAGS_RELEASEGG
|
6 |
+
"-O3"
|
7 |
+
CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts."
|
8 |
+
FORCE )
|
9 |
+
SET(CMAKE_C_FLAGS_RELEASEGG
|
10 |
+
"-O3"
|
11 |
+
CACHE STRING "Flags used by the compiler during release builds with enabled asserts."
|
12 |
+
FORCE )
|
13 |
+
SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG
|
14 |
+
""
|
15 |
+
CACHE STRING "Flags used for linking binaries during release builds with enabled asserts."
|
16 |
+
FORCE )
|
17 |
+
SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG
|
18 |
+
""
|
19 |
+
CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts."
|
20 |
+
FORCE )
|
21 |
+
MARK_AS_ADVANCED(
|
22 |
+
CMAKE_CXX_FLAGS_RELEASEGG
|
23 |
+
CMAKE_C_FLAGS_RELEASEGG
|
24 |
+
CMAKE_EXE_LINKER_FLAGS_RELEASEGG
|
25 |
+
CMAKE_SHARED_LINKER_FLAGS_RELEASEGG )
|
26 |
+
|
27 |
+
# RelWithDebInfoGG - RelWithDebInfo with enabled asserts
|
28 |
+
|
29 |
+
SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
|
30 |
+
"-O2 -g"
|
31 |
+
CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts."
|
32 |
+
FORCE )
|
33 |
+
SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG
|
34 |
+
"-O2 -g"
|
35 |
+
CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts."
|
36 |
+
FORCE )
|
37 |
+
SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
|
38 |
+
""
|
39 |
+
CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts."
|
40 |
+
FORCE )
|
41 |
+
SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG
|
42 |
+
""
|
43 |
+
CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts."
|
44 |
+
FORCE )
|
45 |
+
MARK_AS_ADVANCED(
|
46 |
+
CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
|
47 |
+
CMAKE_C_FLAGS_RELWITHDEBINFOGG
|
48 |
+
CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
|
49 |
+
CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG )
|
50 |
+
|
51 |
+
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
52 |
+
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
53 |
+
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG")
|
54 |
+
endif()
|
stable-diffusion.cpp/ggml/cmake/GitVars.cmake
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
find_package(Git)
|
2 |
+
|
3 |
+
# the commit's SHA1
|
4 |
+
execute_process(COMMAND
|
5 |
+
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
6 |
+
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
7 |
+
OUTPUT_VARIABLE GIT_SHA1
|
8 |
+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
9 |
+
|
10 |
+
# the date of the commit
|
11 |
+
execute_process(COMMAND
|
12 |
+
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
13 |
+
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
14 |
+
OUTPUT_VARIABLE GIT_DATE
|
15 |
+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
16 |
+
|
17 |
+
# the subject of the commit
|
18 |
+
execute_process(COMMAND
|
19 |
+
"${GIT_EXECUTABLE}" log -1 --format=%s
|
20 |
+
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
21 |
+
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
22 |
+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
stable-diffusion.cpp/ggml/examples/CMakeLists.txt
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
if (GGML_ALL_WARNINGS)
|
2 |
+
if (NOT MSVC)
|
3 |
+
set(cxx_flags
|
4 |
+
# TODO(marella): Add other warnings.
|
5 |
+
-Wpedantic
|
6 |
+
-Wunused-variable
|
7 |
+
-Wno-unused-function
|
8 |
+
-Wno-multichar
|
9 |
+
)
|
10 |
+
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>")
|
11 |
+
endif()
|
12 |
+
endif()
|
13 |
+
|
14 |
+
add_library(common STATIC common.cpp)
|
15 |
+
target_include_directories(common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
16 |
+
|
17 |
+
add_library(common-ggml STATIC common-ggml.cpp)
|
18 |
+
target_link_libraries(common-ggml PRIVATE ggml)
|
19 |
+
target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
20 |
+
|
21 |
+
add_subdirectory(gpt-2)
|
22 |
+
add_subdirectory(gpt-j)
|
23 |
+
add_subdirectory(whisper)
|
24 |
+
add_subdirectory(mnist)
|
25 |
+
add_subdirectory(gpt-neox)
|
26 |
+
add_subdirectory(dolly-v2)
|
27 |
+
add_subdirectory(replit)
|
28 |
+
add_subdirectory(mpt)
|
29 |
+
add_subdirectory(starcoder)
|
30 |
+
add_subdirectory(sam)
|
stable-diffusion.cpp/ggml/examples/common-ggml.cpp
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "common-ggml.h"
|
2 |
+
|
3 |
+
#include <regex>
|
4 |
+
#include <map>
|
5 |
+
|
6 |
+
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
7 |
+
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
|
8 |
+
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
|
9 |
+
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
10 |
+
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
11 |
+
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
12 |
+
};
|
13 |
+
|
14 |
+
void ggml_print_ftypes(FILE * fp) {
|
15 |
+
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
|
16 |
+
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
17 |
+
}
|
18 |
+
}
|
19 |
+
|
20 |
+
enum ggml_ftype ggml_parse_ftype(const char * str) {
|
21 |
+
enum ggml_ftype ftype;
|
22 |
+
if (str[0] == 'q') {
|
23 |
+
const auto it = GGML_FTYPE_MAP.find(str);
|
24 |
+
if (it == GGML_FTYPE_MAP.end()) {
|
25 |
+
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
|
26 |
+
return GGML_FTYPE_UNKNOWN;
|
27 |
+
}
|
28 |
+
ftype = it->second;
|
29 |
+
} else {
|
30 |
+
ftype = (enum ggml_ftype) atoi(str);
|
31 |
+
}
|
32 |
+
|
33 |
+
return ftype;
|
34 |
+
}
|
35 |
+
|
36 |
+
bool ggml_common_quantize_0(
|
37 |
+
std::ifstream & finp,
|
38 |
+
std::ofstream & fout,
|
39 |
+
const ggml_ftype ftype,
|
40 |
+
const std::vector<std::string> & to_quant,
|
41 |
+
const std::vector<std::string> & to_skip) {
|
42 |
+
|
43 |
+
ggml_type qtype = GGML_TYPE_F32;
|
44 |
+
|
45 |
+
switch (ftype) {
|
46 |
+
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
|
47 |
+
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
|
48 |
+
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
49 |
+
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
50 |
+
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
51 |
+
case GGML_FTYPE_UNKNOWN:
|
52 |
+
case GGML_FTYPE_ALL_F32:
|
53 |
+
case GGML_FTYPE_MOSTLY_F16:
|
54 |
+
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
55 |
+
case GGML_FTYPE_MOSTLY_Q2_K:
|
56 |
+
case GGML_FTYPE_MOSTLY_Q3_K:
|
57 |
+
case GGML_FTYPE_MOSTLY_Q4_K:
|
58 |
+
case GGML_FTYPE_MOSTLY_Q5_K:
|
59 |
+
case GGML_FTYPE_MOSTLY_Q6_K:
|
60 |
+
{
|
61 |
+
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
62 |
+
return false;
|
63 |
+
}
|
64 |
+
};
|
65 |
+
|
66 |
+
if (!ggml_is_quantized(qtype)) {
|
67 |
+
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
|
68 |
+
return false;
|
69 |
+
}
|
70 |
+
|
71 |
+
size_t total_size_org = 0;
|
72 |
+
size_t total_size_new = 0;
|
73 |
+
|
74 |
+
std::vector<float> work;
|
75 |
+
|
76 |
+
std::vector<uint8_t> data_u8;
|
77 |
+
std::vector<ggml_fp16_t> data_f16;
|
78 |
+
std::vector<float> data_f32;
|
79 |
+
|
80 |
+
std::vector<int64_t> hist_all(1 << 4, 0);
|
81 |
+
|
82 |
+
while (true) {
|
83 |
+
int32_t n_dims;
|
84 |
+
int32_t length;
|
85 |
+
int32_t ttype;
|
86 |
+
|
87 |
+
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
88 |
+
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
|
89 |
+
finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
90 |
+
|
91 |
+
if (finp.eof()) {
|
92 |
+
break;
|
93 |
+
}
|
94 |
+
|
95 |
+
int32_t nelements = 1;
|
96 |
+
int32_t ne[4] = { 1, 1, 1, 1 };
|
97 |
+
for (int i = 0; i < n_dims; ++i) {
|
98 |
+
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
99 |
+
nelements *= ne[i];
|
100 |
+
}
|
101 |
+
|
102 |
+
std::string name(length, 0);
|
103 |
+
finp.read (&name[0], length);
|
104 |
+
|
105 |
+
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
|
106 |
+
|
107 |
+
bool quantize = false;
|
108 |
+
|
109 |
+
// check if we should quantize this tensor
|
110 |
+
for (const auto & s : to_quant) {
|
111 |
+
if (std::regex_match(name, std::regex(s))) {
|
112 |
+
quantize = true;
|
113 |
+
break;
|
114 |
+
}
|
115 |
+
}
|
116 |
+
|
117 |
+
// check if we should skip this tensor
|
118 |
+
for (const auto & s : to_skip) {
|
119 |
+
if (std::regex_match(name, std::regex(s))) {
|
120 |
+
quantize = false;
|
121 |
+
break;
|
122 |
+
}
|
123 |
+
}
|
124 |
+
|
125 |
+
// quantize only 2D tensors
|
126 |
+
quantize &= (n_dims == 2);
|
127 |
+
|
128 |
+
if (quantize) {
|
129 |
+
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
|
130 |
+
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
131 |
+
return false;
|
132 |
+
}
|
133 |
+
|
134 |
+
if (ttype == GGML_TYPE_F16) {
|
135 |
+
data_f16.resize(nelements);
|
136 |
+
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
137 |
+
data_f32.resize(nelements);
|
138 |
+
for (int i = 0; i < nelements; ++i) {
|
139 |
+
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
140 |
+
}
|
141 |
+
} else {
|
142 |
+
data_f32.resize(nelements);
|
143 |
+
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
|
144 |
+
}
|
145 |
+
|
146 |
+
ttype = qtype;
|
147 |
+
} else {
|
148 |
+
const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
|
149 |
+
|
150 |
+
data_u8.resize(nelements*bpe);
|
151 |
+
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
|
152 |
+
}
|
153 |
+
|
154 |
+
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
155 |
+
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
|
156 |
+
fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
157 |
+
for (int i = 0; i < n_dims; ++i) {
|
158 |
+
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
159 |
+
}
|
160 |
+
fout.write(&name[0], length);
|
161 |
+
|
162 |
+
if (quantize) {
|
163 |
+
work.resize(nelements); // for quantization
|
164 |
+
|
165 |
+
size_t cur_size = 0;
|
166 |
+
std::vector<int64_t> hist_cur(1 << 4, 0);
|
167 |
+
|
168 |
+
switch ((ggml_type) ttype) {
|
169 |
+
case GGML_TYPE_Q4_0:
|
170 |
+
{
|
171 |
+
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
172 |
+
} break;
|
173 |
+
case GGML_TYPE_Q4_1:
|
174 |
+
{
|
175 |
+
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
176 |
+
} break;
|
177 |
+
case GGML_TYPE_Q5_0:
|
178 |
+
{
|
179 |
+
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
180 |
+
} break;
|
181 |
+
case GGML_TYPE_Q5_1:
|
182 |
+
{
|
183 |
+
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
184 |
+
} break;
|
185 |
+
case GGML_TYPE_Q8_0:
|
186 |
+
{
|
187 |
+
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
188 |
+
} break;
|
189 |
+
case GGML_TYPE_F32:
|
190 |
+
case GGML_TYPE_F16:
|
191 |
+
case GGML_TYPE_I8:
|
192 |
+
case GGML_TYPE_I16:
|
193 |
+
case GGML_TYPE_I32:
|
194 |
+
case GGML_TYPE_Q8_1:
|
195 |
+
case GGML_TYPE_Q2_K:
|
196 |
+
case GGML_TYPE_Q3_K:
|
197 |
+
case GGML_TYPE_Q4_K:
|
198 |
+
case GGML_TYPE_Q5_K:
|
199 |
+
case GGML_TYPE_Q6_K:
|
200 |
+
case GGML_TYPE_Q8_K:
|
201 |
+
case GGML_TYPE_COUNT:
|
202 |
+
{
|
203 |
+
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
204 |
+
return false;
|
205 |
+
}
|
206 |
+
}
|
207 |
+
|
208 |
+
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
|
209 |
+
total_size_new += cur_size;
|
210 |
+
|
211 |
+
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
|
212 |
+
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
213 |
+
hist_all[i] += hist_cur[i];
|
214 |
+
}
|
215 |
+
|
216 |
+
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
217 |
+
printf("%5.3f ", hist_cur[i] / (float)nelements);
|
218 |
+
}
|
219 |
+
printf("\n");
|
220 |
+
} else {
|
221 |
+
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
|
222 |
+
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
|
223 |
+
total_size_new += data_u8.size();
|
224 |
+
}
|
225 |
+
|
226 |
+
total_size_org += nelements * sizeof(float);
|
227 |
+
}
|
228 |
+
|
229 |
+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
230 |
+
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
|
231 |
+
|
232 |
+
{
|
233 |
+
int64_t sum_all = 0;
|
234 |
+
for (int i = 0; i < (int) hist_all.size(); ++i) {
|
235 |
+
sum_all += hist_all[i];
|
236 |
+
}
|
237 |
+
|
238 |
+
printf("%s: hist: ", __func__);
|
239 |
+
for (int i = 0; i < (int) hist_all.size(); ++i) {
|
240 |
+
printf("%5.3f ", hist_all[i] / (float)sum_all);
|
241 |
+
}
|
242 |
+
printf("\n");
|
243 |
+
}
|
244 |
+
|
245 |
+
return true;
|
246 |
+
}
|
stable-diffusion.cpp/ggml/examples/common-ggml.h
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
#include "ggml.h"
|
4 |
+
|
5 |
+
#include <fstream>
|
6 |
+
#include <vector>
|
7 |
+
#include <string>
|
8 |
+
|
9 |
+
enum ggml_ftype ggml_parse_ftype(const char * str);
|
10 |
+
|
11 |
+
void ggml_print_ftypes(FILE * fp = stderr);
|
12 |
+
|
13 |
+
bool ggml_common_quantize_0(
|
14 |
+
std::ifstream & finp,
|
15 |
+
std::ofstream & fout,
|
16 |
+
const ggml_ftype ftype,
|
17 |
+
const std::vector<std::string> & to_quant,
|
18 |
+
const std::vector<std::string> & to_skip);
|
stable-diffusion.cpp/ggml/examples/common.cpp
ADDED
@@ -0,0 +1,817 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#define _USE_MATH_DEFINES // for M_PI
|
2 |
+
|
3 |
+
#include "common.h"
|
4 |
+
|
5 |
+
// third-party utilities
|
6 |
+
// use your favorite implementations
|
7 |
+
#define DR_WAV_IMPLEMENTATION
|
8 |
+
#include "dr_wav.h"
|
9 |
+
|
10 |
+
#include <cmath>
|
11 |
+
#include <cstring>
|
12 |
+
#include <fstream>
|
13 |
+
#include <regex>
|
14 |
+
#include <locale>
|
15 |
+
#include <codecvt>
|
16 |
+
#include <sstream>
|
17 |
+
|
18 |
+
#if defined(_MSC_VER)
|
19 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
20 |
+
#endif
|
21 |
+
|
22 |
+
// Function to check if the next argument exists
|
23 |
+
std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
|
24 |
+
if (i + 1 < argc && argv[i + 1][0] != '-') {
|
25 |
+
return argv[++i];
|
26 |
+
} else {
|
27 |
+
fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
|
28 |
+
gpt_print_usage(argc, argv, params);
|
29 |
+
exit(0);
|
30 |
+
}
|
31 |
+
}
|
32 |
+
|
33 |
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
34 |
+
for (int i = 1; i < argc; i++) {
|
35 |
+
std::string arg = argv[i];
|
36 |
+
|
37 |
+
if (arg == "-s" || arg == "--seed") {
|
38 |
+
params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
39 |
+
} else if (arg == "-t" || arg == "--threads") {
|
40 |
+
params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
41 |
+
} else if (arg == "-p" || arg == "--prompt") {
|
42 |
+
params.prompt = get_next_arg(i, argc, argv, arg, params);
|
43 |
+
} else if (arg == "-n" || arg == "--n_predict") {
|
44 |
+
params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
45 |
+
} else if (arg == "-np" || arg == "--n_parallel") {
|
46 |
+
params.n_parallel = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
47 |
+
} else if (arg == "--top_k") {
|
48 |
+
params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
49 |
+
} else if (arg == "--top_p") {
|
50 |
+
params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
|
51 |
+
} else if (arg == "--temp") {
|
52 |
+
params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
|
53 |
+
} else if (arg == "--repeat-last-n") {
|
54 |
+
params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
55 |
+
} else if (arg == "--repeat-penalty") {
|
56 |
+
params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
|
57 |
+
} else if (arg == "-b" || arg == "--batch_size") {
|
58 |
+
params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
|
59 |
+
} else if (arg == "-c" || arg == "--context") {
|
60 |
+
params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
|
61 |
+
} else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
|
62 |
+
params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
63 |
+
} else if (arg == "--ignore-eos") {
|
64 |
+
params.ignore_eos = true;
|
65 |
+
} else if (arg == "-m" || arg == "--model") {
|
66 |
+
params.model = get_next_arg(i, argc, argv, arg, params);
|
67 |
+
} else if (arg == "-i" || arg == "--interactive") {
|
68 |
+
params.interactive = true;
|
69 |
+
} else if (arg == "-ip" || arg == "--interactive-port") {
|
70 |
+
params.interactive = true;
|
71 |
+
params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
72 |
+
} else if (arg == "-h" || arg == "--help") {
|
73 |
+
gpt_print_usage(argc, argv, params);
|
74 |
+
exit(0);
|
75 |
+
} else if (arg == "-f" || arg == "--file") {
|
76 |
+
get_next_arg(i, argc, argv, arg, params);
|
77 |
+
std::ifstream file(argv[i]);
|
78 |
+
if (!file) {
|
79 |
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
80 |
+
break;
|
81 |
+
}
|
82 |
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
83 |
+
if (params.prompt.back() == '\n') {
|
84 |
+
params.prompt.pop_back();
|
85 |
+
}
|
86 |
+
} else if (arg == "-tt" || arg == "--token_test") {
|
87 |
+
params.token_test = get_next_arg(i, argc, argv, arg, params);
|
88 |
+
}
|
89 |
+
else {
|
90 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
91 |
+
gpt_print_usage(argc, argv, params);
|
92 |
+
exit(0);
|
93 |
+
}
|
94 |
+
}
|
95 |
+
|
96 |
+
return true;
|
97 |
+
}
|
98 |
+
|
99 |
+
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
100 |
+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
101 |
+
fprintf(stderr, "\n");
|
102 |
+
fprintf(stderr, "options:\n");
|
103 |
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
104 |
+
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
105 |
+
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
106 |
+
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
107 |
+
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
108 |
+
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
109 |
+
fprintf(stderr, " load prompt from a file\n");
|
110 |
+
fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
|
111 |
+
fprintf(stderr, " test tokenization\n");
|
112 |
+
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
113 |
+
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
114 |
+
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
|
115 |
+
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
|
116 |
+
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
|
117 |
+
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
118 |
+
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
119 |
+
fprintf(stderr, " -c N, --context N context / KV cache size (default: %d)\n", params.n_ctx);
|
120 |
+
fprintf(stderr, " --ignore-eos ignore EOS token during generation\n");
|
121 |
+
fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
|
122 |
+
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
123 |
+
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
124 |
+
fprintf(stderr, "\n");
|
125 |
+
}
|
126 |
+
|
127 |
+
std::string gpt_random_prompt(std::mt19937 & rng) {
|
128 |
+
const int r = rng() % 10;
|
129 |
+
switch (r) {
|
130 |
+
case 0: return "So";
|
131 |
+
case 1: return "Once upon a time";
|
132 |
+
case 2: return "When";
|
133 |
+
case 3: return "The";
|
134 |
+
case 4: return "After";
|
135 |
+
case 5: return "If";
|
136 |
+
case 6: return "import";
|
137 |
+
case 7: return "He";
|
138 |
+
case 8: return "She";
|
139 |
+
case 9: return "They";
|
140 |
+
default: return "To";
|
141 |
+
}
|
142 |
+
|
143 |
+
return "The";
|
144 |
+
}
|
145 |
+
|
146 |
+
std::string trim(const std::string & s) {
|
147 |
+
std::regex e("^\\s+|\\s+$");
|
148 |
+
return std::regex_replace(s, e, "");
|
149 |
+
}
|
150 |
+
|
151 |
+
std::string replace(const std::string & s, const std::string & from, const std::string & to) {
|
152 |
+
std::string result = s;
|
153 |
+
size_t pos = 0;
|
154 |
+
while ((pos = result.find(from, pos)) != std::string::npos) {
|
155 |
+
result.replace(pos, from.length(), to);
|
156 |
+
pos += to.length();
|
157 |
+
}
|
158 |
+
return result;
|
159 |
+
}
|
160 |
+
|
161 |
+
void gpt_vocab::add_special_token(const std::string & token) {
|
162 |
+
special_tokens.push_back(token);
|
163 |
+
}
|
164 |
+
|
165 |
+
std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
166 |
+
std::map<std::string, int32_t> result;
|
167 |
+
|
168 |
+
// read file into string
|
169 |
+
std::string json;
|
170 |
+
{
|
171 |
+
std::ifstream ifs(fname);
|
172 |
+
if (!ifs) {
|
173 |
+
fprintf(stderr, "Failed to open %s\n", fname.c_str());
|
174 |
+
exit(1);
|
175 |
+
}
|
176 |
+
|
177 |
+
json = std::string((std::istreambuf_iterator<char>(ifs)),
|
178 |
+
(std::istreambuf_iterator<char>()));
|
179 |
+
}
|
180 |
+
|
181 |
+
if (json[0] != '{') {
|
182 |
+
return result;
|
183 |
+
}
|
184 |
+
|
185 |
+
// parse json
|
186 |
+
{
|
187 |
+
bool has_key = false;
|
188 |
+
bool in_token = false;
|
189 |
+
|
190 |
+
std::string str_key = "";
|
191 |
+
std::string str_val = "";
|
192 |
+
|
193 |
+
int n = json.size();
|
194 |
+
for (int i = 1; i < n; ++i) {
|
195 |
+
if (!in_token) {
|
196 |
+
if (json[i] == ' ') continue;
|
197 |
+
if (json[i] == '"') {
|
198 |
+
in_token = true;
|
199 |
+
continue;
|
200 |
+
}
|
201 |
+
} else {
|
202 |
+
if (json[i] == '\\' && i+1 < n) {
|
203 |
+
if (has_key == false) {
|
204 |
+
str_key += json[i];
|
205 |
+
} else {
|
206 |
+
str_val += json[i];
|
207 |
+
}
|
208 |
+
++i;
|
209 |
+
} else if (json[i] == '"') {
|
210 |
+
if (has_key == false) {
|
211 |
+
has_key = true;
|
212 |
+
++i;
|
213 |
+
while (json[i] == ' ') ++i;
|
214 |
+
++i; // :
|
215 |
+
while (json[i] == ' ') ++i;
|
216 |
+
if (json[i] != '\"') {
|
217 |
+
while (json[i] != ',' && json[i] != '}') {
|
218 |
+
str_val += json[i++];
|
219 |
+
}
|
220 |
+
has_key = false;
|
221 |
+
} else {
|
222 |
+
in_token = true;
|
223 |
+
continue;
|
224 |
+
}
|
225 |
+
} else {
|
226 |
+
has_key = false;
|
227 |
+
}
|
228 |
+
|
229 |
+
str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
|
230 |
+
str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
|
231 |
+
str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> "
|
232 |
+
|
233 |
+
try {
|
234 |
+
result[str_key] = std::stoi(str_val);
|
235 |
+
} catch (...) {
|
236 |
+
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
|
237 |
+
|
238 |
+
}
|
239 |
+
str_key = "";
|
240 |
+
str_val = "";
|
241 |
+
in_token = false;
|
242 |
+
continue;
|
243 |
+
}
|
244 |
+
if (has_key == false) {
|
245 |
+
str_key += json[i];
|
246 |
+
} else {
|
247 |
+
str_val += json[i];
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
252 |
+
|
253 |
+
return result;
|
254 |
+
}
|
255 |
+
|
256 |
+
std::string convert_to_utf8(const std::wstring & input) {
|
257 |
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
258 |
+
return converter.to_bytes(input);
|
259 |
+
}
|
260 |
+
|
261 |
+
|
262 |
+
std::wstring convert_to_wstring(const std::string & input) {
|
263 |
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
264 |
+
return converter.from_bytes(input);
|
265 |
+
}
|
266 |
+
|
267 |
+
void gpt_split_words(std::string str, std::vector<std::string>& words) {
|
268 |
+
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
269 |
+
const std::regex re(pattern);
|
270 |
+
std::smatch m;
|
271 |
+
|
272 |
+
while (std::regex_search(str, m, re)) {
|
273 |
+
for (auto x : m) {
|
274 |
+
words.push_back(x);
|
275 |
+
}
|
276 |
+
str = m.suffix();
|
277 |
+
}
|
278 |
+
}
|
279 |
+
|
280 |
+
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
281 |
+
std::vector<std::string> words;
|
282 |
+
|
283 |
+
// first split the text into words
|
284 |
+
{
|
285 |
+
std::string str = text;
|
286 |
+
|
287 |
+
// Generate the subpattern from the special_tokens vector if it's not empty
|
288 |
+
if (!vocab.special_tokens.empty()) {
|
289 |
+
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
|
290 |
+
std::string special_tokens_subpattern;
|
291 |
+
for (const auto & token : vocab.special_tokens) {
|
292 |
+
if (!special_tokens_subpattern.empty()) {
|
293 |
+
special_tokens_subpattern += "|";
|
294 |
+
}
|
295 |
+
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
|
296 |
+
}
|
297 |
+
|
298 |
+
std::regex re(special_tokens_subpattern);
|
299 |
+
std::smatch m;
|
300 |
+
// Split the text by special tokens.
|
301 |
+
while (std::regex_search(str, m, re)) {
|
302 |
+
// Split the substrings in-between special tokens into words.
|
303 |
+
gpt_split_words(m.prefix(), words);
|
304 |
+
// Add matched special tokens as words.
|
305 |
+
for (auto x : m) {
|
306 |
+
words.push_back(x);
|
307 |
+
}
|
308 |
+
str = m.suffix();
|
309 |
+
}
|
310 |
+
// Remaining text without special tokens will be handled below.
|
311 |
+
}
|
312 |
+
|
313 |
+
gpt_split_words(str, words);
|
314 |
+
}
|
315 |
+
|
316 |
+
// find the longest token that forms each word in words:
|
317 |
+
std::vector<gpt_vocab::id> tokens;
|
318 |
+
for (const auto & word : words) {
|
319 |
+
for (int i = 0; i < (int) word.size(); ){
|
320 |
+
for (int j = word.size() - 1; j >= i; j--){
|
321 |
+
auto cand = word.substr(i, j-i+1);
|
322 |
+
auto it = vocab.token_to_id.find(cand);
|
323 |
+
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
|
324 |
+
tokens.push_back(it->second);
|
325 |
+
i = j + 1;
|
326 |
+
break;
|
327 |
+
}
|
328 |
+
else if (j == i){ // word.substr(i, 1) has no matching
|
329 |
+
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
|
330 |
+
i++;
|
331 |
+
}
|
332 |
+
}
|
333 |
+
}
|
334 |
+
}
|
335 |
+
|
336 |
+
return tokens;
|
337 |
+
}
|
338 |
+
|
339 |
+
std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
|
340 |
+
std::vector<gpt_vocab::id> output;
|
341 |
+
std::stringstream ss(input);
|
342 |
+
std::string token;
|
343 |
+
|
344 |
+
while (std::getline(ss, token, delimiter)) {
|
345 |
+
output.push_back(std::stoi(token));
|
346 |
+
}
|
347 |
+
|
348 |
+
return output;
|
349 |
+
}
|
350 |
+
|
351 |
+
std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
|
352 |
+
if (fpath_test.empty()){
|
353 |
+
fprintf(stderr, "%s : No test file found.\n", __func__);
|
354 |
+
return std::map<std::string, std::vector<gpt_vocab::id>>();
|
355 |
+
}
|
356 |
+
|
357 |
+
std::map<std::string, std::vector<gpt_vocab::id>> tests;
|
358 |
+
|
359 |
+
auto fin = std::ifstream(fpath_test, std::ios_base::in);
|
360 |
+
const char * delimeter = " => ";
|
361 |
+
const char del_tok = ',';
|
362 |
+
std::string line;
|
363 |
+
while (std::getline(fin, line)) {
|
364 |
+
size_t delimiterPos = line.find(delimeter);
|
365 |
+
if (delimiterPos != std::string::npos) {
|
366 |
+
std::string text = line.substr(0, delimiterPos);
|
367 |
+
std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
|
368 |
+
tests[text] = parse_tokens_from_string(s_tokens, del_tok);
|
369 |
+
}
|
370 |
+
}
|
371 |
+
return tests;
|
372 |
+
}
|
373 |
+
|
374 |
+
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
|
375 |
+
std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
|
376 |
+
|
377 |
+
size_t n_fails = 0;
|
378 |
+
|
379 |
+
for (const auto & test : tests) {
|
380 |
+
std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
|
381 |
+
|
382 |
+
if (tokens != test.second){
|
383 |
+
n_fails++;
|
384 |
+
|
385 |
+
// print out failure cases
|
386 |
+
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
|
387 |
+
fprintf(stderr, "%s : tokens in hf: ", __func__);
|
388 |
+
for (const auto & t : test.second) {
|
389 |
+
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
|
390 |
+
}
|
391 |
+
fprintf(stderr, "\n");
|
392 |
+
fprintf(stderr, "%s : tokens in ggml: ", __func__);
|
393 |
+
for (const auto & t : tokens) {
|
394 |
+
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
|
395 |
+
}
|
396 |
+
fprintf(stderr, "\n");
|
397 |
+
}
|
398 |
+
}
|
399 |
+
|
400 |
+
fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
|
401 |
+
}
|
402 |
+
|
403 |
+
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
404 |
+
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
405 |
+
|
406 |
+
vocab.token_to_id = ::json_parse(fname);
|
407 |
+
|
408 |
+
for (const auto & kv : vocab.token_to_id) {
|
409 |
+
vocab.id_to_token[kv.second] = kv.first;
|
410 |
+
}
|
411 |
+
|
412 |
+
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
|
413 |
+
|
414 |
+
// print the vocabulary
|
415 |
+
//for (auto kv : vocab.token_to_id) {
|
416 |
+
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
|
417 |
+
//}
|
418 |
+
|
419 |
+
return true;
|
420 |
+
}
|
421 |
+
|
422 |
+
gpt_vocab::id gpt_sample_top_k_top_p(
|
423 |
+
const gpt_vocab & vocab,
|
424 |
+
const float * logits,
|
425 |
+
int top_k,
|
426 |
+
double top_p,
|
427 |
+
double temp,
|
428 |
+
std::mt19937 & rng) {
|
429 |
+
int n_logits = vocab.id_to_token.size();
|
430 |
+
|
431 |
+
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
432 |
+
logits_id.reserve(n_logits);
|
433 |
+
|
434 |
+
{
|
435 |
+
const double scale = 1.0/temp;
|
436 |
+
for (int i = 0; i < n_logits; ++i) {
|
437 |
+
logits_id.push_back(std::make_pair(logits[i]*scale, i));
|
438 |
+
}
|
439 |
+
}
|
440 |
+
|
441 |
+
// find the top K tokens
|
442 |
+
std::partial_sort(
|
443 |
+
logits_id.begin(),
|
444 |
+
logits_id.begin() + top_k, logits_id.end(),
|
445 |
+
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
446 |
+
return a.first > b.first;
|
447 |
+
});
|
448 |
+
|
449 |
+
logits_id.resize(top_k);
|
450 |
+
|
451 |
+
double maxl = -INFINITY;
|
452 |
+
for (const auto & kv : logits_id) {
|
453 |
+
maxl = std::max(maxl, kv.first);
|
454 |
+
}
|
455 |
+
|
456 |
+
// compute probs for the top K tokens
|
457 |
+
std::vector<double> probs;
|
458 |
+
probs.reserve(logits_id.size());
|
459 |
+
|
460 |
+
double sum = 0.0;
|
461 |
+
for (const auto & kv : logits_id) {
|
462 |
+
double p = exp(kv.first - maxl);
|
463 |
+
probs.push_back(p);
|
464 |
+
sum += p;
|
465 |
+
}
|
466 |
+
|
467 |
+
// normalize the probs
|
468 |
+
for (auto & p : probs) {
|
469 |
+
p /= sum;
|
470 |
+
}
|
471 |
+
|
472 |
+
if (top_p < 1.0f) {
|
473 |
+
double cumsum = 0.0f;
|
474 |
+
for (int i = 0; i < top_k; i++) {
|
475 |
+
cumsum += probs[i];
|
476 |
+
if (cumsum >= top_p) {
|
477 |
+
top_k = i + 1;
|
478 |
+
probs.resize(top_k);
|
479 |
+
logits_id.resize(top_k);
|
480 |
+
break;
|
481 |
+
}
|
482 |
+
}
|
483 |
+
|
484 |
+
cumsum = 1.0/cumsum;
|
485 |
+
for (int i = 0; i < (int) probs.size(); i++) {
|
486 |
+
probs[i] *= cumsum;
|
487 |
+
}
|
488 |
+
}
|
489 |
+
|
490 |
+
//printf("\n");
|
491 |
+
//for (int i = 0; i < (int) probs.size(); i++) {
|
492 |
+
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
493 |
+
//}
|
494 |
+
//exit(0);
|
495 |
+
|
496 |
+
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
497 |
+
int idx = dist(rng);
|
498 |
+
|
499 |
+
return logits_id[idx].second;
|
500 |
+
}
|
501 |
+
|
502 |
+
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
|
503 |
+
const gpt_vocab & vocab,
|
504 |
+
const float * logits,
|
505 |
+
const int32_t * last_n_tokens_data,
|
506 |
+
size_t last_n_tokens_data_size,
|
507 |
+
int top_k,
|
508 |
+
double top_p,
|
509 |
+
double temp,
|
510 |
+
int repeat_last_n,
|
511 |
+
float repeat_penalty,
|
512 |
+
std::mt19937 & rng) {
|
513 |
+
|
514 |
+
int n_logits = vocab.id_to_token.size();
|
515 |
+
|
516 |
+
const auto * plogits = logits;
|
517 |
+
|
518 |
+
const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
|
519 |
+
|
520 |
+
if (temp <= 0) {
|
521 |
+
// select the token with the highest logit directly
|
522 |
+
float max_logit = plogits[0];
|
523 |
+
gpt_vocab::id max_id = 0;
|
524 |
+
|
525 |
+
for (int i = 1; i < n_logits; ++i) {
|
526 |
+
if (plogits[i] > max_logit) {
|
527 |
+
max_logit = plogits[i];
|
528 |
+
max_id = i;
|
529 |
+
}
|
530 |
+
}
|
531 |
+
return max_id;
|
532 |
+
}
|
533 |
+
|
534 |
+
|
535 |
+
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
536 |
+
logits_id.reserve(n_logits);
|
537 |
+
|
538 |
+
{
|
539 |
+
const float scale = 1.0f/temp;
|
540 |
+
for (int i = 0; i < n_logits; ++i) {
|
541 |
+
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
|
542 |
+
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
|
543 |
+
if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
|
544 |
+
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
|
545 |
+
if (plogits[i] < 0.0f) {
|
546 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
|
547 |
+
} else {
|
548 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
|
549 |
+
}
|
550 |
+
} else {
|
551 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
|
552 |
+
}
|
553 |
+
}
|
554 |
+
}
|
555 |
+
|
556 |
+
// find the top K tokens
|
557 |
+
std::partial_sort(
|
558 |
+
logits_id.begin(),
|
559 |
+
logits_id.begin() + top_k, logits_id.end(),
|
560 |
+
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
561 |
+
return a.first > b.first;
|
562 |
+
});
|
563 |
+
|
564 |
+
logits_id.resize(top_k);
|
565 |
+
|
566 |
+
double maxl = -INFINITY;
|
567 |
+
for (const auto & kv : logits_id) {
|
568 |
+
maxl = std::max(maxl, kv.first);
|
569 |
+
}
|
570 |
+
|
571 |
+
// compute probs for the top K tokens
|
572 |
+
std::vector<double> probs;
|
573 |
+
probs.reserve(logits_id.size());
|
574 |
+
|
575 |
+
double sum = 0.0;
|
576 |
+
for (const auto & kv : logits_id) {
|
577 |
+
double p = exp(kv.first - maxl);
|
578 |
+
probs.push_back(p);
|
579 |
+
sum += p;
|
580 |
+
}
|
581 |
+
|
582 |
+
// normalize the probs
|
583 |
+
for (auto & p : probs) {
|
584 |
+
p /= sum;
|
585 |
+
}
|
586 |
+
|
587 |
+
if (top_p < 1.0f) {
|
588 |
+
double cumsum = 0.0f;
|
589 |
+
for (int i = 0; i < top_k; i++) {
|
590 |
+
cumsum += probs[i];
|
591 |
+
if (cumsum >= top_p) {
|
592 |
+
top_k = i + 1;
|
593 |
+
probs.resize(top_k);
|
594 |
+
logits_id.resize(top_k);
|
595 |
+
break;
|
596 |
+
}
|
597 |
+
}
|
598 |
+
|
599 |
+
cumsum = 1.0/cumsum;
|
600 |
+
for (int i = 0; i < (int) probs.size(); i++) {
|
601 |
+
probs[i] *= cumsum;
|
602 |
+
}
|
603 |
+
}
|
604 |
+
|
605 |
+
// printf("\n");
|
606 |
+
// for (int i = 0; i < (int) probs.size(); i++) {
|
607 |
+
// for (int i = 0; i < 10; i++) {
|
608 |
+
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
609 |
+
// }
|
610 |
+
|
611 |
+
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
612 |
+
int idx = dist(rng);
|
613 |
+
|
614 |
+
return logits_id[idx].second;
|
615 |
+
|
616 |
+
}
|
617 |
+
|
618 |
+
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
619 |
+
drwav wav;
|
620 |
+
std::vector<uint8_t> wav_data; // used for pipe input from stdin
|
621 |
+
|
622 |
+
if (fname == "-") {
|
623 |
+
{
|
624 |
+
uint8_t buf[1024];
|
625 |
+
while (true)
|
626 |
+
{
|
627 |
+
const size_t n = fread(buf, 1, sizeof(buf), stdin);
|
628 |
+
if (n == 0) {
|
629 |
+
break;
|
630 |
+
}
|
631 |
+
wav_data.insert(wav_data.end(), buf, buf + n);
|
632 |
+
}
|
633 |
+
}
|
634 |
+
|
635 |
+
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
|
636 |
+
fprintf(stderr, "error: failed to open WAV file from stdin\n");
|
637 |
+
return false;
|
638 |
+
}
|
639 |
+
|
640 |
+
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
|
641 |
+
}
|
642 |
+
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
|
643 |
+
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
|
644 |
+
return false;
|
645 |
+
}
|
646 |
+
|
647 |
+
if (wav.channels != 1 && wav.channels != 2) {
|
648 |
+
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
|
649 |
+
return false;
|
650 |
+
}
|
651 |
+
|
652 |
+
if (stereo && wav.channels != 2) {
|
653 |
+
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
|
654 |
+
return false;
|
655 |
+
}
|
656 |
+
|
657 |
+
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
|
658 |
+
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
|
659 |
+
return false;
|
660 |
+
}
|
661 |
+
|
662 |
+
if (wav.bitsPerSample != 16) {
|
663 |
+
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
|
664 |
+
return false;
|
665 |
+
}
|
666 |
+
|
667 |
+
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
|
668 |
+
|
669 |
+
std::vector<int16_t> pcm16;
|
670 |
+
pcm16.resize(n*wav.channels);
|
671 |
+
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
|
672 |
+
drwav_uninit(&wav);
|
673 |
+
|
674 |
+
// convert to mono, float
|
675 |
+
pcmf32.resize(n);
|
676 |
+
if (wav.channels == 1) {
|
677 |
+
for (uint64_t i = 0; i < n; i++) {
|
678 |
+
pcmf32[i] = float(pcm16[i])/32768.0f;
|
679 |
+
}
|
680 |
+
} else {
|
681 |
+
for (uint64_t i = 0; i < n; i++) {
|
682 |
+
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
683 |
+
}
|
684 |
+
}
|
685 |
+
|
686 |
+
if (stereo) {
|
687 |
+
// convert to stereo, float
|
688 |
+
pcmf32s.resize(2);
|
689 |
+
|
690 |
+
pcmf32s[0].resize(n);
|
691 |
+
pcmf32s[1].resize(n);
|
692 |
+
for (uint64_t i = 0; i < n; i++) {
|
693 |
+
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
|
694 |
+
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
|
695 |
+
}
|
696 |
+
}
|
697 |
+
|
698 |
+
return true;
|
699 |
+
}
|
700 |
+
|
701 |
+
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
|
702 |
+
const float rc = 1.0f / (2.0f * M_PI * cutoff);
|
703 |
+
const float dt = 1.0f / sample_rate;
|
704 |
+
const float alpha = dt / (rc + dt);
|
705 |
+
|
706 |
+
float y = data[0];
|
707 |
+
|
708 |
+
for (size_t i = 1; i < data.size(); i++) {
|
709 |
+
y = alpha * (y + data[i] - data[i - 1]);
|
710 |
+
data[i] = y;
|
711 |
+
}
|
712 |
+
}
|
713 |
+
|
714 |
+
bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
|
715 |
+
const int n_samples = pcmf32.size();
|
716 |
+
const int n_samples_last = (sample_rate * last_ms) / 1000;
|
717 |
+
|
718 |
+
if (n_samples_last >= n_samples) {
|
719 |
+
// not enough samples - assume no speech
|
720 |
+
return false;
|
721 |
+
}
|
722 |
+
|
723 |
+
if (freq_thold > 0.0f) {
|
724 |
+
high_pass_filter(pcmf32, freq_thold, sample_rate);
|
725 |
+
}
|
726 |
+
|
727 |
+
float energy_all = 0.0f;
|
728 |
+
float energy_last = 0.0f;
|
729 |
+
|
730 |
+
for (int i = 0; i < n_samples; i++) {
|
731 |
+
energy_all += fabsf(pcmf32[i]);
|
732 |
+
|
733 |
+
if (i >= n_samples - n_samples_last) {
|
734 |
+
energy_last += fabsf(pcmf32[i]);
|
735 |
+
}
|
736 |
+
}
|
737 |
+
|
738 |
+
energy_all /= n_samples;
|
739 |
+
energy_last /= n_samples_last;
|
740 |
+
|
741 |
+
if (verbose) {
|
742 |
+
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
|
743 |
+
}
|
744 |
+
|
745 |
+
if (energy_last > vad_thold*energy_all) {
|
746 |
+
return false;
|
747 |
+
}
|
748 |
+
|
749 |
+
return true;
|
750 |
+
}
|
751 |
+
|
752 |
+
float similarity(const std::string & s0, const std::string & s1) {
|
753 |
+
const size_t len0 = s0.size() + 1;
|
754 |
+
const size_t len1 = s1.size() + 1;
|
755 |
+
|
756 |
+
std::vector<int> col(len1, 0);
|
757 |
+
std::vector<int> prevCol(len1, 0);
|
758 |
+
|
759 |
+
for (size_t i = 0; i < len1; i++) {
|
760 |
+
prevCol[i] = i;
|
761 |
+
}
|
762 |
+
|
763 |
+
for (size_t i = 0; i < len0; i++) {
|
764 |
+
col[0] = i;
|
765 |
+
for (size_t j = 1; j < len1; j++) {
|
766 |
+
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
|
767 |
+
}
|
768 |
+
col.swap(prevCol);
|
769 |
+
}
|
770 |
+
|
771 |
+
const float dist = prevCol[len1 - 1];
|
772 |
+
|
773 |
+
return 1.0f - (dist / std::max(s0.size(), s1.size()));
|
774 |
+
}
|
775 |
+
|
776 |
+
bool sam_params_parse(int argc, char ** argv, sam_params & params) {
|
777 |
+
for (int i = 1; i < argc; i++) {
|
778 |
+
std::string arg = argv[i];
|
779 |
+
|
780 |
+
if (arg == "-s" || arg == "--seed") {
|
781 |
+
params.seed = std::stoi(argv[++i]);
|
782 |
+
} else if (arg == "-t" || arg == "--threads") {
|
783 |
+
params.n_threads = std::stoi(argv[++i]);
|
784 |
+
} else if (arg == "-m" || arg == "--model") {
|
785 |
+
params.model = argv[++i];
|
786 |
+
} else if (arg == "-i" || arg == "--inp") {
|
787 |
+
params.fname_inp = argv[++i];
|
788 |
+
} else if (arg == "-o" || arg == "--out") {
|
789 |
+
params.fname_out = argv[++i];
|
790 |
+
} else if (arg == "-h" || arg == "--help") {
|
791 |
+
sam_print_usage(argc, argv, params);
|
792 |
+
exit(0);
|
793 |
+
} else {
|
794 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
795 |
+
sam_print_usage(argc, argv, params);
|
796 |
+
exit(0);
|
797 |
+
}
|
798 |
+
}
|
799 |
+
|
800 |
+
return true;
|
801 |
+
}
|
802 |
+
|
803 |
+
void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
|
804 |
+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
805 |
+
fprintf(stderr, "\n");
|
806 |
+
fprintf(stderr, "options:\n");
|
807 |
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
808 |
+
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
809 |
+
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
810 |
+
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
811 |
+
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
812 |
+
fprintf(stderr, " -i FNAME, --inp FNAME\n");
|
813 |
+
fprintf(stderr, " input file (default: %s)\n", params.fname_inp.c_str());
|
814 |
+
fprintf(stderr, " -o FNAME, --out FNAME\n");
|
815 |
+
fprintf(stderr, " output file (default: %s)\n", params.fname_out.c_str());
|
816 |
+
fprintf(stderr, "\n");
|
817 |
+
}
|
stable-diffusion.cpp/ggml/examples/common.h
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Various helper functions and utilities
|
2 |
+
|
3 |
+
#pragma once
|
4 |
+
|
5 |
+
#include <string>
|
6 |
+
#include <map>
|
7 |
+
#include <vector>
|
8 |
+
#include <random>
|
9 |
+
#include <thread>
|
10 |
+
|
11 |
+
#define COMMON_SAMPLE_RATE 16000
|
12 |
+
|
13 |
+
//
|
14 |
+
// GPT CLI argument parsing
|
15 |
+
//
|
16 |
+
|
17 |
+
struct gpt_params {
|
18 |
+
int32_t seed = -1; // RNG seed
|
19 |
+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
20 |
+
int32_t n_predict = 200; // new tokens to predict
|
21 |
+
int32_t n_parallel = 1; // number of parallel streams
|
22 |
+
int32_t n_batch = 8; // batch size for prompt processing
|
23 |
+
int32_t n_ctx = 2048; // context size (this is the KV cache max size)
|
24 |
+
int32_t n_gpu_layers = 0; // number of layers to offlload to the GPU
|
25 |
+
|
26 |
+
bool ignore_eos = false; // ignore EOS token when generating text
|
27 |
+
|
28 |
+
// sampling parameters
|
29 |
+
int32_t top_k = 40;
|
30 |
+
float top_p = 0.9f;
|
31 |
+
float temp = 0.9f;
|
32 |
+
int32_t repeat_last_n = 64;
|
33 |
+
float repeat_penalty = 1.00f;
|
34 |
+
|
35 |
+
std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
|
36 |
+
std::string prompt = "";
|
37 |
+
std::string token_test = "";
|
38 |
+
|
39 |
+
bool interactive = false;
|
40 |
+
int32_t interactive_port = -1;
|
41 |
+
};
|
42 |
+
|
43 |
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
44 |
+
|
45 |
+
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
46 |
+
|
47 |
+
std::string gpt_random_prompt(std::mt19937 & rng);
|
48 |
+
|
49 |
+
//
|
50 |
+
// Vocab utils
|
51 |
+
//
|
52 |
+
|
53 |
+
std::string trim(const std::string & s);
|
54 |
+
|
55 |
+
std::string replace(
|
56 |
+
const std::string & s,
|
57 |
+
const std::string & from,
|
58 |
+
const std::string & to);
|
59 |
+
|
60 |
+
struct gpt_vocab {
|
61 |
+
using id = int32_t;
|
62 |
+
using token = std::string;
|
63 |
+
|
64 |
+
std::map<token, id> token_to_id;
|
65 |
+
std::map<id, token> id_to_token;
|
66 |
+
std::vector<std::string> special_tokens;
|
67 |
+
|
68 |
+
void add_special_token(const std::string & token);
|
69 |
+
};
|
70 |
+
|
71 |
+
// poor-man's JSON parsing
|
72 |
+
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
73 |
+
|
74 |
+
std::string convert_to_utf8(const std::wstring & input);
|
75 |
+
|
76 |
+
std::wstring convert_to_wstring(const std::string & input);
|
77 |
+
|
78 |
+
void gpt_split_words(std::string str, std::vector<std::string>& words);
|
79 |
+
|
80 |
+
// split text into tokens
|
81 |
+
//
|
82 |
+
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
83 |
+
//
|
84 |
+
// Regex (Python):
|
85 |
+
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
86 |
+
//
|
87 |
+
// Regex (C++):
|
88 |
+
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
|
89 |
+
//
|
90 |
+
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
91 |
+
|
92 |
+
// test outputs of gpt_tokenize
|
93 |
+
//
|
94 |
+
// - compare with tokens generated by the huggingface tokenizer
|
95 |
+
// - test cases are chosen based on the model's main language (under 'prompt' directory)
|
96 |
+
// - if all sentences are tokenized identically, print 'All tests passed.'
|
97 |
+
// - otherwise, print sentence, huggingface tokens, ggml tokens
|
98 |
+
//
|
99 |
+
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);
|
100 |
+
|
101 |
+
// load the tokens from encoder.json
|
102 |
+
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
103 |
+
|
104 |
+
// sample next token given probabilities for each embedding
|
105 |
+
//
|
106 |
+
// - consider only the top K tokens
|
107 |
+
// - from them, consider only the top tokens with cumulative probability > P
|
108 |
+
//
|
109 |
+
// TODO: not sure if this implementation is correct
|
110 |
+
// TODO: temperature is not implemented
|
111 |
+
//
|
112 |
+
gpt_vocab::id gpt_sample_top_k_top_p(
|
113 |
+
const gpt_vocab & vocab,
|
114 |
+
const float * logits,
|
115 |
+
int top_k,
|
116 |
+
double top_p,
|
117 |
+
double temp,
|
118 |
+
std::mt19937 & rng);
|
119 |
+
|
120 |
+
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
|
121 |
+
const gpt_vocab & vocab,
|
122 |
+
const float * logits,
|
123 |
+
const int32_t * last_n_tokens_data,
|
124 |
+
size_t last_n_tokens_data_size,
|
125 |
+
int top_k,
|
126 |
+
double top_p,
|
127 |
+
double temp,
|
128 |
+
int repeat_last_n,
|
129 |
+
float repeat_penalty,
|
130 |
+
std::mt19937 & rng);
|
131 |
+
|
132 |
+
//
|
133 |
+
// Audio utils
|
134 |
+
//
|
135 |
+
|
136 |
+
// Read WAV audio file and store the PCM data into pcmf32
|
137 |
+
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
|
138 |
+
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
|
139 |
+
bool read_wav(
|
140 |
+
const std::string & fname,
|
141 |
+
std::vector<float> & pcmf32,
|
142 |
+
std::vector<std::vector<float>> & pcmf32s,
|
143 |
+
bool stereo);
|
144 |
+
|
145 |
+
// Apply a high-pass frequency filter to PCM audio
|
146 |
+
// Suppresses frequencies below cutoff Hz
|
147 |
+
void high_pass_filter(
|
148 |
+
std::vector<float> & data,
|
149 |
+
float cutoff,
|
150 |
+
float sample_rate);
|
151 |
+
|
152 |
+
// Basic voice activity detection (VAD) using audio energy adaptive threshold
|
153 |
+
bool vad_simple(
|
154 |
+
std::vector<float> & pcmf32,
|
155 |
+
int sample_rate,
|
156 |
+
int last_ms,
|
157 |
+
float vad_thold,
|
158 |
+
float freq_thold,
|
159 |
+
bool verbose);
|
160 |
+
|
161 |
+
// compute similarity between two strings using Levenshtein distance
|
162 |
+
float similarity(const std::string & s0, const std::string & s1);
|
163 |
+
|
164 |
+
//
|
165 |
+
// SAM argument parsing
|
166 |
+
//
|
167 |
+
|
168 |
+
struct sam_params {
|
169 |
+
int32_t seed = -1; // RNG seed
|
170 |
+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
171 |
+
|
172 |
+
std::string model = "models/sam-vit-b/ggml-model-f16.bin"; // model path
|
173 |
+
std::string fname_inp = "img.jpg";
|
174 |
+
std::string fname_out = "img.out";
|
175 |
+
};
|
176 |
+
|
177 |
+
bool sam_params_parse(int argc, char ** argv, sam_params & params);
|
178 |
+
|
179 |
+
void sam_print_usage(int argc, char ** argv, const sam_params & params);
|
stable-diffusion.cpp/ggml/examples/dolly-v2/CMakeLists.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# dollyv2
|
3 |
+
|
4 |
+
set(TEST_TARGET dollyv2)
|
5 |
+
add_executable(${TEST_TARGET} main.cpp)
|
6 |
+
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
|
7 |
+
|
8 |
+
#
|
9 |
+
# dollyv2-quantize
|
10 |
+
|
11 |
+
set(TEST_TARGET dollyv2-quantize)
|
12 |
+
add_executable(${TEST_TARGET} quantize.cpp)
|
13 |
+
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
|
stable-diffusion.cpp/ggml/examples/dolly-v2/README.md
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Dolly-V2
|
2 |
+
|
3 |
+
Transformer architecture: GPT-NeoX
|
4 |
+
|
5 |
+
Modeled from examples/stablelm
|
6 |
+
|
7 |
+
Ref: https://github.com/databrickslabs/dolly
|
8 |
+
|
9 |
+
Ref: https://github.com/stability-AI/stableLM/#stablelm-alpha
|
10 |
+
|
11 |
+
## Usage
|
12 |
+
|
13 |
+
```bash
|
14 |
+
# get the repo and build it
|
15 |
+
git clone https://github.com/ggerganov/ggml
|
16 |
+
cd ggml
|
17 |
+
mkdir build && cd build
|
18 |
+
cmake ..
|
19 |
+
make -j
|
20 |
+
|
21 |
+
# get the Dolly-V2 3B model
|
22 |
+
git clone https://huggingface.co/databricks/dolly-v2-3b
|
23 |
+
|
24 |
+
# install Python dependencies
|
25 |
+
python3 -m pip install -r ../requirements.txt
|
26 |
+
|
27 |
+
# convert model to FP16
|
28 |
+
python3 ../examples/dolly-v2/convert-h5-to-ggml.py ./dolly-v2-3b/ 1
|
29 |
+
|
30 |
+
# run inference using FP16 precision
|
31 |
+
./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-f16.bin -p "State the meaning of life." -t 6 -n 64
|
32 |
+
|
33 |
+
main: seed = 1683218142
|
34 |
+
dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-f16.bin' - please wait ...
|
35 |
+
dollyv2_model_load: n_vocab = 50280
|
36 |
+
dollyv2_model_load: n_ctx = 2048
|
37 |
+
dollyv2_model_load: n_embd = 2560
|
38 |
+
dollyv2_model_load: n_head = 32
|
39 |
+
dollyv2_model_load: n_layer = 32
|
40 |
+
dollyv2_model_load: n_rot = 20
|
41 |
+
dollyv2_model_load: ftype = 1
|
42 |
+
dollyv2_model_load: ggml ctx size = 7374.91 MB
|
43 |
+
dollyv2_model_load: memory_size = 640.00 MB, n_mem = 65536
|
44 |
+
dollyv2_model_load: ................................................ done
|
45 |
+
dollyv2_model_load: model size = 5295.10 MB / num tensors = 388
|
46 |
+
main: number of tokens in prompt = 32
|
47 |
+
main: token[0] = 30003, Below
|
48 |
+
main: token[1] = 310, is
|
49 |
+
main: token[2] = 271, an
|
50 |
+
main: token[3] = 9775, instruction
|
51 |
+
main: token[4] = 326, that
|
52 |
+
main: token[5] = 8631, describes
|
53 |
+
main: token[6] = 247, a
|
54 |
+
main: token[7] = 4836, task
|
55 |
+
main: token[8] = 964, .
|
56 |
+
main: token[9] = 19566, Write
|
57 |
+
main: token[10] = 247, a
|
58 |
+
main: token[11] = 2380, response
|
59 |
+
main: token[12] = 326, that
|
60 |
+
main: token[13] = 20420, appropriately
|
61 |
+
main: token[14] = 29141, completes
|
62 |
+
main: token[15] = 253, the
|
63 |
+
main: token[16] = 2748, request
|
64 |
+
main: token[17] = 964, .
|
65 |
+
main: token[18] = 187,
|
66 |
+
|
67 |
+
main: token[19] = 187,
|
68 |
+
|
69 |
+
main: token[20] = 50278, ### Instruction:
|
70 |
+
main: token[21] = 187,
|
71 |
+
|
72 |
+
main: token[22] = 5443, State
|
73 |
+
main: token[23] = 253, the
|
74 |
+
main: token[24] = 4495, meaning
|
75 |
+
main: token[25] = 273, of
|
76 |
+
main: token[26] = 1495, life
|
77 |
+
main: token[27] = 964, .
|
78 |
+
main: token[28] = 187,
|
79 |
+
|
80 |
+
main: token[29] = 187,
|
81 |
+
|
82 |
+
main: token[30] = 50279, ### Response:
|
83 |
+
main: token[31] = 187,
|
84 |
+
|
85 |
+
|
86 |
+
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
87 |
+
|
88 |
+
### Instruction:
|
89 |
+
State the meaning of life.
|
90 |
+
|
91 |
+
### Response:
|
92 |
+
The meaning of life is to love and be loved.
|
93 |
+
|
94 |
+
### End
|
95 |
+
|
96 |
+
main: mem per token = 16136720 bytes
|
97 |
+
main: load time = 2202.58 ms
|
98 |
+
main: sample time = 2.57 ms
|
99 |
+
main: predict time = 1497.14 ms / 33.27 ms per token
|
100 |
+
main: total time = 6187.27 ms
|
101 |
+
```
|
102 |
+
|
103 |
+
## 5-bit integer quantization mode
|
104 |
+
|
105 |
+
```bash
|
106 |
+
# quantize the model to 5-bits using Q5_0 quantization
|
107 |
+
./bin/dollyv2-quantize ./dolly-v2-3b/ggml-model-f16.bin ./dolly-v2-3b/ggml-model-q5_0.bin q5_0
|
108 |
+
|
109 |
+
# run the quantized model
|
110 |
+
./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-q5_0.bin -p "State the meaning of life." -t 6 -n 64
|
111 |
+
|
112 |
+
main: seed = 1683218518
|
113 |
+
dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-q5_0.bin' - please wait ...
|
114 |
+
dollyv2_model_load: n_vocab = 50280
|
115 |
+
dollyv2_model_load: n_ctx = 2048
|
116 |
+
dollyv2_model_load: n_embd = 2560
|
117 |
+
dollyv2_model_load: n_head = 32
|
118 |
+
dollyv2_model_load: n_layer = 32
|
119 |
+
dollyv2_model_load: n_rot = 20
|
120 |
+
dollyv2_model_load: ftype = 8
|
121 |
+
dollyv2_model_load: ggml ctx size = 3902.68 MB
|
122 |
+
dollyv2_model_load: memory_size = 640.00 MB, n_mem = 65536
|
123 |
+
dollyv2_model_load: ................................................ done
|
124 |
+
dollyv2_model_load: model size = 1822.87 MB / num tensors = 388
|
125 |
+
main: number of tokens in prompt = 32
|
126 |
+
main: token[0] = 30003, Below
|
127 |
+
main: token[1] = 310, is
|
128 |
+
main: token[2] = 271, an
|
129 |
+
main: token[3] = 9775, instruction
|
130 |
+
main: token[4] = 326, that
|
131 |
+
main: token[5] = 8631, describes
|
132 |
+
main: token[6] = 247, a
|
133 |
+
main: token[7] = 4836, task
|
134 |
+
main: token[8] = 964, .
|
135 |
+
main: token[9] = 19566, Write
|
136 |
+
main: token[10] = 247, a
|
137 |
+
main: token[11] = 2380, response
|
138 |
+
main: token[12] = 326, that
|
139 |
+
main: token[13] = 20420, appropriately
|
140 |
+
main: token[14] = 29141, completes
|
141 |
+
main: token[15] = 253, the
|
142 |
+
main: token[16] = 2748, request
|
143 |
+
main: token[17] = 964, .
|
144 |
+
main: token[18] = 187,
|
145 |
+
|
146 |
+
main: token[19] = 187,
|
147 |
+
|
148 |
+
main: token[20] = 50278, ### Instruction:
|
149 |
+
main: token[21] = 187,
|
150 |
+
|
151 |
+
main: token[22] = 5443, State
|
152 |
+
main: token[23] = 253, the
|
153 |
+
main: token[24] = 4495, meaning
|
154 |
+
main: token[25] = 273, of
|
155 |
+
main: token[26] = 1495, life
|
156 |
+
main: token[27] = 964, .
|
157 |
+
main: token[28] = 187,
|
158 |
+
|
159 |
+
main: token[29] = 187,
|
160 |
+
|
161 |
+
main: token[30] = 50279, ### Response:
|
162 |
+
main: token[31] = 187,
|
163 |
+
|
164 |
+
|
165 |
+
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
166 |
+
|
167 |
+
### Instruction:
|
168 |
+
State the meaning of life.
|
169 |
+
|
170 |
+
### Response:
|
171 |
+
The meaning of life is the discovery of the true self.
|
172 |
+
|
173 |
+
### End
|
174 |
+
|
175 |
+
main: mem per token = 16127760 bytes
|
176 |
+
main: load time = 1011.09 ms
|
177 |
+
main: sample time = 2.79 ms
|
178 |
+
main: predict time = 1271.62 ms / 27.64 ms per token
|
179 |
+
main: total time = 2802.51 ms
|
180 |
+
```
|
181 |
+
|
182 |
+
## Notes
|
183 |
+
|
184 |
+
- No guarantees for correctness
|
185 |
+
- The tokenizer is currently hacked - probably works only for English
|
186 |
+
- Non-parallel residual is not supported
|
187 |
+
- Contributions and improvements are welcome
|
stable-diffusion.cpp/ggml/examples/dolly-v2/convert-h5-to-ggml.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import struct
|
3 |
+
import json
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
7 |
+
|
8 |
+
if len(sys.argv) < 3:
|
9 |
+
print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
|
10 |
+
print(" ftype == 0 -> float32")
|
11 |
+
print(" ftype == 1 -> float16")
|
12 |
+
sys.exit(1)
|
13 |
+
|
14 |
+
# output in the same directory as the model
|
15 |
+
dir_model = sys.argv[1]
|
16 |
+
fname_out = sys.argv[1] + "/ggml-model.bin"
|
17 |
+
|
18 |
+
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
19 |
+
encoder = json.load(f)
|
20 |
+
|
21 |
+
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
22 |
+
hparams = json.load(f)
|
23 |
+
|
24 |
+
# possible data types
|
25 |
+
# ftype == 0 -> float32
|
26 |
+
# ftype == 1 -> float16
|
27 |
+
#
|
28 |
+
# map from ftype to string
|
29 |
+
ftype_str = ["f32", "f16"]
|
30 |
+
|
31 |
+
ftype = 1
|
32 |
+
if len(sys.argv) > 2:
|
33 |
+
ftype = int(sys.argv[2])
|
34 |
+
if ftype < 0 or ftype > 1:
|
35 |
+
print("Invalid ftype: " + str(ftype))
|
36 |
+
sys.exit(1)
|
37 |
+
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
38 |
+
|
39 |
+
|
40 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
41 |
+
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
|
42 |
+
#print (model)
|
43 |
+
|
44 |
+
#print(tokenizer.encode('I believe the meaning of life is'))
|
45 |
+
|
46 |
+
list_vars = model.state_dict()
|
47 |
+
for name in list_vars.keys():
|
48 |
+
print(name, list_vars[name].shape, list_vars[name].dtype)
|
49 |
+
|
50 |
+
fout = open(fname_out, "wb")
|
51 |
+
|
52 |
+
print(hparams)
|
53 |
+
|
54 |
+
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
|
55 |
+
fout.write(struct.pack("i", hparams["vocab_size"]))
|
56 |
+
fout.write(struct.pack("i", hparams["max_position_embeddings"]))
|
57 |
+
fout.write(struct.pack("i", hparams["hidden_size"]))
|
58 |
+
fout.write(struct.pack("i", hparams["num_attention_heads"]))
|
59 |
+
fout.write(struct.pack("i", hparams["num_hidden_layers"]))
|
60 |
+
fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
|
61 |
+
fout.write(struct.pack("i", hparams["use_parallel_residual"]))
|
62 |
+
fout.write(struct.pack("i", ftype))
|
63 |
+
|
64 |
+
# TODO: temporary hack to not deal with implementing the tokenizer
|
65 |
+
dot_token = tokenizer.encode('.')[0]
|
66 |
+
for i in range(hparams["vocab_size"]):
|
67 |
+
text = tokenizer.decode([dot_token, i]).encode('utf-8')
|
68 |
+
# remove the first byte (it's always '.')
|
69 |
+
text = text[1:]
|
70 |
+
fout.write(struct.pack("i", len(text)))
|
71 |
+
fout.write(text)
|
72 |
+
|
73 |
+
for name in list_vars.keys():
|
74 |
+
data = list_vars[name].squeeze().numpy()
|
75 |
+
print("Processing variable: " + name + " with shape: ", data.shape)
|
76 |
+
|
77 |
+
# we don't need these
|
78 |
+
if name.endswith(".attention.masked_bias") or \
|
79 |
+
name.endswith(".attention.bias") or \
|
80 |
+
name.endswith(".attention.rotary_emb.inv_freq"):
|
81 |
+
print(" Skipping variable: " + name)
|
82 |
+
continue
|
83 |
+
|
84 |
+
n_dims = len(data.shape);
|
85 |
+
|
86 |
+
# ftype == 0 -> float32, ftype == 1 -> float16
|
87 |
+
ftype_cur = 0;
|
88 |
+
if ftype != 0:
|
89 |
+
if name[-7:] == ".weight" and n_dims == 2:
|
90 |
+
print(" Converting to float16")
|
91 |
+
data = data.astype(np.float16)
|
92 |
+
ftype_cur = 1
|
93 |
+
else:
|
94 |
+
print(" Converting to float32")
|
95 |
+
data = data.astype(np.float32)
|
96 |
+
ftype_cur = 0
|
97 |
+
else:
|
98 |
+
if data.dtype != np.float32:
|
99 |
+
print(" Converting to float32")
|
100 |
+
data = data.astype(np.float32)
|
101 |
+
ftype_cur = 0
|
102 |
+
|
103 |
+
# header
|
104 |
+
str = name.encode('utf-8')
|
105 |
+
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
|
106 |
+
for i in range(n_dims):
|
107 |
+
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
|
108 |
+
fout.write(str);
|
109 |
+
|
110 |
+
# data
|
111 |
+
data.tofile(fout)
|
112 |
+
|
113 |
+
fout.close()
|
114 |
+
|
115 |
+
print("Done. Output file: " + fname_out)
|
116 |
+
print("")
|
stable-diffusion.cpp/ggml/examples/dolly-v2/main.cpp
ADDED
@@ -0,0 +1,969 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "ggml/ggml.h"
|
2 |
+
|
3 |
+
#include "common.h"
|
4 |
+
#include "common-ggml.h"
|
5 |
+
|
6 |
+
#include <cassert>
|
7 |
+
#include <cmath>
|
8 |
+
#include <cstdio>
|
9 |
+
#include <cstring>
|
10 |
+
#include <cinttypes>
|
11 |
+
#include <fstream>
|
12 |
+
#include <iostream>
|
13 |
+
#include <map>
|
14 |
+
#include <string>
|
15 |
+
#include <vector>
|
16 |
+
|
17 |
+
#if !defined(_WIN32)
|
18 |
+
#define DOLLY_INTERACTIVE_PORT
|
19 |
+
#endif
|
20 |
+
|
21 |
+
#if defined(DOLLY_INTERACTIVE_PORT)
|
22 |
+
#include <arpa/inet.h>
|
23 |
+
#include <netinet/in.h>
|
24 |
+
#include <sys/socket.h>
|
25 |
+
#include <unistd.h>
|
26 |
+
#endif
|
27 |
+
|
28 |
+
#if defined(_MSC_VER)
|
29 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
30 |
+
#endif
|
31 |
+
|
32 |
+
// default hparams (Dolly-V2 3B)
|
33 |
+
struct dollyv2_hparams {
|
34 |
+
int32_t n_vocab = 50254; // tokenizer.vocab_size
|
35 |
+
int32_t n_ctx = 2048; // model.config.max_position_embeddings
|
36 |
+
int32_t n_embd = 2560; // model.config.hidden_size
|
37 |
+
int32_t n_head = 32; // model.config.num_attention_heads
|
38 |
+
int32_t n_layer = 32; // model.config.num_hidden_layers
|
39 |
+
int32_t n_rot = 20; // rotary_pct[25%] * (n_embd / n_head)
|
40 |
+
int32_t par_res = 1; // 1 = true, 0 = false
|
41 |
+
int32_t ftype = GGML_FTYPE_MOSTLY_F16;
|
42 |
+
float eps = 1e-5f;
|
43 |
+
};
|
44 |
+
|
45 |
+
const std::string INSTRUCTION_KEY = "### Instruction:";
|
46 |
+
const std::string RESPONSE_KEY = "### Response:";
|
47 |
+
const std::string END_KEY = "### End";
|
48 |
+
const std::string INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request.";
|
49 |
+
|
50 |
+
// dollyv2 prompt format
|
51 |
+
std::string prompt_for_generation(const std::string& instruction) {
|
52 |
+
return INTRO_BLURB + "\n\n" + INSTRUCTION_KEY + "\n" + instruction + "\n\n" + RESPONSE_KEY + "\n";
|
53 |
+
}
|
54 |
+
|
55 |
+
struct dollyv2_layer {
|
56 |
+
// pre normalization
|
57 |
+
struct ggml_tensor * ln_1_g;
|
58 |
+
struct ggml_tensor * ln_1_b;
|
59 |
+
|
60 |
+
// attention
|
61 |
+
struct ggml_tensor * c_attn_attn_w;
|
62 |
+
struct ggml_tensor * c_attn_attn_b;
|
63 |
+
|
64 |
+
struct ggml_tensor * c_attn_proj_w;
|
65 |
+
struct ggml_tensor * c_attn_proj_b;
|
66 |
+
|
67 |
+
// post normalization
|
68 |
+
struct ggml_tensor * ln_2_g;
|
69 |
+
struct ggml_tensor * ln_2_b;
|
70 |
+
|
71 |
+
// ff
|
72 |
+
struct ggml_tensor * c_mlp_fc_w;
|
73 |
+
struct ggml_tensor * c_mlp_fc_b;
|
74 |
+
|
75 |
+
struct ggml_tensor * c_mlp_proj_w;
|
76 |
+
struct ggml_tensor * c_mlp_proj_b;
|
77 |
+
};
|
78 |
+
|
79 |
+
struct dollyv2_model {
|
80 |
+
dollyv2_hparams hparams;
|
81 |
+
|
82 |
+
// normalization
|
83 |
+
struct ggml_tensor * ln_f_g;
|
84 |
+
struct ggml_tensor * ln_f_b;
|
85 |
+
|
86 |
+
struct ggml_tensor * wte; // position embedding
|
87 |
+
|
88 |
+
struct ggml_tensor * lmh_g; // language model head
|
89 |
+
//struct ggml_tensor * lmh_b; // language model bias
|
90 |
+
|
91 |
+
std::vector<dollyv2_layer> layers;
|
92 |
+
|
93 |
+
// key + value memory
|
94 |
+
struct ggml_tensor * memory_k;
|
95 |
+
struct ggml_tensor * memory_v;
|
96 |
+
|
97 |
+
//
|
98 |
+
struct ggml_context * ctx;
|
99 |
+
std::map<std::string, struct ggml_tensor *> tensors;
|
100 |
+
};
|
101 |
+
|
102 |
+
// load the model's weights from a file
|
103 |
+
bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vocab & vocab) {
|
104 |
+
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
105 |
+
|
106 |
+
auto fin = std::ifstream(fname, std::ios::binary);
|
107 |
+
if (!fin) {
|
108 |
+
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
109 |
+
return false;
|
110 |
+
}
|
111 |
+
|
112 |
+
// verify magic
|
113 |
+
{
|
114 |
+
uint32_t magic;
|
115 |
+
fin.read((char *) &magic, sizeof(magic));
|
116 |
+
if (magic != GGML_FILE_MAGIC) {
|
117 |
+
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
118 |
+
return false;
|
119 |
+
}
|
120 |
+
}
|
121 |
+
|
122 |
+
// load hparams
|
123 |
+
{
|
124 |
+
auto & hparams = model.hparams;
|
125 |
+
|
126 |
+
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
127 |
+
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
128 |
+
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
129 |
+
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
130 |
+
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
131 |
+
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
132 |
+
fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
|
133 |
+
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
134 |
+
|
135 |
+
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
136 |
+
|
137 |
+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
138 |
+
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
139 |
+
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
140 |
+
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
141 |
+
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
142 |
+
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
143 |
+
printf("%s: par_res = %d\n", __func__, hparams.par_res);
|
144 |
+
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
145 |
+
printf("%s: qntvr = %d\n", __func__, qntvr);
|
146 |
+
|
147 |
+
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
148 |
+
}
|
149 |
+
|
150 |
+
// load vocab
|
151 |
+
{
|
152 |
+
const int32_t n_vocab = model.hparams.n_vocab;
|
153 |
+
|
154 |
+
std::string word;
|
155 |
+
std::vector<char> buf(128);
|
156 |
+
|
157 |
+
for (int i = 0; i < n_vocab; i++) {
|
158 |
+
uint32_t len;
|
159 |
+
fin.read((char *) &len, sizeof(len));
|
160 |
+
|
161 |
+
buf.resize(len);
|
162 |
+
fin.read((char *) buf.data(), len);
|
163 |
+
word.assign(buf.data(), len);
|
164 |
+
|
165 |
+
vocab.token_to_id[word] = i;
|
166 |
+
vocab.id_to_token[i] = word;
|
167 |
+
}
|
168 |
+
|
169 |
+
vocab.add_special_token("### End");
|
170 |
+
vocab.add_special_token("### Instruction:");
|
171 |
+
vocab.add_special_token("### Response:");
|
172 |
+
}
|
173 |
+
|
174 |
+
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
175 |
+
// in order to save memory and also to speed up the computation
|
176 |
+
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
177 |
+
if (wtype == GGML_TYPE_COUNT) {
|
178 |
+
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
179 |
+
__func__, fname.c_str(), model.hparams.ftype);
|
180 |
+
return false;
|
181 |
+
}
|
182 |
+
|
183 |
+
auto & ctx = model.ctx;
|
184 |
+
|
185 |
+
size_t ctx_size = 0;
|
186 |
+
|
187 |
+
{
|
188 |
+
const auto & hparams = model.hparams;
|
189 |
+
|
190 |
+
const int n_embd = hparams.n_embd;
|
191 |
+
const int n_layer = hparams.n_layer;
|
192 |
+
const int n_ctx = hparams.n_ctx;
|
193 |
+
const int n_vocab = hparams.n_vocab;
|
194 |
+
|
195 |
+
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
196 |
+
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
197 |
+
|
198 |
+
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
|
199 |
+
|
200 |
+
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g
|
201 |
+
//ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
|
202 |
+
|
203 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
204 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
205 |
+
|
206 |
+
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
|
207 |
+
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
|
208 |
+
|
209 |
+
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
210 |
+
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
|
211 |
+
|
212 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
|
213 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
|
214 |
+
|
215 |
+
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
216 |
+
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
217 |
+
|
218 |
+
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
219 |
+
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
220 |
+
|
221 |
+
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
222 |
+
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
223 |
+
|
224 |
+
ctx_size += (6 + 16*n_layer)*512; // object overhead
|
225 |
+
|
226 |
+
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
227 |
+
}
|
228 |
+
|
229 |
+
// create the ggml context
|
230 |
+
{
|
231 |
+
struct ggml_init_params params = {
|
232 |
+
/*.mem_size =*/ ctx_size,
|
233 |
+
/*.mem_buffer =*/ NULL,
|
234 |
+
/*.no_alloc =*/ false,
|
235 |
+
};
|
236 |
+
|
237 |
+
model.ctx = ggml_init(params);
|
238 |
+
if (!model.ctx) {
|
239 |
+
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
240 |
+
return false;
|
241 |
+
}
|
242 |
+
}
|
243 |
+
|
244 |
+
// prepare memory for the weights
|
245 |
+
{
|
246 |
+
const auto & hparams = model.hparams;
|
247 |
+
|
248 |
+
const int n_embd = hparams.n_embd;
|
249 |
+
const int n_layer = hparams.n_layer;
|
250 |
+
const int n_vocab = hparams.n_vocab;
|
251 |
+
|
252 |
+
model.layers.resize(n_layer);
|
253 |
+
|
254 |
+
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
255 |
+
|
256 |
+
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
257 |
+
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
258 |
+
|
259 |
+
model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
260 |
+
//model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
|
261 |
+
|
262 |
+
// map by name
|
263 |
+
model.tensors["gpt_neox.embed_in.weight"] = model.wte;
|
264 |
+
|
265 |
+
model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
|
266 |
+
model.tensors["gpt_neox.final_layer_norm.bias"] = model.ln_f_b;
|
267 |
+
|
268 |
+
model.tensors["embed_out.weight"] = model.lmh_g;
|
269 |
+
//model.tensors["lm_head.bias"] = model.lmh_b;
|
270 |
+
|
271 |
+
for (int i = 0; i < n_layer; ++i) {
|
272 |
+
auto & layer = model.layers[i];
|
273 |
+
|
274 |
+
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
275 |
+
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
276 |
+
|
277 |
+
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
|
278 |
+
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
|
279 |
+
|
280 |
+
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
281 |
+
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
282 |
+
|
283 |
+
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
284 |
+
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
285 |
+
|
286 |
+
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
|
287 |
+
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
288 |
+
|
289 |
+
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
290 |
+
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
291 |
+
|
292 |
+
// map by name
|
293 |
+
|
294 |
+
// unmapped: attention.rotary_emb, mlp.act
|
295 |
+
|
296 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
|
297 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"] = layer.ln_1_b;
|
298 |
+
|
299 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
|
300 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"] = layer.c_attn_attn_b;
|
301 |
+
|
302 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
|
303 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"] = layer.c_attn_proj_b;
|
304 |
+
|
305 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
|
306 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"] = layer.ln_2_b;
|
307 |
+
|
308 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
|
309 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"] = layer.c_mlp_fc_b;
|
310 |
+
|
311 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
|
312 |
+
model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"] = layer.c_mlp_proj_b;
|
313 |
+
}
|
314 |
+
}
|
315 |
+
|
316 |
+
// key + value memory
|
317 |
+
{
|
318 |
+
const auto & hparams = model.hparams;
|
319 |
+
|
320 |
+
const int n_embd = hparams.n_embd;
|
321 |
+
const int n_layer = hparams.n_layer;
|
322 |
+
const int n_ctx = hparams.n_ctx;
|
323 |
+
|
324 |
+
const int64_t n_mem = n_layer*n_ctx;
|
325 |
+
const int64_t n_elements = n_embd*n_mem;
|
326 |
+
|
327 |
+
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
328 |
+
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
329 |
+
|
330 |
+
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
331 |
+
|
332 |
+
printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
333 |
+
}
|
334 |
+
|
335 |
+
// load weights
|
336 |
+
{
|
337 |
+
int n_tensors = 0;
|
338 |
+
size_t total_size = 0;
|
339 |
+
|
340 |
+
printf("%s: ", __func__);
|
341 |
+
|
342 |
+
while (true) {
|
343 |
+
int32_t n_dims;
|
344 |
+
int32_t length;
|
345 |
+
int32_t ttype;
|
346 |
+
|
347 |
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
348 |
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
349 |
+
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
350 |
+
|
351 |
+
if (fin.eof()) {
|
352 |
+
break;
|
353 |
+
}
|
354 |
+
|
355 |
+
int32_t nelements = 1;
|
356 |
+
int32_t ne[2] = { 1, 1 };
|
357 |
+
for (int i = 0; i < n_dims; ++i) {
|
358 |
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
359 |
+
nelements *= ne[i];
|
360 |
+
}
|
361 |
+
|
362 |
+
std::string name(length, 0);
|
363 |
+
fin.read(&name[0], length);
|
364 |
+
|
365 |
+
if (model.tensors.find(name) == model.tensors.end()) {
|
366 |
+
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
|
367 |
+
return false;
|
368 |
+
}
|
369 |
+
|
370 |
+
auto tensor = model.tensors[name];
|
371 |
+
if (ggml_nelements(tensor) != nelements) {
|
372 |
+
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
|
373 |
+
return false;
|
374 |
+
}
|
375 |
+
|
376 |
+
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
377 |
+
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
|
378 |
+
__func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
|
379 |
+
return false;
|
380 |
+
}
|
381 |
+
|
382 |
+
// for debugging
|
383 |
+
if (0) {
|
384 |
+
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
385 |
+
}
|
386 |
+
|
387 |
+
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
388 |
+
|
389 |
+
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
390 |
+
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
391 |
+
__func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
|
392 |
+
return false;
|
393 |
+
}
|
394 |
+
|
395 |
+
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
396 |
+
|
397 |
+
total_size += ggml_nbytes(tensor);
|
398 |
+
if (++n_tensors % 8 == 0) {
|
399 |
+
printf(".");
|
400 |
+
fflush(stdout);
|
401 |
+
}
|
402 |
+
}
|
403 |
+
|
404 |
+
printf(" done\n");
|
405 |
+
|
406 |
+
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
407 |
+
}
|
408 |
+
|
409 |
+
fin.close();
|
410 |
+
|
411 |
+
return true;
|
412 |
+
}
|
413 |
+
|
414 |
+
// feed-forward network
|
415 |
+
ggml_tensor * gpt_neox_ff(
|
416 |
+
const dollyv2_layer & layer,
|
417 |
+
ggml_context * ctx0,
|
418 |
+
ggml_tensor * inp,
|
419 |
+
float eps) {
|
420 |
+
ggml_tensor * cur = ggml_norm(ctx0, inp, eps);
|
421 |
+
|
422 |
+
cur = ggml_add(ctx0,
|
423 |
+
ggml_mul(ctx0,
|
424 |
+
ggml_repeat(ctx0, layer.ln_2_g, cur),
|
425 |
+
cur),
|
426 |
+
ggml_repeat(ctx0, layer.ln_2_b, cur));
|
427 |
+
|
428 |
+
cur = ggml_mul_mat(ctx0,
|
429 |
+
layer.c_mlp_fc_w,
|
430 |
+
cur);
|
431 |
+
|
432 |
+
cur = ggml_add(ctx0,
|
433 |
+
ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
|
434 |
+
cur);
|
435 |
+
|
436 |
+
// GELU activation
|
437 |
+
cur = ggml_gelu(ctx0, cur);
|
438 |
+
|
439 |
+
// projection
|
440 |
+
// cur = proj_w*cur + proj_b
|
441 |
+
cur = ggml_mul_mat(ctx0,
|
442 |
+
layer.c_mlp_proj_w,
|
443 |
+
cur);
|
444 |
+
|
445 |
+
cur = ggml_add(ctx0,
|
446 |
+
ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
|
447 |
+
cur);
|
448 |
+
return cur;
|
449 |
+
}
|
450 |
+
|
451 |
+
// evaluate the transformer
|
452 |
+
//
|
453 |
+
// - model: the model
|
454 |
+
// - n_threads: number of threads to use
|
455 |
+
// - n_past: the context size so far
|
456 |
+
// - embd_inp: the embeddings of the tokens in the context
|
457 |
+
// - embd_w: the predicted logits for the next token
|
458 |
+
//
|
459 |
+
bool dollyv2_eval(
|
460 |
+
const dollyv2_model & model,
|
461 |
+
const int n_threads,
|
462 |
+
const int n_past,
|
463 |
+
const std::vector<gpt_vocab::id> & embd_inp,
|
464 |
+
std::vector<float> & embd_w,
|
465 |
+
size_t & mem_per_token) {
|
466 |
+
const int N = embd_inp.size();
|
467 |
+
|
468 |
+
const auto & hparams = model.hparams;
|
469 |
+
|
470 |
+
const int n_embd = hparams.n_embd;
|
471 |
+
const int n_layer = hparams.n_layer;
|
472 |
+
const int n_ctx = hparams.n_ctx;
|
473 |
+
const int n_head = hparams.n_head;
|
474 |
+
const int n_vocab = hparams.n_vocab;
|
475 |
+
const int n_rot = hparams.n_rot;
|
476 |
+
|
477 |
+
static size_t buf_size = 256u*1024*1024;
|
478 |
+
static void * buf = malloc(buf_size);
|
479 |
+
|
480 |
+
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
481 |
+
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
482 |
+
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
483 |
+
|
484 |
+
// reallocate
|
485 |
+
buf_size = buf_size_new;
|
486 |
+
buf = realloc(buf, buf_size);
|
487 |
+
if (buf == nullptr) {
|
488 |
+
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
489 |
+
return false;
|
490 |
+
}
|
491 |
+
}
|
492 |
+
|
493 |
+
struct ggml_init_params params = {
|
494 |
+
/*.mem_size =*/ buf_size,
|
495 |
+
/*.mem_buffer =*/ buf,
|
496 |
+
/*.no_alloc =*/ false,
|
497 |
+
};
|
498 |
+
|
499 |
+
struct ggml_context * ctx0 = ggml_init(params);
|
500 |
+
struct ggml_cgraph gf = { };
|
501 |
+
|
502 |
+
// KQ_pos - contains the positions
|
503 |
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
504 |
+
int * data = (int *) KQ_pos->data;
|
505 |
+
for (int i = 0; i < N; ++i) {
|
506 |
+
data[i] = n_past + i;
|
507 |
+
}
|
508 |
+
|
509 |
+
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
510 |
+
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
511 |
+
|
512 |
+
// wte
|
513 |
+
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
|
514 |
+
|
515 |
+
for (int il = 0; il < n_layer; ++il) {
|
516 |
+
struct ggml_tensor * cur;
|
517 |
+
|
518 |
+
// self-attention
|
519 |
+
{
|
520 |
+
{
|
521 |
+
cur = ggml_norm(ctx0, inpL, hparams.eps);
|
522 |
+
|
523 |
+
cur = ggml_add(ctx0,
|
524 |
+
ggml_mul(ctx0,
|
525 |
+
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
|
526 |
+
cur),
|
527 |
+
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
528 |
+
}
|
529 |
+
|
530 |
+
// compute QKV
|
531 |
+
{
|
532 |
+
cur = ggml_mul_mat(ctx0,
|
533 |
+
model.layers[il].c_attn_attn_w,
|
534 |
+
cur);
|
535 |
+
|
536 |
+
cur = ggml_add(ctx0,
|
537 |
+
ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
|
538 |
+
cur);
|
539 |
+
}
|
540 |
+
|
541 |
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
|
542 |
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
|
543 |
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
|
544 |
+
|
545 |
+
// using mode = 2 for GPT-NeoX mode
|
546 |
+
Qcur = ggml_rope_inplace(ctx0, Qcur, KQ_pos, n_rot, 2, 0);
|
547 |
+
Kcur = ggml_rope_inplace(ctx0, Kcur, KQ_pos, n_rot, 2, 0);
|
548 |
+
|
549 |
+
// store key and value to memory
|
550 |
+
{
|
551 |
+
Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
|
552 |
+
|
553 |
+
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
|
554 |
+
struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
|
555 |
+
( n_ctx)*ggml_element_size(model.memory_v),
|
556 |
+
(il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
|
557 |
+
|
558 |
+
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
559 |
+
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
560 |
+
}
|
561 |
+
|
562 |
+
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
563 |
+
struct ggml_tensor * Q =
|
564 |
+
ggml_permute(ctx0,
|
565 |
+
Qcur,
|
566 |
+
0, 2, 1, 3);
|
567 |
+
|
568 |
+
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
569 |
+
struct ggml_tensor * K =
|
570 |
+
ggml_permute(ctx0,
|
571 |
+
ggml_reshape_3d(ctx0,
|
572 |
+
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
|
573 |
+
n_embd/n_head, n_head, n_past + N),
|
574 |
+
0, 2, 1, 3);
|
575 |
+
|
576 |
+
// K * Q
|
577 |
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
578 |
+
|
579 |
+
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
580 |
+
struct ggml_tensor * KQ_scaled =
|
581 |
+
ggml_scale_inplace(ctx0,
|
582 |
+
KQ,
|
583 |
+
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
584 |
+
);
|
585 |
+
|
586 |
+
// KQ_masked = mask_past(KQ_scaled)
|
587 |
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
588 |
+
|
589 |
+
// KQ = soft_max(KQ_masked)
|
590 |
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
591 |
+
|
592 |
+
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
593 |
+
struct ggml_tensor * V =
|
594 |
+
ggml_view_3d(ctx0, model.memory_v,
|
595 |
+
n_past + N, n_embd/n_head, n_head,
|
596 |
+
n_ctx*ggml_element_size(model.memory_v),
|
597 |
+
n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
|
598 |
+
il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
|
599 |
+
|
600 |
+
// KQV = transpose(V) * KQ_soft_max
|
601 |
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
602 |
+
|
603 |
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
604 |
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
605 |
+
|
606 |
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
607 |
+
cur = ggml_cpy(ctx0,
|
608 |
+
KQV_merged,
|
609 |
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
610 |
+
|
611 |
+
// projection
|
612 |
+
{
|
613 |
+
cur = ggml_mul_mat(ctx0,
|
614 |
+
model.layers[il].c_attn_proj_w,
|
615 |
+
cur);
|
616 |
+
|
617 |
+
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
|
618 |
+
}
|
619 |
+
}
|
620 |
+
|
621 |
+
if (hparams.par_res == 0) {
|
622 |
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
|
623 |
+
|
624 |
+
cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps);
|
625 |
+
|
626 |
+
// input for next layer
|
627 |
+
inpL = ggml_add(ctx0, cur, inpFF);
|
628 |
+
} else {
|
629 |
+
struct ggml_tensor * inpFF = cur;
|
630 |
+
|
631 |
+
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
|
632 |
+
// note here we pass inpL instead of cur
|
633 |
+
cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps);
|
634 |
+
|
635 |
+
// layer input + FF
|
636 |
+
cur = ggml_add(ctx0, cur, inpFF);
|
637 |
+
|
638 |
+
// input for next layer
|
639 |
+
inpL = ggml_add(ctx0, cur, inpL);
|
640 |
+
}
|
641 |
+
|
642 |
+
}
|
643 |
+
|
644 |
+
// norm
|
645 |
+
{
|
646 |
+
inpL = ggml_norm(ctx0, inpL, hparams.eps);
|
647 |
+
|
648 |
+
// inpL = ln_f_g*inpL + ln_f_b
|
649 |
+
inpL = ggml_add(ctx0,
|
650 |
+
ggml_mul(ctx0,
|
651 |
+
ggml_repeat(ctx0, model.ln_f_g, inpL),
|
652 |
+
inpL),
|
653 |
+
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
654 |
+
}
|
655 |
+
|
656 |
+
// lm_head
|
657 |
+
{
|
658 |
+
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
|
659 |
+
|
660 |
+
//inpL = ggml_add(ctx0,
|
661 |
+
// ggml_repeat(ctx0, model.lmh_b, inpL),
|
662 |
+
// inpL);
|
663 |
+
}
|
664 |
+
|
665 |
+
// logits -> probs
|
666 |
+
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
667 |
+
|
668 |
+
// run the computation
|
669 |
+
ggml_build_forward_expand(&gf, inpL);
|
670 |
+
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
671 |
+
|
672 |
+
//if (n_past%100 == 0) {
|
673 |
+
// ggml_graph_print (&gf);
|
674 |
+
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
675 |
+
//}
|
676 |
+
|
677 |
+
//embd_w.resize(n_vocab*N);
|
678 |
+
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
679 |
+
|
680 |
+
// return result for just the last token
|
681 |
+
embd_w.resize(n_vocab);
|
682 |
+
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
683 |
+
|
684 |
+
if (mem_per_token == 0) {
|
685 |
+
mem_per_token = ggml_used_mem(ctx0)/N;
|
686 |
+
}
|
687 |
+
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
688 |
+
|
689 |
+
ggml_free(ctx0);
|
690 |
+
|
691 |
+
return true;
|
692 |
+
}
|
693 |
+
|
694 |
+
std::string execute_prompt(
|
695 |
+
const dollyv2_model &model,
|
696 |
+
gpt_vocab &vocab,
|
697 |
+
const std::string &prompt,
|
698 |
+
gpt_params ¶ms,
|
699 |
+
std::mt19937 &rng,
|
700 |
+
int64_t t_load_us,
|
701 |
+
int64_t t_sample_us,
|
702 |
+
int64_t t_predict_us,
|
703 |
+
size_t mem_per_token,
|
704 |
+
int n_past,
|
705 |
+
bool stream_response_to_cout = false) {
|
706 |
+
std::string output = "";
|
707 |
+
std::vector<float> logits;
|
708 |
+
|
709 |
+
// tokenize the prompt
|
710 |
+
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, prompt);
|
711 |
+
|
712 |
+
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int)embd_inp.size());
|
713 |
+
|
714 |
+
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
715 |
+
for (size_t i = 0; i < embd_inp.size(); i++) {
|
716 |
+
printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
|
717 |
+
}
|
718 |
+
printf("\n");
|
719 |
+
|
720 |
+
std::vector<gpt_vocab::id> embd;
|
721 |
+
|
722 |
+
dollyv2_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
|
723 |
+
|
724 |
+
const int32_t end_token = vocab.token_to_id["### End"];
|
725 |
+
|
726 |
+
for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
|
727 |
+
// predict
|
728 |
+
if (embd.size() > 0) {
|
729 |
+
const int64_t t_start_us = ggml_time_us();
|
730 |
+
|
731 |
+
if (!dollyv2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
|
732 |
+
printf("Failed to predict\n");
|
733 |
+
return output;
|
734 |
+
}
|
735 |
+
|
736 |
+
t_predict_us += ggml_time_us() - t_start_us;
|
737 |
+
}
|
738 |
+
|
739 |
+
n_past += embd.size();
|
740 |
+
embd.clear();
|
741 |
+
|
742 |
+
if (i >= embd_inp.size()) {
|
743 |
+
// sample next token
|
744 |
+
const int top_k = params.top_k;
|
745 |
+
const float top_p = params.top_p;
|
746 |
+
const float temp = params.temp;
|
747 |
+
|
748 |
+
const int n_vocab = model.hparams.n_vocab;
|
749 |
+
|
750 |
+
gpt_vocab::id id = 0;
|
751 |
+
|
752 |
+
{
|
753 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
754 |
+
|
755 |
+
id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
|
756 |
+
|
757 |
+
t_sample_us += ggml_time_us() - t_start_sample_us;
|
758 |
+
}
|
759 |
+
|
760 |
+
// add it to the context
|
761 |
+
embd.push_back(id);
|
762 |
+
} else {
|
763 |
+
// if here, it means we are still processing the input prompt
|
764 |
+
for (size_t k = i; k < embd_inp.size(); k++) {
|
765 |
+
embd.push_back(embd_inp[k]);
|
766 |
+
if (int32_t(embd.size()) > params.n_batch) {
|
767 |
+
break;
|
768 |
+
}
|
769 |
+
}
|
770 |
+
i += embd.size() - 1;
|
771 |
+
}
|
772 |
+
|
773 |
+
// display text
|
774 |
+
for (auto id : embd) {
|
775 |
+
output += vocab.id_to_token[id];
|
776 |
+
if (stream_response_to_cout) {
|
777 |
+
printf("%s", vocab.id_to_token[id].c_str());
|
778 |
+
}
|
779 |
+
}
|
780 |
+
if (stream_response_to_cout) {
|
781 |
+
fflush(stdout);
|
782 |
+
}
|
783 |
+
|
784 |
+
// end of text token
|
785 |
+
if (embd.back() == 0 || (end_token > 0 && embd.back() == end_token)) {
|
786 |
+
return output;
|
787 |
+
}
|
788 |
+
}
|
789 |
+
return output;
|
790 |
+
}
|
791 |
+
|
792 |
+
#if defined(DOLLY_INTERACTIVE_PORT)
|
793 |
+
int setup_port(const int port) {
|
794 |
+
int sockfd = socket(AF_INET, SOCK_STREAM, 0);
|
795 |
+
if (sockfd < 0) {
|
796 |
+
fprintf(stderr, "%s: Failed to create new socket\n", __func__);
|
797 |
+
return -1;
|
798 |
+
}
|
799 |
+
|
800 |
+
sockaddr_in servaddr;
|
801 |
+
std::memset(&servaddr, 0, sizeof(servaddr));
|
802 |
+
|
803 |
+
servaddr.sin_family = AF_INET;
|
804 |
+
servaddr.sin_addr.s_addr = htonl(INADDR_ANY);
|
805 |
+
servaddr.sin_port = htons(port);
|
806 |
+
|
807 |
+
if (bind(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) < 0) {
|
808 |
+
fprintf(stderr, "%s: Failed to bind to port %i\n", __func__, port);
|
809 |
+
return -1;
|
810 |
+
}
|
811 |
+
|
812 |
+
if (listen(sockfd, 10) < 0) {
|
813 |
+
fprintf(stderr, "%s: Failed to listen to socket on port %i\n", __func__, port);
|
814 |
+
return -1;
|
815 |
+
}
|
816 |
+
return sockfd;
|
817 |
+
}
|
818 |
+
|
819 |
+
std::string read_from_port(int sockfd, int clientfd) {
|
820 |
+
if (clientfd < 0) {
|
821 |
+
fprintf(stderr, "%s: Failed to accept new connection\n", __func__);
|
822 |
+
return "";
|
823 |
+
}
|
824 |
+
|
825 |
+
char buffer[4096];
|
826 |
+
std::memset(buffer, 0, sizeof(buffer));
|
827 |
+
|
828 |
+
if (read(clientfd, buffer, sizeof(buffer)) < 0) {
|
829 |
+
fprintf(stderr, "%s: Failed to read from client\n", __func__);
|
830 |
+
} else {
|
831 |
+
std::cout << "Received: " << buffer;
|
832 |
+
return std::string(buffer);
|
833 |
+
}
|
834 |
+
return std::string("");
|
835 |
+
}
|
836 |
+
#endif
|
837 |
+
|
838 |
+
int main(int argc, char ** argv) {
|
839 |
+
ggml_time_init();
|
840 |
+
|
841 |
+
const int64_t t_main_start_us = ggml_time_us();
|
842 |
+
|
843 |
+
gpt_params params;
|
844 |
+
params.model = "models/dolly-v2-3b/ggml-model-f16.bin";
|
845 |
+
|
846 |
+
if (gpt_params_parse(argc, argv, params) == false) {
|
847 |
+
return 1;
|
848 |
+
}
|
849 |
+
|
850 |
+
if (params.seed < 0) {
|
851 |
+
params.seed = time(NULL);
|
852 |
+
}
|
853 |
+
|
854 |
+
printf("%s: seed = %d\n", __func__, params.seed);
|
855 |
+
|
856 |
+
std::mt19937 rng(params.seed);
|
857 |
+
|
858 |
+
int64_t t_load_us = 0;
|
859 |
+
int64_t t_sample_us = 0;
|
860 |
+
int64_t t_predict_us = 0;
|
861 |
+
|
862 |
+
// determine the required inference memory per token:
|
863 |
+
size_t mem_per_token = 0;
|
864 |
+
|
865 |
+
int n_past = 0;
|
866 |
+
|
867 |
+
gpt_vocab vocab;
|
868 |
+
dollyv2_model model;
|
869 |
+
|
870 |
+
// load the model
|
871 |
+
{
|
872 |
+
const int64_t t_start_us = ggml_time_us();
|
873 |
+
|
874 |
+
if (!dollyv2_model_load(params.model, model, vocab)) {
|
875 |
+
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
876 |
+
return 1;
|
877 |
+
}
|
878 |
+
|
879 |
+
t_load_us = ggml_time_us() - t_start_us;
|
880 |
+
|
881 |
+
test_gpt_tokenizer(vocab, params.token_test);
|
882 |
+
}
|
883 |
+
|
884 |
+
#if defined(DOLLY_INTERACTIVE_PORT)
|
885 |
+
int sockfd = -1;
|
886 |
+
if (params.interactive_port != -1) {
|
887 |
+
sockfd = setup_port(params.interactive_port);
|
888 |
+
if (sockfd == -1) {
|
889 |
+
return 1;
|
890 |
+
}
|
891 |
+
fprintf(stdout, "Model is ready on port %i\n", params.interactive_port);
|
892 |
+
fflush(stdout);
|
893 |
+
}
|
894 |
+
#endif
|
895 |
+
|
896 |
+
if (params.interactive || params.interactive_port != -1) {
|
897 |
+
while (true) {
|
898 |
+
std::string prompt_input;
|
899 |
+
#if defined(DOLLY_INTERACTIVE_PORT)
|
900 |
+
int clientfd = -1;
|
901 |
+
if (params.interactive_port != -1) {
|
902 |
+
sockaddr_in clientaddr;
|
903 |
+
socklen_t clientaddrlen = sizeof(clientaddr);
|
904 |
+
clientfd = accept(sockfd, (struct sockaddr *)&clientaddr, &clientaddrlen);
|
905 |
+
prompt_input = read_from_port(sockfd, clientfd);
|
906 |
+
} else
|
907 |
+
#endif
|
908 |
+
{
|
909 |
+
printf("Please enter your quesiton:\n>");
|
910 |
+
fflush(stdout);
|
911 |
+
|
912 |
+
std::getline(std::cin, prompt_input);
|
913 |
+
}
|
914 |
+
|
915 |
+
if (strcmp(prompt_input.c_str(), "exit") == 0) {
|
916 |
+
break;
|
917 |
+
}
|
918 |
+
|
919 |
+
const std::string prompt = prompt_for_generation(prompt_input);
|
920 |
+
// call the model
|
921 |
+
const std::string response = execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true);
|
922 |
+
|
923 |
+
#if defined(DOLLY_INTERACTIVE_PORT)
|
924 |
+
if (params.interactive_port != -1) {
|
925 |
+
if (write(clientfd, response.c_str(), response.size()) < 0) {
|
926 |
+
fprintf(stderr, "%s: Failed to write answer '%s' to client\n", __func__, response.c_str());
|
927 |
+
}
|
928 |
+
|
929 |
+
if (close(clientfd) < 0) {
|
930 |
+
fprintf(stderr, "%s: Failed to close client socket\n", __func__);
|
931 |
+
}
|
932 |
+
} else
|
933 |
+
#endif
|
934 |
+
{
|
935 |
+
printf("%s\n\n", response.c_str());
|
936 |
+
}
|
937 |
+
fflush(stdout);
|
938 |
+
}
|
939 |
+
} else {
|
940 |
+
if (params.prompt.empty()) {
|
941 |
+
params.prompt = gpt_random_prompt(rng);
|
942 |
+
}
|
943 |
+
|
944 |
+
const std::string prompt = prompt_for_generation(params.prompt);
|
945 |
+
execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true);
|
946 |
+
}
|
947 |
+
|
948 |
+
// report timing
|
949 |
+
{
|
950 |
+
const int64_t t_main_end_us = ggml_time_us();
|
951 |
+
|
952 |
+
printf("\n\n");
|
953 |
+
printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
|
954 |
+
printf("%s: load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
|
955 |
+
printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us / 1000.0f);
|
956 |
+
printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past);
|
957 |
+
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
|
958 |
+
}
|
959 |
+
|
960 |
+
ggml_free(model.ctx);
|
961 |
+
|
962 |
+
#if defined(DOLLY_INTERACTIVE_PORT)
|
963 |
+
if (params.interactive_port != -1 && close(sockfd) < 0) {
|
964 |
+
fprintf(stderr, "%s: Failed to close server socket\n", __func__);
|
965 |
+
}
|
966 |
+
#endif
|
967 |
+
|
968 |
+
return 0;
|
969 |
+
}
|
stable-diffusion.cpp/ggml/examples/dolly-v2/quantize.cpp
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "ggml/ggml.h"
|
2 |
+
|
3 |
+
#include "common.h"
|
4 |
+
#include "common-ggml.h"
|
5 |
+
|
6 |
+
#include <cassert>
|
7 |
+
#include <cmath>
|
8 |
+
#include <cstdio>
|
9 |
+
#include <cstring>
|
10 |
+
#include <fstream>
|
11 |
+
#include <map>
|
12 |
+
#include <string>
|
13 |
+
#include <vector>
|
14 |
+
#include <regex>
|
15 |
+
|
16 |
+
// default hparams (dollyv2 3B)
|
17 |
+
struct dollyv2_hparams {
|
18 |
+
int32_t n_vocab = 50254; // tokenizer.vocab_size
|
19 |
+
int32_t n_ctx = 2048; // model.config.max_position_embeddings
|
20 |
+
int32_t n_embd = 2560; // model.config.hidden_size
|
21 |
+
int32_t n_head = 32; // model.config.num_attention_heads
|
22 |
+
int32_t n_layer = 32; // model.config.num_hidden_layers
|
23 |
+
int32_t n_rot = 20; // rotary_pct[25%] * (n_embd / n_head)
|
24 |
+
int32_t par_res = 1; // 1 = true, 0 = false
|
25 |
+
int32_t ftype = GGML_FTYPE_MOSTLY_F16;
|
26 |
+
};
|
27 |
+
|
28 |
+
// quantize a model
|
29 |
+
bool dollyv2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
30 |
+
gpt_vocab vocab;
|
31 |
+
|
32 |
+
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
33 |
+
|
34 |
+
auto finp = std::ifstream(fname_inp, std::ios::binary);
|
35 |
+
if (!finp) {
|
36 |
+
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
|
37 |
+
return false;
|
38 |
+
}
|
39 |
+
|
40 |
+
auto fout = std::ofstream(fname_out, std::ios::binary);
|
41 |
+
if (!fout) {
|
42 |
+
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
|
43 |
+
return false;
|
44 |
+
}
|
45 |
+
|
46 |
+
// verify magic
|
47 |
+
{
|
48 |
+
uint32_t magic;
|
49 |
+
finp.read((char *) &magic, sizeof(magic));
|
50 |
+
if (magic != GGML_FILE_MAGIC) {
|
51 |
+
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
|
52 |
+
return false;
|
53 |
+
}
|
54 |
+
|
55 |
+
fout.write((char *) &magic, sizeof(magic));
|
56 |
+
}
|
57 |
+
|
58 |
+
dollyv2_hparams hparams;
|
59 |
+
|
60 |
+
// load hparams
|
61 |
+
{
|
62 |
+
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
63 |
+
finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
64 |
+
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
65 |
+
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
66 |
+
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
67 |
+
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
68 |
+
finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
|
69 |
+
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
70 |
+
|
71 |
+
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
72 |
+
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
73 |
+
|
74 |
+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
75 |
+
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
76 |
+
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
77 |
+
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
78 |
+
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
79 |
+
printf("%s: par_res = %d\n", __func__, hparams.par_res);
|
80 |
+
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
81 |
+
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
82 |
+
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
83 |
+
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
84 |
+
|
85 |
+
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
86 |
+
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
87 |
+
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
88 |
+
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
89 |
+
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
90 |
+
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
91 |
+
fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
|
92 |
+
fout.write((char *) &ftype_dst, sizeof(ftype_dst));
|
93 |
+
}
|
94 |
+
|
95 |
+
// load vocab
|
96 |
+
{
|
97 |
+
const int32_t n_vocab = hparams.n_vocab;
|
98 |
+
|
99 |
+
std::string word;
|
100 |
+
for (int i = 0; i < n_vocab; i++) {
|
101 |
+
uint32_t len;
|
102 |
+
finp.read ((char *) &len, sizeof(len));
|
103 |
+
fout.write((char *) &len, sizeof(len));
|
104 |
+
|
105 |
+
word.resize(len);
|
106 |
+
finp.read ((char *) word.data(), len);
|
107 |
+
fout.write((char *) word.data(), len);
|
108 |
+
|
109 |
+
vocab.token_to_id[word] = i;
|
110 |
+
vocab.id_to_token[i] = word;
|
111 |
+
}
|
112 |
+
}
|
113 |
+
|
114 |
+
// regexes of tensor names to be quantized
|
115 |
+
const std::vector<std::string> to_quant = {
|
116 |
+
".*weight",
|
117 |
+
};
|
118 |
+
|
119 |
+
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
120 |
+
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
121 |
+
return false;
|
122 |
+
}
|
123 |
+
|
124 |
+
finp.close();
|
125 |
+
fout.close();
|
126 |
+
|
127 |
+
return true;
|
128 |
+
}
|
129 |
+
|
130 |
+
// usage:
|
131 |
+
// ./dollyv2-quantize models/dolly-v2-3B/ggml-model.bin models/dolly-v2-3B/ggml-model-quant.bin type
|
132 |
+
//
|
133 |
+
int main(int argc, char ** argv) {
|
134 |
+
if (argc != 4) {
|
135 |
+
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
136 |
+
ggml_print_ftypes(stderr);
|
137 |
+
return 1;
|
138 |
+
}
|
139 |
+
|
140 |
+
// needed to initialize f16 tables
|
141 |
+
{
|
142 |
+
struct ggml_init_params params = { 0, NULL, false };
|
143 |
+
struct ggml_context * ctx = ggml_init(params);
|
144 |
+
ggml_free(ctx);
|
145 |
+
}
|
146 |
+
|
147 |
+
const std::string fname_inp = argv[1];
|
148 |
+
const std::string fname_out = argv[2];
|
149 |
+
|
150 |
+
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
151 |
+
|
152 |
+
const int64_t t_main_start_us = ggml_time_us();
|
153 |
+
|
154 |
+
int64_t t_quantize_us = 0;
|
155 |
+
|
156 |
+
// load the model
|
157 |
+
{
|
158 |
+
const int64_t t_start_us = ggml_time_us();
|
159 |
+
|
160 |
+
if (!dollyv2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
161 |
+
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
162 |
+
return 1;
|
163 |
+
}
|
164 |
+
|
165 |
+
t_quantize_us = ggml_time_us() - t_start_us;
|
166 |
+
}
|
167 |
+
|
168 |
+
// report timing
|
169 |
+
{
|
170 |
+
const int64_t t_main_end_us = ggml_time_us();
|
171 |
+
|
172 |
+
printf("\n");
|
173 |
+
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
174 |
+
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
|
175 |
+
}
|
176 |
+
|
177 |
+
return 0;
|
178 |
+
}
|
stable-diffusion.cpp/ggml/examples/dr_wav.h
ADDED
The diff for this file is too large to render.
See raw diff
|
|
stable-diffusion.cpp/ggml/examples/gpt-2/CMakeLists.txt
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# gpt-2
|
3 |
+
|
4 |
+
set(TEST_TARGET gpt-2)
|
5 |
+
add_executable(${TEST_TARGET} main.cpp)
|
6 |
+
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
|
7 |
+
|
8 |
+
#
|
9 |
+
# gpt-2-quantize
|
10 |
+
|
11 |
+
set(TEST_TARGET gpt-2-quantize)
|
12 |
+
add_executable(${TEST_TARGET} quantize.cpp)
|
13 |
+
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
|
14 |
+
|
15 |
+
#
|
16 |
+
# gpt-2-batched
|
17 |
+
|
18 |
+
set(TEST_TARGET gpt-2-batched)
|
19 |
+
add_executable(${TEST_TARGET} main-batched.cpp)
|
20 |
+
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
|
21 |
+
|
22 |
+
|
23 |
+
#
|
24 |
+
# For GPU offloading
|
25 |
+
|
26 |
+
if (GGML_CUBLAS)
|
27 |
+
add_compile_definitions(GGML_USE_CUBLAS)
|
28 |
+
endif()
|
29 |
+
|
30 |
+
if (GGML_CLBLAST)
|
31 |
+
add_compile_definitions(GGML_USE_CLBLAST)
|
32 |
+
endif()
|
33 |
+
|
34 |
+
if (GGML_METAL)
|
35 |
+
add_compile_definitions(GGML_USE_METAL)
|
36 |
+
endif()
|
stable-diffusion.cpp/ggml/examples/gpt-2/README.md
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# gpt-2
|
2 |
+
|
3 |
+
This is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library.
|
4 |
+
|
5 |
+
The program runs on the CPU - no video card is required.
|
6 |
+
|
7 |
+
The [Cerebras-GPT](https://huggingface.co/cerebras) models are also supported.
|
8 |
+
|
9 |
+
The example supports the following GPT-2 models:
|
10 |
+
|
11 |
+
| Model | Description | Disk Size |
|
12 |
+
| --- | --- | --- |
|
13 |
+
| 117M | Small model | 240 MB |
|
14 |
+
| 345M | Medium model | 680 MB |
|
15 |
+
| 774M | Large model | 1.5 GB |
|
16 |
+
| 1558M | XL model | 3.0 GB |
|
17 |
+
|
18 |
+
Sample performance on MacBook M1 Pro:
|
19 |
+
|
20 |
+
| Model | Size | Time / Token |
|
21 |
+
| --- | --- | --- |
|
22 |
+
| GPT-2 | 117M | 5 ms |
|
23 |
+
| GPT-2 | 345M | 12 ms |
|
24 |
+
| GPT-2 | 774M | 23 ms |
|
25 |
+
| GPT-2 | 1558M | 42 ms |
|
26 |
+
|
27 |
+
*TODO: add tables for Cerebras-GPT models*
|
28 |
+
|
29 |
+
Sample output:
|
30 |
+
|
31 |
+
```
|
32 |
+
$ ./bin/gpt-2 -h
|
33 |
+
usage: ./bin/gpt-2 [options]
|
34 |
+
|
35 |
+
options:
|
36 |
+
-h, --help show this help message and exit
|
37 |
+
-s SEED, --seed SEED RNG seed (default: -1)
|
38 |
+
-t N, --threads N number of threads to use during computation (default: 8)
|
39 |
+
-p PROMPT, --prompt PROMPT
|
40 |
+
prompt to start generation with (default: random)
|
41 |
+
-n N, --n_predict N number of tokens to predict (default: 200)
|
42 |
+
--top_k N top-k sampling (default: 40)
|
43 |
+
--top_p N top-p sampling (default: 0.9)
|
44 |
+
--temp N temperature (default: 1.0)
|
45 |
+
-b N, --batch_size N batch size for prompt processing (default: 8)
|
46 |
+
-m FNAME, --model FNAME
|
47 |
+
model path (default: models/gpt-2-117M/ggml-model.bin)
|
48 |
+
|
49 |
+
$ ./bin/gpt-2
|
50 |
+
gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
|
51 |
+
gpt2_model_load: n_vocab = 50257
|
52 |
+
gpt2_model_load: n_ctx = 1024
|
53 |
+
gpt2_model_load: n_embd = 768
|
54 |
+
gpt2_model_load: n_head = 12
|
55 |
+
gpt2_model_load: n_layer = 12
|
56 |
+
gpt2_model_load: f16 = 1
|
57 |
+
gpt2_model_load: ggml ctx size = 311.12 MB
|
58 |
+
gpt2_model_load: memory size = 72.00 MB, n_mem = 12288
|
59 |
+
gpt2_model_load: model size = 239.08 MB
|
60 |
+
main: number of tokens in prompt = 1
|
61 |
+
|
62 |
+
So this is going to be the end of the line for us.
|
63 |
+
|
64 |
+
If the Dolphins continue to do their business, it's possible that the team could make a bid to bring in new defensive coordinator Scott Linehan.
|
65 |
+
|
66 |
+
Linehan's job is a little daunting, but he's a great coach and an excellent coach. I don't believe we're going to make the playoffs.
|
67 |
+
|
68 |
+
We're going to have to work hard to keep our heads down and get ready to go.<|endoftext|>
|
69 |
+
|
70 |
+
main: mem per token = 2048612 bytes
|
71 |
+
main: load time = 106.32 ms
|
72 |
+
main: sample time = 7.10 ms
|
73 |
+
main: predict time = 506.40 ms / 5.06 ms per token
|
74 |
+
main: total time = 629.84 ms
|
75 |
+
```
|
76 |
+
|
77 |
+
## Downloading and converting the original models (GPT-2)
|
78 |
+
|
79 |
+
You can download the original model files using the [download-model.sh](download-model.sh) Bash script. The models are
|
80 |
+
in Tensorflow format, so in order to use them with ggml, you need to convert them to appropriate format. This is done
|
81 |
+
via the [convert-ckpt-to-ggml.py](convert-ckpt-to-ggml.py) python script.
|
82 |
+
|
83 |
+
Here is the entire process for the GPT-2 117M model (download from official site + conversion):
|
84 |
+
|
85 |
+
```
|
86 |
+
cd ggml/build
|
87 |
+
../examples/gpt-2/download-model.sh 117M
|
88 |
+
|
89 |
+
Downloading model 117M ...
|
90 |
+
models/gpt-2-117M/checkpoint 100%[=============================>] 77 --.-KB/s in 0s
|
91 |
+
models/gpt-2-117M/encoder.json 100%[=============================>] 1018K 1.20MB/s in 0.8s
|
92 |
+
models/gpt-2-117M/hparams.json 100%[=============================>] 90 --.-KB/s in 0s
|
93 |
+
models/gpt-2-117M/model.ckpt.data-00000-of-00001 100%[=============================>] 474.70M 1.21MB/s in 8m 39s
|
94 |
+
models/gpt-2-117M/model.ckpt.index 100%[=============================>] 5.09K --.-KB/s in 0s
|
95 |
+
models/gpt-2-117M/model.ckpt.meta 100%[=============================>] 460.11K 806KB/s in 0.6s
|
96 |
+
models/gpt-2-117M/vocab.bpe 100%[=============================>] 445.62K 799KB/s in 0.6s
|
97 |
+
Done! Model '117M' saved in 'models/gpt-2-117M/'
|
98 |
+
|
99 |
+
Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.
|
100 |
+
|
101 |
+
python /Users/john/ggml/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-117M/ 1
|
102 |
+
|
103 |
+
```
|
104 |
+
|
105 |
+
This conversion requires that you have python and Tensorflow installed on your computer. Still, if you want to avoid
|
106 |
+
this, you can download the already converted ggml models as described below.
|
107 |
+
|
108 |
+
## Downloading and converting the original models (Cerebras-GPT)
|
109 |
+
|
110 |
+
Clone the respective repository from here: https://huggingface.co/cerebras
|
111 |
+
|
112 |
+
Use the [convert-cerebras-to-ggml.py](convert-cerebras-to-ggml.py) script to convert the model to `ggml` format:
|
113 |
+
|
114 |
+
```
|
115 |
+
cd ggml/build
|
116 |
+
git clone https://huggingface.co/cerebras/Cerebras-GPT-111M models/
|
117 |
+
python ../examples/gpt-2/convert-cerebras-to-ggml.py models/Cerebras-GPT-111M/
|
118 |
+
|
119 |
+
```
|
120 |
+
|
121 |
+
## Downloading the ggml model directly (GPT-2)
|
122 |
+
|
123 |
+
For convenience, I will be hosting the converted ggml model files in order to make it easier to run the examples. This
|
124 |
+
way, you can directly download a single binary file and start using it. No python or Tensorflow is required.
|
125 |
+
|
126 |
+
Here is how to get the 117M ggml model:
|
127 |
+
|
128 |
+
```
|
129 |
+
cd ggml/build
|
130 |
+
../examples/gpt-2/download-ggml-model.sh 117M
|
131 |
+
|
132 |
+
Downloading ggml model 117M ...
|
133 |
+
models/gpt-2-117M/ggml-model.bin 100%[===============================>] 239.58M 8.52MB/s in 28s
|
134 |
+
Done! Model '117M' saved in 'models/gpt-2-117M/ggml-model.bin'
|
135 |
+
You can now use it like this:
|
136 |
+
|
137 |
+
$ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
|
138 |
+
|
139 |
+
```
|
140 |
+
|
141 |
+
At some point, I might decide to stop hosting these models. So in that case, simply revert to the manual process above.
|
142 |
+
|
143 |
+
## Quantizing the models
|
144 |
+
|
145 |
+
You can also try to quantize the `ggml` models via 4-bit integer quantization.
|
146 |
+
Keep in mind that for smaller models, this will render them completely useless.
|
147 |
+
You generally want to quantize larger models.
|
148 |
+
|
149 |
+
```
|
150 |
+
# quantize GPT-2 F16 to Q4_0 (faster but less precise)
|
151 |
+
./bin/gpt-2-quantize models/gpt-2-1558M/ggml-model-f16.bin models/gpt-2-1558M/ggml-model-q4_0.bin 2
|
152 |
+
./bin/gpt-2 -m models/gpt-2-1558M/ggml-model-q4_0.bin -p "This is an example"
|
153 |
+
|
154 |
+
# quantize Cerebras F16 to Q4_1 (slower but more precise)
|
155 |
+
./bin/gpt-2-quantize models/Cerebras-GPT-6.7B/ggml-model-f16.bin models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin 3
|
156 |
+
./bin/gpt-2 -m models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin -p "This is an example"
|
157 |
+
|
158 |
+
```
|
159 |
+
|
160 |
+
## Batched generation example
|
161 |
+
|
162 |
+
You can try the batched generation from a given prompt using the gpt-2-batched binary.
|
163 |
+
|
164 |
+
Sample output:
|
165 |
+
|
166 |
+
```
|
167 |
+
$ gpt-2-batched -np 5 -m models/gpt-2-117M/ggml-model.bin -p "Hello my name is" -n 50
|
168 |
+
|
169 |
+
main: seed = 1697037431
|
170 |
+
gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
|
171 |
+
gpt2_model_load: n_vocab = 50257
|
172 |
+
gpt2_model_load: n_ctx = 1024
|
173 |
+
gpt2_model_load: n_embd = 768
|
174 |
+
gpt2_model_load: n_head = 12
|
175 |
+
gpt2_model_load: n_layer = 12
|
176 |
+
gpt2_model_load: ftype = 1
|
177 |
+
gpt2_model_load: qntvr = 0
|
178 |
+
gpt2_model_load: ggml tensor size = 320 bytes
|
179 |
+
gpt2_model_load: backend buffer size = 312.72 MB
|
180 |
+
ggml_init_cublas: found 1 CUDA devices:
|
181 |
+
Device 0: NVIDIA GeForce GTX 1660, compute capability 7.5
|
182 |
+
gpt2_model_load: using CPU backend
|
183 |
+
gpt2_model_load: memory size = 72.00 MB, n_mem = 12288
|
184 |
+
gpt2_model_load: model size = 239.08 MB
|
185 |
+
extract_tests_from_file : No test file found.
|
186 |
+
test_gpt_tokenizer : 0 tests failed out of 0 tests.
|
187 |
+
main: compute buffer size: 3.26 MB
|
188 |
+
|
189 |
+
|
190 |
+
main: generating 5 sequences ...
|
191 |
+
main: prompt: 'Hello my name is'
|
192 |
+
main: number of tokens in prompt = 4, first 8 tokens: 15496 616 1438 318
|
193 |
+
|
194 |
+
|
195 |
+
sequence 0:
|
196 |
+
|
197 |
+
Hello my name is John. You can call me any way you want, if you want, but for my very first date, I will be on the phone with you. We're both in our early 20s, but I feel like it's all
|
198 |
+
|
199 |
+
sequence 1:
|
200 |
+
|
201 |
+
Hello my name is Robert, and I want to say that we're proud to have your company here on the world's largest platform for sharing your stories with us. This is a huge opportunity for our community. We have hundreds of people on this team and
|
202 |
+
|
203 |
+
sequence 2:
|
204 |
+
|
205 |
+
Hello my name is Jack. I'm the one who created you.
|
206 |
+
|
207 |
+
Jack is a boy with a big smile and a big heart. He is a handsome guy. He loves the outdoors and loves the people he meets. He wants to be a
|
208 |
+
|
209 |
+
sequence 3:
|
210 |
+
|
211 |
+
Hello my name is John. I am a Canadian citizen with a large number of family in Quebec and I am interested in studying. My aim is to take up a post in the Journal of the International Academy of Sciences of Canada which I am currently finishing.
|
212 |
+
|
213 |
+
sequence 4:
|
214 |
+
|
215 |
+
Hello my name is Dan. I am an entrepreneur. I am a great father. I am a great husband. I am a great husband. I am a great dad. And I am a great husband.
|
216 |
+
|
217 |
+
I love my life. I love
|
218 |
+
|
219 |
+
|
220 |
+
|
221 |
+
main: load time = 880.80 ms
|
222 |
+
main: sample time = 91.43 ms
|
223 |
+
main: predict time = 2518.29 ms
|
224 |
+
main: total time = 3544.32 ms
|
225 |
+
```
|
stable-diffusion.cpp/ggml/examples/gpt-2/convert-cerebras-to-ggml.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Convert Cerebras models to ggml format
|
2 |
+
#
|
3 |
+
# ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/
|
4 |
+
#
|
5 |
+
|
6 |
+
import sys
|
7 |
+
import struct
|
8 |
+
import json
|
9 |
+
import torch
|
10 |
+
import numpy as np
|
11 |
+
import re
|
12 |
+
|
13 |
+
from transformers import AutoModelForCausalLM
|
14 |
+
|
15 |
+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
16 |
+
def bytes_to_unicode():
|
17 |
+
"""
|
18 |
+
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
19 |
+
The reversible bpe codes work on unicode strings.
|
20 |
+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
21 |
+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
22 |
+
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
23 |
+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
24 |
+
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
25 |
+
"""
|
26 |
+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
27 |
+
cs = bs[:]
|
28 |
+
n = 0
|
29 |
+
for b in range(2**8):
|
30 |
+
if b not in bs:
|
31 |
+
bs.append(b)
|
32 |
+
cs.append(2**8+n)
|
33 |
+
n += 1
|
34 |
+
cs = [chr(n) for n in cs]
|
35 |
+
return dict(zip(bs, cs))
|
36 |
+
|
37 |
+
if len(sys.argv) < 2:
|
38 |
+
print("Usage: convert-cerebras-to-ggml.py dir-model [use-f32]\n")
|
39 |
+
sys.exit(1)
|
40 |
+
|
41 |
+
# output in the same directory as the model
|
42 |
+
dir_model = sys.argv[1]
|
43 |
+
fname_out = sys.argv[1] + "/ggml-model-f16.bin"
|
44 |
+
|
45 |
+
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
46 |
+
encoder = json.load(f)
|
47 |
+
|
48 |
+
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
49 |
+
hparams = json.load(f)
|
50 |
+
|
51 |
+
# use 16-bit or 32-bit floats
|
52 |
+
use_f16 = True
|
53 |
+
if len(sys.argv) > 2:
|
54 |
+
use_f16 = False
|
55 |
+
fname_out = sys.argv[1] + "/ggml-model-f32.bin"
|
56 |
+
|
57 |
+
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
|
58 |
+
#print (model)
|
59 |
+
|
60 |
+
list_vars = model.state_dict()
|
61 |
+
#print (list_vars)
|
62 |
+
|
63 |
+
print(hparams)
|
64 |
+
|
65 |
+
fout = open(fname_out, "wb")
|
66 |
+
|
67 |
+
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
|
68 |
+
fout.write(struct.pack("i", hparams["vocab_size"]))
|
69 |
+
fout.write(struct.pack("i", hparams["n_positions"]))
|
70 |
+
fout.write(struct.pack("i", hparams["n_embd"]))
|
71 |
+
fout.write(struct.pack("i", hparams["n_head"]))
|
72 |
+
fout.write(struct.pack("i", hparams["n_layer"]))
|
73 |
+
fout.write(struct.pack("i", use_f16))
|
74 |
+
|
75 |
+
byte_encoder = bytes_to_unicode()
|
76 |
+
byte_decoder = {v:k for k, v in byte_encoder.items()}
|
77 |
+
|
78 |
+
fout.write(struct.pack("i", len(encoder)))
|
79 |
+
|
80 |
+
for key in encoder:
|
81 |
+
text = bytearray([byte_decoder[c] for c in key])
|
82 |
+
fout.write(struct.pack("i", len(text)))
|
83 |
+
fout.write(text)
|
84 |
+
|
85 |
+
for name in list_vars.keys():
|
86 |
+
data = list_vars[name].squeeze().numpy()
|
87 |
+
print("Processing variable: " + name + " with shape: ", data.shape)
|
88 |
+
|
89 |
+
# rename headers to keep compatibility
|
90 |
+
if name == "transformer.ln_f.weight":
|
91 |
+
name = "model/ln_f/g"
|
92 |
+
elif name == "transformer.ln_f.bias":
|
93 |
+
name = "model/ln_f/b"
|
94 |
+
elif name == "transformer.wte.weight":
|
95 |
+
name = "model/wte"
|
96 |
+
elif name == "transformer.wpe.weight":
|
97 |
+
name = "model/wpe"
|
98 |
+
elif name == "lm_head.weight":
|
99 |
+
name = "model/lm_head"
|
100 |
+
elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
|
101 |
+
i = re.findall("\d+", name)[0]
|
102 |
+
name = f"model/h{i}/ln_1/g"
|
103 |
+
elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
|
104 |
+
i = re.findall("\d+", name)[0]
|
105 |
+
name = f"model/h{i}/ln_1/b"
|
106 |
+
elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
|
107 |
+
i = re.findall("\d+", name)[0]
|
108 |
+
name = f"model/h{i}/attn/c_attn/w"
|
109 |
+
elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
|
110 |
+
i = re.findall("\d+", name)[0]
|
111 |
+
name = f"model/h{i}/attn/c_attn/b"
|
112 |
+
elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
|
113 |
+
i = re.findall("\d+", name)[0]
|
114 |
+
name = f"model/h{i}/attn/c_proj/w"
|
115 |
+
elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
|
116 |
+
i = re.findall("\d+", name)[0]
|
117 |
+
name = f"model/h{i}/attn/c_proj/b"
|
118 |
+
elif re.match(r"transformer.h.\d+.ln_2.weight", name):
|
119 |
+
i = re.findall("\d+", name)[0]
|
120 |
+
name = f"model/h{i}/ln_2/g"
|
121 |
+
elif re.match(r"transformer.h.\d+.ln_2.bias", name):
|
122 |
+
i = re.findall("\d+", name)[0]
|
123 |
+
name = f"model/h{i}/ln_2/b"
|
124 |
+
elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
|
125 |
+
i = re.findall("\d+", name)[0]
|
126 |
+
name = f"model/h{i}/mlp/c_fc/w"
|
127 |
+
elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
|
128 |
+
i = re.findall("\d+", name)[0]
|
129 |
+
name = f"model/h{i}/mlp/c_fc/b"
|
130 |
+
elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
|
131 |
+
i = re.findall("\d+", name)[0]
|
132 |
+
name = f"model/h{i}/mlp/c_proj/w"
|
133 |
+
elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
|
134 |
+
i = re.findall("\d+", name)[0]
|
135 |
+
name = f"model/h{i}/mlp/c_proj/b"
|
136 |
+
else:
|
137 |
+
print("Unrecognized variable name. %s", name)
|
138 |
+
|
139 |
+
# we don't need these
|
140 |
+
if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
|
141 |
+
print(" Skipping variable: " + name)
|
142 |
+
continue
|
143 |
+
|
144 |
+
n_dims = len(data.shape);
|
145 |
+
|
146 |
+
# ftype == 0 -> float32, ftype == 1 -> float16
|
147 |
+
ftype = 0;
|
148 |
+
if use_f16:
|
149 |
+
if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
|
150 |
+
print(" Converting to float16")
|
151 |
+
data = data.astype(np.float16)
|
152 |
+
ftype = 1
|
153 |
+
else:
|
154 |
+
print(" Converting to float32")
|
155 |
+
data = data.astype(np.float32)
|
156 |
+
ftype = 0
|
157 |
+
|
158 |
+
# for efficiency - transpose the projection matrices
|
159 |
+
# "model/h.*/attn/c_attn/w"
|
160 |
+
# "model/h.*/attn/c_proj/w"
|
161 |
+
# "model/h.*/mlp/c_fc/w"
|
162 |
+
# "model/h.*/mlp/c_proj/w"
|
163 |
+
if name[-14:] == "/attn/c_attn/w" or \
|
164 |
+
name[-14:] == "/attn/c_proj/w" or \
|
165 |
+
name[-11:] == "/mlp/c_fc/w" or \
|
166 |
+
name[-13:] == "/mlp/c_proj/w":
|
167 |
+
print(" Transposing")
|
168 |
+
data = data.transpose()
|
169 |
+
|
170 |
+
# header
|
171 |
+
str = name.encode('utf-8')
|
172 |
+
fout.write(struct.pack("iii", n_dims, len(str), ftype))
|
173 |
+
for i in range(n_dims):
|
174 |
+
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
|
175 |
+
fout.write(str);
|
176 |
+
|
177 |
+
# data
|
178 |
+
data.tofile(fout)
|
179 |
+
|
180 |
+
fout.close()
|
181 |
+
|
182 |
+
print("Done. Output file: " + fname_out)
|
183 |
+
print("")
|
stable-diffusion.cpp/ggml/examples/gpt-2/convert-ckpt-to-ggml.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Convert a model checkpoint to a ggml compatible file
|
2 |
+
#
|
3 |
+
# Load the model using TensorFlow.
|
4 |
+
# Iterate over all variables and write them to a binary file.
|
5 |
+
#
|
6 |
+
# For each variable, write the following:
|
7 |
+
# - Number of dimensions (int)
|
8 |
+
# - Name length (int)
|
9 |
+
# - Dimensions (int[n_dims])
|
10 |
+
# - Name (char[name_length])
|
11 |
+
# - Data (float[n_dims])
|
12 |
+
#
|
13 |
+
# By default, the bigger matrices are converted to 16-bit floats.
|
14 |
+
# This can be disabled by adding the "use-f32" CLI argument.
|
15 |
+
#
|
16 |
+
# At the start of the ggml file we write the model parameters
|
17 |
+
# and vocabulary.
|
18 |
+
#
|
19 |
+
|
20 |
+
import sys
|
21 |
+
import json
|
22 |
+
import struct
|
23 |
+
import numpy as np
|
24 |
+
import tensorflow as tf
|
25 |
+
|
26 |
+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
27 |
+
def bytes_to_unicode():
|
28 |
+
"""
|
29 |
+
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
30 |
+
The reversible bpe codes work on unicode strings.
|
31 |
+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
32 |
+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
33 |
+
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
34 |
+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
35 |
+
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
36 |
+
"""
|
37 |
+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
38 |
+
cs = bs[:]
|
39 |
+
n = 0
|
40 |
+
for b in range(2**8):
|
41 |
+
if b not in bs:
|
42 |
+
bs.append(b)
|
43 |
+
cs.append(2**8+n)
|
44 |
+
n += 1
|
45 |
+
cs = [chr(n) for n in cs]
|
46 |
+
return dict(zip(bs, cs))
|
47 |
+
|
48 |
+
# helper method to convert a numpy array to different float types
|
49 |
+
def convert_to_ftype(data, ftype):
|
50 |
+
# fp16
|
51 |
+
if ftype == 1:
|
52 |
+
return data.astype(np.float16)
|
53 |
+
|
54 |
+
assert False, "Invalid ftype: " + str(ftype)
|
55 |
+
|
56 |
+
if len(sys.argv) < 3:
|
57 |
+
print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
|
58 |
+
print(" ftype == 0 -> float32")
|
59 |
+
print(" ftype == 1 -> float16")
|
60 |
+
sys.exit(1)
|
61 |
+
|
62 |
+
# output in the same directory as the model
|
63 |
+
dir_model = sys.argv[1]
|
64 |
+
fname_out = sys.argv[1] + "/ggml-model.bin"
|
65 |
+
|
66 |
+
with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f:
|
67 |
+
encoder = json.load(f)
|
68 |
+
|
69 |
+
with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f:
|
70 |
+
hparams = json.load(f)
|
71 |
+
|
72 |
+
# possible data types
|
73 |
+
# ftype == 0 -> float32
|
74 |
+
# ftype == 1 -> float16
|
75 |
+
#
|
76 |
+
# map from ftype to string
|
77 |
+
ftype_str = ["f32", "f16"]
|
78 |
+
|
79 |
+
ftype = 1
|
80 |
+
if len(sys.argv) > 2:
|
81 |
+
ftype = int(sys.argv[2])
|
82 |
+
if ftype < 0 or ftype > 1:
|
83 |
+
print("Invalid ftype: " + str(ftype))
|
84 |
+
sys.exit(1)
|
85 |
+
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
86 |
+
|
87 |
+
list_vars = tf.train.list_variables(dir_model)
|
88 |
+
|
89 |
+
fout = open(fname_out, "wb")
|
90 |
+
|
91 |
+
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
|
92 |
+
fout.write(struct.pack("i", hparams["n_vocab"]))
|
93 |
+
fout.write(struct.pack("i", hparams["n_ctx"]))
|
94 |
+
fout.write(struct.pack("i", hparams["n_embd"]))
|
95 |
+
fout.write(struct.pack("i", hparams["n_head"]))
|
96 |
+
fout.write(struct.pack("i", hparams["n_layer"]))
|
97 |
+
fout.write(struct.pack("i", ftype))
|
98 |
+
|
99 |
+
byte_encoder = bytes_to_unicode()
|
100 |
+
byte_decoder = {v:k for k, v in byte_encoder.items()}
|
101 |
+
|
102 |
+
fout.write(struct.pack("i", len(encoder)))
|
103 |
+
|
104 |
+
for key in encoder:
|
105 |
+
text = bytearray([byte_decoder[c] for c in key])
|
106 |
+
fout.write(struct.pack("i", len(text)))
|
107 |
+
fout.write(text)
|
108 |
+
|
109 |
+
for name, shape in list_vars:
|
110 |
+
print("Processing variable: " + name + " with shape: ", shape)
|
111 |
+
|
112 |
+
data = tf.train.load_variable(dir_model, name).squeeze()
|
113 |
+
n_dims = len(data.shape);
|
114 |
+
|
115 |
+
# for efficiency - transpose the projection matrices
|
116 |
+
# "model/h.*/attn/c_attn/w"
|
117 |
+
# "model/h.*/attn/c_proj/w"
|
118 |
+
# "model/h.*/mlp/c_fc/w"
|
119 |
+
# "model/h.*/mlp/c_proj/w"
|
120 |
+
if name[-14:] == "/attn/c_attn/w" or \
|
121 |
+
name[-14:] == "/attn/c_proj/w" or \
|
122 |
+
name[-11:] == "/mlp/c_fc/w" or \
|
123 |
+
name[-13:] == "/mlp/c_proj/w":
|
124 |
+
print(" Transposing")
|
125 |
+
data = data.transpose()
|
126 |
+
|
127 |
+
dshape = data.shape
|
128 |
+
|
129 |
+
ftype_cur = 0
|
130 |
+
if ftype != 0:
|
131 |
+
# match name:
|
132 |
+
# "model/wte"
|
133 |
+
# "model/h.*/attn/c_attn/w"
|
134 |
+
# "model/h.*/attn/c_proj/w"
|
135 |
+
# "model/h.*/mlp/c_fc/w"
|
136 |
+
# "model/h.*/mlp/c_proj/w"
|
137 |
+
if name == "model/wte" or name[-2:] == "/w":
|
138 |
+
print(" Converting to " + ftype_str[ftype])
|
139 |
+
data = convert_to_ftype(data, ftype)
|
140 |
+
ftype_cur = ftype
|
141 |
+
else:
|
142 |
+
print(" Converting to float32")
|
143 |
+
data = data.astype(np.float32)
|
144 |
+
ftype_cur = 0
|
145 |
+
|
146 |
+
# header
|
147 |
+
str = name.encode('utf-8')
|
148 |
+
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
|
149 |
+
for i in range(n_dims):
|
150 |
+
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
|
151 |
+
fout.write(str);
|
152 |
+
|
153 |
+
# data
|
154 |
+
data.tofile(fout)
|
155 |
+
|
156 |
+
fout.close()
|
157 |
+
|
158 |
+
print("Done. Output file: " + fname_out)
|
159 |
+
print("")
|
stable-diffusion.cpp/ggml/examples/gpt-2/convert-h5-to-ggml.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Convert GPT-2 h5 transformer model to ggml format
|
2 |
+
#
|
3 |
+
# Load the model using GPT2Model.
|
4 |
+
# Iterate over all variables and write them to a binary file.
|
5 |
+
#
|
6 |
+
# For each variable, write the following:
|
7 |
+
# - Number of dimensions (int)
|
8 |
+
# - Name length (int)
|
9 |
+
# - Dimensions (int[n_dims])
|
10 |
+
# - Name (char[name_length])
|
11 |
+
# - Data (float[n_dims])
|
12 |
+
#
|
13 |
+
# By default, the bigger matrices are converted to 16-bit floats.
|
14 |
+
# This can be disabled by adding the "use-f32" CLI argument.
|
15 |
+
#
|
16 |
+
# At the start of the ggml file we write the model parameters
|
17 |
+
# and vocabulary.
|
18 |
+
#
|
19 |
+
|
20 |
+
import sys
|
21 |
+
import struct
|
22 |
+
import json
|
23 |
+
import numpy as np
|
24 |
+
import re
|
25 |
+
|
26 |
+
from transformers import GPT2Model
|
27 |
+
|
28 |
+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
29 |
+
def bytes_to_unicode():
|
30 |
+
"""
|
31 |
+
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
32 |
+
The reversible bpe codes work on unicode strings.
|
33 |
+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
34 |
+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
35 |
+
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
36 |
+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
37 |
+
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
38 |
+
"""
|
39 |
+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
40 |
+
cs = bs[:]
|
41 |
+
n = 0
|
42 |
+
for b in range(2**8):
|
43 |
+
if b not in bs:
|
44 |
+
bs.append(b)
|
45 |
+
cs.append(2**8+n)
|
46 |
+
n += 1
|
47 |
+
cs = [chr(n) for n in cs]
|
48 |
+
return dict(zip(bs, cs))
|
49 |
+
|
50 |
+
if len(sys.argv) < 2:
|
51 |
+
print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
|
52 |
+
sys.exit(1)
|
53 |
+
|
54 |
+
# output in the same directory as the model
|
55 |
+
dir_model = sys.argv[1]
|
56 |
+
fname_out = sys.argv[1] + "/ggml-model.bin"
|
57 |
+
|
58 |
+
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
59 |
+
encoder = json.load(f)
|
60 |
+
|
61 |
+
with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
|
62 |
+
encoder_added = json.load(f)
|
63 |
+
|
64 |
+
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
65 |
+
hparams = json.load(f)
|
66 |
+
|
67 |
+
# use 16-bit or 32-bit floats
|
68 |
+
use_f16 = True
|
69 |
+
if len(sys.argv) > 2:
|
70 |
+
use_f16 = False
|
71 |
+
fname_out = sys.argv[1] + "/ggml-model-f32.bin"
|
72 |
+
|
73 |
+
model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True)
|
74 |
+
#print (model)
|
75 |
+
|
76 |
+
list_vars = model.state_dict()
|
77 |
+
#print (list_vars)
|
78 |
+
|
79 |
+
fout = open(fname_out, "wb")
|
80 |
+
|
81 |
+
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
|
82 |
+
fout.write(struct.pack("i", hparams["vocab_size"]))
|
83 |
+
fout.write(struct.pack("i", hparams["n_positions"]))
|
84 |
+
fout.write(struct.pack("i", hparams["n_embd"]))
|
85 |
+
fout.write(struct.pack("i", hparams["n_head"]))
|
86 |
+
fout.write(struct.pack("i", hparams["n_layer"]))
|
87 |
+
#fout.write(struct.pack("i", hparams["rotary_dim"]))
|
88 |
+
fout.write(struct.pack("i", use_f16))
|
89 |
+
|
90 |
+
byte_encoder = bytes_to_unicode()
|
91 |
+
byte_decoder = {v:k for k, v in byte_encoder.items()}
|
92 |
+
|
93 |
+
fout.write(struct.pack("i", len(encoder) + len(encoder_added)))
|
94 |
+
|
95 |
+
for key in encoder:
|
96 |
+
text = bytearray([byte_decoder[c] for c in key])
|
97 |
+
fout.write(struct.pack("i", len(text)))
|
98 |
+
fout.write(text)
|
99 |
+
|
100 |
+
for key in encoder_added:
|
101 |
+
text = bytearray([byte_decoder[c] for c in key])
|
102 |
+
fout.write(struct.pack("i", len(text)))
|
103 |
+
fout.write(text)
|
104 |
+
|
105 |
+
for name in list_vars.keys():
|
106 |
+
data = list_vars[name].squeeze().numpy()
|
107 |
+
print("Processing variable: " + name + " with shape: ", data.shape)
|
108 |
+
|
109 |
+
# we don't need these
|
110 |
+
if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
|
111 |
+
print(" Skipping variable: " + name)
|
112 |
+
continue
|
113 |
+
|
114 |
+
n_dims = len(data.shape);
|
115 |
+
|
116 |
+
# ftype == 0 -> float32, ftype == 1 -> float16
|
117 |
+
ftype = 0;
|
118 |
+
if use_f16:
|
119 |
+
if name[-7:] == ".weight" and n_dims == 2:
|
120 |
+
print(" Converting to float16")
|
121 |
+
data = data.astype(np.float16)
|
122 |
+
ftype = 1
|
123 |
+
else:
|
124 |
+
print(" Converting to float32")
|
125 |
+
data = data.astype(np.float32)
|
126 |
+
ftype = 0
|
127 |
+
|
128 |
+
# for efficiency - transpose these matrices:
|
129 |
+
# "transformer.h.*.mlp.c_proj.weight
|
130 |
+
if name.endswith(".mlp.c_proj.weight"):
|
131 |
+
print(" Transposing")
|
132 |
+
data = data.transpose()
|
133 |
+
|
134 |
+
# rename headers to keep compatibility
|
135 |
+
if name == "ln_f.weight":
|
136 |
+
name = "model/ln_f/g"
|
137 |
+
elif name == "ln_f.bias":
|
138 |
+
name = "model/ln_f/b"
|
139 |
+
elif name == "wte.weight":
|
140 |
+
name = "model/wte"
|
141 |
+
elif name == "wpe.weight":
|
142 |
+
name = "model/wpe"
|
143 |
+
elif re.match(r"h\.\d+\.ln_1\.weight", name):
|
144 |
+
i = re.findall("\d+", name)[0]
|
145 |
+
name = f"model/h{i}/ln_1/g"
|
146 |
+
elif re.match(r"h\.\d+\.ln_1\.bias", name):
|
147 |
+
i = re.findall("\d+", name)[0]
|
148 |
+
name = f"model/h{i}/ln_1/b"
|
149 |
+
elif re.match(r"h\.\d+\.attn\.c_attn\.weight", name):
|
150 |
+
i = re.findall("\d+", name)[0]
|
151 |
+
name = f"model/h{i}/attn/c_attn/w"
|
152 |
+
elif re.match(r"h\.\d+\.attn\.c_attn\.bias", name):
|
153 |
+
i = re.findall("\d+", name)[0]
|
154 |
+
name = f"model/h{i}/attn/c_attn/b"
|
155 |
+
elif re.match(r"h\.\d+\.attn\.c_proj\.weight", name):
|
156 |
+
i = re.findall("\d+", name)[0]
|
157 |
+
name = f"model/h{i}/attn/c_proj/w"
|
158 |
+
elif re.match(r"h.\d+.attn.c_proj.bias", name):
|
159 |
+
i = re.findall("\d+", name)[0]
|
160 |
+
name = f"model/h{i}/attn/c_proj/b"
|
161 |
+
elif re.match(r"h.\d+.ln_2.weight", name):
|
162 |
+
i = re.findall("\d+", name)[0]
|
163 |
+
name = f"model/h{i}/ln_2/g"
|
164 |
+
elif re.match(r"h.\d+.ln_2.bias", name):
|
165 |
+
i = re.findall("\d+", name)[0]
|
166 |
+
name = f"model/h{i}/ln_2/b"
|
167 |
+
elif re.match(r"h.\d+.mlp.c_fc.weight", name):
|
168 |
+
i = re.findall("\d+", name)[0]
|
169 |
+
name = f"model/h{i}/mlp/c_fc/w"
|
170 |
+
elif re.match(r"h.\d+.mlp.c_fc.bias", name):
|
171 |
+
i = re.findall("\d+", name)[0]
|
172 |
+
name = f"model/h{i}/mlp/c_fc/b"
|
173 |
+
elif re.match(r"h.\d+.mlp.c_proj.weight", name):
|
174 |
+
i = re.findall("\d+", name)[0]
|
175 |
+
name = f"model/h{i}/mlp/c_proj/w"
|
176 |
+
elif re.match(r"h.\d+.mlp.c_proj.bias", name):
|
177 |
+
i = re.findall("\d+", name)[0]
|
178 |
+
name = f"model/h{i}/mlp/c_proj/b"
|
179 |
+
else:
|
180 |
+
print("Unrecognized variable name. %s", name)
|
181 |
+
|
182 |
+
str = name.encode('utf-8')
|
183 |
+
|
184 |
+
fout.write(struct.pack("iii", n_dims, len(str), ftype))
|
185 |
+
for i in range(n_dims):
|
186 |
+
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
|
187 |
+
fout.write(str);
|
188 |
+
|
189 |
+
# data
|
190 |
+
data.tofile(fout)
|
191 |
+
|
192 |
+
fout.close()
|
193 |
+
|
194 |
+
print("Done. Output file: " + fname_out)
|
195 |
+
print("")
|
stable-diffusion.cpp/ggml/examples/gpt-2/download-ggml-model.sh
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# This script downloads GPT-2 model files that have already been converted to ggml format.
|
4 |
+
# This way you don't have to convert them yourself.
|
5 |
+
#
|
6 |
+
# If you want to download the original GPT-2 model files, use the "download-model.sh" script instead.
|
7 |
+
|
8 |
+
#src="https://ggml.ggerganov.com"
|
9 |
+
#pfx="ggml-model-gpt-2"
|
10 |
+
|
11 |
+
src="https://huggingface.co/ggerganov/ggml"
|
12 |
+
pfx="resolve/main/ggml-model-gpt-2"
|
13 |
+
|
14 |
+
ggml_path=$(dirname $(realpath $0))
|
15 |
+
|
16 |
+
# GPT-2 models
|
17 |
+
models=( "117M" "345M" "774M" "1558M" )
|
18 |
+
|
19 |
+
# list available models
|
20 |
+
function list_models {
|
21 |
+
printf "\n"
|
22 |
+
printf " Available models:"
|
23 |
+
for model in "${models[@]}"; do
|
24 |
+
printf " $model"
|
25 |
+
done
|
26 |
+
printf "\n\n"
|
27 |
+
}
|
28 |
+
|
29 |
+
if [ "$#" -ne 1 ]; then
|
30 |
+
printf "Usage: $0 <model>\n"
|
31 |
+
list_models
|
32 |
+
|
33 |
+
exit 1
|
34 |
+
fi
|
35 |
+
|
36 |
+
model=$1
|
37 |
+
|
38 |
+
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
|
39 |
+
printf "Invalid model: $model\n"
|
40 |
+
list_models
|
41 |
+
|
42 |
+
exit 1
|
43 |
+
fi
|
44 |
+
|
45 |
+
# download ggml model
|
46 |
+
|
47 |
+
printf "Downloading ggml model $model ...\n"
|
48 |
+
|
49 |
+
mkdir -p models/gpt-2-$model
|
50 |
+
|
51 |
+
if [ -x "$(command -v wget)" ]; then
|
52 |
+
wget --quiet --show-progress -O models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
|
53 |
+
elif [ -x "$(command -v curl)" ]; then
|
54 |
+
curl -L --output models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
|
55 |
+
else
|
56 |
+
printf "Either wget or curl is required to download models.\n"
|
57 |
+
exit 1
|
58 |
+
fi
|
59 |
+
|
60 |
+
if [ $? -ne 0 ]; then
|
61 |
+
printf "Failed to download ggml model $model \n"
|
62 |
+
printf "Please try again later or download the original GPT-2 model files and convert them yourself.\n"
|
63 |
+
exit 1
|
64 |
+
fi
|
65 |
+
|
66 |
+
printf "Done! Model '$model' saved in 'models/gpt-2-$model/ggml-model.bin'\n"
|
67 |
+
printf "You can now use it like this:\n\n"
|
68 |
+
printf " $ ./bin/gpt-2 -m models/gpt-2-$model/ggml-model.bin -p \"This is an example\"\n"
|
69 |
+
printf "\n"
|
stable-diffusion.cpp/ggml/examples/gpt-2/download-model.sh
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
ggml_path=$(dirname $(realpath $0))
|
4 |
+
|
5 |
+
# GPT-2 models
|
6 |
+
models=( "117M" "345M" "774M" "1558M" )
|
7 |
+
|
8 |
+
# list available models
|
9 |
+
function list_models {
|
10 |
+
printf "\n"
|
11 |
+
printf " Available models:"
|
12 |
+
for model in "${models[@]}"; do
|
13 |
+
printf " $model"
|
14 |
+
done
|
15 |
+
printf "\n\n"
|
16 |
+
}
|
17 |
+
|
18 |
+
if [ "$#" -ne 1 ]; then
|
19 |
+
printf "Usage: $0 <model>\n"
|
20 |
+
list_models
|
21 |
+
|
22 |
+
exit 1
|
23 |
+
fi
|
24 |
+
|
25 |
+
model=$1
|
26 |
+
|
27 |
+
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
|
28 |
+
printf "Invalid model: $model\n"
|
29 |
+
list_models
|
30 |
+
|
31 |
+
exit 1
|
32 |
+
fi
|
33 |
+
|
34 |
+
# download model
|
35 |
+
|
36 |
+
printf "Downloading model $model ...\n"
|
37 |
+
|
38 |
+
mkdir -p models/gpt-2-$model
|
39 |
+
|
40 |
+
for file in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do
|
41 |
+
wget --quiet --show-progress -O models/gpt-2-$model/$file https://openaipublic.blob.core.windows.net/gpt-2/models/$model/$file
|
42 |
+
done
|
43 |
+
|
44 |
+
printf "Done! Model '$model' saved in 'models/gpt-2-$model/'\n\n"
|
45 |
+
printf "Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.\n"
|
46 |
+
printf "\n"
|
47 |
+
printf " python $ggml_path/convert-ckpt-to-ggml.py models/gpt-2-$model/\n"
|
48 |
+
printf "\n"
|