SixOpen commited on
Commit
87a3f98
·
1 Parent(s): 349817e
Files changed (6) hide show
  1. Dockerfile +1 -1
  2. Dockerfile.bak +63 -0
  3. app.py +15 -11
  4. app.py.bak +375 -0
  5. start.sh +1 -5
  6. start.sh.bak +5 -0
Dockerfile CHANGED
@@ -60,4 +60,4 @@ ENV PYTHONPATH=${HOME}/app \
60
  PATH=/usr/local/nvidia/bin:${PATH}
61
 
62
 
63
- ENTRYPOINT ["/bin/bash", "-c", "cd llama.cpp && LLAMA_CUDA=1 make -j && cd .. && /bin/sh start.sh"]
 
60
  PATH=/usr/local/nvidia/bin:${PATH}
61
 
62
 
63
+ ENTRYPOINT ["/bin/bash", "-c", "cd llama.cpp && LLAMA_CUDA=1 make -j quantize gguf-split imatrix && cd .. && /bin/sh start.sh"]
Dockerfile.bak ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+ RUN apt-get update && \
5
+ apt-get upgrade -y && \
6
+ apt-get install -y --no-install-recommends \
7
+ git \
8
+ git-lfs \
9
+ wget \
10
+ curl \
11
+ # python build dependencies \
12
+ build-essential \
13
+ libssl-dev \
14
+ zlib1g-dev \
15
+ libbz2-dev \
16
+ libreadline-dev \
17
+ libsqlite3-dev \
18
+ libncursesw5-dev \
19
+ xz-utils \
20
+ tk-dev \
21
+ libxml2-dev \
22
+ libxmlsec1-dev \
23
+ libffi-dev \
24
+ liblzma-dev \
25
+ ffmpeg \
26
+ nvidia-driver-515
27
+
28
+ RUN useradd -m -u 1000 user
29
+ USER user
30
+ ENV HOME=/home/user \
31
+ PATH=/home/user/.local/bin:${PATH}
32
+ WORKDIR ${HOME}/app
33
+
34
+ RUN curl https://pyenv.run | bash
35
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
36
+ ARG PYTHON_VERSION=3.10.13
37
+ RUN pyenv install ${PYTHON_VERSION} && \
38
+ pyenv global ${PYTHON_VERSION} && \
39
+ pyenv rehash && \
40
+ pip install --no-cache-dir -U pip setuptools wheel && \
41
+ pip install "huggingface-hub" "hf-transfer" "gradio[oauth]>=4.28.0" "gradio_huggingfacehub_search==0.0.7" "APScheduler"
42
+
43
+ COPY --chown=1000 . ${HOME}/app
44
+ RUN git clone https://github.com/ggerganov/llama.cpp
45
+ RUN pip install -r llama.cpp/requirements.txt
46
+
47
+ COPY imatrix_calibration.txt ${HOME}/app/llama.cpp/
48
+
49
+ ENV PYTHONPATH=${HOME}/app \
50
+ PYTHONUNBUFFERED=1 \
51
+ HF_HUB_ENABLE_HF_TRANSFER=1 \
52
+ GRADIO_ALLOW_FLAGGING=never \
53
+ GRADIO_NUM_PORTS=1 \
54
+ GRADIO_SERVER_NAME=0.0.0.0 \
55
+ GRADIO_THEME=huggingface \
56
+ TQDM_POSITION=-1 \
57
+ TQDM_MININTERVAL=1 \
58
+ SYSTEM=spaces \
59
+ LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
60
+ PATH=/usr/local/nvidia/bin:${PATH}
61
+
62
+
63
+ ENTRYPOINT ["/bin/bash", "-c", "cd llama.cpp && LLAMA_CUDA=1 make -j && cd .. && /bin/sh start.sh"]
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import shutil
3
  import subprocess
 
4
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
5
  import gradio as gr
6
 
@@ -18,16 +19,10 @@ from textwrap import dedent
18
  HF_TOKEN = os.environ.get("HF_TOKEN")
19
 
20
  def generate_importance_matrix(model_path, train_data_path):
21
- imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 99"
22
 
23
  os.chdir("llama.cpp")
24
 
25
- compile_command = "LLAMA_CUDA=1 make -j"
26
- compile_result = subprocess.run(compile_command, shell=True, capture_output=True, text=True)
27
- if compile_result.returncode != 0:
28
- raise Exception(f"Error compiling imatrix: {compile_result.stderr}")
29
-
30
-
31
  print(f"Current working directory: {os.getcwd()}")
32
  print(f"Files in the current directory: {os.listdir('.')}")
33
 
@@ -35,13 +30,22 @@ def generate_importance_matrix(model_path, train_data_path):
35
  raise Exception(f"Model file not found: {model_path}")
36
 
37
  print("Running imatrix command...")
38
- result = subprocess.run(imatrix_command, shell=True, capture_output=True, text=True)
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  os.chdir("..")
41
 
42
- if result.returncode != 0:
43
- raise Exception(f"Error generating importance matrix: {result.stderr}")
44
- print("Importance matrix generated successfully!")
45
 
46
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
47
  if oauth_token.token is None:
 
1
  import os
2
  import shutil
3
  import subprocess
4
+ import signal
5
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
6
  import gradio as gr
7
 
 
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
 
21
  def generate_importance_matrix(model_path, train_data_path):
22
+ imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
 
24
  os.chdir("llama.cpp")
25
 
 
 
 
 
 
 
26
  print(f"Current working directory: {os.getcwd()}")
27
  print(f"Files in the current directory: {os.listdir('.')}")
28
 
 
30
  raise Exception(f"Model file not found: {model_path}")
31
 
32
  print("Running imatrix command...")
33
+ process = subprocess.Popen(imatrix_command, shell=True)
34
+
35
+ try:
36
+ process.wait(timeout=60) # added wait
37
+ except subprocess.TimeoutExpired:
38
+ print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
39
+ process.send_signal(signal.SIGINT)
40
+ try:
41
+ process.wait(timeout=5) # grace period
42
+ except subprocess.TimeoutExpired:
43
+ print("Imatrix proc still didn't term. Forecfully terming process...")
44
+ process.kill()
45
 
46
  os.chdir("..")
47
 
48
+ print("Importance matrix generation completed.")
 
 
49
 
50
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
51
  if oauth_token.token is None:
app.py.bak ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
5
+ import gradio as gr
6
+
7
+ from huggingface_hub import create_repo, HfApi
8
+ from huggingface_hub import snapshot_download
9
+ from huggingface_hub import whoami
10
+ from huggingface_hub import ModelCard
11
+
12
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
13
+
14
+ from apscheduler.schedulers.background import BackgroundScheduler
15
+
16
+ from textwrap import dedent
17
+
18
+ HF_TOKEN = os.environ.get("HF_TOKEN")
19
+
20
+ def generate_importance_matrix(model_path, train_data_path):
21
+ imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 99"
22
+
23
+ os.chdir("llama.cpp")
24
+
25
+ compile_command = "LLAMA_CUDA=1 make -j"
26
+ compile_result = subprocess.run(compile_command, shell=True, capture_output=True, text=True)
27
+ if compile_result.returncode != 0:
28
+ raise Exception(f"Error compiling imatrix: {compile_result.stderr}")
29
+
30
+
31
+ print(f"Current working directory: {os.getcwd()}")
32
+ print(f"Files in the current directory: {os.listdir('.')}")
33
+
34
+ if not os.path.isfile(f"../{model_path}"):
35
+ raise Exception(f"Model file not found: {model_path}")
36
+
37
+ print("Running imatrix command...")
38
+ result = subprocess.run(imatrix_command, shell=True, capture_output=True, text=True)
39
+
40
+ os.chdir("..")
41
+
42
+ if result.returncode != 0:
43
+ raise Exception(f"Error generating importance matrix: {result.stderr}")
44
+ print("Importance matrix generated successfully!")
45
+
46
+ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
47
+ if oauth_token.token is None:
48
+ raise ValueError("You have to be logged in.")
49
+
50
+ split_cmd = f"llama.cpp/gguf-split --split --split-max-tensors {split_max_tensors}"
51
+ if split_max_size:
52
+ split_cmd += f" --split-max-size {split_max_size}"
53
+ split_cmd += f" {model_path} {model_path.split('.')[0]}"
54
+
55
+ print(f"Split command: {split_cmd}")
56
+
57
+ result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
58
+ print(f"Split command stdout: {result.stdout}")
59
+ print(f"Split command stderr: {result.stderr}")
60
+
61
+ if result.returncode != 0:
62
+ raise Exception(f"Error splitting the model: {result.stderr}")
63
+ print("Model split successfully!")
64
+
65
+
66
+ sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
67
+ if sharded_model_files:
68
+ print(f"Sharded model files: {sharded_model_files}")
69
+ api = HfApi(token=oauth_token.token)
70
+ for file in sharded_model_files:
71
+ file_path = os.path.join('.', file)
72
+ print(f"Uploading file: {file_path}")
73
+ try:
74
+ api.upload_file(
75
+ path_or_fileobj=file_path,
76
+ path_in_repo=file,
77
+ repo_id=repo_id,
78
+ )
79
+ except Exception as e:
80
+ raise Exception(f"Error uploading file {file_path}: {e}")
81
+ else:
82
+ raise Exception("No sharded files found.")
83
+
84
+ print("Sharded model has been uploaded successfully!")
85
+
86
+ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
87
+ if oauth_token.token is None:
88
+ raise ValueError("You must be logged in to use GGUF-my-repo")
89
+ model_name = model_id.split('/')[-1]
90
+ fp16 = f"{model_name}.fp16.gguf"
91
+
92
+ try:
93
+ api = HfApi(token=oauth_token.token)
94
+
95
+ dl_pattern = ["*.md", "*.json", "*.model"]
96
+
97
+ pattern = (
98
+ "*.safetensors"
99
+ if any(
100
+ file.path.endswith(".safetensors")
101
+ for file in api.list_repo_tree(
102
+ repo_id=model_id,
103
+ recursive=True,
104
+ )
105
+ )
106
+ else "*.bin"
107
+ )
108
+
109
+ dl_pattern += pattern
110
+
111
+ api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
112
+ print("Model downloaded successfully!")
113
+ print(f"Current working directory: {os.getcwd()}")
114
+ print(f"Model directory contents: {os.listdir(model_name)}")
115
+
116
+ conversion_script = "convert-hf-to-gguf.py"
117
+ fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
118
+ result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
119
+ print(result)
120
+ if result.returncode != 0:
121
+ raise Exception(f"Error converting to fp16: {result.stderr}")
122
+ print("Model converted to fp16 successfully!")
123
+ print(f"Converted model path: {fp16}")
124
+
125
+ imatrix_path = "llama.cpp/imatrix.dat"
126
+
127
+ if use_imatrix:
128
+ if train_data_file:
129
+ train_data_path = train_data_file.name
130
+ else:
131
+ train_data_path = "imatrix_calibration.txt"
132
+
133
+ print(f"Training data file path: {train_data_path}")
134
+
135
+ if not os.path.isfile(train_data_path):
136
+ raise Exception(f"Training data file not found: {train_data_path}")
137
+
138
+ generate_importance_matrix(fp16, train_data_path)
139
+ else:
140
+ print("Not using imatrix quantization.")
141
+ username = whoami(oauth_token.token)["name"]
142
+ quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
143
+ quantized_gguf_path = quantized_gguf_name
144
+ if use_imatrix:
145
+ quantise_ggml = f"./llama.cpp/quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
146
+ else:
147
+ quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
148
+ result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
149
+ if result.returncode != 0:
150
+ raise Exception(f"Error quantizing: {result.stderr}")
151
+ print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
152
+ print(f"Quantized model path: {quantized_gguf_path}")
153
+
154
+ # Create empty repo
155
+ new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
156
+ new_repo_id = new_repo_url.repo_id
157
+ print("Repo created successfully!", new_repo_url)
158
+
159
+ try:
160
+ card = ModelCard.load(model_id, token=oauth_token.token)
161
+ except:
162
+ card = ModelCard("")
163
+ if card.data.tags is None:
164
+ card.data.tags = []
165
+ card.data.tags.append("llama-cpp")
166
+ card.data.tags.append("gguf-my-repo")
167
+ card.data.base_model = model_id
168
+ card.text = dedent(
169
+ f"""
170
+ # {new_repo_id}
171
+ This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
172
+ Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
173
+
174
+ ## Use with llama.cpp
175
+ Install llama.cpp through brew (works on Mac and Linux)
176
+
177
+ ```bash
178
+ brew install llama.cpp
179
+
180
+ ```
181
+ Invoke the llama.cpp server or the CLI.
182
+
183
+ ### CLI:
184
+ ```bash
185
+ llama --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
186
+ ```
187
+
188
+ ### Server:
189
+ ```bash
190
+ llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
191
+ ```
192
+
193
+ Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
194
+
195
+ Step 1: Clone llama.cpp from GitHub.
196
+ ```
197
+ git clone https://github.com/ggerganov/llama.cpp
198
+ ```
199
+
200
+ Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
201
+ ```
202
+ cd llama.cpp && LLAMA_CURL=1 make
203
+ ```
204
+
205
+ Step 3: Run inference through the main binary.
206
+ ```
207
+ ./main --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
208
+ ```
209
+ or
210
+ ```
211
+ ./server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
212
+ ```
213
+ """
214
+ )
215
+ card.save(f"README.md")
216
+
217
+ if split_model:
218
+ split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
219
+ else:
220
+ try:
221
+ print(f"Uploading quantized model: {quantized_gguf_path}")
222
+ api.upload_file(
223
+ path_or_fileobj=quantized_gguf_path,
224
+ path_in_repo=quantized_gguf_name,
225
+ repo_id=new_repo_id,
226
+ )
227
+ except Exception as e:
228
+ raise Exception(f"Error uploading quantized model: {e}")
229
+
230
+
231
+ imatrix_path = "llama.cpp/imatrix.dat"
232
+ if os.path.isfile(imatrix_path):
233
+ try:
234
+ print(f"Uploading imatrix.dat: {imatrix_path}")
235
+ api.upload_file(
236
+ path_or_fileobj=imatrix_path,
237
+ path_in_repo="imatrix.dat",
238
+ repo_id=new_repo_id,
239
+ )
240
+ except Exception as e:
241
+ raise Exception(f"Error uploading imatrix.dat: {e}")
242
+
243
+ api.upload_file(
244
+ path_or_fileobj=f"README.md",
245
+ path_in_repo=f"README.md",
246
+ repo_id=new_repo_id,
247
+ )
248
+ print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
249
+
250
+ return (
251
+ f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
252
+ "llama.png",
253
+ )
254
+ except Exception as e:
255
+ return (f"Error: {e}", "error.png")
256
+ finally:
257
+ shutil.rmtree(model_name, ignore_errors=True)
258
+ print("Folder cleaned up successfully!")
259
+
260
+
261
+ # Create Gradio interface
262
+ with gr.Blocks(css=".gradio-container {max-height: 600px; overflow-y: auto;}") as demo:
263
+ gr.Markdown("You must be logged in to use GGUF-my-repo.")
264
+ gr.LoginButton(min_width=250)
265
+
266
+ model_id = HuggingfaceHubSearch(
267
+ label="Hub Model ID",
268
+ placeholder="Search for model id on Huggingface",
269
+ search_type="model",
270
+ )
271
+
272
+ q_method = gr.Dropdown(
273
+ ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
274
+ label="Quantization Method",
275
+ info="GGML quantization type",
276
+ value="Q4_K_M",
277
+ filterable=False,
278
+ visible=True
279
+ )
280
+
281
+ imatrix_q_method = gr.Dropdown(
282
+ ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
283
+ label="Imatrix Quantization Method",
284
+ info="GGML imatrix quants type",
285
+ value="IQ4_NL",
286
+ filterable=False,
287
+ visible=False
288
+ )
289
+
290
+ use_imatrix = gr.Checkbox(
291
+ value=False,
292
+ label="Use Imatrix Quantization",
293
+ info="Use importance matrix for quantization."
294
+ )
295
+
296
+ private_repo = gr.Checkbox(
297
+ value=False,
298
+ label="Private Repo",
299
+ info="Create a private repo under your username."
300
+ )
301
+
302
+ train_data_file = gr.File(
303
+ label="Training Data File",
304
+ file_types=["txt"],
305
+ visible=False
306
+ )
307
+
308
+ split_model = gr.Checkbox(
309
+ value=False,
310
+ label="Split Model",
311
+ info="Shard the model using gguf-split."
312
+ )
313
+
314
+ split_max_tensors = gr.Number(
315
+ value=256,
316
+ label="Max Tensors per File",
317
+ info="Maximum number of tensors per file when splitting model.",
318
+ visible=False
319
+ )
320
+
321
+ split_max_size = gr.Textbox(
322
+ label="Max File Size",
323
+ info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
324
+ visible=False
325
+ )
326
+
327
+ def update_visibility(use_imatrix):
328
+ return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
329
+
330
+ use_imatrix.change(
331
+ fn=update_visibility,
332
+ inputs=use_imatrix,
333
+ outputs=[q_method, imatrix_q_method, train_data_file]
334
+ )
335
+
336
+ iface = gr.Interface(
337
+ fn=process_model,
338
+ inputs=[
339
+ model_id,
340
+ q_method,
341
+ use_imatrix,
342
+ imatrix_q_method,
343
+ private_repo,
344
+ train_data_file,
345
+ split_model,
346
+ split_max_tensors,
347
+ split_max_size,
348
+ ],
349
+ outputs=[
350
+ gr.Markdown(label="output"),
351
+ gr.Image(show_label=False),
352
+ ],
353
+ title="Create your own GGUF Quants, blazingly fast ⚡!",
354
+ description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
355
+ api_name=False
356
+ )
357
+
358
+ def update_split_visibility(split_model):
359
+ return gr.update(visible=split_model), gr.update(visible=split_model)
360
+
361
+ split_model.change(
362
+ fn=update_split_visibility,
363
+ inputs=split_model,
364
+ outputs=[split_max_tensors, split_max_size]
365
+ )
366
+
367
+ def restart_space():
368
+ HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
369
+
370
+ scheduler = BackgroundScheduler()
371
+ scheduler.add_job(restart_space, "interval", seconds=21600)
372
+ scheduler.start()
373
+
374
+ # Launch the interface
375
+ demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
start.sh CHANGED
@@ -1,5 +1 @@
1
- cd llama.cpp
2
- make -j quantize gguf-split imatrix
3
-
4
- cd ..
5
- python app.py
 
1
+ python app.py
 
 
 
 
start.sh.bak ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ cd llama.cpp
2
+ make -j quantize gguf-split imatrix
3
+
4
+ cd ..
5
+ python app.py