SixOpen commited on
Commit
349817e
·
1 Parent(s): c360795

Imatrix support

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +11 -5
  3. app.py +117 -25
  4. imatrix_calibration.txt +3 -0
  5. start.sh +2 -1
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  llama.png filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  llama.png filter=lfs diff=lfs merge=lfs -text
37
+ imatrix_calibration.txt filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,4 +1,5 @@
1
- FROM python:3.9
 
2
  ENV DEBIAN_FRONTEND=noninteractive
3
  RUN apt-get update && \
4
  apt-get upgrade -y && \
@@ -21,8 +22,8 @@ RUN apt-get update && \
21
  libxmlsec1-dev \
22
  libffi-dev \
23
  liblzma-dev \
24
- # gradio dependencies \
25
- ffmpeg
26
 
27
  RUN useradd -m -u 1000 user
28
  USER user
@@ -43,6 +44,8 @@ COPY --chown=1000 . ${HOME}/app
43
  RUN git clone https://github.com/ggerganov/llama.cpp
44
  RUN pip install -r llama.cpp/requirements.txt
45
 
 
 
46
  ENV PYTHONPATH=${HOME}/app \
47
  PYTHONUNBUFFERED=1 \
48
  HF_HUB_ENABLE_HF_TRANSFER=1 \
@@ -52,6 +55,9 @@ ENV PYTHONPATH=${HOME}/app \
52
  GRADIO_THEME=huggingface \
53
  TQDM_POSITION=-1 \
54
  TQDM_MININTERVAL=1 \
55
- SYSTEM=spaces
 
 
 
56
 
57
- ENTRYPOINT /bin/sh start.sh
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
2
+
3
  ENV DEBIAN_FRONTEND=noninteractive
4
  RUN apt-get update && \
5
  apt-get upgrade -y && \
 
22
  libxmlsec1-dev \
23
  libffi-dev \
24
  liblzma-dev \
25
+ ffmpeg \
26
+ nvidia-driver-515
27
 
28
  RUN useradd -m -u 1000 user
29
  USER user
 
44
  RUN git clone https://github.com/ggerganov/llama.cpp
45
  RUN pip install -r llama.cpp/requirements.txt
46
 
47
+ COPY imatrix_calibration.txt ${HOME}/app/llama.cpp/
48
+
49
  ENV PYTHONPATH=${HOME}/app \
50
  PYTHONUNBUFFERED=1 \
51
  HF_HUB_ENABLE_HF_TRANSFER=1 \
 
55
  GRADIO_THEME=huggingface \
56
  TQDM_POSITION=-1 \
57
  TQDM_MININTERVAL=1 \
58
+ SYSTEM=spaces \
59
+ LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
60
+ PATH=/usr/local/nvidia/bin:${PATH}
61
+
62
 
63
+ ENTRYPOINT ["/bin/bash", "-c", "cd llama.cpp && LLAMA_CUDA=1 make -j && cd .. && /bin/sh start.sh"]
app.py CHANGED
@@ -17,6 +17,32 @@ from textwrap import dedent
17
 
18
  HF_TOKEN = os.environ.get("HF_TOKEN")
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
21
  if oauth_token.token is None:
22
  raise ValueError("You have to be logged in.")
@@ -57,7 +83,7 @@ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, s
57
 
58
  print("Sharded model has been uploaded successfully!")
59
 
60
- def process_model(model_id, q_method, private_repo, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
61
  if oauth_token.token is None:
62
  raise ValueError("You must be logged in to use GGUF-my-repo")
63
  model_name = model_id.split('/')[-1]
@@ -96,18 +122,37 @@ def process_model(model_id, q_method, private_repo, split_model, split_max_tenso
96
  print("Model converted to fp16 successfully!")
97
  print(f"Converted model path: {fp16}")
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  username = whoami(oauth_token.token)["name"]
100
- quantized_gguf_name = f"{model_name.lower()}-{q_method.lower()}.gguf"
101
  quantized_gguf_path = quantized_gguf_name
102
- quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
 
 
 
103
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
104
  if result.returncode != 0:
105
  raise Exception(f"Error quantizing: {result.stderr}")
106
- print(f"Quantized successfully with {q_method} option!")
107
  print(f"Quantized model path: {quantized_gguf_path}")
108
 
109
  # Create empty repo
110
- new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
111
  new_repo_id = new_repo_url.repo_id
112
  print("Repo created successfully!", new_repo_url)
113
 
@@ -181,13 +226,26 @@ def process_model(model_id, q_method, private_repo, split_model, split_max_tenso
181
  )
182
  except Exception as e:
183
  raise Exception(f"Error uploading quantized model: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  api.upload_file(
186
  path_or_fileobj=f"README.md",
187
  path_in_repo=f"README.md",
188
  repo_id=new_repo_id,
189
  )
190
- print(f"Uploaded successfully with {q_method} option!")
191
 
192
  return (
193
  f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
@@ -201,58 +259,92 @@ def process_model(model_id, q_method, private_repo, split_model, split_max_tenso
201
 
202
 
203
  # Create Gradio interface
204
- with gr.Blocks() as demo:
205
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
206
  gr.LoginButton(min_width=250)
207
 
208
- model_id_input = HuggingfaceHubSearch(
209
  label="Hub Model ID",
210
  placeholder="Search for model id on Huggingface",
211
  search_type="model",
212
  )
213
 
214
- q_method_input = gr.Dropdown(
215
  ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
216
  label="Quantization Method",
217
  info="GGML quantization type",
218
  value="Q4_K_M",
219
- filterable=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  )
221
 
222
- private_repo_input = gr.Checkbox(
223
  value=False,
224
  label="Private Repo",
225
  info="Create a private repo under your username."
226
  )
227
 
228
- split_model_input = gr.Checkbox(
 
 
 
 
 
 
229
  value=False,
230
  label="Split Model",
231
  info="Shard the model using gguf-split."
232
  )
233
 
234
- split_max_tensors_input = gr.Number(
235
  value=256,
236
  label="Max Tensors per File",
237
  info="Maximum number of tensors per file when splitting model.",
238
  visible=False
239
  )
240
 
241
- split_max_size_input = gr.Textbox(
242
  label="Max File Size",
243
  info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
244
  visible=False
245
  )
246
 
 
 
 
 
 
 
 
 
 
247
  iface = gr.Interface(
248
  fn=process_model,
249
  inputs=[
250
- model_id_input,
251
- q_method_input,
252
- private_repo_input,
253
- split_model_input,
254
- split_max_tensors_input,
255
- split_max_size_input,
 
 
 
256
  ],
257
  outputs=[
258
  gr.Markdown(label="output"),
@@ -263,13 +355,13 @@ with gr.Blocks() as demo:
263
  api_name=False
264
  )
265
 
266
- def update_visibility(split_model):
267
  return gr.update(visible=split_model), gr.update(visible=split_model)
268
 
269
- split_model_input.change(
270
- fn=update_visibility,
271
- inputs=split_model_input,
272
- outputs=[split_max_tensors_input, split_max_size_input]
273
  )
274
 
275
  def restart_space():
 
17
 
18
  HF_TOKEN = os.environ.get("HF_TOKEN")
19
 
20
+ def generate_importance_matrix(model_path, train_data_path):
21
+ imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 99"
22
+
23
+ os.chdir("llama.cpp")
24
+
25
+ compile_command = "LLAMA_CUDA=1 make -j"
26
+ compile_result = subprocess.run(compile_command, shell=True, capture_output=True, text=True)
27
+ if compile_result.returncode != 0:
28
+ raise Exception(f"Error compiling imatrix: {compile_result.stderr}")
29
+
30
+
31
+ print(f"Current working directory: {os.getcwd()}")
32
+ print(f"Files in the current directory: {os.listdir('.')}")
33
+
34
+ if not os.path.isfile(f"../{model_path}"):
35
+ raise Exception(f"Model file not found: {model_path}")
36
+
37
+ print("Running imatrix command...")
38
+ result = subprocess.run(imatrix_command, shell=True, capture_output=True, text=True)
39
+
40
+ os.chdir("..")
41
+
42
+ if result.returncode != 0:
43
+ raise Exception(f"Error generating importance matrix: {result.stderr}")
44
+ print("Importance matrix generated successfully!")
45
+
46
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
47
  if oauth_token.token is None:
48
  raise ValueError("You have to be logged in.")
 
83
 
84
  print("Sharded model has been uploaded successfully!")
85
 
86
+ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
87
  if oauth_token.token is None:
88
  raise ValueError("You must be logged in to use GGUF-my-repo")
89
  model_name = model_id.split('/')[-1]
 
122
  print("Model converted to fp16 successfully!")
123
  print(f"Converted model path: {fp16}")
124
 
125
+ imatrix_path = "llama.cpp/imatrix.dat"
126
+
127
+ if use_imatrix:
128
+ if train_data_file:
129
+ train_data_path = train_data_file.name
130
+ else:
131
+ train_data_path = "imatrix_calibration.txt"
132
+
133
+ print(f"Training data file path: {train_data_path}")
134
+
135
+ if not os.path.isfile(train_data_path):
136
+ raise Exception(f"Training data file not found: {train_data_path}")
137
+
138
+ generate_importance_matrix(fp16, train_data_path)
139
+ else:
140
+ print("Not using imatrix quantization.")
141
  username = whoami(oauth_token.token)["name"]
142
+ quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
143
  quantized_gguf_path = quantized_gguf_name
144
+ if use_imatrix:
145
+ quantise_ggml = f"./llama.cpp/quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
146
+ else:
147
+ quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
148
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
149
  if result.returncode != 0:
150
  raise Exception(f"Error quantizing: {result.stderr}")
151
+ print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
152
  print(f"Quantized model path: {quantized_gguf_path}")
153
 
154
  # Create empty repo
155
+ new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
156
  new_repo_id = new_repo_url.repo_id
157
  print("Repo created successfully!", new_repo_url)
158
 
 
226
  )
227
  except Exception as e:
228
  raise Exception(f"Error uploading quantized model: {e}")
229
+
230
+
231
+ imatrix_path = "llama.cpp/imatrix.dat"
232
+ if os.path.isfile(imatrix_path):
233
+ try:
234
+ print(f"Uploading imatrix.dat: {imatrix_path}")
235
+ api.upload_file(
236
+ path_or_fileobj=imatrix_path,
237
+ path_in_repo="imatrix.dat",
238
+ repo_id=new_repo_id,
239
+ )
240
+ except Exception as e:
241
+ raise Exception(f"Error uploading imatrix.dat: {e}")
242
 
243
  api.upload_file(
244
  path_or_fileobj=f"README.md",
245
  path_in_repo=f"README.md",
246
  repo_id=new_repo_id,
247
  )
248
+ print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
249
 
250
  return (
251
  f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
 
259
 
260
 
261
  # Create Gradio interface
262
+ with gr.Blocks(css=".gradio-container {max-height: 600px; overflow-y: auto;}") as demo:
263
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
264
  gr.LoginButton(min_width=250)
265
 
266
+ model_id = HuggingfaceHubSearch(
267
  label="Hub Model ID",
268
  placeholder="Search for model id on Huggingface",
269
  search_type="model",
270
  )
271
 
272
+ q_method = gr.Dropdown(
273
  ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
274
  label="Quantization Method",
275
  info="GGML quantization type",
276
  value="Q4_K_M",
277
+ filterable=False,
278
+ visible=True
279
+ )
280
+
281
+ imatrix_q_method = gr.Dropdown(
282
+ ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
283
+ label="Imatrix Quantization Method",
284
+ info="GGML imatrix quants type",
285
+ value="IQ4_NL",
286
+ filterable=False,
287
+ visible=False
288
+ )
289
+
290
+ use_imatrix = gr.Checkbox(
291
+ value=False,
292
+ label="Use Imatrix Quantization",
293
+ info="Use importance matrix for quantization."
294
  )
295
 
296
+ private_repo = gr.Checkbox(
297
  value=False,
298
  label="Private Repo",
299
  info="Create a private repo under your username."
300
  )
301
 
302
+ train_data_file = gr.File(
303
+ label="Training Data File",
304
+ file_types=["txt"],
305
+ visible=False
306
+ )
307
+
308
+ split_model = gr.Checkbox(
309
  value=False,
310
  label="Split Model",
311
  info="Shard the model using gguf-split."
312
  )
313
 
314
+ split_max_tensors = gr.Number(
315
  value=256,
316
  label="Max Tensors per File",
317
  info="Maximum number of tensors per file when splitting model.",
318
  visible=False
319
  )
320
 
321
+ split_max_size = gr.Textbox(
322
  label="Max File Size",
323
  info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
324
  visible=False
325
  )
326
 
327
+ def update_visibility(use_imatrix):
328
+ return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
329
+
330
+ use_imatrix.change(
331
+ fn=update_visibility,
332
+ inputs=use_imatrix,
333
+ outputs=[q_method, imatrix_q_method, train_data_file]
334
+ )
335
+
336
  iface = gr.Interface(
337
  fn=process_model,
338
  inputs=[
339
+ model_id,
340
+ q_method,
341
+ use_imatrix,
342
+ imatrix_q_method,
343
+ private_repo,
344
+ train_data_file,
345
+ split_model,
346
+ split_max_tensors,
347
+ split_max_size,
348
  ],
349
  outputs=[
350
  gr.Markdown(label="output"),
 
355
  api_name=False
356
  )
357
 
358
+ def update_split_visibility(split_model):
359
  return gr.update(visible=split_model), gr.update(visible=split_model)
360
 
361
+ split_model.change(
362
+ fn=update_split_visibility,
363
+ inputs=split_model,
364
+ outputs=[split_max_tensors, split_max_size]
365
  )
366
 
367
  def restart_space():
imatrix_calibration.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52062b7643edddbbc83435331ed1bc6ffc3eb463fae9df3551df52fb5638f0e8
3
+ size 201119
start.sh CHANGED
@@ -1,4 +1,5 @@
1
  cd llama.cpp
2
- make -j quantize gguf-split
 
3
  cd ..
4
  python app.py
 
1
  cd llama.cpp
2
+ make -j quantize gguf-split imatrix
3
+
4
  cd ..
5
  python app.py