SixOpen commited on
Commit
65be081
1 Parent(s): 72cbfdf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -15
app.py CHANGED
@@ -28,6 +28,30 @@ def script_to_use(model_id, api):
28
  arch = arch[0]
29
  return "convert.py" if arch in LLAMA_LIKE_ARCHS else "convert-hf-to-gguf.py"
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
32
  if oauth_token.token is None:
33
  raise ValueError("You have to be logged in.")
@@ -68,11 +92,11 @@ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, s
68
 
69
  print("Sharded model has been uploaded successfully!")
70
 
71
- def process_model(model_id, q_method, private_repo, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
72
  if oauth_token.token is None:
73
  raise ValueError("You must be logged in to use GGUF-my-repo")
74
  model_name = model_id.split('/')[-1]
75
- fp16 = f"{model_name}.fp16.gguf"
76
 
77
  try:
78
  api = HfApi(token=oauth_token.token)
@@ -107,18 +131,60 @@ def process_model(model_id, q_method, private_repo, split_model, split_max_tenso
107
  print("Model converted to fp16 successfully!")
108
  print(f"Converted model path: {fp16}")
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  username = whoami(oauth_token.token)["name"]
111
- quantized_gguf_name = f"{model_name.lower()}-{q_method.lower()}.gguf"
112
- quantized_gguf_path = quantized_gguf_name
113
- quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
114
- result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
 
 
 
 
 
 
 
 
 
 
 
 
115
  if result.returncode != 0:
116
  raise Exception(f"Error quantizing: {result.stderr}")
117
  print(f"Quantized successfully with {q_method} option!")
118
  print(f"Quantized model path: {quantized_gguf_path}")
119
 
120
  # Create empty repo
121
- new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
122
  new_repo_id = new_repo_url.repo_id
123
  print("Repo created successfully!", new_repo_url)
124
 
@@ -173,6 +239,19 @@ def process_model(model_id, q_method, private_repo, split_model, split_max_tenso
173
  except Exception as e:
174
  raise Exception(f"Error uploading quantized model: {e}")
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  api.upload_file(
177
  path_or_fileobj=f"README.md",
178
  path_in_repo=f"README.md",
@@ -203,7 +282,7 @@ with gr.Blocks() as demo:
203
  )
204
 
205
  q_method_input = gr.Dropdown(
206
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
207
  label="Quantization Method",
208
  info="GGML quantization type",
209
  value="Q4_K_M",
@@ -216,6 +295,11 @@ with gr.Blocks() as demo:
216
  info="Create a private repo under your username."
217
  )
218
 
 
 
 
 
 
219
  split_model_input = gr.Checkbox(
220
  value=False,
221
  label="Split Model",
@@ -241,6 +325,7 @@ with gr.Blocks() as demo:
241
  model_id_input,
242
  q_method_input,
243
  private_repo_input,
 
244
  split_model_input,
245
  split_max_tensors_input,
246
  split_max_size_input,
@@ -258,16 +343,14 @@ with gr.Blocks() as demo:
258
 
259
  split_model_input.change(
260
  fn=update_visibility,
261
- inputs=split_model_input,
262
- outputs=[split_max_tensors_input, split_max_size_input]
263
- )
264
-
265
- def restart_space():
266
- HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
267
 
268
  scheduler = BackgroundScheduler()
269
  scheduler.add_job(restart_space, "interval", seconds=21600)
270
  scheduler.start()
271
 
272
- # Launch the interface
273
  demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True)
 
28
  arch = arch[0]
29
  return "convert.py" if arch in LLAMA_LIKE_ARCHS else "convert-hf-to-gguf.py"
30
 
31
+ def generate_importance_matrix(model_path, train_data_path):
32
+ imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 0" #No GPU on the basic spaces unlike main, it works regardless but takes >2 hours
33
+
34
+ os.chdir("llama.cpp")
35
+
36
+ compile_command = "make"
37
+ compile_result = subprocess.run(compile_command, shell=True, capture_output=True, text=True)
38
+ if compile_result.returncode != 0:
39
+ raise Exception(f"Error compiling imatrix: {compile_result.stderr}")
40
+
41
+ print(f"Current working directory: {os.getcwd()}")
42
+ print(f"Files in the current directory: {os.listdir('.')}")
43
+
44
+ if not os.path.isfile(f"../{model_path}"):
45
+ raise Exception(f"Model file not found: {model_path}")
46
+
47
+ result = subprocess.run(imatrix_command, shell=True, capture_output=True, text=True)
48
+
49
+ os.chdir("..")
50
+
51
+ if result.returncode != 0:
52
+ raise Exception(f"Error generating importance matrix: {result.stderr}")
53
+ print("Importance matrix generated successfully!")
54
+
55
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
56
  if oauth_token.token is None:
57
  raise ValueError("You have to be logged in.")
 
92
 
93
  print("Sharded model has been uploaded successfully!")
94
 
95
+ def process_model(model_id, q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
96
  if oauth_token.token is None:
97
  raise ValueError("You must be logged in to use GGUF-my-repo")
98
  model_name = model_id.split('/')[-1]
99
+ fp16 = f"llama.cpp/{model_name}.fp16.gguf"
100
 
101
  try:
102
  api = HfApi(token=oauth_token.token)
 
131
  print("Model converted to fp16 successfully!")
132
  print(f"Converted model path: {fp16}")
133
 
134
+ imatrix_path = "llama.cpp/imatrix.dat"
135
+ use_imatrix = q_method.startswith("IQ")
136
+
137
+ if use_imatrix:
138
+ if train_data_file:
139
+
140
+ train_data_path = train_data_file.name
141
+
142
+
143
+ print(f"Training data file path: {train_data_path}")
144
+
145
+
146
+ if not os.path.isfile(train_data_path):
147
+ raise Exception(f"Training data file not found: {train_data_path}")
148
+ else:
149
+ # for now it's a decent fallback/default
150
+ train_data_path = "imatrix_calibration.txt"
151
+
152
+
153
+ print(f"Using fallback training data file: {train_data_path}")
154
+
155
+
156
+ if not os.path.isfile(train_data_path):
157
+ raise Exception(f"Fallback training data file not found: {train_data_path}")
158
+
159
+ generate_importance_matrix(fp16, train_data_path)
160
+ else:
161
+ print("Not using imatrix quantization. Skipping importance matrix generation.")
162
+
163
+
164
  username = whoami(oauth_token.token)["name"]
165
+ quantized_gguf_name = f"{model_name.lower()}-{q_method.lower()}-imat.gguf"
166
+ quantized_gguf_path = f"llama.cpp/{quantized_gguf_name}"
167
+ if use_imatrix:
168
+ quantise_ggml = f"./llama.cpp/quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {q_method}"
169
+ else:
170
+ quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
171
+
172
+
173
+ print(f"Quantization command: {quantise_ggml}")
174
+
175
+ result = subprocess.run(quantise_ggml, shell=True, capture_output=True, text=True)
176
+
177
+
178
+ print(f"Quantization command stdout: {result.stdout}")
179
+ print(f"Quantization command stderr: {result.stderr}")
180
+
181
  if result.returncode != 0:
182
  raise Exception(f"Error quantizing: {result.stderr}")
183
  print(f"Quantized successfully with {q_method} option!")
184
  print(f"Quantized model path: {quantized_gguf_path}")
185
 
186
  # Create empty repo
187
+ new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-imat.gguf", exist_ok=True, private=private_repo)
188
  new_repo_id = new_repo_url.repo_id
189
  print("Repo created successfully!", new_repo_url)
190
 
 
239
  except Exception as e:
240
  raise Exception(f"Error uploading quantized model: {e}")
241
 
242
+
243
+ imatrix_path = "llama.cpp/imatrix.dat"
244
+ if os.path.isfile(imatrix_path):
245
+ try:
246
+ print(f"Uploading imatrix.dat: {imatrix_path}")
247
+ api.upload_file(
248
+ path_or_fileobj=imatrix_path,
249
+ path_in_repo="imatrix.dat",
250
+ repo_id=new_repo_id,
251
+ )
252
+ except Exception as e:
253
+ raise Exception(f"Error uploading imatrix.dat: {e}")
254
+
255
  api.upload_file(
256
  path_or_fileobj=f"README.md",
257
  path_in_repo=f"README.md",
 
282
  )
283
 
284
  q_method_input = gr.Dropdown(
285
+ ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S", "Q6_K", "Q8_0"],
286
  label="Quantization Method",
287
  info="GGML quantization type",
288
  value="Q4_K_M",
 
295
  info="Create a private repo under your username."
296
  )
297
 
298
+ train_data_file_input = gr.File(
299
+ label="Training Data File",
300
+ file_types=["txt"]
301
+ )
302
+
303
  split_model_input = gr.Checkbox(
304
  value=False,
305
  label="Split Model",
 
325
  model_id_input,
326
  q_method_input,
327
  private_repo_input,
328
+ train_data_file_input,
329
  split_model_input,
330
  split_max_tensors_input,
331
  split_max_size_input,
 
343
 
344
  split_model_input.change(
345
  fn=update_visibility,
346
+ inputs=split_model_input, outputs=[split_max_tensors_input, split_max_size_input]
347
+ )
348
+ def restart_space():
349
+ HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
 
 
350
 
351
  scheduler = BackgroundScheduler()
352
  scheduler.add_job(restart_space, "interval", seconds=21600)
353
  scheduler.start()
354
 
355
+ #Launch the interface
356
  demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True)