gguf-my-repo-sp_imat

Sleeping

App Files Files Community

SixOpen commited on May 20, 2024

Commit

65be081

•

1 Parent(s): 72cbfdf

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -15

app.py CHANGED Viewed

@@ -28,6 +28,30 @@ def script_to_use(model_id, api):
     arch = arch[0]
     return "convert.py" if arch in LLAMA_LIKE_ARCHS else "convert-hf-to-gguf.py"
 def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
     if oauth_token.token is None:
         raise ValueError("You have to be logged in.")
@@ -68,11 +92,11 @@ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, s
     print("Sharded model has been uploaded successfully!")
-def process_model(model_id, q_method, private_repo, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
     if oauth_token.token is None:
         raise ValueError("You must be logged in to use GGUF-my-repo")
     model_name = model_id.split('/')[-1]
-    fp16 = f"{model_name}.fp16.gguf"
     try:
         api = HfApi(token=oauth_token.token)
@@ -107,18 +131,60 @@ def process_model(model_id, q_method, private_repo, split_model, split_max_tenso
         print("Model converted to fp16 successfully!")
         print(f"Converted model path: {fp16}")
         username = whoami(oauth_token.token)["name"]
-        quantized_gguf_name = f"{model_name.lower()}-{q_method.lower()}.gguf"
-        quantized_gguf_path = quantized_gguf_name
-        quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
-        result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
         if result.returncode != 0:
             raise Exception(f"Error quantizing: {result.stderr}")
         print(f"Quantized successfully with {q_method} option!")
         print(f"Quantized model path: {quantized_gguf_path}")
         # Create empty repo
-        new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
         new_repo_id = new_repo_url.repo_id
         print("Repo created successfully!", new_repo_url)
@@ -173,6 +239,19 @@ def process_model(model_id, q_method, private_repo, split_model, split_max_tenso
             except Exception as e:
                 raise Exception(f"Error uploading quantized model: {e}")
         api.upload_file(
             path_or_fileobj=f"README.md",
             path_in_repo=f"README.md",
@@ -203,7 +282,7 @@ with gr.Blocks() as demo:
     )
     q_method_input = gr.Dropdown(
-        ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
         label="Quantization Method",
         info="GGML quantization type",
         value="Q4_K_M",
@@ -216,6 +295,11 @@ with gr.Blocks() as demo:
         info="Create a private repo under your username."
     )
     split_model_input = gr.Checkbox(
         value=False,
         label="Split Model",
@@ -241,6 +325,7 @@ with gr.Blocks() as demo:
             model_id_input,
             q_method_input,
             private_repo_input,
             split_model_input,
             split_max_tensors_input,
             split_max_size_input,
@@ -258,16 +343,14 @@ with gr.Blocks() as demo:
     split_model_input.change(
         fn=update_visibility,
-        inputs=split_model_input,
-        outputs=[split_max_tensors_input, split_max_size_input]
-    )
-def restart_space():
-    HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=21600)
 scheduler.start()
-# Launch the interface
 demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True)

     arch = arch[0]
     return "convert.py" if arch in LLAMA_LIKE_ARCHS else "convert-hf-to-gguf.py"
+def generate_importance_matrix(model_path, train_data_path):
+    imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 0"   #No GPU on the basic spaces unlike main, it works regardless but takes >2 hours
+    os.chdir("llama.cpp")
+    compile_command = "make"
+    compile_result = subprocess.run(compile_command, shell=True, capture_output=True, text=True)
+    if compile_result.returncode != 0:
+        raise Exception(f"Error compiling imatrix: {compile_result.stderr}")
+    print(f"Current working directory: {os.getcwd()}")
+    print(f"Files in the current directory: {os.listdir('.')}")
+    if not os.path.isfile(f"../{model_path}"):
+        raise Exception(f"Model file not found: {model_path}")
+    result = subprocess.run(imatrix_command, shell=True, capture_output=True, text=True)
+    os.chdir("..")
+    if result.returncode != 0:
+        raise Exception(f"Error generating importance matrix: {result.stderr}")
+    print("Importance matrix generated successfully!")
 def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
     if oauth_token.token is None:
         raise ValueError("You have to be logged in.")
     print("Sharded model has been uploaded successfully!")
+def process_model(model_id, q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
     if oauth_token.token is None:
         raise ValueError("You must be logged in to use GGUF-my-repo")
     model_name = model_id.split('/')[-1]
+    fp16 = f"llama.cpp/{model_name}.fp16.gguf"
     try:
         api = HfApi(token=oauth_token.token)
         print("Model converted to fp16 successfully!")
         print(f"Converted model path: {fp16}")
+        imatrix_path = "llama.cpp/imatrix.dat"
+        use_imatrix = q_method.startswith("IQ")
+        if use_imatrix:
+            if train_data_file:
+                train_data_path = train_data_file.name
+                print(f"Training data file path: {train_data_path}")
+                if not os.path.isfile(train_data_path):
+                    raise Exception(f"Training data file not found: {train_data_path}")
+            else:
+                # for now it's a decent fallback/default
+                train_data_path = "imatrix_calibration.txt"
+                print(f"Using fallback training data file: {train_data_path}")
+                if not os.path.isfile(train_data_path):
+                    raise Exception(f"Fallback training data file not found: {train_data_path}")
+            generate_importance_matrix(fp16, train_data_path)
+        else:
+            print("Not using imatrix quantization. Skipping importance matrix generation.")
         username = whoami(oauth_token.token)["name"]
+        quantized_gguf_name = f"{model_name.lower()}-{q_method.lower()}-imat.gguf"
+        quantized_gguf_path = f"llama.cpp/{quantized_gguf_name}"
+        if use_imatrix:
+            quantise_ggml = f"./llama.cpp/quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {q_method}"
+        else:
+            quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
+        print(f"Quantization command: {quantise_ggml}")
+        result = subprocess.run(quantise_ggml, shell=True, capture_output=True, text=True)
+        print(f"Quantization command stdout: {result.stdout}")
+        print(f"Quantization command stderr: {result.stderr}")
         if result.returncode != 0:
             raise Exception(f"Error quantizing: {result.stderr}")
         print(f"Quantized successfully with {q_method} option!")
         print(f"Quantized model path: {quantized_gguf_path}")
         # Create empty repo
+        new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-imat.gguf", exist_ok=True, private=private_repo)
         new_repo_id = new_repo_url.repo_id
         print("Repo created successfully!", new_repo_url)
             except Exception as e:
                 raise Exception(f"Error uploading quantized model: {e}")
+        imatrix_path = "llama.cpp/imatrix.dat"
+        if os.path.isfile(imatrix_path):
+            try:
+                print(f"Uploading imatrix.dat: {imatrix_path}")
+                api.upload_file(
+                    path_or_fileobj=imatrix_path,
+                    path_in_repo="imatrix.dat",
+                    repo_id=new_repo_id,
+                )
+            except Exception as e:
+                raise Exception(f"Error uploading imatrix.dat: {e}")
         api.upload_file(
             path_or_fileobj=f"README.md",
             path_in_repo=f"README.md",
     )
     q_method_input = gr.Dropdown(
+        ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S", "Q6_K", "Q8_0"],
         label="Quantization Method",
         info="GGML quantization type",
         value="Q4_K_M",
         info="Create a private repo under your username."
     )
+    train_data_file_input = gr.File(
+        label="Training Data File",
+        file_types=["txt"]
+    )
     split_model_input = gr.Checkbox(
         value=False,
         label="Split Model",
             model_id_input,
             q_method_input,
             private_repo_input,
+            train_data_file_input,
             split_model_input,
             split_max_tensors_input,
             split_max_size_input,
     split_model_input.change(
         fn=update_visibility,
+        inputs=split_model_input, outputs=[split_max_tensors_input, split_max_size_input]
+)
+    def restart_space():
+        HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=21600)
 scheduler.start()
+#Launch the interface
 demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True)