Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 6,524 Bytes
57c7ce1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import os
import subprocess
import signal
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
import gradio as gr
import tempfile
from huggingface_hub import HfApi, ModelCard, whoami
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from pathlib import Path
from textwrap import dedent
from apscheduler.schedulers.background import BackgroundScheduler
HF_TOKEN = os.environ.get("HF_TOKEN")
CONVERSION_SCRIPT = "convert_lora_to_gguf.py"
def process_model(peft_model_id: str, q_method: str, private_repo, oauth_token: gr.OAuthToken | None):
if oauth_token.token is None:
raise ValueError("You must be logged in to use GGUF-my-lora")
model_name = peft_model_id.split('/')[-1]
gguf_output_name = f"{model_name}-{q_method.lower()}.gguf"
try:
api = HfApi(token=oauth_token.token)
dl_pattern = ["*.md", "*.json", "*.model"]
pattern = (
"*.safetensors"
if any(
file.path.endswith(".safetensors")
for file in api.list_repo_tree(
repo_id=peft_model_id,
recursive=True,
)
)
else "*.bin"
)
dl_pattern += [pattern]
if not os.path.exists("downloads"):
os.makedirs("downloads")
with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
# Keep the model name as the dirname so the model name metadata is populated correctly
local_dir = Path(tmpdir)/model_name
print(local_dir)
api.snapshot_download(repo_id=peft_model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
print("Model downloaded successfully!")
print(f"Current working directory: {os.getcwd()}")
print(f"Model directory contents: {os.listdir(local_dir)}")
adapter_config_dir = local_dir/"adapter_config.json"
if not os.path.exists(adapter_config_dir):
raise Exception("adapter_config.json not found. Please ensure the selected repo is a PEFT LoRA model.")
fp16_conversion = f"python llama.cpp/{CONVERSION_SCRIPT} {local_dir} --outtype {q_method.lower()} --outfile {gguf_output_name}"
result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
print(result)
if result.returncode != 0:
raise Exception(f"Error converting to GGUF {q_method}: {result.stderr}")
print("Model converted to GGUF successfully!")
print(f"Converted model path: {gguf_output_name}")
# Create empty repo
username = whoami(oauth_token.token)["name"]
new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
new_repo_id = new_repo_url.repo_id
print("Repo created successfully!", new_repo_url)
# Upload the GGUF model
api.upload_file(
path_or_fileobj=gguf_output_name,
path_in_repo=gguf_output_name,
repo_id=new_repo_id,
)
print("Uploaded", gguf_output_name)
try:
card = ModelCard.load(peft_model_id, token=oauth_token.token)
except:
card = ModelCard("")
if card.data.tags is None:
card.data.tags = []
card.data.tags.append("llama-cpp")
card.data.tags.append("gguf-my-lora")
card.data.base_model = peft_model_id
card.text = dedent(
f"""
# {new_repo_id}
This LoRA adapter was converted to GGUF format from [`{peft_model_id}`](https://huggingface.co/{peft_model_id}) via the ggml.ai's [GGUF-my-lora](https://huggingface.co/spaces/ggml-org/gguf-my-lora) space.
Refer to the [original adapter repository](https://huggingface.co/{peft_model_id}) for more details.
## Use with llama.cpp
```bash
# with cli
llama-cli -m base_model.gguf --lora {gguf_output_name} (...other args)
# with server
llama-server -m base_model.gguf --lora {gguf_output_name} (...other args)
```
To know more about LoRA usage with llama.cpp server, refer to the [llama.cpp server documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md).
"""
)
card.save(f"README.md")
api.upload_file(
path_or_fileobj=f"README.md",
path_in_repo=f"README.md",
repo_id=new_repo_id,
)
return (
f'<h1>✅ DONE</h1><br/><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>'
)
except Exception as e:
return (f"<h1>❌ ERROR</h1><br/><br/>{e}")
css="""/* Custom CSS to allow scrolling */
.gradio-container {overflow-y: auto;}
"""
# Create Gradio interface
with gr.Blocks(css=css) as demo:
gr.Markdown("You must be logged in to use GGUF-my-lora.")
gr.LoginButton(min_width=250)
peft_model_id = HuggingfaceHubSearch(
label="PEFT LoRA repository",
placeholder="Search for repository on Huggingface",
search_type="model",
)
q_method = gr.Dropdown(
["F32", "F16", "Q8_0"],
label="Quantization Method",
info="(Note: Quantization less than Q8 produces very poor results)",
value="F16",
filterable=False,
visible=True
)
private_repo = gr.Checkbox(
value=False,
label="Private Repo",
info="Create a private repo under your username."
)
iface = gr.Interface(
fn=process_model,
inputs=[
peft_model_id,
q_method,
private_repo,
],
outputs=[
gr.Markdown(label="output"),
],
title="Convert PEFT LoRA adapters to GGUF, blazingly fast ⚡!",
description="The space takes a PEFT LoRA (stored on a HF repo) as an input, converts it to GGUF and creates a Public repo under your HF user namespace.",
api_name=False
)
def restart_space():
HfApi().restart_space(repo_id="ggml-org/gguf-my-lora", token=HF_TOKEN, factory_reboot=True)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=21600)
scheduler.start()
# Launch the interface
demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False) |