joy-caption-pre-alpha-mod

Running on Zero

App Files Files Community

John6666 commited on Sep 12

Commit

f902fc6

•

1 Parent(s): 1650be4

Upload 6 files

Browse files

Files changed (5) hide show

app.py +11 -5
joycaption.py +82 -26
packages.txt +1 -0
pre-requirements.txt +1 -0
requirements.txt +4 -2

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import spaces
 import gradio as gr
-from joycaption import stream_chat_mod, get_text_model, change_text_model
 JC_TITLE_MD = "<h1><center>JoyCaption Pre-Alpha Mod</center></h1>"
 JC_DESC_MD = """This space is mod of [fancyfeast/joy-caption-pre-alpha](https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha),
@@ -17,9 +17,14 @@ with gr.Blocks() as demo:
             with gr.Group():
                 jc_input_image = gr.Image(type="pil", label="Input Image", sources=["upload", "clipboard"], height=384)
                 with gr.Accordion("Advanced", open=False):
-                    jc_text_model = gr.Dropdown(label="LLM Model", info="You can enter a huggingface model repo_id to want to use.",
-                                                choices=get_text_model(), value=get_text_model()[0],
-                                                allow_custom_value=True, interactive=True, min_width=320)
                     jc_use_inference_client = gr.Checkbox(label="Use Inference Client", value=False, visible=False)
                     with gr.Row():
                         jc_tokens = gr.Slider(minimum=1, maximum=4096, value=300, step=1, label="Max tokens")
@@ -32,7 +37,8 @@ with gr.Blocks() as demo:
     gr.Markdown(JC_DESC_MD, elem_classes="info")
     jc_run_button.click(fn=stream_chat_mod, inputs=[jc_input_image, jc_tokens, jc_topk, jc_temperature], outputs=[jc_output_caption])
-    jc_text_model.change(change_text_model, [jc_text_model, jc_use_inference_client], [jc_text_model], show_api=False)
     jc_use_inference_client.change(change_text_model, [jc_text_model, jc_use_inference_client], [jc_text_model], show_api=False)
 if __name__ == "__main__":

 import spaces
 import gradio as gr
+from joycaption import stream_chat_mod, get_text_model, change_text_model, get_repo_gguf
 JC_TITLE_MD = "<h1><center>JoyCaption Pre-Alpha Mod</center></h1>"
 JC_DESC_MD = """This space is mod of [fancyfeast/joy-caption-pre-alpha](https://huggingface.co/spaces/fancyfeast/joy-caption-pre-alpha),
             with gr.Group():
                 jc_input_image = gr.Image(type="pil", label="Input Image", sources=["upload", "clipboard"], height=384)
                 with gr.Accordion("Advanced", open=False):
+                    with gr.Row():
+                        jc_text_model = gr.Dropdown(label="LLM Model", info="You can enter a huggingface model repo_id to want to use.",
+                                                    choices=get_text_model(), value=get_text_model()[0],
+                                                    allow_custom_value=True, interactive=True, min_width=320)
+                        jc_gguf = gr.Dropdown(label=f"GGUF Filename", choices=[], value="",
+                                              allow_custom_value=True, min_width=320, visible=False)
+                        jc_nf4 = gr.Checkbox(label="Use NF4 quantization", value=True)
+                        jc_text_model_button = gr.Button("Load Model", variant="secondary")
                     jc_use_inference_client = gr.Checkbox(label="Use Inference Client", value=False, visible=False)
                     with gr.Row():
                         jc_tokens = gr.Slider(minimum=1, maximum=4096, value=300, step=1, label="Max tokens")
     gr.Markdown(JC_DESC_MD, elem_classes="info")
     jc_run_button.click(fn=stream_chat_mod, inputs=[jc_input_image, jc_tokens, jc_topk, jc_temperature], outputs=[jc_output_caption])
+    jc_text_model_button.click(change_text_model, [jc_text_model, jc_use_inference_client, jc_gguf, jc_nf4], [jc_text_model], show_api=False)
+    #jc_text_model.change(get_repo_gguf, [jc_text_model], [jc_gguf], show_api=False)
     jc_use_inference_client.change(change_text_model, [jc_text_model, jc_use_inference_client], [jc_text_model], show_api=False)
 if __name__ == "__main__":

joycaption.py CHANGED Viewed

@@ -12,17 +12,16 @@ import gc
 device = "cuda" if torch.cuda.is_available() else "cpu"
-llm_models = [
-    "Sao10K/Llama-3.1-8B-Stheno-v3.4",
-    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
-    "mergekit-community/L3.1-Boshima-b-FIX",
-    "meta-llama/Meta-Llama-3.1-8B",
-]
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 VLM_PROMPT = "A descriptive caption for this image:\n"
-MODEL_PATH = llm_models[0]
 CHECKPOINT_PATH = Path("wpkklhc6")
 TITLE = "<h1><center>JoyCaption Pre-Alpha (2024-07-30a)</center></h1>"
@@ -42,21 +41,41 @@ class ImageAdapter(nn.Module):
         x = self.linear2(x)
         return x
 # https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
 # https://huggingface.co/google/flan-ul2/discussions/8
 text_model_client = None
 text_model = None
 image_adapter = None
-def load_text_model(model_name: str=MODEL_PATH):
     global text_model
     global image_adapter
-    global text_model_client
-    global use_inference_client
     try:
         print(f"Loading LLM: {model_name}")
-        if device == "cpu": text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
-        else: text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
         print("Loading image adapter")
         image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size).eval().to("cpu")
         image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=True))
@@ -76,10 +95,6 @@ clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
 clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model.eval().requires_grad_(False).to(device)
 # Tokenizer
-print("Loading tokenizer")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
-assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
 # LLM
 # Image Adapter
 load_text_model()
@@ -176,11 +191,17 @@ def stream_chat_mod(input_image: Image.Image, max_new_tokens: int=300, top_k: in
     ], dim=1).to(device)
     attention_mask = torch.ones_like(input_ids)
     # https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility
     # https://huggingface.co/docs/huggingface_hub/v0.24.6/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation
     #generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=False, suppress_tokens=None)
     generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask,
                                        max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, temperature=temperature, suppress_tokens=None)
     # Trim off the prompt
     generate_ids = generate_ids[:, input_ids.shape[1]:]
@@ -199,8 +220,8 @@ def is_repo_name(s):
 def is_repo_exists(repo_id):
     from huggingface_hub import HfApi
-    api = HfApi()
     try:
         if api.repo_exists(repo_id=repo_id): return True
         else: return False
     except Exception as e:
@@ -210,24 +231,59 @@ def is_repo_exists(repo_id):
 def get_text_model():
-    return llm_models
 @spaces.GPU()
-def change_text_model(model_name: str=MODEL_PATH, use_client: bool=False, progress=gr.Progress(track_tqdm=True)):
     global use_inference_client
-    global text_model
     global llm_models
     use_inference_client = use_client
     try:
         if not is_repo_name(model_name) or not is_repo_exists(model_name):
             raise gr.Error(f"Repo doesn't exist: {model_name}")
         if use_inference_client:
-            pass
         else:
-            load_text_model(model_name)
-        if model_name not in llm_models: llm_models.append(model_name)
-        return gr.update(visible=True)
     except Exception as e:
         raise gr.Error(f"Model load error: {model_name}, {e}")

 device = "cuda" if torch.cuda.is_available() else "cpu"
+llm_models = {
+    "Sao10K/Llama-3.1-8B-Stheno-v3.4": None,
+    "unsloth/Meta-Llama-3.1-8B-bnb-4bit": None,
+    "mergekit-community/L3.1-Boshima-b-FIX": None,
+    "meta-llama/Meta-Llama-3.1-8B": None,
+}
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 VLM_PROMPT = "A descriptive caption for this image:\n"
+MODEL_PATH = list(llm_models.keys())[0]
 CHECKPOINT_PATH = Path("wpkklhc6")
 TITLE = "<h1><center>JoyCaption Pre-Alpha (2024-07-30a)</center></h1>"
         x = self.linear2(x)
         return x
+# https://huggingface.co/docs/transformers/v4.44.2/gguf
+# https://github.com/city96/ComfyUI-GGUF/issues/7
+# https://github.com/THUDM/ChatGLM-6B/issues/18
+# https://github.com/meta-llama/llama/issues/394
+# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/109
 # https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
 # https://huggingface.co/google/flan-ul2/discussions/8
+# https://huggingface.co/blog/4bit-transformers-bitsandbytes
+tokenizer = None
 text_model_client = None
 text_model = None
 image_adapter = None
+def load_text_model(model_name: str=MODEL_PATH, gguf_file: str | None=None, is_nf4: bool=True):
+    global tokenizer
     global text_model
     global image_adapter
+    global text_model_client #
+    global use_inference_client #
     try:
+        from transformers import BitsAndBytesConfig
+        nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
+                                        bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
+        print("Loading tokenizer")
+        if gguf_file: tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=gguf_file, use_fast=True, legacy=False)
+        else: tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, legacy=False)
+        assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Tokenizer is of type {type(tokenizer)}"
         print(f"Loading LLM: {model_name}")
+        if gguf_file:
+            if device == "cpu": text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=device, torch_dtype=torch.bfloat16).eval()
+            elif is_nf4: text_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
+            else: text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
+        else:
+            if device == "cpu": text_model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file, device_map=device, torch_dtype=torch.bfloat16).eval()
+            elif is_nf4: text_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
+            else: text_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
         print("Loading image adapter")
         image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size).eval().to("cpu")
         image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu", weights_only=True))
 clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model.eval().requires_grad_(False).to(device)
 # Tokenizer
 # LLM
 # Image Adapter
 load_text_model()
     ], dim=1).to(device)
     attention_mask = torch.ones_like(input_ids)
+    # https://huggingface.co/docs/transformers/v4.44.2/main_classes/text_generation#transformers.FlaxGenerationMixin.generate
+    # https://github.com/huggingface/transformers/issues/6535
+    # https://zenn.dev/hijikix/articles/8c445f4373fdcc ja
+    # https://github.com/ggerganov/llama.cpp/discussions/7712
     # https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility
     # https://huggingface.co/docs/huggingface_hub/v0.24.6/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation
     #generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=False, suppress_tokens=None)
     generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask,
                                        max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, temperature=temperature, suppress_tokens=None)
+    print(prompt)
     # Trim off the prompt
     generate_ids = generate_ids[:, input_ids.shape[1]:]
 def is_repo_exists(repo_id):
     from huggingface_hub import HfApi
     try:
+        api = HfApi(token=HF_TOKEN)
         if api.repo_exists(repo_id=repo_id): return True
         else: return False
     except Exception as e:
 def get_text_model():
+    return list(llm_models.keys())
+def is_gguf_repo(repo_id: str):
+    from huggingface_hub import HfApi
+    try:
+        api = HfApi(token=HF_TOKEN)
+        if not is_repo_name(repo_id) or not is_repo_exists(repo_id): return False
+        files = api.list_repo_files(repo_id=repo_id)
+    except Exception as e:
+        print(f"Error: Failed to get {repo_id}'s info.")
+        print(e)
+        gr.Warning(f"Error: Failed to get {repo_id}'s info.")
+        return False
+    files = [f for f in files if f.endswith(".gguf")]
+    if len(files) == 0: return False
+    else: return True
+def get_repo_gguf(repo_id: str):
+    from huggingface_hub import HfApi
+    try:
+        api = HfApi(token=HF_TOKEN)
+        if not is_repo_name(repo_id) or not is_repo_exists(repo_id): return gr.update(value="", choices=[])
+        files = api.list_repo_files(repo_id=repo_id)
+    except Exception as e:
+        print(f"Error: Failed to get {repo_id}'s info.")
+        print(e)
+        gr.Warning(f"Error: Failed to get {repo_id}'s info.")
+        return gr.update(value="", choices=[])
+    files = [f for f in files if f.endswith(".gguf")]
+    if len(files) == 0: return gr.update(value="", choices=[])
+    else: return gr.update(value=files[0], choices=files)
 @spaces.GPU()
+def change_text_model(model_name: str=MODEL_PATH, use_client: bool=False, gguf_file: str | None=None,
+                      is_nf4: bool=True, progress=gr.Progress(track_tqdm=True)):
     global use_inference_client
     global llm_models
     use_inference_client = use_client
     try:
         if not is_repo_name(model_name) or not is_repo_exists(model_name):
             raise gr.Error(f"Repo doesn't exist: {model_name}")
+        if not gguf_file and is_gguf_repo(model_name):
+            gr.Info(f"Please select a gguf file.")
+            return gr.update(visible=True)
         if use_inference_client:
+            pass #
         else:
+            load_text_model(model_name, gguf_file, is_nf4)
+        if model_name not in llm_models: llm_models[model_name] = gguf_file if gguf_file else None
+        return gr.update(choices=get_text_model())
     except Exception as e:
         raise gr.Error(f"Model load error: {model_name}, {e}")

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ git-lfs

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip>=23.0.0

requirements.txt CHANGED Viewed

@@ -1,8 +1,10 @@
 huggingface_hub
 accelerate
 torch
-transformers==4.43.3
 sentencepiece
 bitsandbytes
 Pillow
-protobuf

 huggingface_hub
 accelerate
 torch
+git+https://github.com/huggingface/transformers
 sentencepiece
 bitsandbytes
 Pillow
+protobuf
+gguf
+numpy<2.0.0