moondream2-cpu

Runtime error

App Files Files Community

John6666 commited on Nov 14, 2024

Commit

65df304

verified ·

1 Parent(s): 07279b8

Upload 3 files

Browse files

Files changed (3) hide show

README.md +15 -15
app.py +108 -94
requirements.txt +5 -4

README.md CHANGED Viewed

@@ -1,16 +1,16 @@
----
-title: moondream2
-emoji: 🌔
-colorFrom: indigo
-colorTo: blue
-sdk: gradio
-sdk_version: 4.39.0
-app_file: app.py
-pinned: false
-short_description: a tiny vision language model
-license: apache-2.0
-preload_from_hub:
-- vikhyatk/moondream2
----
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: moondream2
+emoji: 🌔
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: 5.5.0
+app_file: app.py
+pinned: false
+short_description: a tiny vision language model
+license: apache-2.0
+preload_from_hub:
+- vikhyatk/moondream2
+---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,94 +1,108 @@
-import spaces
-import torch
-import re
-import gradio as gr
-from threading import Thread
-from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
-from PIL import ImageDraw
-from torchvision.transforms.v2 import Resize
-import subprocess
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-model_id = "vikhyatk/moondream2"
-revision = "2024-08-26"
-tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
-moondream = AutoModelForCausalLM.from_pretrained(
-    model_id, trust_remote_code=True, revision=revision,
-    torch_dtype=torch.bfloat16, device_map={"": "cuda"},
-    attn_implementation="flash_attention_2"
-)
-moondream.eval()
-@spaces.GPU(duration=10)
-def answer_question(img, prompt):
-    image_embeds = moondream.encode_image(img)
-    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
-    thread = Thread(
-        target=moondream.answer_question,
-        kwargs={
-            "image_embeds": image_embeds,
-            "question": prompt,
-            "tokenizer": tokenizer,
-            "streamer": streamer,
-        },
-    )
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        yield buffer.strip()
-def extract_floats(text):
-    # Regular expression to match an array of four floating point numbers
-    pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
-    match = re.search(pattern, text)
-    if match:
-        # Extract the numbers and convert them to floats
-        return [float(num) for num in match.groups()]
-    return None  # Return None if no match is found
-def extract_bbox(text):
-    bbox = None
-    if extract_floats(text) is not None:
-        x1, y1, x2, y2 = extract_floats(text)
-        bbox = (x1, y1, x2, y2)
-    return bbox
-def process_answer(img, answer):
-    if extract_bbox(answer) is not None:
-        x1, y1, x2, y2 = extract_bbox(answer)
-        draw_image = Resize(768)(img)
-        width, height = draw_image.size
-        x1, x2 = int(x1 * width), int(x2 * width)
-        y1, y2 = int(y1 * height), int(y2 * height)
-        bbox = (x1, y1, x2, y2)
-        ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
-        return gr.update(visible=True, value=draw_image)
-    return gr.update(visible=False, value=None)
-with gr.Blocks() as demo:
-    gr.Markdown(
-        """
-        # 🌔 moondream2
-        A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
-        """
-    )
-    with gr.Row():
-        prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
-        submit = gr.Button("Submit")
-    with gr.Row():
-        img = gr.Image(type="pil", label="Upload an Image")
-        with gr.Column():
-            output = gr.Markdown(label="Response")
-            ann = gr.Image(visible=False, label="Annotated Image")
-    submit.click(answer_question, [img, prompt], output)
-    prompt.submit(answer_question, [img, prompt], output)
-    output.change(process_answer, [img, output], ann, show_progress=False)
-demo.queue().launch()

+import os
+if os.environ.get("SPACES_ZERO_GPU") is not None:
+    import spaces
+else:
+    class spaces:
+        @staticmethod
+        def GPU(func):
+            def wrapper(*args, **kwargs):
+                return func(*args, **kwargs)
+            return wrapper
+import torch
+import re
+import gradio as gr
+from threading import Thread
+from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
+from PIL import ImageDraw
+from torchvision.transforms.v2 import Resize
+import subprocess
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_id = "vikhyatk/moondream2"
+#model_id = "zesquirrelnator/moondream2-finetuneV2"
+#revision = "2024-08-26"
+#tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+moondream = AutoModelForCausalLM.from_pretrained(
+    model_id, trust_remote_code=True, #revision=revision,
+    torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, #device_map="auto",
+    #ignore_mismatched_sizes=True,
+    #attn_implementation="flash_attention_2"
+).to(device)
+moondream.eval()
+@spaces.GPU
+def answer_question(img, prompt):
+    image_embeds = moondream.encode_image(img)
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    thread = Thread(
+        target=moondream.answer_question,
+        kwargs={
+            "image_embeds": image_embeds,
+            "question": prompt,
+            "tokenizer": tokenizer,
+            "streamer": streamer,
+        },
+    )
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        yield buffer.strip()
+def extract_floats(text):
+    # Regular expression to match an array of four floating point numbers
+    pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
+    match = re.search(pattern, text)
+    if match:
+        # Extract the numbers and convert them to floats
+        return [float(num) for num in match.groups()]
+    return None  # Return None if no match is found
+def extract_bbox(text):
+    bbox = None
+    if extract_floats(text) is not None:
+        x1, y1, x2, y2 = extract_floats(text)
+        bbox = (x1, y1, x2, y2)
+    return bbox
+def process_answer(img, answer):
+    if extract_bbox(answer) is not None:
+        x1, y1, x2, y2 = extract_bbox(answer)
+        draw_image = Resize(768)(img)
+        width, height = draw_image.size
+        x1, x2 = int(x1 * width), int(x2 * width)
+        y1, y2 = int(y1 * height), int(y2 * height)
+        bbox = (x1, y1, x2, y2)
+        ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
+        return gr.update(visible=True, value=draw_image)
+    return gr.update(visible=False, value=None)
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # 🌔 moondream2
+        A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
+        """
+    )
+    with gr.Row():
+        prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
+        submit = gr.Button("Submit")
+    with gr.Row():
+        img = gr.Image(type="pil", label="Upload an Image")
+        with gr.Column():
+            output = gr.Markdown(label="Response")
+            ann = gr.Image(visible=False, label="Annotated Image")
+    submit.click(answer_question, [img, prompt], output)
+    prompt.submit(answer_question, [img, prompt], output)
+    output.change(process_answer, [img, output], ann, show_progress=False)
+demo.queue().launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
-timm==0.9.12
-transformers==4.44.0
-einops==0.8.0
-accelerate==0.32.1

+timm>=0.9.12
+transformers>=4.44.0
+einops>=0.8.0
+accelerate>=0.32.1
+numpy<2