John6666 commited on
Commit
65df304
ยท
verified ยท
1 Parent(s): 07279b8

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +15 -15
  2. app.py +108 -94
  3. requirements.txt +5 -4
README.md CHANGED
@@ -1,16 +1,16 @@
1
- ---
2
- title: moondream2
3
- emoji: ๐ŸŒ”
4
- colorFrom: indigo
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 4.39.0
8
- app_file: app.py
9
- pinned: false
10
- short_description: a tiny vision language model
11
- license: apache-2.0
12
- preload_from_hub:
13
- - vikhyatk/moondream2
14
- ---
15
-
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: moondream2
3
+ emoji: ๐ŸŒ”
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.5.0
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: a tiny vision language model
11
+ license: apache-2.0
12
+ preload_from_hub:
13
+ - vikhyatk/moondream2
14
+ ---
15
+
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,94 +1,108 @@
1
- import spaces
2
- import torch
3
- import re
4
- import gradio as gr
5
- from threading import Thread
6
- from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
7
- from PIL import ImageDraw
8
- from torchvision.transforms.v2 import Resize
9
-
10
- import subprocess
11
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
12
-
13
- model_id = "vikhyatk/moondream2"
14
- revision = "2024-08-26"
15
- tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
16
- moondream = AutoModelForCausalLM.from_pretrained(
17
- model_id, trust_remote_code=True, revision=revision,
18
- torch_dtype=torch.bfloat16, device_map={"": "cuda"},
19
- attn_implementation="flash_attention_2"
20
- )
21
- moondream.eval()
22
-
23
-
24
- @spaces.GPU(duration=10)
25
- def answer_question(img, prompt):
26
- image_embeds = moondream.encode_image(img)
27
- streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
28
- thread = Thread(
29
- target=moondream.answer_question,
30
- kwargs={
31
- "image_embeds": image_embeds,
32
- "question": prompt,
33
- "tokenizer": tokenizer,
34
- "streamer": streamer,
35
- },
36
- )
37
- thread.start()
38
-
39
- buffer = ""
40
- for new_text in streamer:
41
- buffer += new_text
42
- yield buffer.strip()
43
-
44
- def extract_floats(text):
45
- # Regular expression to match an array of four floating point numbers
46
- pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
47
- match = re.search(pattern, text)
48
- if match:
49
- # Extract the numbers and convert them to floats
50
- return [float(num) for num in match.groups()]
51
- return None # Return None if no match is found
52
-
53
-
54
- def extract_bbox(text):
55
- bbox = None
56
- if extract_floats(text) is not None:
57
- x1, y1, x2, y2 = extract_floats(text)
58
- bbox = (x1, y1, x2, y2)
59
- return bbox
60
-
61
- def process_answer(img, answer):
62
- if extract_bbox(answer) is not None:
63
- x1, y1, x2, y2 = extract_bbox(answer)
64
- draw_image = Resize(768)(img)
65
- width, height = draw_image.size
66
- x1, x2 = int(x1 * width), int(x2 * width)
67
- y1, y2 = int(y1 * height), int(y2 * height)
68
- bbox = (x1, y1, x2, y2)
69
- ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
70
- return gr.update(visible=True, value=draw_image)
71
-
72
- return gr.update(visible=False, value=None)
73
-
74
- with gr.Blocks() as demo:
75
- gr.Markdown(
76
- """
77
- # ๐ŸŒ” moondream2
78
- A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
79
- """
80
- )
81
- with gr.Row():
82
- prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
83
- submit = gr.Button("Submit")
84
- with gr.Row():
85
- img = gr.Image(type="pil", label="Upload an Image")
86
- with gr.Column():
87
- output = gr.Markdown(label="Response")
88
- ann = gr.Image(visible=False, label="Annotated Image")
89
-
90
- submit.click(answer_question, [img, prompt], output)
91
- prompt.submit(answer_question, [img, prompt], output)
92
- output.change(process_answer, [img, output], ann, show_progress=False)
93
-
94
- demo.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ if os.environ.get("SPACES_ZERO_GPU") is not None:
3
+ import spaces
4
+ else:
5
+ class spaces:
6
+ @staticmethod
7
+ def GPU(func):
8
+ def wrapper(*args, **kwargs):
9
+ return func(*args, **kwargs)
10
+ return wrapper
11
+ import torch
12
+ import re
13
+ import gradio as gr
14
+ from threading import Thread
15
+ from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
16
+ from PIL import ImageDraw
17
+ from torchvision.transforms.v2 import Resize
18
+
19
+ import subprocess
20
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
21
+
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+
24
+ model_id = "vikhyatk/moondream2"
25
+ #model_id = "zesquirrelnator/moondream2-finetuneV2"
26
+ #revision = "2024-08-26"
27
+ #tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
28
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
29
+ moondream = AutoModelForCausalLM.from_pretrained(
30
+ model_id, trust_remote_code=True, #revision=revision,
31
+ torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, #device_map="auto",
32
+ #ignore_mismatched_sizes=True,
33
+ #attn_implementation="flash_attention_2"
34
+ ).to(device)
35
+ moondream.eval()
36
+
37
+
38
+ @spaces.GPU
39
+ def answer_question(img, prompt):
40
+ image_embeds = moondream.encode_image(img)
41
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
42
+ thread = Thread(
43
+ target=moondream.answer_question,
44
+ kwargs={
45
+ "image_embeds": image_embeds,
46
+ "question": prompt,
47
+ "tokenizer": tokenizer,
48
+ "streamer": streamer,
49
+ },
50
+ )
51
+ thread.start()
52
+
53
+ buffer = ""
54
+ for new_text in streamer:
55
+ buffer += new_text
56
+ yield buffer.strip()
57
+
58
+ def extract_floats(text):
59
+ # Regular expression to match an array of four floating point numbers
60
+ pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
61
+ match = re.search(pattern, text)
62
+ if match:
63
+ # Extract the numbers and convert them to floats
64
+ return [float(num) for num in match.groups()]
65
+ return None # Return None if no match is found
66
+
67
+
68
+ def extract_bbox(text):
69
+ bbox = None
70
+ if extract_floats(text) is not None:
71
+ x1, y1, x2, y2 = extract_floats(text)
72
+ bbox = (x1, y1, x2, y2)
73
+ return bbox
74
+
75
+ def process_answer(img, answer):
76
+ if extract_bbox(answer) is not None:
77
+ x1, y1, x2, y2 = extract_bbox(answer)
78
+ draw_image = Resize(768)(img)
79
+ width, height = draw_image.size
80
+ x1, x2 = int(x1 * width), int(x2 * width)
81
+ y1, y2 = int(y1 * height), int(y2 * height)
82
+ bbox = (x1, y1, x2, y2)
83
+ ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
84
+ return gr.update(visible=True, value=draw_image)
85
+
86
+ return gr.update(visible=False, value=None)
87
+
88
+ with gr.Blocks() as demo:
89
+ gr.Markdown(
90
+ """
91
+ # ๐ŸŒ” moondream2
92
+ A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
93
+ """
94
+ )
95
+ with gr.Row():
96
+ prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
97
+ submit = gr.Button("Submit")
98
+ with gr.Row():
99
+ img = gr.Image(type="pil", label="Upload an Image")
100
+ with gr.Column():
101
+ output = gr.Markdown(label="Response")
102
+ ann = gr.Image(visible=False, label="Annotated Image")
103
+
104
+ submit.click(answer_question, [img, prompt], output)
105
+ prompt.submit(answer_question, [img, prompt], output)
106
+ output.change(process_answer, [img, output], ann, show_progress=False)
107
+
108
+ demo.queue().launch()
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
- timm==0.9.12
2
- transformers==4.44.0
3
- einops==0.8.0
4
- accelerate==0.32.1
 
 
1
+ timm>=0.9.12
2
+ transformers>=4.44.0
3
+ einops>=0.8.0
4
+ accelerate>=0.32.1
5
+ numpy<2