File size: 4,188 Bytes
fda8dae
5781b89
 
 
 
 
07d11bb
 
5781b89
1322687
 
5781b89
 
6d0cb8a
12e7969
5781b89
60e7a28
1322687
 
bac7d5d
5781b89
 
5c4fa84
1250026
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3a8054
 
1250026
5781b89
fda8dae
5781b89
 
 
 
 
 
 
 
 
 
b2d3b41
 
 
 
5781b89
 
 
 
 
 
6b26249
 
5781b89
07d11bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5781b89
 
 
 
25d4410
1250026
5781b89
 
 
1322687
5781b89
 
 
07d11bb
 
 
 
5781b89
 
07d11bb
5781b89
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import spaces
import torch
import re
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize

import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

model_id = "vikhyatk/moondream2"
revision = "2024-05-20"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
moondream = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision,
    torch_dtype=torch.bfloat16, device_map={"": "cuda"},
    attn_implementation="flash_attention_2"
)
moondream.eval()

control_vectors = torch.load("control_vectors.pt", map_location="cpu")
control_vectors = [t.to('cuda', dtype=torch.bfloat16) for t in control_vectors]

class LayerWrapper(torch.nn.Module):
    def __init__(self, og_layer, control_vectors, scale=4.2):
        super().__init__()
        self.og_layer = og_layer
        self.control_vectors = control_vectors
        self.scale = scale

    def forward(self, *args, **kwargs):
        layer_outputs = self.og_layer(*args, **kwargs)
        layer_outputs = (layer_outputs[0] + self.scale * self.control_vectors, *layer_outputs[1:])
        return layer_outputs

moondream.text_model.transformer.h = torch.nn.ModuleList([
    LayerWrapper(layer, vector, 4.2)
    for layer, vector in zip(moondream.text_model.transformer.h, control_vectors)
])

@spaces.GPU(duration=10)
def answer_question(img, prompt):
    image_embeds = moondream.encode_image(img)
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    thread = Thread(
        target=moondream.answer_question,
        kwargs={
            "image_embeds": image_embeds,
            "question": prompt,
            "tokenizer": tokenizer,
            "streamer": streamer,
            "repetition_penalty": 1.2,
            "temperature": 0.1,
            "do_sample": True,
            "length_penalty": 1.2
        },
    )
    thread.start()

    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer.strip()

def extract_floats(text):
    # Regular expression to match an array of four floating point numbers
    pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
    match = re.search(pattern, text)
    if match:
        # Extract the numbers and convert them to floats
        return [float(num) for num in match.groups()]
    return None  # Return None if no match is found


def extract_bbox(text):
    bbox = None
    if extract_floats(text) is not None:
        x1, y1, x2, y2 = extract_floats(text)
        bbox = (x1, y1, x2, y2)
    return bbox

def process_answer(img, answer):
    if extract_bbox(answer) is not None:
        x1, y1, x2, y2 = extract_bbox(answer)
        draw_image = Resize(768)(img)
        width, height = draw_image.size
        x1, x2 = int(x1 * width), int(x2 * width)
        y1, y2 = int(y1 * height), int(y2 * height)
        bbox = (x1, y1, x2, y2)
        ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
        return gr.update(visible=True, value=draw_image)

    return gr.update(visible=False, value=None)

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # 🌜 contemplative moondream
        a demo of [moondream](http://moondream.ai) steered to discuss the meaning of life using [activation vectors](https://github.com/vikhyat/moondream/blob/main/notebooks/RepEng.ipynb)
        """
    )
    with gr.Row():
        prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
        submit = gr.Button("Submit")
    with gr.Row():
        img = gr.Image(type="pil", label="Upload an Image")
        with gr.Column():
            output = gr.Markdown(label="Response")
            ann = gr.Image(visible=False, label="Annotated Image")

    submit.click(answer_question, [img, prompt], output)
    prompt.submit(answer_question, [img, prompt], output)
    output.change(process_answer, [img, output], ann, show_progress=False)

demo.queue().launch()