File size: 3,217 Bytes
f4aa29b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc4889b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
if os.environ.get("SPACES_ZERO_GPU") is not None:
    import spaces
else:
    class spaces:
        @staticmethod
        def GPU(func):
            def wrapper(*args, **kwargs):
                return func(*args, **kwargs)
            return wrapper
import gradio as gr
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

device = "cuda" if torch.cuda.is_available() else "cpu"
#model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model_id = "unsloth/Llama-3.2-11B-Vision-Instruct"
#model_id = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit"
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"

model = MllamaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
processor = AutoProcessor.from_pretrained(model_id)

@spaces.GPU
def infer(message: str, url: str):
    kwargs = {}
    image = Image.open(requests.get(url, stream=True).raw) if url and "http" in url else ""
    if image: kwargs["images"] = image
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": message}
        ]}
    ]
    
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    #input_text = "<|image|> If I had to write a haiku for this one, it would be: "
    inputs = processor(
        text=input_text,
        add_special_tokens=False,
        return_tensors="pt",
        **kwargs,
    ).to(model.device)

    output = model.generate(**inputs, max_new_tokens=30)
    output_str = processor.decode(output[0])
    print(message)
    print(url)
    print(output_str)
    return output_str

with gr.Blocks() as demo:
    with gr.Row():
        message = gr.Textbox(label="Message", value="Describe the image.", lines=1)
        image_url = gr.Textbox(label="Image URL", value=url, lines=1)
    run_button = gr.Button("Run", variant="primary")
    info_md = gr.Markdown("<br><br><br>")

    run_button.click(infer, [message, image_url], [info_md])

demo.launch()

"""
Describe the image.
https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>Describe the image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

This image features a charming anthropomorphic rabbit, attired in a brown waistcoat and tan pants, with a blue coat draped over his shoulders, standing
If I had to write a haiku for this one, it would be: 

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>If I had to write a haiku for this one, it would be: <|eot_id|><|start_header_id|>assistant<|end_header_id|>

It seems like you started to write a haiku but didn't finish. Would you like to complete it?<|eot_id|>
Who are you?

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>Who are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."<|eot_id|>
"""