File size: 3,562 Bytes
36a6fc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces
from PIL import Image
import io
import subprocess
subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)

models = {
    "maxiw/Florence-2-ScreenQA-base": AutoModelForCausalLM.from_pretrained("maxiw/Florence-2-ScreenQA-base", trust_remote_code=True).to("cuda").eval(),
}

processors = {
    "maxiw/Florence-2-ScreenQA-base": AutoProcessor.from_pretrained("maxiw/Florence-2-ScreenQA-base", trust_remote_code=True),
}


DESCRIPTION = "# [Florence-2-ScreenQA Demo](https://huggingface.co/maxiw/Florence-2-ScreenQA-base)"


@spaces.GPU
def run_example(task_prompt, image, text_input=None, model_id="maxiw/Florence-2-ScreenQA-base"):
    model = models[model_id]
    processor = processors[model_id]
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )
    if "<SQA>" in parsed_answer:
        parsed_answer = parsed_answer["<SQA>"]
    return parsed_answer


def process_image(image, task_prompt, text_input=None, model_id="maxiw/Florence-2-ScreenQA-base"):
    image = Image.fromarray(image)  # Convert NumPy array to PIL Image
    if task_prompt == "ScreenQA":
        task_prompt = "<SQA>"
        results = run_example(task_prompt, image, text_input, model_id=model_id)
        return results
    else:
        print("Unknown task prompt")
        return "", None  # Return empty string and None for unknown task prompts

css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""


single_task_list =[
    "ScreenQA"
]


with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Tab(label="Florence-2 Input"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="maxiw/Florence-2-ScreenQA-base")
                task_prompt = gr.Dropdown(choices=single_task_list, label="Task Prompt", value="ScreenQA")
                text_input = gr.Textbox(label="Question")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        gr.Examples(
            examples=[
                ["image1.jpg", "ScreenQA", "What is the version of the settings?"],
                ["image1.jpg", "ScreenQA", "What is the state of use lower resolution images?"],
                ["image2.jpg", "ScreenQA", "How much is the discount for the product?"]
            ],
            inputs=[input_img, task_prompt, text_input],
            outputs=[output_text],
            fn=process_image,
            cache_examples=True,
            label="Try examples"
        )

        submit_btn.click(process_image, [input_img, task_prompt, text_input, model_selector], [output_text])

demo.launch(debug=True)