from typing import Tuple, Optional

import gradio as gr
import supervision as sv
import torch
from PIL import Image

from utils.florence import load_florence_model, run_florence_inference, \
    FLORENCE_DETAILED_CAPTION_TASK, \
    FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
from utils.modes import INFERENCE_MODES, OPEN_VOCABULARY_DETECTION, \
    CAPTION_GROUNDING_MASKS
from utils.sam import load_sam_model, run_sam_inference

MARKDOWN = """
# Florence2 + SAM2 🔥

This demo integrates Florence2 and SAM2 models for detailed image captioning and object 
detection. Florence2 generates detailed captions that are then used to perform phrase 
grounding. The Segment Anything Model 2 (SAM2) converts these phrase-grounded boxes 
into masks.
"""

EXAMPLES = [
    [OPEN_VOCABULARY_DETECTION, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'straw'],
    [OPEN_VOCABULARY_DETECTION, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'napkin'],
    [OPEN_VOCABULARY_DETECTION, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 'tail'],
    [CAPTION_GROUNDING_MASKS, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
    [CAPTION_GROUNDING_MASKS, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
]

DEVICE = torch.device("cuda")
FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
SAM_MODEL = load_sam_model(device=DEVICE)
BOX_ANNOTATOR = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
LABEL_ANNOTATOR = sv.LabelAnnotator(
    color_lookup=sv.ColorLookup.INDEX,
    text_position=sv.Position.CENTER_OF_MASS,
    text_color=sv.Color.from_hex("#FFFFFF"),
    border_radius=5
)
MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX)


def annotate_image(image, detections):
    output_image = image.copy()
    output_image = MASK_ANNOTATOR.annotate(output_image, detections)
    output_image = BOX_ANNOTATOR.annotate(output_image, detections)
    output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
    return output_image


def on_mode_dropdown_change(text):
    return [
        gr.Textbox(visible=text == OPEN_VOCABULARY_DETECTION),
        gr.Textbox(visible=text == CAPTION_GROUNDING_MASKS),
    ]


def process(
    mode_dropdown, image_input, text_input
) -> Tuple[Optional[Image.Image], Optional[str]]:
    if not image_input:
        return None, None

    if mode_dropdown == OPEN_VOCABULARY_DETECTION:
        if not text_input:
            return None, None

        _, result = run_florence_inference(
            model=FLORENCE_MODEL,
            processor=FLORENCE_PROCESSOR,
            device=DEVICE,
            image=image_input,
            task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
            text=text_input
        )
        detections = sv.Detections.from_lmm(
            lmm=sv.LMM.FLORENCE_2,
            result=result,
            resolution_wh=image_input.size
        )
        detections = run_sam_inference(SAM_MODEL, image_input, detections)
        return annotate_image(image_input, detections), None

    if mode_dropdown == CAPTION_GROUNDING_MASKS:
        _, result = run_florence_inference(
            model=FLORENCE_MODEL,
            processor=FLORENCE_PROCESSOR,
            device=DEVICE,
            image=image_input,
            task=FLORENCE_DETAILED_CAPTION_TASK
        )
        caption = result[FLORENCE_DETAILED_CAPTION_TASK]
        _, result = run_florence_inference(
            model=FLORENCE_MODEL,
            processor=FLORENCE_PROCESSOR,
            device=DEVICE,
            image=image_input,
            task=FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK,
            text=caption
        )
        detections = sv.Detections.from_lmm(
            lmm=sv.LMM.FLORENCE_2,
            result=result,
            resolution_wh=image_input.size
        )
        detections = run_sam_inference(SAM_MODEL, image_input, detections)
        return annotate_image(image_input, detections), caption


with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    mode_dropdown_component = gr.Dropdown(
        choices=INFERENCE_MODES,
        value=INFERENCE_MODES[0],
        label="Mode",
        info="Select a mode to use.",
        interactive=True
    )
    with gr.Row():
        with gr.Column():
            image_input_component = gr.Image(
                type='pil', label='Upload image')
            text_input_component = gr.Textbox(
                label='Text prompt')
            submit_button_component = gr.Button(value='Submit', variant='primary')
        with gr.Column():
            image_output_component = gr.Image(type='pil', label='Image output')
            text_output_component = gr.Textbox(label='Caption output', visible=False)

    with gr.Row():
        gr.Examples(
            fn=process,
            examples=EXAMPLES,
            inputs=[
                mode_dropdown_component,
                image_input_component,
                text_input_component
            ],
            outputs=[
                image_output_component,
                text_output_component
            ],
            run_on_click=True
        )

    submit_button_component.click(
        fn=process,
        inputs=[
            mode_dropdown_component,
            image_input_component,
            text_input_component
        ],
        outputs=[
            image_output_component,
            text_output_component
        ]
    )
    mode_dropdown_component.change(
        on_mode_dropdown_change,
        inputs=[mode_dropdown_component],
        outputs=[
            text_input_component,
            text_output_component
        ]
    )

demo.launch(debug=False, show_error=True, max_threads=1)