Spaces:

SkalskiP
/

florence-sam

Running on Zero

App Files Files Community

florence-sam / app.py

SkalskiP

open vocabulary detection with Florence2 + masks with SAM2

576e22a 7 months ago

raw

history blame

5.73 kB

	from typing import Tuple, Optional

	import gradio as gr
	import supervision as sv
	import torch
	from PIL import Image

	from utils.florence import load_florence_model, run_florence_inference, \
	FLORENCE_DETAILED_CAPTION_TASK, \
	FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
	from utils.modes import INFERENCE_MODES, OPEN_VOCABULARY_DETECTION, \
	CAPTION_GROUNDING_MASKS
	from utils.sam import load_sam_model, run_sam_inference

	MARKDOWN = """
	# Florence2 + SAM2 🔥

	This demo integrates Florence2 and SAM2 models for detailed image captioning and object
	detection. Florence2 generates detailed captions that are then used to perform phrase
	grounding. The Segment Anything Model 2 (SAM2) converts these phrase-grounded boxes
	into masks.
	"""

	EXAMPLES = [
	[OPEN_VOCABULARY_DETECTION, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'straw'],
	[OPEN_VOCABULARY_DETECTION, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'napkin'],
	[OPEN_VOCABULARY_DETECTION, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 'tail'],
	[CAPTION_GROUNDING_MASKS, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
	[CAPTION_GROUNDING_MASKS, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
	]

	DEVICE = torch.device("cuda")
	FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
	SAM_MODEL = load_sam_model(device=DEVICE)
	BOX_ANNOTATOR = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
	LABEL_ANNOTATOR = sv.LabelAnnotator(
	color_lookup=sv.ColorLookup.INDEX,
	text_position=sv.Position.CENTER_OF_MASS,
	text_color=sv.Color.from_hex("#FFFFFF"),
	border_radius=5
	)
	MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX)


	def annotate_image(image, detections):
	output_image = image.copy()
	output_image = MASK_ANNOTATOR.annotate(output_image, detections)
	output_image = BOX_ANNOTATOR.annotate(output_image, detections)
	output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
	return output_image


	def on_mode_dropdown_change(text):
	return [
	gr.Textbox(visible=text == OPEN_VOCABULARY_DETECTION),
	gr.Textbox(visible=text == CAPTION_GROUNDING_MASKS),
	]


	def process(
	mode_dropdown, image_input, text_input
	) -> Tuple[Optional[Image.Image], Optional[str]]:
	if not image_input:
	return None, None

	if mode_dropdown == OPEN_VOCABULARY_DETECTION:
	if not text_input:
	return None, None

	_, result = run_florence_inference(
	model=FLORENCE_MODEL,
	processor=FLORENCE_PROCESSOR,
	device=DEVICE,
	image=image_input,
	task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
	text=text_input
	)
	detections = sv.Detections.from_lmm(
	lmm=sv.LMM.FLORENCE_2,
	result=result,
	resolution_wh=image_input.size
	)
	detections = run_sam_inference(SAM_MODEL, image_input, detections)
	return annotate_image(image_input, detections), None

	if mode_dropdown == CAPTION_GROUNDING_MASKS:
	_, result = run_florence_inference(
	model=FLORENCE_MODEL,
	processor=FLORENCE_PROCESSOR,
	device=DEVICE,
	image=image_input,
	task=FLORENCE_DETAILED_CAPTION_TASK
	)
	caption = result[FLORENCE_DETAILED_CAPTION_TASK]
	_, result = run_florence_inference(
	model=FLORENCE_MODEL,
	processor=FLORENCE_PROCESSOR,
	device=DEVICE,
	image=image_input,
	task=FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK,
	text=caption
	)
	detections = sv.Detections.from_lmm(
	lmm=sv.LMM.FLORENCE_2,
	result=result,
	resolution_wh=image_input.size
	)
	detections = run_sam_inference(SAM_MODEL, image_input, detections)
	return annotate_image(image_input, detections), caption


	with gr.Blocks() as demo:
	gr.Markdown(MARKDOWN)
	mode_dropdown_component = gr.Dropdown(
	choices=INFERENCE_MODES,
	value=INFERENCE_MODES[0],
	label="Mode",
	info="Select a mode to use.",
	interactive=True
	)
	with gr.Row():
	with gr.Column():
	image_input_component = gr.Image(
	type='pil', label='Upload image')
	text_input_component = gr.Textbox(
	label='Text prompt')
	submit_button_component = gr.Button(value='Submit', variant='primary')
	with gr.Column():
	image_output_component = gr.Image(type='pil', label='Image output')
	text_output_component = gr.Textbox(label='Caption output', visible=False)

	with gr.Row():
	gr.Examples(
	fn=process,
	examples=EXAMPLES,
	inputs=[
	mode_dropdown_component,
	image_input_component,
	text_input_component
	],
	outputs=[
	image_output_component,
	text_output_component
	],
	run_on_click=True
	)

	submit_button_component.click(
	fn=process,
	inputs=[
	mode_dropdown_component,
	image_input_component,
	text_input_component
	],
	outputs=[
	image_output_component,
	text_output_component
	]
	)
	mode_dropdown_component.change(
	on_mode_dropdown_change,
	inputs=[mode_dropdown_component],
	outputs=[
	text_input_component,
	text_output_component
	]
	)

	demo.launch(debug=False, show_error=True, max_threads=1)