import gradio as gr from transformers import pipeline from PIL import ImageDraw import torch detector = pipeline("zero-shot-object-detection", model="google/owlvit-base-patch32") depth_estimator = pipeline("depth-estimation", model="Intel/dpt-large") def visualize_preds(image, predictions): new_image = image.copy() draw = ImageDraw.Draw(new_image) for prediction in predictions: box = prediction["box"] label = prediction["label"] score = prediction["score"] xmin, ymin, xmax, ymax = box.values() draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1) draw.text((xmin, ymin), f"{label}: {round(score,2)}", fill="white") return new_image def compute_depth(image, preds): output = depth_estimator(image) prediction = torch.nn.functional.interpolate( output["predicted_depth"].unsqueeze(1), size=image.size[::-1], mode="bicubic", align_corners=False, ).squeeze().cpu().numpy() output = [] for pred in preds: x = (pred["box"]["xmax"] + pred["box"]["xmin"]) // 2 y = (pred["box"]["ymax"] + pred["box"]["ymin"]) // 2 output.append({ "class": pred["label"], "distance": float(prediction[y][x]) }) return output def process(image, text): items = text.split(".") preds = detector(image, candidate_labels=items) return [visualize_preds(image, preds), compute_depth(image, preds)] with gr.Blocks() as demo: with gr.Row(): with gr.Column(scale=1): image = gr.Image(type="pil") name = gr.Textbox(label="Name") greet_btn = gr.Button("Greet") with gr.Column(scale=1): output_detection = gr.Image(type="pil") output_distance = gr.JSON(label="Distance") greet_btn.click(fn=process, inputs=[image, name], outputs=[output_detection, output_distance], api_name="process") demo.launch()