Spaces:

shenyunhang
/

APE_demo

Build error

File size: 34,262 Bytes

import gc
import multiprocessing as mp
import os
import shutil
import sys
import time
from os import path

import cv2
import torch
from huggingface_hub import hf_hub_download
from PIL import Image

import ape
import detectron2.data.transforms as T
import gradio as gr
from ape.model_zoo import get_config_file
from demo_lazy import get_parser, setup_cfg
from detectron2.config import CfgNode
from detectron2.data.detection_utils import read_image
from detectron2.evaluation.coco_evaluation import instances_to_coco_json
from detectron2.utils.logger import setup_logger
from predictor_lazy import VisualizationDemo

this_dir = path.dirname(path.abspath(__file__))

# os.system("git clone https://github.com/shenyunhang/APE.git")
# os.system("python3.10 -m pip install -e APE/")

example_list = [
    [
        this_dir + "/examples/Totoro01.png",
        # "Sky, Water, Tree, The biggest Chinchilla, The older girl wearing skirt on branch, Grass",
        "Girl with hat",
        # 0.05,
        0.25,
        ["object detection", "instance segmentation"],
    ],
    [
        this_dir + "/examples/Totoro01.png",
        "Sky, Water, Tree, Chinchilla, Grass, Girl",
        0.15,
        ["semantic segmentation"],
    ],
    [
        this_dir + "/examples/199_3946193540.jpg",
        "chess piece of horse head",
        0.30,
        ["object detection", "instance segmentation"],
    ],
    [
        this_dir + "/examples/TheGreatWall.jpg",
        "The Great Wall",
        0.1,
        ["semantic segmentation"],
    ],
    [
        this_dir + "/examples/Pisa.jpg",
        "Pisa",
        0.01,
        ["object detection", "instance segmentation"],
    ],
    [
        this_dir + "/examples/SolvayConference1927.jpg",
        # "Albert Einstein, Madame Curie",
        "Madame Curie",
        # 0.01,
        0.03,
        ["object detection", "instance segmentation"],
    ],
    [
        this_dir + "/examples/Transformers.webp",
        "Optimus Prime",
        0.11,
        ["object detection", "instance segmentation"],
    ],
    [
        this_dir + "/examples/Terminator3.jpg",
        "Humanoid Robot",
        0.10,
        ["object detection", "instance segmentation"],
    ],
    [
        this_dir + "/examples/MatrixRevolutionForZion.jpg",
        """machine killer with gun in fighting,
donut with colored granules on the surface,
railings being crossed by horses, 
a horse running or jumping,
equestrian rider's helmet,
outdoor dog led by rope, 
a dog being touched, 
clothed dog, 
basketball in hand, 
a basketball player with both feet off the ground, 
player with basketball in the hand, 
spoon on the plate, 
coffee cup with coffee, 
the nearest dessert to the coffee cup, 
the bartender who is mixing wine, 
a bartender in a suit, 
wine glass with wine, 
a person in aprons, 
pot with food, 
a knife being used to cut vegetables, 
striped sofa in the room, 
a sofa with pillows on it in the room, 
lights on in the room, 
an indoor lying pet, 
a cat on the sofa, 
one pet looking directly at the camera indoors, 
a bed with patterns in the room, 
the lamp on the table beside the bed, 
pillow placed at the head of the bed, 
a blackboard full of words in the classroom, 
child sitting at desks in the classroom, 
a person standing in front of bookshelves in the library, 
the table someone is using in the library, 
a person who touches books in the library, 
a person standing in front of the cake counter, 
a square plate full of cakes, 
a cake decorated with cream, 
hot dog with vegetables, 
hot dog with sauce on the surface, 
red sausage, 
flowerpot with flowers potted inside, 
monochrome flowerpot, 
a flowerpot filled with black soil, 
apple growing on trees, 
red complete apple, 
apple with a stalk, 
a woman brushing her teeth, 
toothbrush held by someone, 
toilet brush with colored bristles, 
a customer whose hair is being cut by barber, 
a barber at work, 
cloth covering the barber, 
shopping cart pushed by people in the supermarket, 
shopping cart with people in the supermarket, 
shopping cart full of goods, 
a child wearing a mask, 
refrigerator with fruit, 
a drink bottle in the refrigerator, 
refrigerator with more than two doors, 
a watch placed on a table or cloth, 
a watch with three or more watch hands can be seen, 
a watch with one or more small dials, 
clothes hanger, 
a piece of clothing hanging on the hanger, 
a piece of clothing worn on plastic models, 
leather bag with glossy surface, 
backpack, 
open package, 
a fish held by people, 
a person who is fishing with a fishing rod, 
a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder,
a person being interviewed, 
a person with microphone hold in hand,
        """,
        0.20,
        ["object detection", "instance segmentation"],
    ],
    [
        this_dir + "/examples/094_56726435.jpg",
        # "donut with colored granules on the surface",
        """donut with colored granules on the surface,
railings being crossed by horses, 
a horse running or jumping,
equestrian rider's helmet,
outdoor dog led by rope, 
a dog being touched, 
clothed dog, 
basketball in hand, 
a basketball player with both feet off the ground, 
player with basketball in the hand, 
spoon on the plate, 
coffee cup with coffee, 
the nearest dessert to the coffee cup, 
the bartender who is mixing wine, 
a bartender in a suit, 
wine glass with wine, 
a person in aprons, 
pot with food, 
a knife being used to cut vegetables, 
striped sofa in the room, 
a sofa with pillows on it in the room, 
lights on in the room, 
an indoor lying pet, 
a cat on the sofa, 
one pet looking directly at the camera indoors, 
a bed with patterns in the room, 
the lamp on the table beside the bed, 
pillow placed at the head of the bed, 
a blackboard full of words in the classroom, 
a blackboard or whiteboard with something pasted, 
child sitting at desks in the classroom, 
a person standing in front of bookshelves in the library, 
the table someone is using in the library, 
a person who touches books in the library, 
a person standing in front of the cake counter, 
a square plate full of cakes, 
a cake decorated with cream, 
hot dog with vegetables, 
hot dog with sauce on the surface, 
red sausage, 
flowerpot with flowers potted inside, 
monochrome flowerpot, 
a flowerpot filled with black soil, 
apple growing on trees, 
red complete apple, 
apple with a stalk, 
a woman brushing her teeth, 
toothbrush held by someone, 
toilet brush with colored bristles, 
a customer whose hair is being cut by barber, 
a barber at work, 
cloth covering the barber, 
a plastic toy, 
a plush toy, 
a humanoid toy, 
shopping cart pushed by people in the supermarket, 
shopping cart with people in the supermarket, 
shopping cart full of goods, 
a child wearing a mask, 
a mask on face with half a face exposed, 
a mask on face with only eyes exposed, 
refrigerator with fruit, 
a drink bottle in the refrigerator, 
refrigerator with more than two doors, 
a watch placed on a table or cloth, 
a watch with three or more watch hands can be seen, 
a watch with one or more small dials, 
clothes hanger, 
a piece of clothing hanging on the hanger, 
a piece of clothing worn on plastic models, 
leather bag with glossy surface, 
backpack, 
open package, 
a fish held by people, 
a person who is fishing with a fishing rod, 
a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder,
a person being interviewed, 
a person with microphone hold in hand,
        """,
        0.50,
        ["object detection", "instance segmentation"],
    ],
    [
        this_dir + "/examples/013_438973263.jpg",
        # "a male lion with a mane",
        """a male lion with a mane,
railings being crossed by horses, 
a horse running or jumping,
equestrian rider's helmet,
outdoor dog led by rope, 
a dog being touched, 
clothed dog, 
basketball in hand, 
a basketball player with both feet off the ground, 
player with basketball in the hand, 
spoon on the plate, 
coffee cup with coffee, 
the nearest dessert to the coffee cup, 
the bartender who is mixing wine, 
a bartender in a suit, 
wine glass with wine, 
a person in aprons, 
pot with food, 
a knife being used to cut vegetables, 
striped sofa in the room, 
a sofa with pillows on it in the room, 
lights on in the room, 
an indoor lying pet, 
a cat on the sofa, 
one pet looking directly at the camera indoors, 
a bed with patterns in the room, 
the lamp on the table beside the bed, 
pillow placed at the head of the bed, 
a blackboard full of words in the classroom, 
a blackboard or whiteboard with something pasted, 
child sitting at desks in the classroom, 
a person standing in front of bookshelves in the library, 
the table someone is using in the library, 
a person who touches books in the library, 
a person standing in front of the cake counter, 
a square plate full of cakes, 
a cake decorated with cream, 
hot dog with vegetables, 
hot dog with sauce on the surface, 
red sausage, 
flowerpot with flowers potted inside, 
monochrome flowerpot, 
a flowerpot filled with black soil, 
apple growing on trees, 
red complete apple, 
apple with a stalk, 
a woman brushing her teeth, 
toothbrush held by someone, 
toilet brush with colored bristles, 
a customer whose hair is being cut by barber, 
a barber at work, 
cloth covering the barber, 
a plastic toy, 
a plush toy, 
a humanoid toy, 
shopping cart pushed by people in the supermarket, 
shopping cart with people in the supermarket, 
shopping cart full of goods, 
a child wearing a mask, 
a mask on face with half a face exposed, 
a mask on face with only eyes exposed, 
refrigerator with fruit, 
a drink bottle in the refrigerator, 
refrigerator with more than two doors, 
a watch placed on a table or cloth, 
a watch with three or more watch hands can be seen, 
a watch with one or more small dials, 
clothes hanger, 
a piece of clothing hanging on the hanger, 
a piece of clothing worn on plastic models, 
leather bag with glossy surface, 
backpack, 
open package, 
a fish held by people, 
a person who is fishing with a fishing rod, 
a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder,
a person being interviewed, 
a person with microphone hold in hand,
        """,
        # 0.25,
        0.50,
        ["object detection", "instance segmentation"],
    ],
]

ckpt_repo_id = "shenyunhang/APE"


def setup_model(name):
    gc.collect()
    torch.cuda.empty_cache()

    if save_memory:
        pass
    else:
        return

    for key, demo in all_demo.items():
        if key == name:
            demo.predictor.model.to(running_device)
        else:
            demo.predictor.model.to("cpu")

    gc.collect()
    torch.cuda.empty_cache()


def run_on_image_A(input_image_path, input_text, score_threshold, output_type):
    logger.info("run_on_image")

    setup_model("APE_A")
    demo = all_demo["APE_A"]
    cfg = all_cfg["APE_A"]
    demo.predictor.model.model_vision.test_score_thresh = score_threshold

    return run_on_image(
        input_image_path,
        input_text,
        output_type,
        demo,
        cfg,
    )


def run_on_image_C(input_image_path, input_text, score_threshold, output_type):
    logger.info("run_on_image_C")

    setup_model("APE_C")
    demo = all_demo["APE_C"]
    cfg = all_cfg["APE_C"]
    demo.predictor.model.model_vision.test_score_thresh = score_threshold

    return run_on_image(
        input_image_path,
        input_text,
        output_type,
        demo,
        cfg,
    )


def run_on_image_D(input_image_path, input_text, score_threshold, output_type):
    logger.info("run_on_image_D")

    setup_model("APE_D")
    demo = all_demo["APE_D"]
    cfg = all_cfg["APE_D"]
    demo.predictor.model.model_vision.test_score_thresh = score_threshold

    return run_on_image(
        input_image_path,
        input_text,
        output_type,
        demo,
        cfg,
    )


def run_on_image_comparison(input_image_path, input_text, score_threshold, output_type):
    logger.info("run_on_image_comparison")

    r = []
    for key in all_demo.keys():
        logger.info("run_on_image_comparison {}".format(key))
        setup_model(key)
        demo = all_demo[key]
        cfg = all_cfg[key]
        demo.predictor.model.model_vision.test_score_thresh = score_threshold

        img, _ = run_on_image(
            input_image_path,
            input_text,
            output_type,
            demo,
            cfg,
        )
        r.append(img)

    return r


def run_on_image(
    input_image_path,
    input_text,
    output_type,
    demo,
    cfg,
):
    with_box = False
    with_mask = False
    with_sseg = False
    if "object detection" in output_type:
        with_box = True
    if "instance segmentation" in output_type:
        with_mask = True
    if "semantic segmentation" in output_type:
        with_sseg = True

    if isinstance(input_image_path, dict):
        input_mask_path = input_image_path["mask"]
        input_image_path = input_image_path["image"]
        print("input_image_path", input_image_path)
        print("input_mask_path", input_mask_path)
    else:
        input_mask_path = None

    print("input_text", input_text)

    if isinstance(cfg, CfgNode):
        input_format = cfg.INPUT.FORMAT
    else:
        if "model_vision" in cfg.model:
            input_format = cfg.model.model_vision.input_format
        else:
            input_format = cfg.model.input_format

    input_image = read_image(input_image_path, format="BGR")
    # img = cv2.imread(input_image_path)
    # cv2.imwrite("tmp.jpg", img)
    # # input_image = read_image("tmp.jpg", format=input_format)
    # input_image = read_image("tmp.jpg", format="BGR")

    if input_mask_path is not None:
        input_mask = read_image(input_mask_path, "L").squeeze(2)
        print("input_mask", input_mask)
        print("input_mask", input_mask.shape)
    else:
        input_mask = None

    if not with_box and not with_mask and not with_sseg:
        return input_image[:, :, ::-1]

    if input_image.shape[0] > 1024 or input_image.shape[1] > 1024:
        transform = aug.get_transform(input_image)
        input_image = transform.apply_image(input_image)
    else:
        transform = None

    start_time = time.time()
    predictions, visualized_output, _, metadata = demo.run_on_image(
        input_image,
        text_prompt=input_text,
        mask_prompt=input_mask,
        with_box=with_box,
        with_mask=with_mask,
        with_sseg=with_sseg,
    )

    logger.info(
        "{} in {:.2f}s".format(
            "detected {} instances".format(len(predictions["instances"]))
            if "instances" in predictions
            else "finished",
            time.time() - start_time,
        )
    )

    output_image = visualized_output.get_image()
    print("output_image", output_image.shape)
    # if input_format == "RGB":
    #     output_image = output_image[:, :, ::-1]
    if transform:
        output_image = transform.inverse().apply_image(output_image)
    print("output_image", output_image.shape)

    output_image = Image.fromarray(output_image)

    gc.collect()
    torch.cuda.empty_cache()

    json_results = instances_to_coco_json(predictions["instances"].to(demo.cpu_device), 0)
    for json_result in json_results:
        json_result["category_name"] = metadata.thing_classes[json_result["category_id"]]
        del json_result["image_id"]

    return output_image, json_results


def load_APE_A():
    # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth"
    init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth"
    init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint)

    args = get_parser().parse_args()
    args.config_file = get_config_file(
        "LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj1024_cp_720k.py"
    )
    args.confidence_threshold = 0.01
    args.opts = [
        "train.init_checkpoint='{}'".format(init_checkpoint),
        "model.model_language.cache_dir=''",
        "model.model_vision.select_box_nums_for_evaluation=500",
        "model.model_vision.backbone.net.xattn=False",
        "model.model_vision.transformer.encoder.pytorch_attn=True",
        "model.model_vision.transformer.decoder.pytorch_attn=True",
    ]
    if running_device == "cpu":
        args.opts += [
            "model.model_language.dtype='float32'",
        ]
    logger.info("Arguments: " + str(args))
    cfg = setup_cfg(args)

    cfg.model.model_vision.criterion[0].use_fed_loss = False
    cfg.model.model_vision.criterion[2].use_fed_loss = False
    cfg.train.device = running_device

    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
        "vision_cfg"
    ]["layers"] = 1
    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
        "vision_cfg"
    ]["fusedLN"] = False

    demo = VisualizationDemo(cfg, args=args)
    if save_memory:
        demo.predictor.model.to("cpu")
        # demo.predictor.model.half()
    else:
        demo.predictor.model.to(running_device)

    all_demo["APE_A"] = demo
    all_cfg["APE_A"] = cfg


def load_APE_B():
    # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_225418/model_final.pth"
    init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_225418/model_final.pth"
    init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint)

    args = get_parser().parse_args()
    args.config_file = get_config_file(
        "LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj1024_cp_1080k.py"
    )
    args.confidence_threshold = 0.01
    args.opts = [
        "train.init_checkpoint='{}'".format(init_checkpoint),
        "model.model_language.cache_dir=''",
        "model.model_vision.select_box_nums_for_evaluation=500",
        "model.model_vision.text_feature_bank_reset=True",
        "model.model_vision.backbone.net.xattn=False",
        "model.model_vision.transformer.encoder.pytorch_attn=True",
        "model.model_vision.transformer.decoder.pytorch_attn=True",
    ]
    if running_device == "cpu":
        args.opts += [
            "model.model_language.dtype='float32'",
        ]
    logger.info("Arguments: " + str(args))
    cfg = setup_cfg(args)

    cfg.model.model_vision.criterion[0].use_fed_loss = False
    cfg.model.model_vision.criterion[2].use_fed_loss = False
    cfg.train.device = running_device

    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
        "vision_cfg"
    ]["layers"] = 1
    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
        "vision_cfg"
    ]["fusedLN"] = False

    demo = VisualizationDemo(cfg, args=args)
    if save_memory:
        demo.predictor.model.to("cpu")
        # demo.predictor.model.half()
    else:
        demo.predictor.model.to(running_device)

    all_demo["APE_B"] = demo
    all_cfg["APE_B"] = cfg


def load_APE_C():
    # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_210950/model_final.pth"
    init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_210950/model_final.pth"
    init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint)

    args = get_parser().parse_args()
    args.config_file = get_config_file(
        "LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj1024_cp_1080k.py"
    )
    args.confidence_threshold = 0.01
    args.opts = [
        "train.init_checkpoint='{}'".format(init_checkpoint),
        "model.model_language.cache_dir=''",
        "model.model_vision.select_box_nums_for_evaluation=500",
        "model.model_vision.text_feature_bank_reset=True",
        "model.model_vision.backbone.net.xattn=False",
        "model.model_vision.transformer.encoder.pytorch_attn=True",
        "model.model_vision.transformer.decoder.pytorch_attn=True",
    ]
    if running_device == "cpu":
        args.opts += [
            "model.model_language.dtype='float32'",
        ]
    logger.info("Arguments: " + str(args))
    cfg = setup_cfg(args)

    cfg.model.model_vision.criterion[0].use_fed_loss = False
    cfg.model.model_vision.criterion[2].use_fed_loss = False
    cfg.train.device = running_device

    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
        "vision_cfg"
    ]["layers"] = 1
    ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][
        "vision_cfg"
    ]["fusedLN"] = False

    demo = VisualizationDemo(cfg, args=args)
    if save_memory:
        demo.predictor.model.to("cpu")
        # demo.predictor.model.half()
    else:
        demo.predictor.model.to(running_device)

    all_demo["APE_C"] = demo
    all_cfg["APE_C"] = cfg


def load_APE_D():
    # init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k_mdl_20230829_162438/model_final.pth"
    init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k_mdl_20230829_162438/model_final.pth"
    init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint)

    args = get_parser().parse_args()
    args.config_file = get_config_file(
        "LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k.py"
    )
    args.confidence_threshold = 0.01
    args.opts = [
        "train.init_checkpoint='{}'".format(init_checkpoint),
        "model.model_language.cache_dir=''",
        "model.model_vision.select_box_nums_for_evaluation=500",
        "model.model_vision.text_feature_bank_reset=True",
        "model.model_vision.backbone.net.xattn=False",
        "model.model_vision.transformer.encoder.pytorch_attn=True",
        "model.model_vision.transformer.decoder.pytorch_attn=True",
    ]
    if running_device == "cpu":
        args.opts += [
            "model.model_language.dtype='float32'",
        ]
    logger.info("Arguments: " + str(args))
    cfg = setup_cfg(args)

    cfg.model.model_vision.criterion[0].use_fed_loss = False
    cfg.model.model_vision.criterion[2].use_fed_loss = False
    cfg.train.device = running_device

    ape.modeling.text.eva02_clip.factory._MODEL_CONFIGS[cfg.model.model_language.clip_model][
        "vision_cfg"
    ]["layers"] = 1

    demo = VisualizationDemo(cfg, args=args)
    if save_memory:
        demo.predictor.model.to("cpu")
        # demo.predictor.model.half()
    else:
        demo.predictor.model.to(running_device)

    all_demo["APE_D"] = demo
    all_cfg["APE_D"] = cfg


def APE_A_tab():
    with gr.Tab("APE A"):
        with gr.Row(equal_height=False):
            with gr.Column(scale=1):
                input_image = gr.Image(
                    sources=["upload"],
                    type="filepath",
                    # tool="sketch",
                    # brush_radius=50,
                )
                input_text = gr.Textbox(
                    label="Object Prompt (optional, if not provided, will only find COCO object.)",
                    info="格式: word1,word2,word3,...",
                )

                score_threshold = gr.Slider(
                    label="Score Threshold", minimum=0.01, maximum=1.0, value=0.3, step=0.01
                )

                output_type = gr.CheckboxGroup(
                    ["object detection", "instance segmentation"],
                    value=["object detection", "instance segmentation"],
                    label="Output Type",
                    info="Which kind of output is displayed?",
                ).style(item_container=True, container=True)

                run_button = gr.Button("Run")

            with gr.Column(scale=2):
                gallery = gr.Image(
                    type="pil",
                )

        example_data = gr.Dataset(
            components=[input_image, input_text, score_threshold],
            samples=examples,
            samples_per_page=5,
        )
        example_data.click(fn=set_example, inputs=example_data, outputs=example_data.components)

        # add_tail_info()
        output_json = gr.JSON(label="json results")

        run_button.click(
            fn=run_on_image,
            inputs=[input_image, input_text, score_threshold, output_type],
            outputs=[gallery, output_json],
        )


def APE_C_tab():
    with gr.Tab("APE C"):
        with gr.Row(equal_height=False):
            with gr.Column(scale=1):
                input_image = gr.Image(
                    sources=["upload"],
                    type="filepath",
                    # tool="sketch",
                    # brush_radius=50,
                )
                input_text = gr.Textbox(
                    label="Object Prompt (optional, if not provided, will only find COCO object.)",
                    info="格式: word1,word2,sentence1,sentence2,...",
                )

                score_threshold = gr.Slider(
                    label="Score Threshold", minimum=0.01, maximum=1.0, value=0.3, step=0.01
                )

                output_type = gr.CheckboxGroup(
                    ["object detection", "instance segmentation", "semantic segmentation"],
                    value=["object detection", "instance segmentation"],
                    label="Output Type",
                    info="Which kind of output is displayed?",
                ).style(item_container=True, container=True)

                run_button = gr.Button("Run")

            with gr.Column(scale=2):
                gallery = gr.Image(
                    type="pil",
                )

        example_data = gr.Dataset(
            components=[input_image, input_text, score_threshold],
            samples=example_list,
            samples_per_page=5,
        )
        example_data.click(fn=set_example, inputs=example_data, outputs=example_data.components)

        # add_tail_info()
        output_json = gr.JSON(label="json results")

        run_button.click(
            fn=run_on_image_C,
            inputs=[input_image, input_text, score_threshold, output_type],
            outputs=[gallery, output_json],
        )


def APE_D_tab():
    with gr.Tab("APE D"):
        with gr.Row(equal_height=False):
            with gr.Column(scale=1):
                input_image = gr.Image(
                    sources=["upload"],
                    type="filepath",
                    # tool="sketch",
                    # brush_radius=50,
                )
                input_text = gr.Textbox(
                    label="Object Prompt (optional, if not provided, will only find COCO object.)",
                    info="格式: word1,word2,sentence1,sentence2,...",
                )

                score_threshold = gr.Slider(
                    label="Score Threshold", minimum=0.01, maximum=1.0, value=0.1, step=0.01
                )

                output_type = gr.CheckboxGroup(
                    ["object detection", "instance segmentation", "semantic segmentation"],
                    value=["object detection", "instance segmentation"],
                    label="Output Type",
                    info="Which kind of output is displayed?",
                )

                run_button = gr.Button("Run")

            with gr.Column(scale=2):
                gallery = gr.Image(
                    type="pil",
                )

        gr.Examples(
            examples=example_list,
            inputs=[input_image, input_text, score_threshold, output_type],
            examples_per_page=20,
        )

        # add_tail_info()
        output_json = gr.JSON(label="json results")

        run_button.click(
            fn=run_on_image_D,
            inputs=[input_image, input_text, score_threshold, output_type],
            outputs=[gallery, output_json],
        )


def comparison_tab():
    with gr.Tab("APE all"):
        with gr.Row(equal_height=False):
            with gr.Column(scale=1):
                input_image = gr.Image(
                    sources=["upload"],
                    type="filepath",
                    # tool="sketch",
                    # brush_radius=50,
                )
                input_text = gr.Textbox(
                    label="Object Prompt (optional, if not provided, will only find COCO object.)",
                    info="格式: word1,word2,sentence1,sentence2,...",
                )

                score_threshold = gr.Slider(
                    label="Score Threshold", minimum=0.01, maximum=1.0, value=0.1, step=0.01
                )

                output_type = gr.CheckboxGroup(
                    ["object detection", "instance segmentation", "semantic segmentation"],
                    value=["object detection", "instance segmentation"],
                    label="Output Type",
                    info="Which kind of output is displayed?",
                )

                run_button = gr.Button("Run")

            gallery_all = []
            with gr.Column(scale=2):
                for key in all_demo.keys():
                    gallery = gr.Image(
                        label=key,
                        type="pil",
                    )
                    gallery_all.append(gallery)

        gr.Examples(
            examples=example_list,
            inputs=[input_image, input_text, score_threshold, output_type],
            examples_per_page=20,
        )

        # add_tail_info()

        run_button.click(
            fn=run_on_image_comparison,
            inputs=[input_image, input_text, score_threshold, output_type],
            outputs=gallery_all,
        )


def is_port_in_use(port: int) -> bool:
    import socket

    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(("localhost", port)) == 0


def add_head_info(max_available_memory):
    gr.Markdown(
        "# APE: Aligning and Prompting Everything All at Once for Universal Visual Perception"
    )
    if max_available_memory:
        gr.Markdown(
            "Note multiple models are deployed on single GPU, so it may take several minutes to run the models and visualize the results."
        )
    else:
        gr.Markdown(
            "Note multiple models are deployed on CPU, so it may take a while to run the models and visualize the results."
        )
        gr.Markdown(
            "Noted results computed by CPU are slightly different to results computed by GPU, and some libraries are disabled on CPU."
        )
    gr.Markdown(
        "If the demo is out of memory, try to ***decrease*** the number of object prompt and ***increase*** score threshold."
    )

    gr.Markdown("---")


def add_tail_info():
    gr.Markdown("---")
    gr.Markdown("### We also support Prompt")
    gr.Markdown(
        """
    |  Location prompt   | result |  Location prompt   | result  |
    |  ----  | ----  |  ----  | ----  |
    | ![Location prompt](/file=examples/prompt/20230627-131346_11.176.20.67_mask.PNG)  | ![结果](/file=examples/prompt/20230627-131346_11.176.20.67_pred.png) | ![Location prompt](/file=examples/prompt/20230627-131530_11.176.20.67_mask.PNG)  | ![结果](/file=examples/prompt/20230627-131530_11.176.20.67_pred.png) |
    | ![Location prompt](/file=examples/prompt/20230627-131520_11.176.20.67_mask.PNG)  | ![结果](/file=examples/prompt/20230627-131520_11.176.20.67_pred.png) | ![Location prompt](/file=examples/prompt/20230627-114219_11.176.20.67_mask.PNG)  | ![结果](/file=examples/prompt/20230627-114219_11.176.20.67_pred.png) |
    """
    )
    gr.Markdown("---")


if __name__ == "__main__":
    available_port = [80, 8080]
    for port in available_port:
        if is_port_in_use(port):
            continue
        else:
            server_port = port
            break
    print("server_port", server_port)

    available_memory = [
        torch.cuda.mem_get_info(i)[0] / 1024**3 for i in range(torch.cuda.device_count())
    ]

    global running_device
    if len(available_memory) > 0:
        max_available_memory = max(available_memory)
        device_id = available_memory.index(max_available_memory)

        running_device = "cuda:" + str(device_id)
    else:
        max_available_memory = 0
        running_device = "cpu"

    global save_memory
    save_memory = False
    if max_available_memory > 0 and max_available_memory < 40:
        save_memory = True

    print("available_memory", available_memory)
    print("max_available_memory", max_available_memory)
    print("running_device", running_device)
    print("save_memory", save_memory)

    # ==========================================================================================

    mp.set_start_method("spawn", force=True)
    setup_logger(name="fvcore")
    setup_logger(name="ape")
    global logger
    logger = setup_logger()

    global aug
    aug = T.ResizeShortestEdge([1024, 1024], 1024)

    global all_demo
    all_demo = {}
    all_cfg = {}

    # load_APE_A()
    # load_APE_B()
    # load_APE_C()
    save_memory = False
    load_APE_D()

    title = "APE: Aligning and Prompting Everything All at Once for Universal Visual Perception"
    block = gr.Blocks(title=title).queue()
    with block:
        add_head_info(max_available_memory)

        # APE_A_tab()
        # APE_C_tab()
        APE_D_tab()

        comparison_tab()

        # add_tail_info()

    block.launch(
        share=False,
        # server_name="0.0.0.0",
        # server_port=server_port,
        show_api=False,
        show_error=True,
    )