File size: 4,274 Bytes
b793f0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import argparse
import cv2
from ultralytics import YOLO
from FastSAM.tools import *
from groundingdino.util.inference import load_model, load_image, predict, annotate, Model
from torchvision.ops import box_convert
import ast
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_path", type=str, default="./FastSAM/FastSAM-x.pt", help="model"
)
parser.add_argument(
"--img_path", type=str, default="./images/dogs.jpg", help="path to image file"
)
parser.add_argument(
"--text", type=str, default="the black dog.", help="text prompt for GroundingDINO"
)
parser.add_argument("--imgsz", type=int, default=1024, help="image size")
parser.add_argument(
"--iou",
type=float,
default=0.9,
help="iou threshold for filtering the annotations",
)
parser.add_argument(
"--conf", type=float, default=0.4, help="object confidence threshold"
)
parser.add_argument(
"--output", type=str, default="./output/", help="image save path"
)
parser.add_argument(
"--randomcolor", type=bool, default=True, help="mask random color"
)
parser.add_argument(
"--point_prompt", type=str, default="[[0,0]]", help="[[x1,y1],[x2,y2]]"
)
parser.add_argument(
"--point_label",
type=str,
default="[0]",
help="[1,0] 0:background, 1:foreground",
)
parser.add_argument("--box_prompt", type=str, default="[0,0,0,0]", help="[x,y,w,h]")
parser.add_argument(
"--better_quality",
type=str,
default=False,
help="better quality using morphologyEx",
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
parser.add_argument(
"--device", type=str, default=device, help="cuda:[0,1,2,3,4] or cpu"
)
parser.add_argument(
"--retina",
type=bool,
default=True,
help="draw high-resolution segmentation masks",
)
parser.add_argument(
"--withContours", type=bool, default=False, help="draw the edges of the masks"
)
return parser.parse_args()
def main(args):
# Image Path
img_path = args.img_path
text = args.text
# path to save img
save_path = args.output
if not os.path.exists(save_path):
os.makedirs(save_path)
basename = os.path.basename(args.img_path).split(".")[0]
# Build Fast-SAM Model
# ckpt_path = "/comp_robot/rentianhe/code/Grounded-Segment-Anything/FastSAM/FastSAM-x.pt"
model = YOLO(args.model_path)
results = model(
args.img_path,
imgsz=args.imgsz,
device=args.device,
retina_masks=args.retina,
iou=args.iou,
conf=args.conf,
max_det=100,
)
# Build GroundingDINO Model
groundingdino_config = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
groundingdino_ckpt_path = "./groundingdino_swint_ogc.pth"
image_source, image = load_image(img_path)
model = load_model(groundingdino_config, groundingdino_ckpt_path)
boxes, logits, phrases = predict(
model=model,
image=image,
caption=text,
box_threshold=0.3,
text_threshold=0.25,
device=args.device,
)
# Grounded-Fast-SAM
ori_img = cv2.imread(img_path)
ori_h = ori_img.shape[0]
ori_w = ori_img.shape[1]
# Save each frame due to the post process from FastSAM
boxes = boxes * torch.Tensor([ori_w, ori_h, ori_w, ori_h])
print(f"Detected Boxes: {len(boxes)}")
boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").cpu().numpy().tolist()
for box_idx in range(len(boxes)):
mask, _ = box_prompt(
results[0].masks.data,
boxes[box_idx],
ori_h,
ori_w,
)
annotations = np.array([mask])
img_array = fast_process(
annotations=annotations,
args=args,
mask_random_color=True,
bbox=boxes[box_idx],
)
cv2.imwrite(os.path.join(save_path, basename + f"_{str(box_idx)}_caption_{phrases[box_idx]}.jpg"), cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR))
if __name__ == "__main__":
args = parse_args()
main(args)
|