import os import sys sys.path.append(os.getcwd()) from cllm.agents.base import Action BUILTIN_SEG_BY_POINTS = "Segment the given image based on the prompt points." BUILTIN_SEG_BY_MASK = "Segment the given image based on the prompt mask." # BUILTIN_REMOVE_BY_MASK = "Remove the object based on the given mask." BUILTIN_IMAGE_TO_EDGE = "Generate the edge from the given image." BUILTIN_GENERATE_SIMILAR_IMAGE = "Generate a new image similar to the input image" # BUILTIN_GENERATE_SIMILAR_IMAGE2 = "Generate a similar image from the given image 2" # BUILTIN_GENERATE_SIMILAR_IMAGE3 = "Image to image. 3" BUILTIN_GENERATE_SIMILAR_IMAGE4 = "Generate a new image similar to image 4" BUILTIN_GENERATE_IMAGE_HED = "Generate a new image based on HED result from input image" BUILTIN_GENERATE_IMAGE_DEPTH = ( "Generate a new image based on depth map from input image" ) BUILTIN_GENERATE_IMAGE_OCR = "Please extract the text from the image" BUILTIN_TEXT_EDGE_TO_IMAGE = "Generate an image based on the given edge map." BUILTIN_GENERATE_IMAGE = "Generate a new image that shows a woman is skiing" BUILTIN_IMAGE_TO_VIDEO = "Generate a video from the image" BUILTIN_COUNT_OBJECTS = "Provide me with the count of bears in the input image" BUILTIN_VIDEO_TO_WEBPAGE = "Generate a web page for input video" BUILTIN_TEXT_TO_MUSIC = "Please generate a piece of music based on given prompt. Here is the prompt: An 80s driving pop song with heavy drums and synth pads in the background" BUILTIN_IMAGE_ERASING1 = "Erase the wine glass from the photo" BUILTIN_IMAGE_ERASING2 = "Erase the cats in the photo" BUILTIN_IMAGE_CROPPING = "Crop the cats from the photo" BUILTIN_IMAGE_SEG = "give me the mask of elephant." BUILTIN_IMAGE_HIGHLIGHT = "highlight the elephant." BUILTIN_TEXT_SPEECH = "translate text into speech" BUILTIN_DUBBING = "dub this video with the given audio" BUILTIN_COUNT_OBJECTS2 = "Count the horse in the image." BUILTIN_IMAGE_TO_VIDEO2 = "Generate an image that shows a serene and beautiful landscape with a calm lake reflecting the blue sky and white clouds. Then generate a video to introduce this image." BUILTIN_IMAGE_TO_VIDEO3 = "Create a visual and auditory representation of a peaceful and scenic landscape. The image should depict a serene and beautiful landscape with a calm lake reflecting the blue sky. The music should match the image. Finally, combine the image and the music into a video that showcases the beauty of nature." BUILTIN_VIDEO_CLS = "Recognize the action in the video" BUILTIN_VIDEO_CLS = "Recognize the action in the video" BUILTIN_AUDIO_CLS = "Recognize the event in this audio" BUILTIN_IMAGE2MUSIC = "Generate a piece of music for this image" BUILTIN_VIDEO2MUSIC = ( "Generate a piece of music for this video and dub the video with generated music" ) BUILTIN_PLANS = { # BUILTIN_REMOVE_BY_MASK: [ # [ # Action( # tool_name="image_inpainting", # inputs={"image": "image", "mask": "image.mask"}, # outputs=["-0"], # ) # ] # ], BUILTIN_IMAGE_TO_EDGE: [ [ Action( tool_name="image_to_edge", inputs={"image": "image"}, outputs=["-0"], ) ] ], BUILTIN_TEXT_EDGE_TO_IMAGE: [ [ Action( tool_name="image_captioning", inputs={"image": "image"}, outputs=["-prompt"], ), Action( tool_name="edge_text_to_image", inputs={ "edge": "image.edge", "text": "-prompt", }, outputs=["-0"], ), ] ], BUILTIN_GENERATE_SIMILAR_IMAGE: [ [ Action( tool_name="image_to_edge", inputs={"image": "image"}, outputs=["-edge"], ), Action( tool_name="image_captioning", inputs={"image": "image"}, outputs=["-prompt"], ), Action( tool_name="edge_text_to_image", inputs={ "edge": "-edge", "text": "-prompt", }, outputs=["-0"], ), ] ], # BUILTIN_GENERATE_SIMILAR_IMAGE2: [ # [ # Action( # tool_name="image_captioning", # inputs={"image": "image"}, # outputs=["-prompt"], # ), # Action( # tool_name="text_to_image", # inputs={"text": "-prompt"}, # outputs=["-0"], # ), # ] # ], # BUILTIN_GENERATE_SIMILAR_IMAGE3: [ # [ # Action( # tool_name="image_to_image", # inputs={"image": "image"}, # outputs=["-0"], # ), # ] # ], BUILTIN_GENERATE_IMAGE_HED: [ [ Action( tool_name="image_to_hed", inputs={"image": "image"}, outputs=["-image_to_hed-hed-0"], ), Action( tool_name="hed_text_to_image", inputs={ "text": "beautiful mountains and sunset", "hed": "-image_to_hed-hed-0", }, outputs=["-0"], ), ] ], BUILTIN_GENERATE_IMAGE_DEPTH: [ [ Action( tool_name="image_captioning", inputs={ "image": "image", }, outputs=["-image_captioning-text-0"], ), Action( tool_name="image_to_depth", inputs={"image": "image"}, outputs=["-image_to_depth-depth-0"], ), Action( tool_name="depth_text_to_image", inputs={ "text": "-image_captioning-text-0", "depth": "-image_to_depth-depth-0", }, outputs=["-0"], ), ] ], BUILTIN_GENERATE_IMAGE_OCR: [ [ Action( tool_name="optical_character_recognition", inputs={"image": "image"}, outputs=["-0"], ) ] ], BUILTIN_COUNT_OBJECTS: [ [ Action( tool_name="object_detection", inputs={"image": "image"}, outputs=["-object_detection-bbox-0"], ), Action( tool_name="select_bbox", inputs={ "bbox_list": "-object_detection-bbox-0", "condition": "bear", }, outputs=["-select_bbox-bbox-0"], ), Action( tool_name="count_objects", inputs={"bbox_list": "-select_bbox-bbox-0"}, outputs=["-0"], ), ], [ Action( tool_name="image_question_answering", inputs={ "text": "Provide me with the count of bears in the input image", "image": "image", }, outputs=["-1"], ) ], ], BUILTIN_VIDEO_TO_WEBPAGE: [ [ Action( tool_name="video_captioning", inputs={"video": "video"}, outputs=["-text-0"], ), Action( tool_name="text_to_music", inputs={"text": "-text-0"}, outputs=["-text_to_music-audio-0"], ), Action( tool_name="dub_video", inputs={ "video": "video", "audio": "-text_to_music-audio-0", }, outputs=["-dub_video-video-0"], ), Action( tool_name="title_generation", inputs={"text": "-text-0"}, outputs=["-text-1"], ), Action( tool_name="text_to_tags", inputs={"text": "-text-0"}, outputs=["-tags-0"], ), Action( tool_name="video_to_webpage", inputs={ "video": "-dub_video-video-0", "title": "-text-1", "tags": "-tags-0", "description": "-text-0", }, outputs=["-0"], ), ] ], BUILTIN_TEXT_TO_MUSIC: [ [ Action( tool_name="text_to_music", inputs={ "text": "An 80s driving pop song with heavy drums and synth pads in the background" }, outputs=["-audio-0"], ) ] ], BUILTIN_IMAGE_ERASING1: [ [ Action( tool_name="image_instance_segmentation", inputs={"image": "image"}, outputs=["-image_instance_segmentation-mask-0"], ), Action( tool_name="select_mask", inputs={ "mask_list": "-image_instance_segmentation-mask-0", "condition": "wine glass", }, outputs=["-select_mask-mask-1"], ), Action( tool_name="image_inpainting", inputs={ "image": "image", "mask": "-select_mask-mask-0", }, outputs=["-0"], ), ] ], BUILTIN_IMAGE_ERASING2: [ [ Action( tool_name="image_instance_segmentation", inputs={"image": "image"}, outputs=["-image_instance_segmentation-mask-0"], ), Action( tool_name="select_mask", inputs={ "mask_list": "-image_instance_segmentation-mask-0", "condition": "cat", }, outputs=["-select_mask-mask-0"], ), Action( tool_name="image_inpainting", inputs={ "image": "image", "mask": "-select_mask-mask-0", }, outputs=["-0"], ), ] ], BUILTIN_IMAGE_CROPPING: [ [ Action( tool_name="object_detection", inputs={"image": "image"}, outputs=["-object_detection-bbox-0"], ), Action( tool_name="select_bbox", inputs={ "bbox_list": "-object_detection-bbox-0", "condition": "cat", }, outputs=["-select_bbox-bbox-0"], ), Action( tool_name="image_cropping", inputs={ "image": "image", "object": "-select_bbox-bbox-0", }, outputs=["-0"], ), ] ], BUILTIN_IMAGE_SEG: [ [ Action( tool_name="image_instance_segmentation", inputs={"image": "image"}, outputs=["-image_instance_segmentation-mask-0"], ), Action( tool_name="select_mask", inputs={ "mask_list": "-image_instance_segmentation-mask-0", "condition": "elephant", }, outputs=["-0"], ), ] ], BUILTIN_IMAGE_HIGHLIGHT: [ [ Action( tool_name="object_detection", inputs={"image": "image"}, outputs=["-object_detection-bbox-0"], ), Action( tool_name="select_bbox", inputs={ "bbox_list": "-object_detection-bbox-0", "condition": "elephant", }, outputs=["-select_bbox-bbox-0"], ), Action( tool_name="highlight_object_on_image", inputs={ "image": "image", "bbox": "-select_bbox-bbox-0", }, outputs=["-0"], ), ] ], BUILTIN_TEXT_SPEECH: [ [ Action( tool_name="text_to_speech", inputs={ "text": "Hope is the thing with feathers That perches in the soul, And sings the tune without the words, And never stops at all" }, outputs=["-0"], ) ] ], BUILTIN_DUBBING: [ [ Action( tool_name="dub_video", inputs={"video": "video", "audio": "audio"}, outputs=["-0"], ) ] ], BUILTIN_GENERATE_SIMILAR_IMAGE4: [ [ Action( tool_name="segment_anything", inputs={"image": "image"}, outputs=["-seg"], ), Action( tool_name="image_captioning", inputs={"image": "image"}, outputs=["-prompt"], ), Action( tool_name="segmentation_text_to_image", inputs={ "segmentation": "-seg", "text": "-prompt", }, outputs=["-0"], ), ] ], BUILTIN_GENERATE_IMAGE: [ [ Action( tool_name="text_to_image", inputs={"text": "a woman is skiing"}, outputs=["-0"], ) ] ], BUILTIN_IMAGE_TO_VIDEO: [ [ Action( tool_name="image_to_video", inputs={"image": "image"}, outputs=["-0"], ) ] ], BUILTIN_COUNT_OBJECTS2: [ [ Action( tool_name="object_detection", inputs={"image": "image"}, outputs=["-object_detection-bbox-0"], ), Action( tool_name="select_bbox", inputs={ "bbox_list": "-object_detection-bbox-0", "condition": "horse", }, outputs=["-select_bbox-bbox-0"], ), Action( tool_name="count_objects", inputs={"bbox_list": "-select_bbox-bbox-0"}, outputs=["-0"], ), ], [ Action( tool_name="image_question_answering", inputs={ "text": "Provide me with the count of horses in the input image", "image": "image", }, outputs=["-1"], ) ], ], BUILTIN_IMAGE_TO_VIDEO2: [ [ Action( tool_name="text_to_image", inputs={ "text": "A serene and beautiful landscape with a calm lake reflecting the blue sky and white clouds." }, outputs=["-0"], ), ], [ Action( tool_name="image_captioning", inputs={"image": "-0"}, outputs=["-text-0"], ), Action( tool_name="text_to_speech", inputs={"text": "-text-0"}, outputs=["-text_to_speech-audio-0"], ), Action( tool_name="image_audio_to_video", inputs={ "image": "-0", "audio": "-text_to_speech-audio-0", }, outputs=["-1"], ), ], ], BUILTIN_IMAGE_TO_VIDEO3: [ [ Action( tool_name="text_to_image", inputs={ "text": "A serene and beautiful landscape with a calm lake reflecting the blue sky." }, outputs=["-0"], ), ], [ Action( tool_name="image_captioning", inputs={"image": "-0"}, outputs=["-text-0"], ), Action( tool_name="text_to_music", inputs={"text": "-text-0"}, outputs=["-1"], ), ], [ Action( tool_name="image_to_video", inputs={ "image": "-0", }, outputs=["-image_to_video-video-0"], ), Action( tool_name="dub_video", inputs={ "video": "-image_to_video-video-0", "audio": "-1", }, outputs=["-2"], ), ], ], BUILTIN_VIDEO_CLS: [ [ Action( tool_name="video_classification", inputs={"video": "video"}, outputs=["-0"], ) ] ], BUILTIN_AUDIO_CLS: [ [ Action( tool_name="audio_classification", inputs={"audio": "audio"}, outputs=["-0"], ) ] ], BUILTIN_IMAGE2MUSIC: [ [ Action( tool_name="image_captioning", inputs={"image": "image"}, outputs=["-text-0"], ), Action( tool_name="text_to_music", inputs={"text": "-text-0"}, outputs=["-0"], ), ] ], BUILTIN_VIDEO2MUSIC: [ [ Action( tool_name="video_captioning", inputs={"video": "video"}, outputs=["-text-0"], ), Action( tool_name="text_to_music", inputs={"text": "-text-0"}, outputs=["-0"], ), ], [ Action( tool_name="dub_video", inputs={ "video": "video", "audio": "-0", }, outputs=["-1"], ), ], ], BUILTIN_SEG_BY_POINTS: [ [ Action( tool_name="image_segmentation_by_points", inputs={"image": "image", "prompt_points": "prompt_points"}, outputs=["-0"], ) ] ], # BUILTIN_SEG_BY_MASK: [ # [ # Action( # tool_name='image_segmentation_by_mask', # inputs={'image': 'image', 'prompt_mask': 'prompt_mask'}, # outputs=['-0'], # ) # ] # ], } def load_builtin_plans(path): import json plans = json.load(open(path, "r")) processed_plan = {} for query, actions in plans.items(): actions2 = [] for ac in actions[0]: actions2.append( Action( tool_name=ac["tool_name"], inputs=ac["inputs"], outputs=ac["outputs"], ), ) processed_plan[query] = [actions2] return processed_plan