from typing import List, Union from vision_functions import find_in_image, simple_qa, verify_property, best_text_match, compute_depth def bool_to_yesno(bool_answer: bool) -> str: return "yes" if bool_answer else "no" class ImagePatch: """A Python class containing a crop of an image centered around a particular object, as well as relevant information. Attributes ---------- cropped_image : array_like An array-like of the cropped image taken from the original image. left : int An int describing the position of the left border of the crop's bounding box in the original image. lower : int An int describing the position of the bottom border of the crop's bounding box in the original image. right : int An int describing the position of the right border of the crop's bounding box in the original image. upper : int An int describing the position of the top border of the crop's bounding box in the original image. Methods ------- find(object_name: str) -> List[ImagePatch] Returns a list of new ImagePatch objects containing crops of the image centered around any objects found in the image matching the object_name. simple_query(question: str=None) -> str Returns the answer to a basic question asked about the image. If no question is provided, returns the answer to "What is this?". exists(object_name: str) -> bool Returns True if the object specified by object_name is found in the image, and False otherwise. verify_property(property: str) -> bool Returns True if the property is met, and False otherwise. compute_depth()->float Returns the median depth of the image crop. best_text_match(string1: str, string2: str) -> str Returns the string that best matches the image. crop(left: int, lower: int, right: int, upper: int) -> ImagePatch Returns a new ImagePatch object containing a crop of the image at the given coordinates. """ def __init__(self, image, left: int = None, lower: int = None, right: int = None, upper: int = None): """Initializes an ImagePatch object by cropping the image at the given coordinates and stores the coordinates as attributes. If no coordinates are provided, the image is left unmodified, and the coordinates are set to the dimensions of the image. Parameters ------- image : array_like An array-like of the original image. left : int An int describing the position of the left border of the crop's bounding box in the original image. lower : int An int describing the position of the bottom border of the crop's bounding box in the original image. right : int An int describing the position of the right border of the crop's bounding box in the original image. upper : int An int describing the position of the top border of the crop's bounding box in the original image. """ if left is None and right is None and upper is None and lower is None: self.cropped_image = image self.left = 0 self.lower = 0 self.right = image.shape[2] # width self.upper = image.shape[1] # height else: self.cropped_image = image[:, lower:upper, left:right] self.left = left self.upper = upper self.right = right self.lower = lower self.width = self.cropped_image.shape[2] self.height = self.cropped_image.shape[1] self.horizontal_center = (self.left + self.right) / 2 self.vertical_center = (self.lower + self.upper) / 2 def find(self, object_name: str) -> List["ImagePatch"]: """Returns a new ImagePatch object containing the crop of the image centered around the object specified by object_name. Parameters ------- object_name : str A string describing the name of the object to be found in the image. Examples -------- >>> # Given an image: Find the foo. >>> def execute_command(image) -> List[ImagePatch]: >>> image_patch = ImagePatch(image) >>> foo_patches = image_patch.find("foo") >>> return foo_patches """ return find_in_image(self.cropped_image, object_name) def simple_query(self, question: str = None) -> str: """Returns the answer to a basic question asked about the image. If no question is provided, returns the answer to "What is this?". Parameters ------- question : str A string describing the question to be asked. Examples ------- >>> # Given an image: Which kind of animal is not eating? >>> def execute_command(image) -> str: >>> image_patch = ImagePatch(image) >>> animal_patches = image_patch.find("animal") >>> for animal_patch in animal_patches: >>> if not animal_patch.verify_property("animal", "eating"): >>> return animal_patch.simple_query("What kind of animal is eating?") # crop would include eating so keep it in the query >>> # If no animal is not eating, query the image directly >>> return image_patch.simple_query("Which kind of animal is not eating?") >>> # Given an image: What is in front of the horse? >>> def execute_command(image) -> str: >>> image_patch = ImagePatch(image) >>> # contains a relation (around, next to, on, near, on top of, in front of, behind, etc), so ask directly >>> return image_patch.simple_query("What is in front of the horse?") """ return simple_qa(self.cropped_image, question) def exists(self, object_name: str) -> bool: """Returns True if the object specified by object_name is found in the image, and False otherwise. Parameters ------- object_name : str A string describing the name of the object to be found in the image. Examples ------- >>> # Given an image: Are there both cakes and gummy bears in the photo? >>> def execute_command(image) -> str: >>> image_patch = ImagePatch(image) >>> is_cake = image_patch.exists("cake") >>> is_gummy_bear = image_patch.exists("gummy bear") >>> return bool_to_yesno(is_cake and is_gummy_bear) """ return len(self.find(object_name)) > 0 def verify_property(self, object_name: str, property: str) -> bool: """Returns True if the object possesses the property, and False otherwise. Differs from 'exists' in that it presupposes the existence of the object specified by object_name, instead checking whether the object possesses the property. Parameters ------- object_name : str A string describing the name of the object to be found in the image. property : str A string describing the property to be checked. Examples ------- >>> # Given an image: Do the letters have blue color? >>> def execute_command(image) -> str: >>> image_patch = ImagePatch(image) >>> letters_patches = image_patch.find("letters") >>> # Question assumes only one letter patch >>> if len(letters_patches) == 0: >>> # If no letters are found, query the image directly >>> return image_patch.simple_query("Do the letters have blue color?") >>> return bool_to_yesno(letters_patches[0].verify_property("letters", "blue")) """ return verify_property(self.cropped_image, object_name, property) def compute_depth(self): """Returns the median depth of the image crop Parameters ---------- Returns ------- float the median depth of the image crop Examples -------- >>> # Given an image: Find the bar furthest away. >>> def execute_command(image)->ImagePatch: >>> image_patch = ImagePatch(image) >>> bar_patches = image_patch.find("bar") >>> bar_patches.sort(key=lambda bar: bar.compute_depth()) >>> return bar_patches[-1] """ depth_map = compute_depth(self.cropped_image) return depth_map.median() def best_text_match(self, option_list: List[str]) -> str: """Returns the string that best matches the image. Parameters ------- option_list : str A list with the names of the different options prefix : str A string with the prefixes to append to the options Examples ------- >>> # Given an image: Is the cap gold or white? >>> def execute_command(image) -> str: >>> image_patch = ImagePatch(image) >>> cap_patches = image_patch.find("cap") >>> # Question assumes one cap patch >>> if len(cap_patches) == 0: >>> # If no cap is found, query the image directly >>> return image_patch.simple_query("Is the cap gold or white?") >>> return cap_patches[0].best_text_match(["gold", "white"]) """ return best_text_match(self.cropped_image, option_list) def crop(self, left: int, lower: int, right: int, upper: int) -> "ImagePatch": """Returns a new ImagePatch cropped from the current ImagePatch. Parameters ------- left : int The leftmost pixel of the cropped image. lower : int The lowest pixel of the cropped image. right : int The rightmost pixel of the cropped image. upper : int The uppermost pixel of the cropped image. ------- """ return ImagePatch(self.cropped_image, left, lower, right, upper) def best_image_match(list_patches: List[ImagePatch], content: List[str], return_index=False) -> Union[ImagePatch, int]: """Returns the patch most likely to contain the content. Parameters ---------- list_patches : List[ImagePatch] content : List[str] the object of interest return_index : bool if True, returns the index of the patch most likely to contain the object Returns ------- int Patch most likely to contain the object """ return best_image_match(list_patches, content, return_index) def distance(patch_a: ImagePatch, patch_b: ImagePatch) -> float: """ Returns the distance between the edges of two ImagePatches. If the patches overlap, it returns a negative distance corresponding to the negative intersection over union. Parameters ---------- patch_a : ImagePatch patch_b : ImagePatch Examples -------- # Return the qux that is closest to the foo >>> def execute_command(image): >>> image_patch = ImagePatch(image) >>> qux_patches = image_patch.find('qux') >>> foo_patches = image_patch.find('foo') >>> foo_patch = foo_patches[0] >>> qux_patches.sort(key=lambda x: distance(x, foo_patch)) >>> return qux_patches[0] """ return distance(patch_a, patch_b) # Examples of using ImagePatch # Given an image: What toy is wearing a shirt? def execute_command(image) -> str: # not a relational verb so go step by step image_patch = ImagePatch(image) toy_patches = image_patch.find("toy") # Question assumes only one toy patch if len(toy_patches) == 0: # If no toy is found, query the image directly return image_patch.simple_query("What toy is wearing a shirt?") for toy_patch in toy_patches: is_wearing_shirt = (toy_patch.simple_query("Is the toy wearing a shirt?") == "yes") if is_wearing_shirt: return toy_patch.simple_query( "What toy is wearing a shirt?") # crop would include the shirt so keep it in the query # If no toy is wearing a shirt, pick the first toy return toy_patches[0].simple_query("What toy is wearing a shirt?") # Given an image: Who is the man staring at? def execute_command(image) -> str: # asks for the predicate of a relational verb (staring at), so ask directly image_patch = ImagePatch(image) return image_patch.simple_query("Who is the man staring at?") # Given an image: Find more visible chair. def execute_command(image) -> ImagePatch: # Return the chair image_patch = ImagePatch(image) # Remember: return the chair return image_patch.find("chair")[0] # Given an image: Find lamp on the bottom. def execute_command(image) -> ImagePatch: # Return the lamp image_patch = ImagePatch(image) lamp_patches = image_patch.find("lamp") lamp_patches.sort(key=lambda lamp: lamp.vertical_center) # Remember: return the lamp return lamp_patches[0] # Return the bottommost lamp # Given a list of images: Does the pole that is near a building that is near a green sign and the pole that is near bushes that are near a green sign have the same material? def execute_command(image_list) -> str: material_1 = None material_2 = None for image in image_list: image = ImagePatch(image) # find the building building_patches = image.find("building") for building_patch in building_patches: poles = building_patch.find("pole") signs = building_patch.find("sign") greensigns = [sign for sign in signs if sign.verify_property('sign', 'green')] if len(poles) > 0 and len(greensigns) > 0: material_1 = poles[0].simple_query("What is the material of the pole?") # find the bush bushes_patches = image.find("bushes") for bushes_patch in bushes_patches: poles = bushes_patch.find("pole") signs = bushes_patch.find("sign") greensigns = [sign for sign in signs if sign.verify_property('sign', 'green')] if len(poles) > 0 and len(greensigns) > 0: material_2 = poles[0].simple_query("What is the material of the pole?") return bool_to_yesno(material_1 == material_2) # Given an image: Find middle kid. def execute_command(image) -> ImagePatch: # Return the kid image_patch = ImagePatch(image) kid_patches = image_patch.find("kid") if len(kid_patches) == 0: kid_patches = [image_patch] kid_patches.sort(key=lambda kid: kid.horizontal_center) # Remember: return the kid return kid_patches[len(kid_patches) // 2] # Return the middle kid # Given an image: Is that blanket to the right of a pillow? def execute_command(image) -> str: image_patch = ImagePatch(image) blanket_patches = image_patch.find("blanket") # Question assumes only one blanket patch if len(blanket_patches) == 0: # If no blanket is found, query the image directly return image_patch.simple_query("Is that blanket to the right of a pillow?") for blanket_patch in blanket_patches: pillow_patches = image_patch.find("pillow") for pillow_patch in pillow_patches: if pillow_patch.horizontal_center > blanket_patch.horizontal_center: return "yes" return "no" # Given an image: How many people are there? def execute_command(image) -> str: image_patch = ImagePatch(image) person_patches = image_patch.find("person") return str(len(person_patches)) # Given a list of images: Is the man that is wearing dark pants driving?. def execute_command(image_list) -> str: for image in image_list: image = ImagePatch(image) man_patches = image.find("man") for man_patch in man_patches: pants = man_patch.find("pants") if len(pants) == 0: continue if pants[0].verify_property("pants", "dark"): return man_patch.simple_query("Is this man driving?") return ImagePatch(image_list[0]).simple_query("Is the man that is wearing dark pants driving?") # Given an image: Is there a backpack to the right of the man? def execute_command(image) -> str: image_patch = ImagePatch(image) man_patches = image_patch.find("man") # Question assumes one man patch if len(man_patches) == 0: # If no man is found, query the image directly return image_patch.simple_query("Is there a backpack to the right of the man?") man_patch = man_patches[0] backpack_patches = image_patch.find("backpack") # Question assumes one backpack patch if len(backpack_patches) == 0: return "no" for backpack_patch in backpack_patches: if backpack_patch.horizontal_center > man_patch.horizontal_center: return "yes" return "no" # Given a list of images: What is the pizza with red tomato on it on? def execute_command(image_list) -> str: for image in image_list: image = ImagePatch(image) pizza_patches = image.find("pizza") for pizza_patch in pizza_patches: tomato_patches = pizza_patch.find("tomato") has_red_tomato = False for tomato_patch in tomato_patches: if tomato_patch.verify_property("tomato", "red"): has_red_tomato = True if has_red_tomato: return pizza_patch.simple_query("What is the pizza on?") return ImagePatch(image_list[0]).simple_query("What is the pizza with red tomato on it on?") # Given an image: Find chair to the right near the couch. def execute_command(image) -> ImagePatch: # Return the chair image_patch = ImagePatch(image) chair_patches = image_patch.find("chair") if len(chair_patches) == 0: chair_patches = [image_patch] elif len(chair_patches) == 1: return chair_patches[0] chair_patches_right = [c for c in chair_patches if c.horizontal_center > image_patch.horizontal_center] couch_patches = image_patch.find("couch") if len(couch_patches) == 0: couch_patches = [image_patch] couch_patch = couch_patches[0] chair_patches_right.sort(key=lambda c: distance(c, couch_patch)) chair_patch = chair_patches_right[0] # Remember: return the chair return chair_patch # Given an image: Are there bagels or lemons? def execute_command(image) -> str: image_patch = ImagePatch(image) is_bagel = image_patch.exists("bagel") is_lemon = image_patch.exists("lemon") return bool_to_yesno(is_bagel or is_lemon) # Given an image: In which part is the bread, the bottom or the top? def execute_command(image) -> str: image_patch = ImagePatch(image) bread_patches = image_patch.find("bread") # Question assumes only one bread patch if len(bread_patches) == 0: # If no bread is found, query the image directly return image_patch.simple_query("In which part is the bread, the bottom or the top?") if bread_patches[0].vertical_center < image_patch.vertical_center: return "bottom" else: return "top" # Given an image: Find foo to bottom left. def execute_command(image) -> ImagePatch: # Return the foo image_patch = ImagePatch(image) foo_patches = image_patch.find("foo") lowermost_coordinate = min([patch.vertical_center for patch in foo_patches]) foo_patches_bottom = [patch for patch in foo_patches if patch.vertical_center - lowermost_coordinate < 100] if len(foo_patches_bottom) == 0: foo_patches_bottom = foo_patches elif len(foo_patches_bottom) == 1: return foo_patches_bottom[0] foo_patches_bottom.sort(key=lambda foo: foo.horizontal_center) foo_patch = foo_patches_bottom[0] # Remember: return the foo return foo_patch # Given an image: Find number 17. def execute_command(image) -> ImagePatch: # Return the person image_patch = ImagePatch(image) person_patches = image_patch.find("person") for patch in person_patches: if patch.exists("17"): return patch # Remember: return the person return person_patches[0] # Given a list of images: Is the statement true? There is at least 1 image with a brown dog that is near a bicycle and is wearing a collar. def execute_command(image_list) -> str: for image in image_list: image = ImagePatch(image) dog_patches = image.find("dog") for dog in dog_patches: near_bicycle = dog.simple_query("Is the dog near a bicycle?") wearing_collar = dog.simple_query("Is the dog wearing a collar?") if near_bicycle == "yes" and wearing_collar == "yes": return 'yes' return 'no' # Given an image: Find dog to the left of the post who is closest to girl wearing a shirt with text that says "I love you". def execute_command(image) -> ImagePatch: # Return the dog image_patch = ImagePatch(image) shirt_patches = image_patch.find("shirt") if len(shirt_patches) == 0: shirt_patches = [image_patch] shirt_patch = best_image_match(list_patches=shirt_patches, content=["I love you shirt"]) post_patches = image_patch.find("post") post_patches.sort(key=lambda post: distance(post, shirt_patch)) post_patch = post_patches[0] dog_patches = image_patch.find("dog") dogs_left_patch = [dog for dog in dog_patches if dog.left < post_patch.left] if len(dogs_left_patch) == 0: dogs_left_patch = dog_patches dogs_left_patch.sort(key=lambda dog: distance(dog, post_patch)) dog_patch = dogs_left_patch[0] # Remember: return the dog return dog_patch # Given an image: Find balloon on the right and second from the bottom. def execute_command(image) -> ImagePatch: # Return the balloon image_patch = ImagePatch(image) balloon_patches = image_patch.find("balloon") if len(balloon_patches) == 0: balloon_patches = [image_patch] elif len(balloon_patches) == 1: return balloon_patches[0] leftmost_coordinate = min([patch.horizontal_center for patch in balloon_patches]) balloon_patches_right = [patch for patch in balloon_patches if patch.horizontal_center - leftmost_coordinate < 100] if len(balloon_patches_right) == 0: balloon_patches_right = balloon_patches balloon_patches_right.sort(key=lambda p: p.vertical_center) balloon_patch = balloon_patches_right[1] # Remember: return the balloon return balloon_patch # Given an image: Find girl in white next to man in left. def execute_command(image) -> ImagePatch: # Return the girl image_patch = ImagePatch(image) girl_patches = image_patch.find("girl") girl_in_white_patches = [g for g in girl_patches if g.verify_property("girl", "white clothing")] if len(girl_in_white_patches) == 0: girl_in_white_patches = girl_patches man_patches = image_patch.find("man") man_patches.sort(key=lambda man: man.horizontal_center) leftmost_man = man_patches[0] # First from the left girl_in_white_patches.sort(key=lambda girl: distance(girl, leftmost_man)) girl_patch = girl_in_white_patches[0] # Remember: return the girl return girl_patch # Given a list of images: Is the statement true? There is 1 table that is in front of woman that is wearing jacket. def execute_command(image_list) -> str: for image in image_list: image = ImagePatch(image) woman_patches = image.find("woman") for woman in woman_patches: if woman.simple_query("Is the woman wearing jacket?") == "yes": tables = woman.find("table") return bool_to_yesno(len(tables) == 1) return 'no' # Given an image: Find top left. def execute_command(image) -> ImagePatch: # Return the person image_patch = ImagePatch(image) # Figure out what thing the caption is referring to. We need a subject for every caption persons = image_patch.find("person") top_all_objects = max([obj.vertical_center for obj in persons]) # Select objects that are close to the top # We do this because the caption is asking first about vertical and then about horizontal persons_top = [p for p in persons if top_all_objects - p.vertical_center < 100] if len(persons_top) == 0: persons_top = persons # And after that, obtain the leftmost object among them persons_top.sort(key=lambda obj: obj.horizontal_center) person_leftmost = persons_top[0] # Remember: return the person return person_leftmost # Given an image: What type of weather do you see in the photograph? def execute_command(image) -> str: image_patch = ImagePatch(image) return image_patch.simple_query("What type of weather do you see in the photograph?") # Given an image: How many orange life vests can be seen? def execute_command(image) -> str: image_patch = ImagePatch(image) life_vest_patches = image_patch.find("life vest") orange_life_vest_patches = [] for life_vest_patch in life_vest_patches: if life_vest_patch.verify_property('life vest', 'orange'): orange_life_vest_patches.append(life_vest_patch) return str(len(orange_life_vest_patches)) # Given an image: What is behind the pole? def execute_command(image) -> str: image_patch = ImagePatch(image) # contains a relation (around, next to, on, near, on top of, in front of, behind, etc), so ask directly return image_patch.simple_query("What is behind the pole?") # Given an image: Find second to top flower. def execute_command(image) -> ImagePatch: # Return the flower image_patch = ImagePatch(image) flower_patches = image_patch.find("flower") flower_patches.sort(key=lambda flower: flower.vertical_center) flower_patch = flower_patches[-2] # Remember: return the flower return flower_patch # Given an image: Find back. def execute_command(image) -> ImagePatch: # Return the person image_patch = ImagePatch(image) person_patches = image_patch.find("person") person_patches.sort(key=lambda person: person.compute_depth()) person_patch = person_patches[-1] # Remember: return the person return person_patch # Given an image: Find chair at the front. def execute_command(image) -> ImagePatch: # Return the chair image_patch = ImagePatch(image) chair_patches = image_patch.find("chair") chair_patches.sort(key=lambda chair: chair.compute_depth()) chair_patch = chair_patches[0] # Remember: return the chair return chair_patch # Given an image: Find white and yellow pants. def execute_command(image) -> ImagePatch: # Return the person image_patch = ImagePatch(image) # Clothing always requires returning the person person_patches = image_patch.find("person") person_patch = best_image_match(person_patches, ["white pants", "yellow pants"]) # Remember: return the person return person_patch # Given an image: Find cow facing the camera. def execute_command(image) -> ImagePatch: # Return the cow image_patch = ImagePatch(image) cow_patches = image_patch.find("cow") if len(cow_patches) == 0: cow_patches = [image_patch] cow_patch = best_image_match(list_patches=cow_patches, content=["cow facing the camera"]) # Remember: return the cow return cow_patch # Given a list of images: Is the statement true? There is 1 image that contains exactly 3 blue papers. def execute_command(image_list) -> str: image_cnt = 0 for image in image_list: image = ImagePatch(image) paper_patches = image.find("paper") blue_paper_patches = [] for paper in paper_patches: if paper.verify_property("paper", "blue"): blue_paper_patches.append(paper) if len(blue_paper_patches) == 3: image_cnt += 1 return bool_to_yesno(image_cnt == 1) # Given an image: Find black car just under stop sign. def execute_command(image) -> ImagePatch: # Return the car image_patch = ImagePatch(image) stop_sign_patches = image_patch.find("stop sign") if len(stop_sign_patches) == 0: stop_sign_patches = [image_patch] stop_sign_patch = stop_sign_patches[0] car_patches = image_patch.find("black car") car_under_stop = [] for car in car_patches: if car.upper < stop_sign_patch.upper: car_under_stop.append(car) # Find car that is closest to the stop sign car_under_stop.sort(key=lambda car: car.vertical_center - stop_sign_patch.vertical_center) # Remember: return the car return car_under_stop[0] # Given a list of images: Is there either a standing man that is holding a cell phone or a sitting man that is holding a cell phone? def execute_command(image_list) -> str: for image in image_list: image = ImagePatch(image) man_patches = image.find("man") for man in man_patches: holding_cell_phone = man.simple_query("Is this man holding a cell phone?") if holding_cell_phone == "yes": if man.simple_query("Is this man sitting?") == "yes": return 'yes' if man.simple_query("Is this man standing?") == "yes": return 'yes' return 'no' # Given a list of images: How many people are running while looking at their cell phone? def execute_command(image) -> str: image_patch = ImagePatch(image) people_patches = image_patch.find("person") # Question assumes only one person patch if len(people_patches) == 0: # If no people are found, query the image directly return image_patch.simple_query("How many people are running while looking at their cell phone?") people_count = 0 for person_patch in people_patches: # Verify two conditions: (1) running (2) looking at cell phone if person_patch.simple_query("Is the person running?") == "yes": if person_patch.simple_query("Is the person looking at cell phone?") == "yes": people_count += 1 return str(people_count) # Given a list of images: Does the car that is on a highway and the car that is on a street have the same color? def execute_command(image_list) -> str: color_1 = None color_2 = None for image in image_list: image = ImagePatch(image) car_patches = image.find("car") for car_patch in car_patches: if car_patch.simple_query("Is the car on the highway?") == "yes": color_1 = car_patch.simple_query("What is the color of the car?") elif car_patch.simple_query("Is the car on a street?") == "yes": color_2 = car_patch.simple_query("What is the color of the car?") return bool_to_yesno(color_1 == color_2) # Given a list of images: Is the statement true? There are 3 magazine that are on table. def execute_command(image_list) -> str: count = 0 for image in image_list: image = ImagePatch(image) magazine_patches = image.find("magazine") for magazine_patch in magazine_patches: on_table = magazine_patch.simple_query("Is the magazine on a table?") if on_table == "yes": count += 1 return bool_to_yesno(count == 3) # INSERT_QUERY_HERE