import json import numpy as np from tqdm import tqdm from pathlib import Path from infer_utils import run_inference_single, create_mask def run_s2looking_inference( model, dataset_path, processor, tokenizer, conv_mode, use_video_data=False, open_prompt=None, repeat_frames=True, prompt_strategy="interleave", chronological_prefix=True, data_frac=1, data_size=None, delete_system_prompt=False, last_image=False, print_prompt=False, answer_path=None, start_ind=None, end_ind=None, ): dir = Path(dataset_path) with open(dir) as f: s2looking_data = json.load(f) if data_size is not None: data_size = min(data_size, len(s2looking_data)) idx = np.random.choice(len(s2looking_data), data_size, replace=False) s2looking_data = [s2looking_data[i] for i in idx] elif data_frac < 1: idx = np.random.choice(len(s2looking_data), int(len(s2looking_data) * data_frac), replace=False) s2looking_data = [s2looking_data[i] for i in idx] answers = {} for question in tqdm(s2looking_data): question_id = question["id"] inp = question["conversations"][0]['value'] answer_str = question["conversations"][1]['value'] metadata = question['metadata'] task = question['task'] image_paths = question['video'] original_input_polygon = question['original_input_polygon'] outputs = run_inference_single( model=model, processor=processor, tokenizer=tokenizer, conv_mode=conv_mode, inp=inp, image_paths=image_paths, metadata=metadata, repeat_frames=repeat_frames, use_video_data=use_video_data, prompt_strategy=prompt_strategy, chronological_prefix=chronological_prefix, delete_system_prompt=delete_system_prompt, last_image=last_image, print_prompt=print_prompt ) answers[question_id] = { "predicted": outputs, "ground_truth": answer_str, "question": inp, "task": task, "original_input_polygon": original_input_polygon } return answers