File size: 2,455 Bytes
134cb11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import json
from tqdm import tqdm
from pathlib import Path

from infer_utils import run_inference_single
import numpy as np



def run_xbd_inference(
    model,
    dataset_path,
    processor,
    tokenizer,
    conv_mode,
    use_video_data=False,
    open_prompt=None,
    repeat_frames=None,
    prompt_strategy="interleave",
    chronological_prefix=True,
    data_frac=1,
    data_size=None,
    last_image=False,
    delete_system_prompt=False,
    print_prompt=False,
    answer_path=None,
    start_ind=None,
    end_ind=None,
):

    with open(dataset_path) as f:
            xbd_data = json.load(f)

    if data_size is not None:
        data_size = min(data_size, len(xbd_data))
        idx = np.random.choice(len(xbd_data), data_size, replace=False)
        xbd_data = [xbd_data[i] for i in idx]
    elif data_frac < 1:
        idx = np.random.choice(len(xbd_data), int(len(xbd_data) * data_frac), replace=False)
        xbd_data = [xbd_data[i] for i in idx]

    answers = {}
    for question in tqdm(xbd_data):
        question_id = question["id"]
        inp = question["conversations"][0]['value']

        answer_str = question["conversations"][1]['value']
        metadata = question['metadata']
        image_paths = question['video']
        task = question['task']
        original_input_polygon = question['original_input_polygon']

        # TODO: check if you want to add closed framing for yes/no questions
        outputs = run_inference_single(
            model=model,
            processor=processor,
            tokenizer=tokenizer,
            conv_mode=conv_mode,
            inp=inp,
            image_paths=image_paths,
            metadata=metadata,
            repeat_frames=repeat_frames,
            use_video_data=use_video_data,
            prompt_strategy=prompt_strategy,
            chronological_prefix=chronological_prefix,
            last_image=last_image,
            print_prompt=print_prompt
        )

        answers[question_id] = {
            "question": inp,
            "predicted": outputs,
            "ground_truth": answer_str,
            "task": task,
            "original_input_polygon": original_input_polygon
        }
        # For recording individual answers as inference runs
        entry = {question_id: answers[question_id]}
        with open('/deep/u/joycech/aicc-working/geovlm_xbd_localization.json', 'a') as f:
            f.write(json.dumps(entry) + ',')

    return answers