|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
|
from typing import Iterable, Mapping, Tuple, Union |
|
|
|
|
|
def compute_tapvid_metrics( |
|
query_points: np.ndarray, |
|
gt_occluded: np.ndarray, |
|
gt_tracks: np.ndarray, |
|
pred_occluded: np.ndarray, |
|
pred_tracks: np.ndarray, |
|
query_mode: str, |
|
) -> Mapping[str, np.ndarray]: |
|
"""Computes TAP-Vid metrics (Jaccard, Pts. Within Thresh, Occ. Acc.) |
|
See the TAP-Vid paper for details on the metric computation. All inputs are |
|
given in raster coordinates. The first three arguments should be the direct |
|
outputs of the reader: the 'query_points', 'occluded', and 'target_points'. |
|
The paper metrics assume these are scaled relative to 256x256 images. |
|
pred_occluded and pred_tracks are your algorithm's predictions. |
|
This function takes a batch of inputs, and computes metrics separately for |
|
each video. The metrics for the full benchmark are a simple mean of the |
|
metrics across the full set of videos. These numbers are between 0 and 1, |
|
but the paper multiplies them by 100 to ease reading. |
|
Args: |
|
query_points: The query points, an in the format [t, y, x]. Its size is |
|
[b, n, 3], where b is the batch size and n is the number of queries |
|
gt_occluded: A boolean array of shape [b, n, t], where t is the number |
|
of frames. True indicates that the point is occluded. |
|
gt_tracks: The target points, of shape [b, n, t, 2]. Each point is |
|
in the format [x, y] |
|
pred_occluded: A boolean array of predicted occlusions, in the same |
|
format as gt_occluded. |
|
pred_tracks: An array of track predictions from your algorithm, in the |
|
same format as gt_tracks. |
|
query_mode: Either 'first' or 'strided', depending on how queries are |
|
sampled. If 'first', we assume the prior knowledge that all points |
|
before the query point are occluded, and these are removed from the |
|
evaluation. |
|
Returns: |
|
A dict with the following keys: |
|
occlusion_accuracy: Accuracy at predicting occlusion. |
|
pts_within_{x} for x in [1, 2, 4, 8, 16]: Fraction of points |
|
predicted to be within the given pixel threshold, ignoring occlusion |
|
prediction. |
|
jaccard_{x} for x in [1, 2, 4, 8, 16]: Jaccard metric for the given |
|
threshold |
|
average_pts_within_thresh: average across pts_within_{x} |
|
average_jaccard: average across jaccard_{x} |
|
""" |
|
|
|
metrics = {} |
|
|
|
|
|
eye = np.eye(gt_tracks.shape[2], dtype=np.int32) |
|
|
|
if query_mode == "first": |
|
|
|
query_frame_to_eval_frames = np.cumsum(eye, axis=1) - eye |
|
elif query_mode == "strided": |
|
|
|
query_frame_to_eval_frames = 1 - eye |
|
else: |
|
raise ValueError("Unknown query mode " + query_mode) |
|
|
|
query_frame = query_points[..., 0] |
|
query_frame = np.round(query_frame).astype(np.int32) |
|
evaluation_points = query_frame_to_eval_frames[query_frame] > 0 |
|
|
|
|
|
|
|
occ_acc = np.sum( |
|
np.equal(pred_occluded, gt_occluded) & evaluation_points, |
|
axis=(1, 2), |
|
) / np.sum(evaluation_points) |
|
metrics["occlusion_accuracy"] = occ_acc |
|
|
|
|
|
|
|
visible = np.logical_not(gt_occluded) |
|
pred_visible = np.logical_not(pred_occluded) |
|
all_frac_within = [] |
|
all_jaccard = [] |
|
for thresh in [1, 2, 4, 8, 16]: |
|
|
|
|
|
within_dist = np.sum( |
|
np.square(pred_tracks - gt_tracks), |
|
axis=-1, |
|
) < np.square(thresh) |
|
is_correct = np.logical_and(within_dist, visible) |
|
|
|
|
|
|
|
|
|
count_correct = np.sum( |
|
is_correct & evaluation_points, |
|
axis=(1, 2), |
|
) |
|
count_visible_points = np.sum(visible & evaluation_points, axis=(1, 2)) |
|
frac_correct = count_correct / count_visible_points |
|
metrics["pts_within_" + str(thresh)] = frac_correct |
|
all_frac_within.append(frac_correct) |
|
|
|
true_positives = np.sum( |
|
is_correct & pred_visible & evaluation_points, axis=(1, 2) |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gt_positives = np.sum(visible & evaluation_points, axis=(1, 2)) |
|
false_positives = (~visible) & pred_visible |
|
false_positives = false_positives | ((~within_dist) & pred_visible) |
|
false_positives = np.sum(false_positives & evaluation_points, axis=(1, 2)) |
|
jaccard = true_positives / (gt_positives + false_positives) |
|
metrics["jaccard_" + str(thresh)] = jaccard |
|
all_jaccard.append(jaccard) |
|
metrics["average_jaccard"] = np.mean( |
|
np.stack(all_jaccard, axis=1), |
|
axis=1, |
|
) |
|
metrics["average_pts_within_thresh"] = np.mean( |
|
np.stack(all_frac_within, axis=1), |
|
axis=1, |
|
) |
|
return metrics |
|
|