ViTGaze / data /
yhsong's picture
initial commit
f9561b9 verified
import math
from os import path as osp
from typing import Callable, Optional
import glob
import torch
from import Dataset
import torchvision.transforms.functional as TF
import numpy as np
from PIL import Image, ImageOps
import pandas as pd
from .masking import MaskGenerator
from . import data_utils as utils
class VideoAttentionTargetVideo(Dataset):
def __init__(
image_root: str,
anno_root: str,
head_root: str,
transform: Callable,
input_size: int,
output_size: int,
quant_labelmap: bool = True,
is_train: bool = True,
seq_len: int = 8,
max_len: int = 32,
mask_generator: Optional[MaskGenerator] = None,
bbox_jitter: float = 0.5,
rand_crop: float = 0.5,
rand_flip: float = 0.5,
color_jitter: float = 0.5,
rand_rotate: float = 0.0,
rand_lsj: float = 0.0,
dfs = []
for show_dir in glob.glob(osp.join(anno_root, "*")):
for sequence_path in glob.glob(osp.join(show_dir, "*", "*.txt")):
df = pd.read_csv(
show_name = sequence_path.split("/")[-3]
clip = sequence_path.split("/")[-2]
df["path"] = df["path"].apply(
lambda path: osp.join(show_name, clip, path)
cur_len = len(df.index)
if is_train:
if cur_len <= max_len:
if cur_len >= seq_len:
remainder = cur_len % max_len
df_splits = [
df[i : i + max_len]
for i in range(0, cur_len - max_len, max_len)
if remainder >= seq_len:
if cur_len < seq_len:
df_splits = [
df[i : i + seq_len]
for i in range(0, cur_len - seq_len, seq_len)
for df in dfs:
self.dfs = dfs
self.length = len(dfs)
self.data_dir = image_root
self.head_dir = head_root
self.transform = transform
self.draw_labelmap = (
utils.draw_labelmap if quant_labelmap else utils.draw_labelmap_no_quant
self.is_train = is_train
self.input_size = input_size
self.output_size = output_size
self.seq_len = seq_len
if self.is_train:
self.bbox_jitter = bbox_jitter
self.rand_crop = rand_crop
self.rand_flip = rand_flip
self.color_jitter = color_jitter
self.rand_rotate = rand_rotate
self.rand_lsj = rand_lsj
self.mask_generator = mask_generator
def __getitem__(self, index):
df = self.dfs[index]
seq_len = len(df.index)
for coord in ["x_min", "y_min", "x_max", "y_max"]:
df[coord] = utils.smooth_by_conv(11, df, coord)
if self.is_train:
# cond for data augmentation
cond_jitter = np.random.random_sample()
cond_flip = np.random.random_sample()
cond_color = np.random.random_sample()
if cond_color < self.color_jitter:
n1 = np.random.uniform(0.5, 1.5)
n2 = np.random.uniform(0.5, 1.5)
n3 = np.random.uniform(0.5, 1.5)
cond_crop = np.random.random_sample()
cond_rotate = np.random.random_sample()
if cond_rotate < self.rand_rotate:
angle = (2 * np.random.random_sample() - 1) * 20
angle = -math.radians(angle)
cond_lsj = np.random.random_sample()
if cond_lsj < self.rand_lsj:
lsj_scale = 0.1 + np.random.random_sample() * 0.9
# if longer than seq_len_limit, cut it down to the limit with the init index randomly sampled
if seq_len > self.seq_len:
sampled_ind = np.random.randint(0, seq_len - self.seq_len)
seq_len = self.seq_len
sampled_ind = 0
if cond_crop < self.rand_crop:
sliced_x_min = df["x_min"].iloc[sampled_ind : sampled_ind + seq_len]
sliced_x_max = df["x_max"].iloc[sampled_ind : sampled_ind + seq_len]
sliced_y_min = df["y_min"].iloc[sampled_ind : sampled_ind + seq_len]
sliced_y_max = df["y_max"].iloc[sampled_ind : sampled_ind + seq_len]
sliced_gaze_x = df["gaze_x"].iloc[sampled_ind : sampled_ind + seq_len]
sliced_gaze_y = df["gaze_y"].iloc[sampled_ind : sampled_ind + seq_len]
check_sum = sliced_gaze_x.sum() + sliced_gaze_y.sum()
all_outside = check_sum == -2 * seq_len
# Calculate the minimum valid range of the crop that doesn't exclude the face and the gaze target
if all_outside:
crop_x_min = np.min([sliced_x_min.min(), sliced_x_max.min()])
crop_y_min = np.min([sliced_y_min.min(), sliced_y_max.min()])
crop_x_max = np.max([sliced_x_min.max(), sliced_x_max.max()])
crop_y_max = np.max([sliced_y_min.max(), sliced_y_max.max()])
crop_x_min = np.min(
[sliced_gaze_x.min(), sliced_x_min.min(), sliced_x_max.min()]
crop_y_min = np.min(
[sliced_gaze_y.min(), sliced_y_min.min(), sliced_y_max.min()]
crop_x_max = np.max(
[sliced_gaze_x.max(), sliced_x_min.max(), sliced_x_max.max()]
crop_y_max = np.max(
[sliced_gaze_y.max(), sliced_y_min.max(), sliced_y_max.max()]
# Randomly select a random top left corner
if crop_x_min >= 0:
crop_x_min = np.random.uniform(0, crop_x_min)
if crop_y_min >= 0:
crop_y_min = np.random.uniform(0, crop_y_min)
# Get image size
path = osp.join(self.data_dir, df["path"].iloc[0])
img =
img = img.convert("RGB")
width, height = img.size
# Find the range of valid crop width and height starting from the (crop_x_min, crop_y_min)
crop_width_min = crop_x_max - crop_x_min
crop_height_min = crop_y_max - crop_y_min
crop_width_max = width - crop_x_min
crop_height_max = height - crop_y_min
# Randomly select a width and a height
crop_width = np.random.uniform(crop_width_min, crop_width_max)
crop_height = np.random.uniform(crop_height_min, crop_height_max)
# Round to integers
crop_y_min, crop_x_min, crop_height, crop_width = map(
int, map(round, (crop_y_min, crop_x_min, crop_height, crop_width))
sampled_ind = 0
images = []
head_channels = []
heatmaps = []
gazes = []
gaze_inouts = []
imsizes = []
head_masks = []
if self.is_train and self.mask_generator is not None:
image_masks = []
for i, row in df.iterrows():
if self.is_train and (i < sampled_ind or i >= (sampled_ind + self.seq_len)):
x_min = row["x_min"] # note: Already in image coordinates
y_min = row["y_min"] # note: Already in image coordinates
x_max = row["x_max"] # note: Already in image coordinates
y_max = row["y_max"] # note: Already in image coordinates
gaze_x = row["gaze_x"] # note: Already in image coordinates
gaze_y = row["gaze_y"] # note: Already in image coordinates
if x_min > x_max:
x_min, x_max = x_max, x_min
if y_min > y_max:
y_min, y_max = y_max, y_min
path = row["path"]
img =, path)).convert("RGB")
width, height = img.size
imsize = torch.FloatTensor([width, height])
# Since we finetune from weights trained on GazeFollow,
# we don't incorporate the auxiliary task for VAT.
if osp.exists(osp.join(self.head_dir, path)):
head_mask =, path)).resize(
(width, height)
head_mask = Image.fromarray(np.zeros((height, width), dtype=np.float32))
x_min, y_min, x_max, y_max = map(float, [x_min, y_min, x_max, y_max])
gaze_x, gaze_y = map(float, [gaze_x, gaze_y])
if gaze_x == -1 and gaze_y == -1:
gaze_inside = False
if (
gaze_x < 0
): # move gaze point that was sliglty outside the image back in
gaze_x = 0
if gaze_y < 0:
gaze_y = 0
gaze_inside = True
if self.is_train:
## data augmentation
# Jitter (expansion-only) bounding box size.
if cond_jitter < self.bbox_jitter:
k = cond_jitter * 0.1
x_min -= k * abs(x_max - x_min)
y_min -= k * abs(y_max - y_min)
x_max += k * abs(x_max - x_min)
y_max += k * abs(y_max - y_min)
x_min = np.clip(x_min, 0, width - 1)
x_max = np.clip(x_max, 0, width - 1)
y_min = np.clip(y_min, 0, height - 1)
y_max = np.clip(y_max, 0, height - 1)
# Random color change
if cond_color < self.color_jitter:
img = TF.adjust_brightness(img, brightness_factor=n1)
img = TF.adjust_contrast(img, contrast_factor=n2)
img = TF.adjust_saturation(img, saturation_factor=n3)
# Random Crop
if cond_crop < self.rand_crop:
# Crop it
img = TF.crop(img, crop_y_min, crop_x_min, crop_height, crop_width)
head_mask = TF.crop(
head_mask, crop_y_min, crop_x_min, crop_height, crop_width
# Record the crop's (x, y) offset
offset_x, offset_y = crop_x_min, crop_y_min
# convert coordinates into the cropped frame
x_min, y_min, x_max, y_max = (
x_min - offset_x,
y_min - offset_y,
x_max - offset_x,
y_max - offset_y,
if gaze_inside:
gaze_x, gaze_y = (gaze_x - offset_x), (gaze_y - offset_y)
gaze_x = -1
gaze_y = -1
width, height = crop_width, crop_height
# Flip?
if cond_flip < self.rand_flip:
img = img.transpose(Image.FLIP_LEFT_RIGHT)
head_mask = head_mask.transpose(Image.FLIP_LEFT_RIGHT)
x_max_2 = width - x_min
x_min_2 = width - x_max
x_max = x_max_2
x_min = x_min_2
if gaze_x != -1 and gaze_y != -1:
gaze_x = width - gaze_x
# Random Rotation
if cond_rotate < self.rand_rotate:
rot_mat = [
round(math.cos(angle), 15),
round(math.sin(angle), 15),
round(-math.sin(angle), 15),
round(math.cos(angle), 15),
def _transform(x, y, matrix):
return (
matrix[0] * x + matrix[1] * y + matrix[2],
matrix[3] * x + matrix[4] * y + matrix[5],
def _inv_transform(x, y, matrix):
x, y = x - matrix[2], y - matrix[5]
return (
matrix[0] * x + matrix[3] * y,
matrix[1] * x + matrix[4] * y,
# Calculate offsets
rot_center = (width / 2.0, height / 2.0)
rot_mat[2], rot_mat[5] = _transform(
-rot_center[0], -rot_center[1], rot_mat
rot_mat[2] += rot_center[0]
rot_mat[5] += rot_center[1]
xx = []
yy = []
for x, y in ((0, 0), (width, 0), (width, height), (0, height)):
x, y = _transform(x, y, rot_mat)
nw = math.ceil(max(xx)) - math.floor(min(xx))
nh = math.ceil(max(yy)) - math.floor(min(yy))
rot_mat[2], rot_mat[5] = _transform(
-(nw - width) / 2.0, -(nh - height) / 2.0, rot_mat
img = img.transform((nw, nh), Image.AFFINE, rot_mat, Image.BILINEAR)
head_mask = head_mask.transform(
(nw, nh), Image.AFFINE, rot_mat, Image.BILINEAR
xx = []
yy = []
for x, y in (
(x_min, y_min),
(x_min, y_max),
(x_max, y_min),
(x_max, y_max),
x, y = _inv_transform(x, y, rot_mat)
x_max, x_min = min(max(xx), nw), max(min(xx), 0)
y_max, y_min = min(max(yy), nh), max(min(yy), 0)
gaze_x, gaze_y = _inv_transform(gaze_x, gaze_y, rot_mat)
width, height = nw, nh
if cond_lsj < self.rand_lsj:
nh, nw = int(height * lsj_scale), int(width * lsj_scale)
img = TF.resize(img, (nh, nw))
img = ImageOps.expand(img, (0, 0, width - nw, height - nh))
head_mask = TF.resize(head_mask, (nh, nw))
head_mask = ImageOps.expand(
head_mask, (0, 0, width - nw, height - nh)
x_min, y_min, x_max, y_max = (
x_min * lsj_scale,
y_min * lsj_scale,
x_max * lsj_scale,
y_max * lsj_scale,
gaze_x, gaze_y = gaze_x * lsj_scale, gaze_y * lsj_scale
head_channel = utils.get_head_box_channel(
if self.is_train and self.mask_generator is not None:
image_mask = self.mask_generator(
x_min / width,
y_min / height,
x_max / width,
y_max / height,
if self.transform is not None:
img = self.transform(img)
head_mask = TF.to_tensor(
TF.resize(head_mask, (self.input_size, self.input_size))
if gaze_inside:
gaze_x /= float(width) # fractional gaze
gaze_y /= float(height)
gaze_heatmap = torch.zeros(
self.output_size, self.output_size
) # set the size of the output
gaze_map = self.draw_labelmap(
[gaze_x * self.output_size, gaze_y * self.output_size],
gazes.append(torch.FloatTensor([gaze_x, gaze_y]))
gaze_map = torch.zeros(self.output_size, self.output_size)
gazes.append(torch.FloatTensor([-1, -1]))
images = torch.stack(images)
head_channels = torch.stack(head_channels)
heatmaps = torch.stack(heatmaps)
gazes = torch.stack(gazes)
gaze_inouts = torch.stack(gaze_inouts)
head_masks = torch.stack(head_masks)
imsizes = torch.stack(imsizes)
out_dict = {
"images": images,
"head_channels": head_channels,
"heatmaps": heatmaps,
"gazes": gazes,
"gaze_inouts": gaze_inouts,
"head_masks": head_masks,
"imsize": imsizes,
if self.is_train and self.mask_generator is not None:
out_dict["image_masks"] = torch.stack(image_masks)
return out_dict
def __len__(self):
return self.length
def video_collate(batch):
keys = batch[0].keys()
return {key:[item[key] for item in batch]) for key in keys}