from scipy.spatial import ConvexHull import torch import torch.nn.functional as F import numpy as np from tqdm import tqdm def normalize_kp(kp_source, kp_driving, kp_driving_initial, adapt_movement_scale=False, use_relative_movement=False, use_relative_jacobian=False): if adapt_movement_scale: source_area = ConvexHull(kp_source['value'][0].data.cpu().numpy()).volume driving_area = ConvexHull(kp_driving_initial['value'][0].data.cpu().numpy()).volume adapt_movement_scale = np.sqrt(source_area) / np.sqrt(driving_area) else: adapt_movement_scale = 1 kp_new = {k: v for k, v in kp_driving.items()} if use_relative_movement: kp_value_diff = (kp_driving['value'] - kp_driving_initial['value']) kp_value_diff *= adapt_movement_scale kp_new['value'] = kp_value_diff + kp_source['value'] if use_relative_jacobian: jacobian_diff = torch.matmul(kp_driving['jacobian'], torch.inverse(kp_driving_initial['jacobian'])) kp_new['jacobian'] = torch.matmul(jacobian_diff, kp_source['jacobian']) return kp_new def headpose_pred_to_degree(pred): device = pred.device idx_tensor = [idx for idx in range(66)] idx_tensor = torch.FloatTensor(idx_tensor).to(device) pred = F.softmax(pred) degree = torch.sum(pred*idx_tensor, 1) * 3 - 99 return degree def get_rotation_matrix(yaw, pitch, roll): yaw = yaw / 180 * 3.14 pitch = pitch / 180 * 3.14 roll = roll / 180 * 3.14 roll = roll.unsqueeze(1) pitch = pitch.unsqueeze(1) yaw = yaw.unsqueeze(1) pitch_mat = torch.cat([torch.ones_like(pitch), torch.zeros_like(pitch), torch.zeros_like(pitch), torch.zeros_like(pitch), torch.cos(pitch), -torch.sin(pitch), torch.zeros_like(pitch), torch.sin(pitch), torch.cos(pitch)], dim=1) pitch_mat = pitch_mat.view(pitch_mat.shape[0], 3, 3) yaw_mat = torch.cat([torch.cos(yaw), torch.zeros_like(yaw), torch.sin(yaw), torch.zeros_like(yaw), torch.ones_like(yaw), torch.zeros_like(yaw), -torch.sin(yaw), torch.zeros_like(yaw), torch.cos(yaw)], dim=1) yaw_mat = yaw_mat.view(yaw_mat.shape[0], 3, 3) roll_mat = torch.cat([torch.cos(roll), -torch.sin(roll), torch.zeros_like(roll), torch.sin(roll), torch.cos(roll), torch.zeros_like(roll), torch.zeros_like(roll), torch.zeros_like(roll), torch.ones_like(roll)], dim=1) roll_mat = roll_mat.view(roll_mat.shape[0], 3, 3) rot_mat = torch.einsum('bij,bjk,bkm->bim', pitch_mat, yaw_mat, roll_mat) return rot_mat def keypoint_transformation(kp_canonical, he, wo_exp=False): kp = kp_canonical['value'] # (bs, k, 3) yaw, pitch, roll= he['yaw'], he['pitch'], he['roll'] yaw = headpose_pred_to_degree(yaw) pitch = headpose_pred_to_degree(pitch) roll = headpose_pred_to_degree(roll) if 'yaw_in' in he: yaw = he['yaw_in'] if 'pitch_in' in he: pitch = he['pitch_in'] if 'roll_in' in he: roll = he['roll_in'] rot_mat = get_rotation_matrix(yaw, pitch, roll) # (bs, 3, 3) t, exp = he['t'], he['exp'] if wo_exp: exp = exp*0 # keypoint rotation kp_rotated = torch.einsum('bmp,bkp->bkm', rot_mat, kp) # keypoint translation t[:, 0] = t[:, 0]*0 t[:, 2] = t[:, 2]*0 t = t.unsqueeze(1).repeat(1, kp.shape[1], 1) kp_t = kp_rotated + t # add expression deviation exp = exp.view(exp.shape[0], -1, 3) kp_transformed = kp_t + exp return {'value': kp_transformed} # def make_animation(source_image, source_semantics, target_semantics, # generator, kp_detector, he_estimator, mapping, # yaw_c_seq=None, pitch_c_seq=None, roll_c_seq=None, # use_exp=True): # with torch.no_grad(): # predictions = [] # kp_canonical = kp_detector(source_image) # he_source = mapping(source_semantics) # kp_source = keypoint_transformation(kp_canonical, he_source) # for frame_idx in tqdm(range(target_semantics.shape[1]), 'Face Renderer:'): # target_semantics_frame = target_semantics[:, frame_idx] # he_driving = mapping(target_semantics_frame) # if yaw_c_seq is not None: # he_driving['yaw_in'] = yaw_c_seq[:, frame_idx] # if pitch_c_seq is not None: # he_driving['pitch_in'] = pitch_c_seq[:, frame_idx] # if roll_c_seq is not None: # he_driving['roll_in'] = roll_c_seq[:, frame_idx] # kp_driving = keypoint_transformation(kp_canonical, he_driving) # #kp_norm = normalize_kp(kp_source=kp_source, kp_driving=kp_driving, # #kp_driving_initial=kp_driving_initial) # kp_norm = kp_driving # out = generator(source_image, kp_source=kp_source, kp_driving=kp_norm) # ''' # source_image_new = out['prediction'].squeeze(1) # kp_canonical_new = kp_detector(source_image_new) # he_source_new = he_estimator(source_image_new) # kp_source_new = keypoint_transformation(kp_canonical_new, he_source_new, wo_exp=True) # kp_driving_new = keypoint_transformation(kp_canonical_new, he_driving, wo_exp=True) # out = generator(source_image_new, kp_source=kp_source_new, kp_driving=kp_driving_new) # ''' # predictions.append(out['prediction']) # torch.cuda.empty_cache() # predictions_ts = torch.stack(predictions, dim=1) # return predictions_ts import torch from torch.cuda.amp import autocast def make_animation(source_image, source_semantics, target_semantics, generator, kp_detector, he_estimator, mapping, yaw_c_seq=None, pitch_c_seq=None, roll_c_seq=None, use_exp=True): # device='cuda' # # Move inputs to GPU # source_image = source_image.to(device) # source_semantics = source_semantics.to(device) # target_semantics = target_semantics.to(device) with torch.no_grad(): # No gradients needed predictions = [] kp_canonical = kp_detector(source_image) he_source = mapping(source_semantics) kp_source = keypoint_transformation(kp_canonical, he_source) for frame_idx in tqdm(range(target_semantics.shape[1]), desc='Face Renderer:', unit='frame'): target_semantics_frame = target_semantics[:, frame_idx] he_driving = mapping(target_semantics_frame) if yaw_c_seq is not None: he_driving['yaw_in'] = yaw_c_seq[:, frame_idx] if pitch_c_seq is not None: he_driving['pitch_in'] = pitch_c_seq[:, frame_idx] if roll_c_seq is not None: he_driving['roll_in'] = roll_c_seq[:, frame_idx] kp_driving = keypoint_transformation(kp_canonical, he_driving) kp_norm = kp_driving # Use mixed precision for faster computation with autocast(): out = generator(source_image, kp_source=kp_source, kp_driving=kp_norm) predictions.append(out['prediction']) # Optional: Explicitly synchronize (use only if necessary) torch.cuda.synchronize() # Stack predictions into a single tensor predictions_ts = torch.stack(predictions, dim=1) return predictions_ts class AnimateModel(torch.nn.Module): """ Merge all generator related updates into single model for better multi-gpu usage """ def __init__(self, generator, kp_extractor, mapping): super(AnimateModel, self).__init__() self.kp_extractor = kp_extractor self.generator = generator self.mapping = mapping self.kp_extractor.eval() self.generator.eval() self.mapping.eval() def forward(self, x): source_image = x['source_image'] source_semantics = x['source_semantics'] target_semantics = x['target_semantics'] yaw_c_seq = x['yaw_c_seq'] pitch_c_seq = x['pitch_c_seq'] roll_c_seq = x['roll_c_seq'] predictions_video = make_animation(source_image, source_semantics, target_semantics, self.generator, self.kp_extractor, self.mapping, use_exp = True, yaw_c_seq=yaw_c_seq, pitch_c_seq=pitch_c_seq, roll_c_seq=roll_c_seq) return predictions_video