import argparse import os from model import LipCoordNet from dataset import MyDataset import torch import cv2 import face_alignment import numpy as np import dlib import glob def get_position(size, padding=0.25): x = [ 0.000213256, 0.0752622, 0.18113, 0.29077, 0.393397, 0.586856, 0.689483, 0.799124, 0.904991, 0.98004, 0.490127, 0.490127, 0.490127, 0.490127, 0.36688, 0.426036, 0.490127, 0.554217, 0.613373, 0.121737, 0.187122, 0.265825, 0.334606, 0.260918, 0.182743, 0.645647, 0.714428, 0.793132, 0.858516, 0.79751, 0.719335, 0.254149, 0.340985, 0.428858, 0.490127, 0.551395, 0.639268, 0.726104, 0.642159, 0.556721, 0.490127, 0.423532, 0.338094, 0.290379, 0.428096, 0.490127, 0.552157, 0.689874, 0.553364, 0.490127, 0.42689, ] y = [ 0.106454, 0.038915, 0.0187482, 0.0344891, 0.0773906, 0.0773906, 0.0344891, 0.0187482, 0.038915, 0.106454, 0.203352, 0.307009, 0.409805, 0.515625, 0.587326, 0.609345, 0.628106, 0.609345, 0.587326, 0.216423, 0.178758, 0.179852, 0.231733, 0.245099, 0.244077, 0.231733, 0.179852, 0.178758, 0.216423, 0.244077, 0.245099, 0.780233, 0.745405, 0.727388, 0.742578, 0.727388, 0.745405, 0.780233, 0.864805, 0.902192, 0.909281, 0.902192, 0.864805, 0.784792, 0.778746, 0.785343, 0.778746, 0.784792, 0.824182, 0.831803, 0.824182, ] x, y = np.array(x), np.array(y) x = (x + padding) / (2 * padding + 1) y = (y + padding) / (2 * padding + 1) x = x * size y = y * size return np.array(list(zip(x, y))) def transformation_from_points(points1, points2): points1 = points1.astype(np.float64) points2 = points2.astype(np.float64) c1 = np.mean(points1, axis=0) c2 = np.mean(points2, axis=0) points1 -= c1 points2 -= c2 s1 = np.std(points1) s2 = np.std(points2) points1 /= s1 points2 /= s2 U, S, Vt = np.linalg.svd(points1.T * points2) R = (U * Vt).T return np.vstack( [ np.hstack(((s2 / s1) * R, c2.T - (s2 / s1) * R * c1.T)), np.matrix([0.0, 0.0, 1.0]), ] ) def load_video(file, device: str): # create the samples directory if it doesn't exist if not os.path.exists("samples"): os.makedirs("samples") p = os.path.join("samples") output = os.path.join("samples", "%04d.jpg") cmd = "ffmpeg -hide_banner -loglevel error -i {} -qscale:v 2 -r 25 {}".format( file, output ) os.system(cmd) files = os.listdir(p) files = sorted(files, key=lambda x: int(os.path.splitext(x)[0])) array = [cv2.imread(os.path.join(p, file)) for file in files] array = list(filter(lambda im: not im is None, array)) fa = face_alignment.FaceAlignment( face_alignment.LandmarksType._2D, flip_input=False, device=device ) points = [fa.get_landmarks(I) for I in array] front256 = get_position(256) video = [] for point, scene in zip(points, array): if point is not None: shape = np.array(point[0]) shape = shape[17:] M = transformation_from_points(np.matrix(shape), np.matrix(front256)) img = cv2.warpAffine(scene, M[:2], (256, 256)) (x, y) = front256[-20:].mean(0).astype(np.int32) w = 160 // 2 img = img[y - w // 2 : y + w // 2, x - w : x + w, ...] img = cv2.resize(img, (128, 64)) video.append(img) video = np.stack(video, axis=0).astype(np.float32) video = torch.FloatTensor(video.transpose(3, 0, 1, 2)) / 255.0 return video def extract_lip_coordinates(detector, predictor, img_path): image = cv2.imread(img_path) image = cv2.resize(image, (600, 500)) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) rects = detector(gray) retries = 3 while retries > 0: try: assert len(rects) == 1 break except AssertionError as e: retries -= 1 for rect in rects: # apply the shape predictor to the face ROI shape = predictor(gray, rect) x = [] y = [] for n in range(48, 68): x.append(shape.part(n).x) y.append(shape.part(n).y) return [x, y] def generate_lip_coordinates(frame_images_directory, detector, predictor): frames = glob.glob(frame_images_directory + "/*.jpg") frames.sort() img = cv2.imread(frames[0]) height, width, layers = img.shape coords = [] for frame in frames: x_coords, y_coords = extract_lip_coordinates(detector, predictor, frame) normalized_coords = [] for x, y in zip(x_coords, y_coords): normalized_x = x / width normalized_y = y / height normalized_coords.append((normalized_x, normalized_y)) coords.append(normalized_coords) coords_array = np.array(coords, dtype=np.float32) coords_array = torch.from_numpy(coords_array) return coords_array def ctc_decode(y): y = y.argmax(-1) t = y.size(0) result = [] for i in range(t + 1): result.append(MyDataset.ctc_arr2txt(y[:i], start=1)) return result def output_video(p, txt, output_path): files = os.listdir(p) files = sorted(files, key=lambda x: int(os.path.splitext(x)[0])) font = cv2.FONT_HERSHEY_SIMPLEX for file, line in zip(files, txt): img = cv2.imread(os.path.join(p, file)) h, w, _ = img.shape img = cv2.putText( img, line, (w // 8, 11 * h // 12), font, 1.2, (0, 0, 0), 3, cv2.LINE_AA ) img = cv2.putText( img, line, (w // 8, 11 * h // 12), font, 1.2, (255, 255, 255), 0, cv2.LINE_AA, ) h = h // 2 w = w // 2 img = cv2.resize(img, (w, h)) cv2.imwrite(os.path.join(p, file), img) # create the output_videos directory if it doesn't exist if not os.path.exists(output_path): os.makedirs(output_path) output = os.path.join(output_path, "output.mp4") cmd = "ffmpeg -hide_banner -loglevel error -y -i {}/%04d.jpg -r 25 {}".format( p, output ) os.system(cmd) def main(): parser = argparse.ArgumentParser() parser.add_argument( "--weights", type=str, default="pretrain/LipCoordNet_coords_loss_0.025581153109669685_wer_0.01746208431890914_cer_0.006488426950253695.pt", help="path to the weights file", ) parser.add_argument( "--input_video", type=str, help="path to the input video frames", ) parser.add_argument( "--device", type=str, default="cuda", help="device to run the model on", ) parser.add_argument( "--output_path", type=str, default="output_videos", help="directory to save the output video", ) args = parser.parse_args() # validate if device is valid if args.device not in ("cuda", "cpu"): raise ValueError("Invalid device, must be either cuda or cpu") device = args.device # load model model = LipCoordNet() model.load_state_dict(torch.load(args.weights)) model = model.to(device) model.eval() detector = dlib.get_frontal_face_detector() predictor = dlib.shape_predictor( "lip_coordinate_extraction/shape_predictor_68_face_landmarks_GTX.dat" ) # load video video = load_video(args.input_video, device) # generate lip coordinates coords = generate_lip_coordinates("samples", detector, predictor) pred = model(video[None, ...].to(device), coords[None, ...].to(device)) output = ctc_decode(pred[0]) print(output[-1]) output_video("samples", output, args.output_path) if __name__ == "__main__": main()