TalkingFaceGeneration / FONT /process_data.py
daddyjin's picture
add all files except ckpt files
9f3fa29
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 24 11:36:01 2021
@author: Xinya
"""
import os
import glob
import time
import numpy as np
import csv
import cv2
import dlib
from skimage import transform as tf
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor('./shape_predictor_68_face_landmarks.dat')
import imageio
def save(path, frames, format):
if format == '.mp4':
imageio.mimsave(path, frames)
elif format == '.png':
if not os.path.exists(path):
os.makedirs(path)
for j, frame in enumerate(frames):
cv2.imwrite(path+'/'+str(j)+'.png',frame)
# imageio.imsave(os.path.join(path, str(j) + '.png'), frames[j])
else:
print ("Unknown format %s" % format)
exit()
def crop_image(image_path, out_path):
template = np.load('./M003_template.npy')
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
rects = detector(gray, 1) #detect human face
if len(rects) != 1:
return 0
for (j, rect) in enumerate(rects):
shape = predictor(gray, rect) #detect 68 points
shape = shape_to_np(shape)
pts2 = np.float32(template[:47,:])
# pts2 = np.float32(template[17:35,:])
# pts1 = np.vstack((landmark[27:36,:], landmark[39,:],landmark[42,:],landmark[45,:]))
pts1 = np.float32(shape[:47,:]) #eye and nose
# pts1 = np.float32(landmark[17:35,:])
tform = tf.SimilarityTransform()
tform.estimate( pts2, pts1) #Set the transformation matrix with the explicit parameters.
dst = tf.warp(image, tform, output_shape=(256, 256))
dst = np.array(dst * 255, dtype=np.uint8)
cv2.imwrite(out_path,dst)
def shape_to_np(shape, dtype="int"):
# initialize the list of (x, y)-coordinates
coords = np.zeros((shape.num_parts, 2), dtype=dtype)
# loop over all facial landmarks and convert them
# to a 2-tuple of (x, y)-coordinates
for i in range(0, shape.num_parts):
coords[i] = (shape.part(i).x, shape.part(i).y)
# return the list of (x, y)-coordinates
return coords
def crop_image_tem(video_path, out_path):
image_all = []
videoCapture = cv2.VideoCapture(video_path)
success, frame = videoCapture.read()
n = 0
while success :
image_all.append(frame)
n = n + 1
success, frame = videoCapture.read()
if len(image_all)!=0 :
template = np.load('./M003_template.npy')
image=image_all[0]
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
rects = detector(gray, 1) #detect human face
if len(rects) != 1:
return 0
for (j, rect) in enumerate(rects):
shape = predictor(gray, rect) #detect 68 points
shape = shape_to_np(shape)
pts2 = np.float32(template[:47,:])
# pts2 = np.float32(template[17:35,:])
# pts1 = np.vstack((landmark[27:36,:], landmark[39,:],landmark[42,:],landmark[45,:]))
pts1 = np.float32(shape[:47,:]) #eye and nose
# pts1 = np.float32(landmark[17:35,:])
tform = tf.SimilarityTransform()
tform.estimate( pts2, pts1) #Set the transformation matrix with the explicit parameters.
out = []
for i in range(len(image_all)):
image = image_all[i]
dst = tf.warp(image, tform, output_shape=(256, 256))
dst = np.array(dst * 255, dtype=np.uint8)
out.append(dst)
if not os.path.exists(out_path):
os.makedirs(out_path)
save(out_path,out,'.png')
def proc_audio(src_mouth_path, dst_audio_path):
audio_command = 'ffmpeg -i \"{}\" -loglevel error -y -f wav -acodec pcm_s16le ' \
'-ar 16000 \"{}\"'.format(src_mouth_path, dst_audio_path)
os.system(audio_command)
if __name__ == "__main__":
#video alignment
video_path = './test/crop/M030_sad_3_001.mp4'
out_path = './test/crop/M030_sad_3_001'
crop_image_tem(video_path, out_path)
#image alignment
image_path = './test/raw_image/brade2.jpg'
out_path = './test/image/brade2.jpg'
crop_image(image_path, out_path)
#change_audio_sample_rate
src_mouth_path = './test/audio/00015.mp3'
dst_audio_path = './test/audio/00015.mov'
proc_audio(src_mouth_path, dst_audio_path)