import torch, uuid import os, sys, shutil from Demo_TFR_Pirenderer.src.utils.preprocess import CropAndExtract from Demo_TFR_Pirenderer.src.test_audio2coeff import Audio2Coeff from Demo_TFR_Pirenderer.src.generate_batch import get_data from Demo_TFR_Pirenderer.src.generate_facerender_batch import get_facerender_data from Demo_TFR_Pirenderer.src.pirenderer.animate import AnimateFromCoeff from pydub import AudioSegment from scipy.io import savemat, loadmat def mp3_to_wav(mp3_filename,wav_filename,frame_rate): mp3_file = AudioSegment.from_file(file=mp3_filename) mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav") class OPT(): def __init__(self, checkpoint_path='checkpoints', config_path='src/config', lazy_load=False): if torch.cuda.is_available() : device = "cuda" else: device = "cpu" self.device = device os.environ['TORCH_HOME']= checkpoint_path self.checkpoint_path = checkpoint_path self.config_path = config_path self.path_of_lm_croper = os.path.join( checkpoint_path, 'shape_predictor_68_face_landmarks.dat') self.path_of_net_recon_model = os.path.join( checkpoint_path, 'epoch_20.pth') self.dir_of_BFM_fitting = os.path.join( checkpoint_path, 'BFM_Fitting') self.wav2lip_checkpoint = os.path.join( checkpoint_path, 'wav2lip.pth') self.audio2pose_checkpoint = os.path.join( checkpoint_path, 'auido2pose.pth') self.audio2pose_yaml_path = os.path.join( config_path, 'auido2pose.yaml') self.audio2exp_checkpoint = os.path.join( checkpoint_path, 'auido2exp.pth') self.audio2exp_yaml_path = os.path.join( config_path, 'auido2exp.yaml') self.pirenderer_checkpoint = os.path.join(checkpoint_path, 'epoch_00190_iteration_000400000_checkpoint.pt') self.pirenderer_yaml_path = os.path.join(config_path, 'face.yaml') self.lazy_load = lazy_load if not self.lazy_load: #init model # print(self.audio2pose_checkpoint) self.audio_to_coeff = Audio2Coeff(self.audio2pose_checkpoint, self.audio2pose_yaml_path, self.audio2exp_checkpoint, self.audio2exp_yaml_path, self.wav2lip_checkpoint, self.device) # print(self.path_of_lm_croper) self.preprocess_model = CropAndExtract(self.path_of_lm_croper, self.path_of_net_recon_model, self.dir_of_BFM_fitting, self.device) def test(self, source_image, driven_audio, preprocess='full', still_mode=False, result_dir='./results/'): ### crop: only model, if self.lazy_load: #init model # print(self.audio2pose_checkpoint) self.audio_to_coeff = Audio2Coeff(self.audio2pose_checkpoint, self.audio2pose_yaml_path, self.audio2exp_checkpoint, self.audio2exp_yaml_path, self.wav2lip_checkpoint, self.device) # print(self.path_of_lm_croper) self.preprocess_model = CropAndExtract(self.path_of_lm_croper, self.path_of_net_recon_model, self.dir_of_BFM_fitting, self.device) self.pirender = AnimateFromCoeff(self.pirenderer_checkpoint, self.pirenderer_yaml_path, self.device) time_tag = str(uuid.uuid4()) save_dir = os.path.join(result_dir, time_tag) os.makedirs(save_dir, exist_ok=True) input_dir = os.path.join(save_dir, 'input') os.makedirs(input_dir, exist_ok=True) # print(source_image) pic_path = os.path.join(input_dir, os.path.basename(source_image)) shutil.copy(source_image, input_dir) if os.path.isfile(driven_audio): audio_path = os.path.join(input_dir, os.path.basename(driven_audio)) #### mp3 to wav if '.mp3' in audio_path: mp3_to_wav(driven_audio, audio_path.replace('.mp3', '.wav'), 16000) audio_path = audio_path.replace('.mp3', '.wav') else: shutil.copy(driven_audio, input_dir) else: raise AttributeError("error audio") os.makedirs(save_dir, exist_ok=True) pose_style = 0 #crop image and extract 3dmm from image first_frame_dir = os.path.join(save_dir, 'first_frame_dir') os.makedirs(first_frame_dir, exist_ok=True) first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(pic_path, first_frame_dir, preprocess) if first_coeff_path is None: raise AttributeError("No face is detected") #audio2ceoff batch = get_data(first_coeff_path, audio_path, self.device, ref_eyeblink_coeff_path=None, still=still_mode) # longer audio? coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style) # coeff_data = loadmat(coeff_path) # print(coeff_data["coeff_3dmm"].shape) # B,70 # print(type(coeff_data["coeff_3dmm"])) # nd.array # coeff2video batch_size = 1 data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode, preprocess=preprocess) # print(data["source_image"].shape) # print(data["source_semantics"].shape) # print(data["target_semantics_list"].shape) return_path = self.pirender.generate(data, save_dir) #coeff2video if self.lazy_load: del self.preprocess_model del self.audio_to_coeff if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() import gc; gc.collect() return return_path