diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d550eaaeaf3ec284e5299a9a2b995efebadc790c --- /dev/null +++ b/.gitignore @@ -0,0 +1,153 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ \ No newline at end of file diff --git a/app.py b/app.py index 02cfb17df79ef9ec49c1f9a884450e6eee73c151..f30bd2bcfd5bb0f220bde2af8bad9b802329a0cb 100644 --- a/app.py +++ b/app.py @@ -27,15 +27,15 @@ def sadtalker_demo(result_dir='./tmp/'): Homepage       \ Github ") - with gr.Row().style(equal_height=False): + with gr.Row(): with gr.Column(variant='panel'): with gr.Tabs(elem_id="sadtalker_source_image"): with gr.TabItem('Upload image'): with gr.Row(): - source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256,width=256) + source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256) with gr.Tabs(elem_id="sadtalker_driven_audio"): - with gr.TabItem('Upload audio(wav only currently)'): + with gr.TabItem('Upload audio(wav/mp3 only currently)'): with gr.Column(variant='panel'): driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath") @@ -43,12 +43,13 @@ def sadtalker_demo(result_dir='./tmp/'): with gr.Tabs(elem_id="sadtalker_checkbox"): with gr.TabItem('Settings'): with gr.Column(variant='panel'): - is_still_mode = gr.Checkbox(label="w/ Still Mode (fewer head motion)") - enhancer = gr.Checkbox(label="w/ GFPGAN as Face enhancer") + is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion)").style(container=True) + is_resize_mode = gr.Checkbox(label="Resize Mode (⚠️ Resize mode need manually crop the image firstly, can handle larger image crop)").style(container=True) + is_enhance_mode = gr.Checkbox(label="Enhance Mode (better face quality )").style(container=True) submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary') with gr.Tabs(elem_id="sadtalker_genearted"): - gen_video = gr.Video(label="Generated video", format="mp4").style(height=256,width=256) + gen_video = gr.Video(label="Generated video", format="mp4").style(width=256) gen_text = gr.Textbox(visible=False) with gr.Row(): @@ -57,7 +58,22 @@ def sadtalker_demo(result_dir='./tmp/'): 'examples/source_image/art_10.png', 'examples/driven_audio/deyu.wav', True, + False, False + ], + [ + 'examples/source_image/art_1.png', + 'examples/driven_audio/fayu.wav', + True, + True, + False + ], + [ + 'examples/source_image/art_9.png', + 'examples/driven_audio/itosinger1.wav', + True, + False, + True ] ] gr.Examples(examples=examples, @@ -65,7 +81,8 @@ def sadtalker_demo(result_dir='./tmp/'): source_image, driven_audio, is_still_mode, - enhancer, + is_resize_mode, + is_enhance_mode, gr.Textbox(value=result_dir, visible=False)], outputs=[gen_video, gen_text], fn=sad_talker.test, @@ -76,7 +93,8 @@ def sadtalker_demo(result_dir='./tmp/'): inputs=[source_image, driven_audio, is_still_mode, - enhancer, + is_resize_mode, + is_enhance_mode, gr.Textbox(value=result_dir, visible=False)], outputs=[gen_video, gen_text] ) diff --git a/modules/__pycache__/sadtalker_test.cpython-38.pyc b/modules/__pycache__/sadtalker_test.cpython-38.pyc index 5eb11acf9e267816b4086e1b582013476729c533..6377ea45cbb1a59c93ae5dd63fb699a3f4288be2 100644 Binary files a/modules/__pycache__/sadtalker_test.cpython-38.pyc and b/modules/__pycache__/sadtalker_test.cpython-38.pyc differ diff --git a/modules/__pycache__/text2speech.cpython-38.pyc b/modules/__pycache__/text2speech.cpython-38.pyc index 4b4bb06e0b09743093bac9edf6c0f5a16acac5f8..90ad4127ce0050c2215bdb797974ad849d12a96c 100644 Binary files a/modules/__pycache__/text2speech.cpython-38.pyc and b/modules/__pycache__/text2speech.cpython-38.pyc differ diff --git a/modules/sadtalker_test.py b/modules/sadtalker_test.py index f2404421dc132dcf05cddcb9422f47175f229bc9..f15e3aafe95aea1d082cbca4a3d5f4c6ce10fea4 100644 --- a/modules/sadtalker_test.py +++ b/modules/sadtalker_test.py @@ -60,7 +60,7 @@ class SadTalker(): facerender_yaml_path, device) self.device = device - def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'): + def test(self, source_image, driven_audio, still_mode, resize_mode, use_enhancer, result_dir='./'): time_tag = str(uuid.uuid4()) # strftime("%Y_%m_%d_%H.%M.%S") save_dir = os.path.join(result_dir, time_tag) @@ -91,7 +91,7 @@ class SadTalker(): #crop image and extract 3dmm from image first_frame_dir = os.path.join(save_dir, 'first_frame_dir') os.makedirs(first_frame_dir, exist_ok=True) - first_coeff_path, crop_pic_path = self.preprocess_model.generate(pic_path, first_frame_dir) + first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir, crop_or_resize= 'crop' if resize_mode == 'crop' else 'resize') if first_coeff_path is None: raise AttributeError("No face is detected") @@ -101,7 +101,7 @@ class SadTalker(): #coeff2video batch_size = 4 data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode) - self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None) + self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size) video_name = data['video_name'] print(f'The generated video is named {video_name} in {save_dir}') diff --git a/src/__pycache__/generate_batch.cpython-38.pyc b/src/__pycache__/generate_batch.cpython-38.pyc index 5032a29f3d4291f8f90539857bd726dc58679445..c68dd09e49933b52115307195bf3aa446d924922 100644 Binary files a/src/__pycache__/generate_batch.cpython-38.pyc and b/src/__pycache__/generate_batch.cpython-38.pyc differ diff --git a/src/__pycache__/generate_facerender_batch.cpython-38.pyc b/src/__pycache__/generate_facerender_batch.cpython-38.pyc index 3dbfc8c5c193db5bcb198d2fdd6c3775c9f1f9ff..6a30615ed3eaa5902a2fa553ed3ed17a9ae92a51 100644 Binary files a/src/__pycache__/generate_facerender_batch.cpython-38.pyc and b/src/__pycache__/generate_facerender_batch.cpython-38.pyc differ diff --git a/src/__pycache__/test_audio2coeff.cpython-38.pyc b/src/__pycache__/test_audio2coeff.cpython-38.pyc index 145f0df05b72a17711ca0d2bc8fa960f2e760d20..c2553cc97f50096d7c7005ad39274a8653cb6ad4 100644 Binary files a/src/__pycache__/test_audio2coeff.cpython-38.pyc and b/src/__pycache__/test_audio2coeff.cpython-38.pyc differ diff --git a/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc b/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc index 28f97d6138ff61dd5d40dbde67c8b62135623a9e..460563d74a990c40a3c5bd6f3209acca6d86b550 100644 Binary files a/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc and b/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc differ diff --git a/src/audio2exp_models/__pycache__/networks.cpython-38.pyc b/src/audio2exp_models/__pycache__/networks.cpython-38.pyc index 76c987956e3de068b8817aca70cdd05479e82232..766660615f22f94c740dd420ccef83ed442c4fac 100644 Binary files a/src/audio2exp_models/__pycache__/networks.cpython-38.pyc and b/src/audio2exp_models/__pycache__/networks.cpython-38.pyc differ diff --git a/src/audio2exp_models/audio2exp.py b/src/audio2exp_models/audio2exp.py index 8231007799891ca4dd7f81b04226d82ddfab292d..5f6e6b77b0ceb2089539caa440f7106c7b1e8aa2 100644 --- a/src/audio2exp_models/audio2exp.py +++ b/src/audio2exp_models/audio2exp.py @@ -1,3 +1,4 @@ +from tqdm import tqdm import torch from torch import nn @@ -15,15 +16,24 @@ class Audio2Exp(nn.Module): bs = mel_input.shape[0] T = mel_input.shape[1] - ref = batch['ref'][:, :, :64].repeat((1,T,1)) #bs T 64 - ratio = batch['ratio_gt'] #bs T + exp_coeff_pred = [] - audiox = mel_input.view(-1, 1, 80, 16) # bs*T 1 80 16 - exp_coeff_pred = self.netG(audiox, ref, ratio) # bs T 64 + for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames + + current_mel_input = mel_input[:,i:i+10] + + ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1)) #bs T 64 + ratio = batch['ratio_gt'][:, i:i+10] #bs T + + audiox = current_mel_input.view(-1, 1, 80, 16) # bs*T 1 80 16 + + curr_exp_coeff_pred = self.netG(audiox, ref, ratio) # bs T 64 + + exp_coeff_pred += [curr_exp_coeff_pred] # BS x T x 64 results_dict = { - 'exp_coeff_pred': exp_coeff_pred + 'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1) } return results_dict diff --git a/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc b/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc index 57e4e23d75532766340f6ebb4ebc3eb038b73564..20fa93168344012f0bdb77727b5b5669fac8a10b 100644 Binary files a/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc differ diff --git a/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc b/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc index 253503524dff693f19b69d927124b99b3ac145e5..97d9bdf072c5bd356cc312357646c6eae2b798d0 100644 Binary files a/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc differ diff --git a/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc b/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc index ff9c8bc5279ab0e4675f9d23344f1af8c126f57e..0d9aaee3ad4caa8afc40f723d224eb5b25e8afcd 100644 Binary files a/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc and b/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc differ diff --git a/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc b/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc index 12552f8a5cdc38657fa5769fb9055641b0c511f6..c7ebfcd0dd3538cedeb7eba984f94d9763b392c6 100644 Binary files a/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc and b/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc differ diff --git a/src/audio2pose_models/__pycache__/networks.cpython-38.pyc b/src/audio2pose_models/__pycache__/networks.cpython-38.pyc index 14e8a5b95b2b165c9a8e501ac6b87801dea9481f..239626089b91321b1c00cfba2dfe0a3ba1ccb0b9 100644 Binary files a/src/audio2pose_models/__pycache__/networks.cpython-38.pyc and b/src/audio2pose_models/__pycache__/networks.cpython-38.pyc differ diff --git a/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc b/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc index 1609cc2e18291e92dc25d290d47e215079c91972..0e6b40591fd932ddb2cf686b72afd08c90de1a44 100644 Binary files a/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc and b/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc differ diff --git a/src/audio2pose_models/audio2pose.py b/src/audio2pose_models/audio2pose.py index fc2499bd22f7fecabd3849a117a35d6a44d16269..3a37179e221340662a817628df3d01ae9e34404f 100644 --- a/src/audio2pose_models/audio2pose.py +++ b/src/audio2pose_models/audio2pose.py @@ -76,6 +76,7 @@ class Audio2Pose(nn.Module): batch['audio_emb'] = audio_emb batch = self.netG.test(batch) pose_motion_pred_list.append(batch['pose_motion_pred']) #list of bs seq_len 6 + if re != 0: z = torch.randn(bs, self.latent_dim).to(ref.device) batch['z'] = z diff --git a/src/audio2pose_models/audio_encoder.py b/src/audio2pose_models/audio_encoder.py index 8dc0f372a20f874ec7513d37b61859dc46e2669a..0ce036df119f86ef28c3ac8d6c834264571c309a 100644 --- a/src/audio2pose_models/audio_encoder.py +++ b/src/audio2pose_models/audio_encoder.py @@ -19,7 +19,7 @@ class Conv2d(nn.Module): return self.act(out) class AudioEncoder(nn.Module): - def __init__(self, wav2lip_checkpoint, device='cpu'): + def __init__(self, wav2lip_checkpoint): super(AudioEncoder, self).__init__() self.audio_encoder = nn.Sequential( @@ -42,7 +42,7 @@ class AudioEncoder(nn.Module): Conv2d(512, 512, kernel_size=1, stride=1, padding=0),) #### load the pre-trained audio_encoder\ - wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=device)['state_dict'] + wav2lip_state_dict = torch.load(wav2lip_checkpoint)['state_dict'] state_dict = self.audio_encoder.state_dict() for k,v in wav2lip_state_dict.items(): diff --git a/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc b/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc index d8dba7f62d303d81702181344c6a01a2d1671592..0469c877400338fae921f4aedf1159b03abbb101 100644 Binary files a/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc and b/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc differ diff --git a/src/face3d/extract_kp_videos.py b/src/face3d/extract_kp_videos.py index f1e3b7ee33ed44eb487a5856d8f835e7473e9466..f12e9ec3488d99a29620b744beaa46814b66db8f 100644 --- a/src/face3d/extract_kp_videos.py +++ b/src/face3d/extract_kp_videos.py @@ -71,7 +71,7 @@ def read_video(filename): def run(data): filename, opt, device = data os.environ['CUDA_VISIBLE_DEVICES'] = device - kp_extractor = KeypointExtractor(device) + kp_extractor = KeypointExtractor() images = read_video(filename) name = filename.split('/')[-2:] os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True) diff --git a/src/face3d/models/__pycache__/__init__.cpython-38.pyc b/src/face3d/models/__pycache__/__init__.cpython-38.pyc index 9226ef2a75f88c1bf2b87e8e173b3539d67e6b7e..886f0b184346c5530d0bf8d6f4b2300079511225 100644 Binary files a/src/face3d/models/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/face3d/models/__pycache__/base_model.cpython-38.pyc b/src/face3d/models/__pycache__/base_model.cpython-38.pyc index 960d0df171da67b761b6b279c66ee32d1d698611..e42691ec8e26c5c38baf6bd0172dff8110754da1 100644 Binary files a/src/face3d/models/__pycache__/base_model.cpython-38.pyc and b/src/face3d/models/__pycache__/base_model.cpython-38.pyc differ diff --git a/src/face3d/models/__pycache__/networks.cpython-38.pyc b/src/face3d/models/__pycache__/networks.cpython-38.pyc index 00d82c97fa7a1dca5d0495fb1a2cbeb664f09813..1a97b5cd3309786e87448c4478ae2d19a18e096b 100644 Binary files a/src/face3d/models/__pycache__/networks.cpython-38.pyc and b/src/face3d/models/__pycache__/networks.cpython-38.pyc differ diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc index 0acba565bc3928508199e41a8da56f09ad955ccc..83f6ad3ed4af3cc3d3cfa9067e345cdffb058638 100644 Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc index 1d5ddd6e5ded7e34e7aaa8b3d945a0167454ee3c..f59247d26d9210b5fd2960df842753a903a90b3d 100644 Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc differ diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc index 46f43a3ae7bc4653ccbef934b56566b667ec0028..d8edc64d28aa3e3fb8c26ba795d04a8ef35b1540 100644 Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc differ diff --git a/src/face3d/util/__pycache__/__init__.cpython-38.pyc b/src/face3d/util/__pycache__/__init__.cpython-38.pyc index 72c836e4ebd2b5baf1273f61fc024eb1e0347085..22771f3169f2da9a37c1bd619a0e5d05003492b9 100644 Binary files a/src/face3d/util/__pycache__/__init__.cpython-38.pyc and b/src/face3d/util/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/face3d/util/__pycache__/load_mats.cpython-38.pyc b/src/face3d/util/__pycache__/load_mats.cpython-38.pyc index 0457c4f9ef0a394f926a5f792178dc9b25c60cfb..8a48b59ca078ef709825d54c069f518c15103c4e 100644 Binary files a/src/face3d/util/__pycache__/load_mats.cpython-38.pyc and b/src/face3d/util/__pycache__/load_mats.cpython-38.pyc differ diff --git a/src/face3d/util/__pycache__/preprocess.cpython-38.pyc b/src/face3d/util/__pycache__/preprocess.cpython-38.pyc index 8100839a4aafba54f8a6a034b03e0660a48368ec..7900dafbd8b74629c391eb8972f615650d4461df 100644 Binary files a/src/face3d/util/__pycache__/preprocess.cpython-38.pyc and b/src/face3d/util/__pycache__/preprocess.cpython-38.pyc differ diff --git a/src/facerender/__pycache__/animate.cpython-38.pyc b/src/facerender/__pycache__/animate.cpython-38.pyc index 91ca41e74ce67feb6bdb9948a59183d7b92eb3b9..11fb3d0ee467093c0cb318003c52eb4c78f11cc9 100644 Binary files a/src/facerender/__pycache__/animate.cpython-38.pyc and b/src/facerender/__pycache__/animate.cpython-38.pyc differ diff --git a/src/facerender/animate.py b/src/facerender/animate.py index e9743a6cdf8eaa60f7cb25d97d831d99638d855e..be2d62ebaeffe06a8dee1e268d832690b1937320 100644 --- a/src/facerender/animate.py +++ b/src/facerender/animate.py @@ -1,4 +1,5 @@ import os +import cv2 import yaml import numpy as np import warnings @@ -106,7 +107,7 @@ class AnimateFromCoeff(): return checkpoint['epoch'] - def generate(self, x, video_save_dir, enhancer=None): + def generate(self, x, video_save_dir, enhancer=None, original_size=None): source_image=x['source_image'].type(torch.FloatTensor) source_semantics=x['source_semantics'].type(torch.FloatTensor) @@ -137,6 +138,10 @@ class AnimateFromCoeff(): video.append(image) result = img_as_ubyte(video) + ### the generated video is 256x256, so we keep the aspect ratio, + if original_size: + result = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in result ] + video_name = x['video_name'] + '.mp4' path = os.path.join(video_save_dir, 'temp_'+video_name) imageio.mimsave(path, result, fps=float(25)) @@ -146,6 +151,10 @@ class AnimateFromCoeff(): av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer) enhanced_path = os.path.join(video_save_dir, 'temp_'+video_name_enhancer) enhanced_images = face_enhancer(result, method=enhancer) + + if original_size: + enhanced_images = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in enhanced_images ] + imageio.mimsave(enhanced_path, enhanced_images, fps=float(25)) av_path = os.path.join(video_save_dir, video_name) diff --git a/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc b/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc index 35a44da77686faa8c09392f15f119518059cd288..5178c3763bc9f6fcff3a8a410deff7d3c30060db 100644 Binary files a/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc and b/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc differ diff --git a/src/facerender/modules/__pycache__/generator.cpython-38.pyc b/src/facerender/modules/__pycache__/generator.cpython-38.pyc index f18ac48e6c06f902f22d051588a9023bcd163dda..8d132f05d36e505f21c864d4c95931472ba58051 100644 Binary files a/src/facerender/modules/__pycache__/generator.cpython-38.pyc and b/src/facerender/modules/__pycache__/generator.cpython-38.pyc differ diff --git a/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc b/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc index 2eb85ab75af6c9d5c47d5a0ecbcb929310c2d902..ccc5d4543365bfc022a06a72d6ed9d388249279a 100644 Binary files a/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc and b/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc differ diff --git a/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc b/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc index de5ff5d0819a9aa6adc7d3aaa92343361a098048..1b54bcc293d742f70db165849b9764666b0f9a8b 100644 Binary files a/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc and b/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc differ diff --git a/src/facerender/modules/__pycache__/mapping.cpython-38.pyc b/src/facerender/modules/__pycache__/mapping.cpython-38.pyc index b8ff701e5c431228481c008a4110c97f45b4803a..7e1a2baa2bfab28fe7e3904f94a644633124b56c 100644 Binary files a/src/facerender/modules/__pycache__/mapping.cpython-38.pyc and b/src/facerender/modules/__pycache__/mapping.cpython-38.pyc differ diff --git a/src/facerender/modules/__pycache__/util.cpython-38.pyc b/src/facerender/modules/__pycache__/util.cpython-38.pyc index 92a821d2ec9c6c655bd82b333490adca2f020304..1e1c92955be38c880c52cc70b8051fd8ef4fa63a 100644 Binary files a/src/facerender/modules/__pycache__/util.cpython-38.pyc and b/src/facerender/modules/__pycache__/util.cpython-38.pyc differ diff --git a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc index 1712d28c2f1564aecdec0971496514e6b9a85109..03d5fdb5ff0e14c08894b394b8c1cae7e1f324c4 100644 Binary files a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc index d204555804699761f59a2d639d89d33e0e1a8637..20a4560fc425087d5d63c70cc08fd12c2d8a7ea1 100644 Binary files a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc differ diff --git a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc index efda35f1eb336382fc5cb5943db3aaeda8042de6..eb7252b8ad1b6aec2f5566979db0494f71a63d91 100644 Binary files a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc differ diff --git a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc index 6fdcebc6259dc3c8ebf212c6d26bf9f00316ede7..30c9811579d75333db1b60fe4622f682013f719b 100644 Binary files a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc differ diff --git a/src/generate_batch.py b/src/generate_batch.py index 35a785b182ff4bfbdcc55fd8cfc7c5471cb9a1af..2d9e19b6aa4c19c13caf0a208e1189cd6c19f796 100644 --- a/src/generate_batch.py +++ b/src/generate_batch.py @@ -1,18 +1,11 @@ import os + +from tqdm import tqdm import torch import numpy as np import random import scipy.io as scio import src.utils.audio as audio -import subprocess, platform - -from pydub import AudioSegment - -def mp3_to_wav(mp3_filename,wav_filename,frame_rate): - mp3_file = AudioSegment.from_mp3(file=mp3_filename) - mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav") - - def crop_pad_audio(wav, audio_length): if len(wav) > audio_length: @@ -33,7 +26,6 @@ def generate_blink_seq(num_frames): ratio = np.zeros((num_frames,1)) frame_id = 0 while frame_id in range(num_frames): - #start = random.choice(range(60,70)) start = 80 if frame_id+start+9<=num_frames - 1: ratio[frame_id+start:frame_id+start+9, 0] = [0.5,0.6,0.7,0.9,1, 0.9, 0.7,0.6,0.5] @@ -48,7 +40,6 @@ def generate_blink_seq_randomly(num_frames): return ratio frame_id = 0 while frame_id in range(num_frames): - #start = random.choice(range(60,70)) start = random.choice(range(min(10,num_frames), min(int(num_frames/2), 70))) if frame_id+start+5<=num_frames - 1: ratio[frame_id+start:frame_id+start+5, 0] = [0.5, 0.9, 1.0, 0.9, 0.5] @@ -60,8 +51,6 @@ def generate_blink_seq_randomly(num_frames): def get_data(first_coeff_path, audio_path, device): syncnet_mel_step_size = 16 - syncnet_T = 5 - MAX_FRAME = 32 fps = 25 pic_name = os.path.splitext(os.path.split(first_coeff_path)[-1])[0] @@ -71,23 +60,14 @@ def get_data(first_coeff_path, audio_path, device): source_semantics_dict = scio.loadmat(source_semantics_path) ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70] #1 70 - print(audio_path) - if '.mp3' in audio_path: - print(audio_path) - mp3_to_wav(audio_path, audio_path.replace('.mp3','.wav'), 16000) - new_audio = audio_path.replace('.mp3','.wav') - else: - new_audio = audio_path - - wav = audio.load_wav(new_audio, 16000) - + wav = audio.load_wav(audio_path, 16000) wav_length, num_frames = parse_audio_length(len(wav), 16000, 25) wav = crop_pad_audio(wav, wav_length) orig_mel = audio.melspectrogram(wav).T spec = orig_mel.copy() # nframes 80 indiv_mels = [] - for i in range(num_frames): + for i in tqdm(range(num_frames), 'mel:'): start_frame_num = i-2 start_idx = int(80. * (start_frame_num / float(fps))) end_idx = start_idx + syncnet_mel_step_size @@ -97,7 +77,6 @@ def get_data(first_coeff_path, audio_path, device): indiv_mels.append(m.T) indiv_mels = np.asarray(indiv_mels) # T 80 16 ratio = generate_blink_seq_randomly(num_frames) # T - indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1).unsqueeze(0) # bs T 1 80 16 ratio = torch.FloatTensor(ratio).unsqueeze(0) # bs T diff --git a/src/gradio_demo.py b/src/gradio_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..4f78c97349652e23cf463c49527191fcec795564 --- /dev/null +++ b/src/gradio_demo.py @@ -0,0 +1,113 @@ +import torch, uuid +from time import gmtime, strftime +import os, sys, shutil +from src.utils.preprocess import CropAndExtract +from src.test_audio2coeff import Audio2Coeff +from src.facerender.animate import AnimateFromCoeff +from src.generate_batch import get_data +from src.generate_facerender_batch import get_facerender_data +from src.utils.text2speech import text2speech + +from pydub import AudioSegment + +def mp3_to_wav(mp3_filename,wav_filename,frame_rate): + mp3_file = AudioSegment.from_file(file=mp3_filename) + mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav") + + +class SadTalker(): + + def __init__(self, checkpoint_path='checkpoints', config_path='src/config'): + + if torch.cuda.is_available() : + device = "cuda" + else: + device = "cpu" + + os.environ['TORCH_HOME']= checkpoint_path + + path_of_lm_croper = os.path.join( checkpoint_path, 'shape_predictor_68_face_landmarks.dat') + path_of_net_recon_model = os.path.join( checkpoint_path, 'epoch_20.pth') + dir_of_BFM_fitting = os.path.join( checkpoint_path, 'BFM_Fitting') + wav2lip_checkpoint = os.path.join( checkpoint_path, 'wav2lip.pth') + + audio2pose_checkpoint = os.path.join( checkpoint_path, 'auido2pose_00140-model.pth') + audio2pose_yaml_path = os.path.join( config_path, 'auido2pose.yaml') + + audio2exp_checkpoint = os.path.join( checkpoint_path, 'auido2exp_00300-model.pth') + audio2exp_yaml_path = os.path.join( config_path, 'auido2exp.yaml') + + free_view_checkpoint = os.path.join( checkpoint_path, 'facevid2vid_00189-model.pth.tar') + mapping_checkpoint = os.path.join( checkpoint_path, 'mapping_00229-model.pth.tar') + facerender_yaml_path = os.path.join( config_path, 'facerender.yaml') + + #init model + print(path_of_lm_croper) + self.preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device) + + print(audio2pose_checkpoint) + self.audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path, + audio2exp_checkpoint, audio2exp_yaml_path, wav2lip_checkpoint, device) + print(free_view_checkpoint) + self.animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint, + facerender_yaml_path, device) + self.device = device + + def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'): + + time_tag = str(uuid.uuid4()) + save_dir = os.path.join(result_dir, time_tag) + os.makedirs(save_dir, exist_ok=True) + + input_dir = os.path.join(save_dir, 'input') + os.makedirs(input_dir, exist_ok=True) + + print(source_image) + pic_path = os.path.join(input_dir, os.path.basename(source_image)) + shutil.move(source_image, input_dir) + + if os.path.isfile(driven_audio): + audio_path = os.path.join(input_dir, os.path.basename(driven_audio)) + + #### mp3 to wav + if '.mp3' in audio_path: + mp3_to_wav(driven_audio, audio_path.replace('.mp3', '.wav'), 16000) + audio_path = audio_path.replace('.mp3', '.wav') + else: + shutil.move(driven_audio, input_dir) + else: + text2speech + + + os.makedirs(save_dir, exist_ok=True) + pose_style = 0 + #crop image and extract 3dmm from image + first_frame_dir = os.path.join(save_dir, 'first_frame_dir') + os.makedirs(first_frame_dir, exist_ok=True) + first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir) + + if first_coeff_path is None: + raise AttributeError("No face is detected") + + #audio2ceoff + batch = get_data(first_coeff_path, audio_path, self.device) # longer audio? + coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style) + #coeff2video + batch_size = 4 + data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode) + self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size) + video_name = data['video_name'] + print(f'The generated video is named {video_name} in {save_dir}') + + torch.cuda.empty_cache() + torch.cuda.synchronize() + import gc; gc.collect() + + if use_enhancer: + return os.path.join(save_dir, video_name+'_enhanced.mp4'), os.path.join(save_dir, video_name+'_enhanced.mp4') + + else: + return os.path.join(save_dir, video_name+'.mp4'), os.path.join(save_dir, video_name+'.mp4') + + + \ No newline at end of file diff --git a/src/test_audio2coeff.py b/src/test_audio2coeff.py index 3de26514660d9a12853c45e4e5278c7cfce7a7cd..3db6be3af59b0319c50106d9a92c903118f28410 100644 --- a/src/test_audio2coeff.py +++ b/src/test_audio2coeff.py @@ -81,7 +81,7 @@ class Audio2Coeff(): savemat(os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])), {'coeff_3dmm': coeffs_pred_numpy}) - torch.cuda.empty_cache() + return os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])) diff --git a/src/utils/__pycache__/audio.cpython-38.pyc b/src/utils/__pycache__/audio.cpython-38.pyc index 71fec496164398d674b966d4146ed59d46ac7e0f..c9037ed6e9b29bf1f5ba29b25ed9c067103bb361 100644 Binary files a/src/utils/__pycache__/audio.cpython-38.pyc and b/src/utils/__pycache__/audio.cpython-38.pyc differ diff --git a/src/utils/__pycache__/croper.cpython-38.pyc b/src/utils/__pycache__/croper.cpython-38.pyc index 66904192497cde94208a03a315a9017c41850d1d..addfae662741dd661426427e2f29d506c399adba 100644 Binary files a/src/utils/__pycache__/croper.cpython-38.pyc and b/src/utils/__pycache__/croper.cpython-38.pyc differ diff --git a/src/utils/__pycache__/face_enhancer.cpython-38.pyc b/src/utils/__pycache__/face_enhancer.cpython-38.pyc index c2c91bd41ae88985f474551e32f12ba2b07bfc0e..51b465795f49c49c741a7fb510d02564337deb28 100644 Binary files a/src/utils/__pycache__/face_enhancer.cpython-38.pyc and b/src/utils/__pycache__/face_enhancer.cpython-38.pyc differ diff --git a/src/utils/__pycache__/hparams.cpython-38.pyc b/src/utils/__pycache__/hparams.cpython-38.pyc index d0e429fd2d0a979e1704da0a94b4cce691d2ccef..29278c1421204d040aa03f77ed43e18f9b60dad8 100644 Binary files a/src/utils/__pycache__/hparams.cpython-38.pyc and b/src/utils/__pycache__/hparams.cpython-38.pyc differ diff --git a/src/utils/__pycache__/preprocess.cpython-38.pyc b/src/utils/__pycache__/preprocess.cpython-38.pyc index 58abea029b3e9d96718a6812624cbba961ef5202..e5e0b7f2a4c29050bfbb30405816311acd3060f0 100644 Binary files a/src/utils/__pycache__/preprocess.cpython-38.pyc and b/src/utils/__pycache__/preprocess.cpython-38.pyc differ diff --git a/src/utils/preprocess.py b/src/utils/preprocess.py index e5362752551bba79bfacb3dd304b47ff525f4e2f..4e3dad8d4a49080a3300f672965a11a8a2054fa2 100644 --- a/src/utils/preprocess.py +++ b/src/utils/preprocess.py @@ -1,5 +1,5 @@ import numpy as np -import cv2, os, sys,torch +import cv2, os, sys, torch from tqdm import tqdm from PIL import Image @@ -51,7 +51,7 @@ class CropAndExtract(): self.lm3d_std = load_lm3d(dir_of_BFM_fitting) self.device = device - def generate(self, input_path, save_dir): + def generate(self, input_path, save_dir, crop_or_resize='crop'): pic_size = 256 pic_name = os.path.splitext(os.path.split(input_path)[-1])[0] @@ -81,7 +81,7 @@ class CropAndExtract(): break x_full_frames = [cv2.cvtColor(full_frames[0], cv2.COLOR_BGR2RGB) ] - if True: + if crop_or_resize.lower() == 'crop': # default crop x_full_frames, crop, quad = self.croper.crop(x_full_frames, xsize=pic_size) clx, cly, crx, cry = crop lx, ly, rx, ry = quad @@ -90,7 +90,9 @@ class CropAndExtract(): original_size = (ox2 - ox1, oy2 - oy1) else: oy1, oy2, ox1, ox2 = 0, x_full_frames[0].shape[0], 0, x_full_frames[0].shape[1] - frames_pil = [Image.fromarray(cv2.resize(frame,(pic_size,pic_size))) for frame in x_full_frames] + original_size = (ox2 - ox1, oy2 - oy1) + + frames_pil = [Image.fromarray(cv2.resize(frame,(pic_size, pic_size))) for frame in x_full_frames] if len(frames_pil) == 0: print('No face is detected in the input file') return None, None @@ -110,7 +112,7 @@ class CropAndExtract(): if not os.path.isfile(coeff_path): # load 3dmm paramter generator from Deep3DFaceRecon_pytorch video_coeffs, full_coeffs = [], [] - for idx in tqdm(range(len(frames_pil)), desc=' 3DMM Extraction In Video:'): + for idx in tqdm(range(len(frames_pil)), desc='3DMM Extraction In Video:'): frame = frames_pil[idx] W,H = frame.size lm1 = lm[idx].reshape([-1, 2]) @@ -147,4 +149,4 @@ class CropAndExtract(): savemat(coeff_path, {'coeff_3dmm': semantic_npy, 'full_3dmm': np.array(full_coeffs)[0]}) - return coeff_path, png_path \ No newline at end of file + return coeff_path, png_path, original_size \ No newline at end of file diff --git a/src/utils/text2speech.py b/src/utils/text2speech.py new file mode 100644 index 0000000000000000000000000000000000000000..3ecaef36961494c8b2b1f5771a70b997efa04ffd --- /dev/null +++ b/src/utils/text2speech.py @@ -0,0 +1,12 @@ +import os + +def text2speech(txt, audio_path): + print(txt) + cmd = f'tts --text "{txt}" --out_path {audio_path}' + print(cmd) + try: + os.system(cmd) + return audio_path + except: + print("Error: Failed convert txt to audio") + return None \ No newline at end of file