diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d550eaaeaf3ec284e5299a9a2b995efebadc790c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,153 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
\ No newline at end of file
diff --git a/app.py b/app.py
index 02cfb17df79ef9ec49c1f9a884450e6eee73c151..f30bd2bcfd5bb0f220bde2af8bad9b802329a0cb 100644
--- a/app.py
+++ b/app.py
@@ -27,15 +27,15 @@ def sadtalker_demo(result_dir='./tmp/'):
                     <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a>  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
                      <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
         
-        with gr.Row().style(equal_height=False):
+        with gr.Row():
             with gr.Column(variant='panel'):
                 with gr.Tabs(elem_id="sadtalker_source_image"):
                     with gr.TabItem('Upload image'):
                         with gr.Row():
-                            source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256,width=256)
+                            source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256)
  
                 with gr.Tabs(elem_id="sadtalker_driven_audio"):
-                    with gr.TabItem('Upload audio(wav only currently)'):
+                    with gr.TabItem('Upload audio(wav/mp3 only currently)'):
                         with gr.Column(variant='panel'):
                             driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
 
@@ -43,12 +43,13 @@ def sadtalker_demo(result_dir='./tmp/'):
                 with gr.Tabs(elem_id="sadtalker_checkbox"):
                     with gr.TabItem('Settings'):
                         with gr.Column(variant='panel'):
-                            is_still_mode = gr.Checkbox(label="w/ Still Mode (fewer head motion)")
-                            enhancer = gr.Checkbox(label="w/ GFPGAN as Face enhancer")
+                            is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion)").style(container=True)
+                            is_resize_mode = gr.Checkbox(label="Resize Mode (⚠️ Resize mode need manually crop the image firstly, can handle larger image crop)").style(container=True)
+                            is_enhance_mode = gr.Checkbox(label="Enhance Mode (better face quality )").style(container=True)
                             submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
 
                 with gr.Tabs(elem_id="sadtalker_genearted"):
-                        gen_video = gr.Video(label="Generated video", format="mp4").style(height=256,width=256)
+                        gen_video = gr.Video(label="Generated video", format="mp4").style(width=256)
                         gen_text = gr.Textbox(visible=False)
                     
         with gr.Row():
@@ -57,7 +58,22 @@ def sadtalker_demo(result_dir='./tmp/'):
                     'examples/source_image/art_10.png',
                     'examples/driven_audio/deyu.wav',
                     True,
+                    False,
                     False
+                ],
+                [
+                    'examples/source_image/art_1.png',
+                    'examples/driven_audio/fayu.wav',
+                    True,
+                    True,
+                    False
+                ],
+                [
+                    'examples/source_image/art_9.png',
+                    'examples/driven_audio/itosinger1.wav',
+                    True,
+                    False,
+                    True
                 ]
             ]
             gr.Examples(examples=examples,
@@ -65,7 +81,8 @@ def sadtalker_demo(result_dir='./tmp/'):
                             source_image,
                             driven_audio,
                             is_still_mode,
-                            enhancer,
+                            is_resize_mode,
+                            is_enhance_mode,
                             gr.Textbox(value=result_dir, visible=False)], 
                         outputs=[gen_video, gen_text],
                         fn=sad_talker.test,
@@ -76,7 +93,8 @@ def sadtalker_demo(result_dir='./tmp/'):
                     inputs=[source_image,
                             driven_audio,
                             is_still_mode,
-                            enhancer,
+                            is_resize_mode,
+                            is_enhance_mode,
                             gr.Textbox(value=result_dir, visible=False)], 
                     outputs=[gen_video, gen_text]
                     )
diff --git a/modules/__pycache__/sadtalker_test.cpython-38.pyc b/modules/__pycache__/sadtalker_test.cpython-38.pyc
index 5eb11acf9e267816b4086e1b582013476729c533..6377ea45cbb1a59c93ae5dd63fb699a3f4288be2 100644
Binary files a/modules/__pycache__/sadtalker_test.cpython-38.pyc and b/modules/__pycache__/sadtalker_test.cpython-38.pyc differ
diff --git a/modules/__pycache__/text2speech.cpython-38.pyc b/modules/__pycache__/text2speech.cpython-38.pyc
index 4b4bb06e0b09743093bac9edf6c0f5a16acac5f8..90ad4127ce0050c2215bdb797974ad849d12a96c 100644
Binary files a/modules/__pycache__/text2speech.cpython-38.pyc and b/modules/__pycache__/text2speech.cpython-38.pyc differ
diff --git a/modules/sadtalker_test.py b/modules/sadtalker_test.py
index f2404421dc132dcf05cddcb9422f47175f229bc9..f15e3aafe95aea1d082cbca4a3d5f4c6ce10fea4 100644
--- a/modules/sadtalker_test.py
+++ b/modules/sadtalker_test.py
@@ -60,7 +60,7 @@ class SadTalker():
                                             facerender_yaml_path, device)
         self.device = device
 
-    def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'):
+    def test(self, source_image, driven_audio, still_mode, resize_mode, use_enhancer, result_dir='./'):
 
         time_tag =  str(uuid.uuid4()) # strftime("%Y_%m_%d_%H.%M.%S")
         save_dir = os.path.join(result_dir, time_tag)
@@ -91,7 +91,7 @@ class SadTalker():
         #crop image and extract 3dmm from image
         first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
         os.makedirs(first_frame_dir, exist_ok=True)
-        first_coeff_path, crop_pic_path = self.preprocess_model.generate(pic_path, first_frame_dir)
+        first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir, crop_or_resize= 'crop' if resize_mode == 'crop' else 'resize')
         if first_coeff_path is None:
             raise AttributeError("No face is detected")
 
@@ -101,7 +101,7 @@ class SadTalker():
         #coeff2video
         batch_size = 4
         data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
-        self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None)
+        self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size)
         video_name = data['video_name']
         print(f'The generated video is named {video_name} in {save_dir}')
 
diff --git a/src/__pycache__/generate_batch.cpython-38.pyc b/src/__pycache__/generate_batch.cpython-38.pyc
index 5032a29f3d4291f8f90539857bd726dc58679445..c68dd09e49933b52115307195bf3aa446d924922 100644
Binary files a/src/__pycache__/generate_batch.cpython-38.pyc and b/src/__pycache__/generate_batch.cpython-38.pyc differ
diff --git a/src/__pycache__/generate_facerender_batch.cpython-38.pyc b/src/__pycache__/generate_facerender_batch.cpython-38.pyc
index 3dbfc8c5c193db5bcb198d2fdd6c3775c9f1f9ff..6a30615ed3eaa5902a2fa553ed3ed17a9ae92a51 100644
Binary files a/src/__pycache__/generate_facerender_batch.cpython-38.pyc and b/src/__pycache__/generate_facerender_batch.cpython-38.pyc differ
diff --git a/src/__pycache__/test_audio2coeff.cpython-38.pyc b/src/__pycache__/test_audio2coeff.cpython-38.pyc
index 145f0df05b72a17711ca0d2bc8fa960f2e760d20..c2553cc97f50096d7c7005ad39274a8653cb6ad4 100644
Binary files a/src/__pycache__/test_audio2coeff.cpython-38.pyc and b/src/__pycache__/test_audio2coeff.cpython-38.pyc differ
diff --git a/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc b/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc
index 28f97d6138ff61dd5d40dbde67c8b62135623a9e..460563d74a990c40a3c5bd6f3209acca6d86b550 100644
Binary files a/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc and b/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc differ
diff --git a/src/audio2exp_models/__pycache__/networks.cpython-38.pyc b/src/audio2exp_models/__pycache__/networks.cpython-38.pyc
index 76c987956e3de068b8817aca70cdd05479e82232..766660615f22f94c740dd420ccef83ed442c4fac 100644
Binary files a/src/audio2exp_models/__pycache__/networks.cpython-38.pyc and b/src/audio2exp_models/__pycache__/networks.cpython-38.pyc differ
diff --git a/src/audio2exp_models/audio2exp.py b/src/audio2exp_models/audio2exp.py
index 8231007799891ca4dd7f81b04226d82ddfab292d..5f6e6b77b0ceb2089539caa440f7106c7b1e8aa2 100644
--- a/src/audio2exp_models/audio2exp.py
+++ b/src/audio2exp_models/audio2exp.py
@@ -1,3 +1,4 @@
+from tqdm import tqdm
 import torch
 from torch import nn
 
@@ -15,15 +16,24 @@ class Audio2Exp(nn.Module):
         bs = mel_input.shape[0]
         T = mel_input.shape[1]
 
-        ref = batch['ref'][:, :, :64].repeat((1,T,1))           #bs T 64
-        ratio = batch['ratio_gt']                               #bs T
+        exp_coeff_pred = []
 
-        audiox = mel_input.view(-1, 1, 80, 16)                  # bs*T 1 80 16
-        exp_coeff_pred  = self.netG(audiox, ref, ratio)         # bs T 64 
+        for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames
+            
+            current_mel_input = mel_input[:,i:i+10]
+
+            ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1))           #bs T 64
+            ratio = batch['ratio_gt'][:, i:i+10]                               #bs T
+
+            audiox = current_mel_input.view(-1, 1, 80, 16)                  # bs*T 1 80 16
+
+            curr_exp_coeff_pred  = self.netG(audiox, ref, ratio)         # bs T 64 
+
+            exp_coeff_pred += [curr_exp_coeff_pred]
 
         # BS x T x 64
         results_dict = {
-            'exp_coeff_pred': exp_coeff_pred
+            'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1)
             }
         return results_dict
 
diff --git a/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc b/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc
index 57e4e23d75532766340f6ebb4ebc3eb038b73564..20fa93168344012f0bdb77727b5b5669fac8a10b 100644
Binary files a/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc b/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc
index 253503524dff693f19b69d927124b99b3ac145e5..97d9bdf072c5bd356cc312357646c6eae2b798d0 100644
Binary files a/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc b/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc
index ff9c8bc5279ab0e4675f9d23344f1af8c126f57e..0d9aaee3ad4caa8afc40f723d224eb5b25e8afcd 100644
Binary files a/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc and b/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc b/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc
index 12552f8a5cdc38657fa5769fb9055641b0c511f6..c7ebfcd0dd3538cedeb7eba984f94d9763b392c6 100644
Binary files a/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc and b/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/networks.cpython-38.pyc b/src/audio2pose_models/__pycache__/networks.cpython-38.pyc
index 14e8a5b95b2b165c9a8e501ac6b87801dea9481f..239626089b91321b1c00cfba2dfe0a3ba1ccb0b9 100644
Binary files a/src/audio2pose_models/__pycache__/networks.cpython-38.pyc and b/src/audio2pose_models/__pycache__/networks.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc b/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc
index 1609cc2e18291e92dc25d290d47e215079c91972..0e6b40591fd932ddb2cf686b72afd08c90de1a44 100644
Binary files a/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc and b/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc differ
diff --git a/src/audio2pose_models/audio2pose.py b/src/audio2pose_models/audio2pose.py
index fc2499bd22f7fecabd3849a117a35d6a44d16269..3a37179e221340662a817628df3d01ae9e34404f 100644
--- a/src/audio2pose_models/audio2pose.py
+++ b/src/audio2pose_models/audio2pose.py
@@ -76,6 +76,7 @@ class Audio2Pose(nn.Module):
             batch['audio_emb'] = audio_emb
             batch = self.netG.test(batch)
             pose_motion_pred_list.append(batch['pose_motion_pred'])  #list of bs seq_len 6
+        
         if re != 0:
             z = torch.randn(bs, self.latent_dim).to(ref.device)
             batch['z'] = z
diff --git a/src/audio2pose_models/audio_encoder.py b/src/audio2pose_models/audio_encoder.py
index 8dc0f372a20f874ec7513d37b61859dc46e2669a..0ce036df119f86ef28c3ac8d6c834264571c309a 100644
--- a/src/audio2pose_models/audio_encoder.py
+++ b/src/audio2pose_models/audio_encoder.py
@@ -19,7 +19,7 @@ class Conv2d(nn.Module):
         return self.act(out)
 
 class AudioEncoder(nn.Module):
-    def __init__(self, wav2lip_checkpoint, device='cpu'):
+    def __init__(self, wav2lip_checkpoint):
         super(AudioEncoder, self).__init__()
 
         self.audio_encoder = nn.Sequential(
@@ -42,7 +42,7 @@ class AudioEncoder(nn.Module):
             Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
 
         #### load the pre-trained audio_encoder\
-        wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=device)['state_dict']
+        wav2lip_state_dict = torch.load(wav2lip_checkpoint)['state_dict']
         state_dict = self.audio_encoder.state_dict()
 
         for k,v in wav2lip_state_dict.items():
diff --git a/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc b/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc
index d8dba7f62d303d81702181344c6a01a2d1671592..0469c877400338fae921f4aedf1159b03abbb101 100644
Binary files a/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc and b/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc differ
diff --git a/src/face3d/extract_kp_videos.py b/src/face3d/extract_kp_videos.py
index f1e3b7ee33ed44eb487a5856d8f835e7473e9466..f12e9ec3488d99a29620b744beaa46814b66db8f 100644
--- a/src/face3d/extract_kp_videos.py
+++ b/src/face3d/extract_kp_videos.py
@@ -71,7 +71,7 @@ def read_video(filename):
 def run(data):
     filename, opt, device = data
     os.environ['CUDA_VISIBLE_DEVICES'] = device
-    kp_extractor = KeypointExtractor(device)
+    kp_extractor = KeypointExtractor()
     images = read_video(filename)
     name = filename.split('/')[-2:]
     os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)
diff --git a/src/face3d/models/__pycache__/__init__.cpython-38.pyc b/src/face3d/models/__pycache__/__init__.cpython-38.pyc
index 9226ef2a75f88c1bf2b87e8e173b3539d67e6b7e..886f0b184346c5530d0bf8d6f4b2300079511225 100644
Binary files a/src/face3d/models/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/face3d/models/__pycache__/base_model.cpython-38.pyc b/src/face3d/models/__pycache__/base_model.cpython-38.pyc
index 960d0df171da67b761b6b279c66ee32d1d698611..e42691ec8e26c5c38baf6bd0172dff8110754da1 100644
Binary files a/src/face3d/models/__pycache__/base_model.cpython-38.pyc and b/src/face3d/models/__pycache__/base_model.cpython-38.pyc differ
diff --git a/src/face3d/models/__pycache__/networks.cpython-38.pyc b/src/face3d/models/__pycache__/networks.cpython-38.pyc
index 00d82c97fa7a1dca5d0495fb1a2cbeb664f09813..1a97b5cd3309786e87448c4478ae2d19a18e096b 100644
Binary files a/src/face3d/models/__pycache__/networks.cpython-38.pyc and b/src/face3d/models/__pycache__/networks.cpython-38.pyc differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc
index 0acba565bc3928508199e41a8da56f09ad955ccc..83f6ad3ed4af3cc3d3cfa9067e345cdffb058638 100644
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc
index 1d5ddd6e5ded7e34e7aaa8b3d945a0167454ee3c..f59247d26d9210b5fd2960df842753a903a90b3d 100644
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc
index 46f43a3ae7bc4653ccbef934b56566b667ec0028..d8edc64d28aa3e3fb8c26ba795d04a8ef35b1540 100644
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc differ
diff --git a/src/face3d/util/__pycache__/__init__.cpython-38.pyc b/src/face3d/util/__pycache__/__init__.cpython-38.pyc
index 72c836e4ebd2b5baf1273f61fc024eb1e0347085..22771f3169f2da9a37c1bd619a0e5d05003492b9 100644
Binary files a/src/face3d/util/__pycache__/__init__.cpython-38.pyc and b/src/face3d/util/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/face3d/util/__pycache__/load_mats.cpython-38.pyc b/src/face3d/util/__pycache__/load_mats.cpython-38.pyc
index 0457c4f9ef0a394f926a5f792178dc9b25c60cfb..8a48b59ca078ef709825d54c069f518c15103c4e 100644
Binary files a/src/face3d/util/__pycache__/load_mats.cpython-38.pyc and b/src/face3d/util/__pycache__/load_mats.cpython-38.pyc differ
diff --git a/src/face3d/util/__pycache__/preprocess.cpython-38.pyc b/src/face3d/util/__pycache__/preprocess.cpython-38.pyc
index 8100839a4aafba54f8a6a034b03e0660a48368ec..7900dafbd8b74629c391eb8972f615650d4461df 100644
Binary files a/src/face3d/util/__pycache__/preprocess.cpython-38.pyc and b/src/face3d/util/__pycache__/preprocess.cpython-38.pyc differ
diff --git a/src/facerender/__pycache__/animate.cpython-38.pyc b/src/facerender/__pycache__/animate.cpython-38.pyc
index 91ca41e74ce67feb6bdb9948a59183d7b92eb3b9..11fb3d0ee467093c0cb318003c52eb4c78f11cc9 100644
Binary files a/src/facerender/__pycache__/animate.cpython-38.pyc and b/src/facerender/__pycache__/animate.cpython-38.pyc differ
diff --git a/src/facerender/animate.py b/src/facerender/animate.py
index e9743a6cdf8eaa60f7cb25d97d831d99638d855e..be2d62ebaeffe06a8dee1e268d832690b1937320 100644
--- a/src/facerender/animate.py
+++ b/src/facerender/animate.py
@@ -1,4 +1,5 @@
 import os
+import cv2
 import yaml
 import numpy as np
 import warnings
@@ -106,7 +107,7 @@ class AnimateFromCoeff():
 
         return checkpoint['epoch']
 
-    def generate(self, x, video_save_dir, enhancer=None):
+    def generate(self, x, video_save_dir, enhancer=None, original_size=None):
 
         source_image=x['source_image'].type(torch.FloatTensor)
         source_semantics=x['source_semantics'].type(torch.FloatTensor)
@@ -137,6 +138,10 @@ class AnimateFromCoeff():
             video.append(image)
         result = img_as_ubyte(video)
 
+        ### the generated video is 256x256, so we  keep the aspect ratio, 
+        if original_size:
+            result = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in result ]
+        
         video_name = x['video_name']  + '.mp4'
         path = os.path.join(video_save_dir, 'temp_'+video_name)
         imageio.mimsave(path, result, fps=float(25))
@@ -146,6 +151,10 @@ class AnimateFromCoeff():
             av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer) 
             enhanced_path = os.path.join(video_save_dir, 'temp_'+video_name_enhancer)
             enhanced_images = face_enhancer(result, method=enhancer)
+
+            if original_size:
+                enhanced_images = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in enhanced_images ]
+
             imageio.mimsave(enhanced_path, enhanced_images, fps=float(25))
 
         av_path = os.path.join(video_save_dir, video_name) 
diff --git a/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc b/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc
index 35a44da77686faa8c09392f15f119518059cd288..5178c3763bc9f6fcff3a8a410deff7d3c30060db 100644
Binary files a/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc and b/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/generator.cpython-38.pyc b/src/facerender/modules/__pycache__/generator.cpython-38.pyc
index f18ac48e6c06f902f22d051588a9023bcd163dda..8d132f05d36e505f21c864d4c95931472ba58051 100644
Binary files a/src/facerender/modules/__pycache__/generator.cpython-38.pyc and b/src/facerender/modules/__pycache__/generator.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc b/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc
index 2eb85ab75af6c9d5c47d5a0ecbcb929310c2d902..ccc5d4543365bfc022a06a72d6ed9d388249279a 100644
Binary files a/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc and b/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc b/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc
index de5ff5d0819a9aa6adc7d3aaa92343361a098048..1b54bcc293d742f70db165849b9764666b0f9a8b 100644
Binary files a/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc and b/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/mapping.cpython-38.pyc b/src/facerender/modules/__pycache__/mapping.cpython-38.pyc
index b8ff701e5c431228481c008a4110c97f45b4803a..7e1a2baa2bfab28fe7e3904f94a644633124b56c 100644
Binary files a/src/facerender/modules/__pycache__/mapping.cpython-38.pyc and b/src/facerender/modules/__pycache__/mapping.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/util.cpython-38.pyc b/src/facerender/modules/__pycache__/util.cpython-38.pyc
index 92a821d2ec9c6c655bd82b333490adca2f020304..1e1c92955be38c880c52cc70b8051fd8ef4fa63a 100644
Binary files a/src/facerender/modules/__pycache__/util.cpython-38.pyc and b/src/facerender/modules/__pycache__/util.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc
index 1712d28c2f1564aecdec0971496514e6b9a85109..03d5fdb5ff0e14c08894b394b8c1cae7e1f324c4 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc
index d204555804699761f59a2d639d89d33e0e1a8637..20a4560fc425087d5d63c70cc08fd12c2d8a7ea1 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc
index efda35f1eb336382fc5cb5943db3aaeda8042de6..eb7252b8ad1b6aec2f5566979db0494f71a63d91 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc
index 6fdcebc6259dc3c8ebf212c6d26bf9f00316ede7..30c9811579d75333db1b60fe4622f682013f719b 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc differ
diff --git a/src/generate_batch.py b/src/generate_batch.py
index 35a785b182ff4bfbdcc55fd8cfc7c5471cb9a1af..2d9e19b6aa4c19c13caf0a208e1189cd6c19f796 100644
--- a/src/generate_batch.py
+++ b/src/generate_batch.py
@@ -1,18 +1,11 @@
 import os
+
+from tqdm import tqdm
 import torch
 import numpy as np
 import random
 import scipy.io as scio
 import src.utils.audio as audio
-import subprocess, platform
-
-from pydub import AudioSegment
-
-def mp3_to_wav(mp3_filename,wav_filename,frame_rate):
-    mp3_file = AudioSegment.from_mp3(file=mp3_filename)
-    mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav")
-
-
 
 def crop_pad_audio(wav, audio_length):
     if len(wav) > audio_length:
@@ -33,7 +26,6 @@ def generate_blink_seq(num_frames):
     ratio = np.zeros((num_frames,1))
     frame_id = 0
     while frame_id in range(num_frames):
-        #start = random.choice(range(60,70))
         start = 80
         if frame_id+start+9<=num_frames - 1:
             ratio[frame_id+start:frame_id+start+9, 0] = [0.5,0.6,0.7,0.9,1, 0.9, 0.7,0.6,0.5]
@@ -48,7 +40,6 @@ def generate_blink_seq_randomly(num_frames):
         return ratio
     frame_id = 0
     while frame_id in range(num_frames):
-        #start = random.choice(range(60,70))
         start = random.choice(range(min(10,num_frames), min(int(num_frames/2), 70))) 
         if frame_id+start+5<=num_frames - 1:
             ratio[frame_id+start:frame_id+start+5, 0] = [0.5, 0.9, 1.0, 0.9, 0.5]
@@ -60,8 +51,6 @@ def generate_blink_seq_randomly(num_frames):
 def get_data(first_coeff_path, audio_path, device):
 
     syncnet_mel_step_size = 16
-    syncnet_T = 5
-    MAX_FRAME = 32
     fps = 25
 
     pic_name = os.path.splitext(os.path.split(first_coeff_path)[-1])[0]
@@ -71,23 +60,14 @@ def get_data(first_coeff_path, audio_path, device):
     source_semantics_dict = scio.loadmat(source_semantics_path)
     ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70]         #1 70
 
-    print(audio_path)
-    if '.mp3' in audio_path:
-        print(audio_path)
-        mp3_to_wav(audio_path, audio_path.replace('.mp3','.wav'), 16000)
-        new_audio = audio_path.replace('.mp3','.wav')
-    else:
-        new_audio = audio_path
-
-    wav = audio.load_wav(new_audio, 16000) 
-            
+    wav = audio.load_wav(audio_path, 16000) 
     wav_length, num_frames = parse_audio_length(len(wav), 16000, 25)
     wav = crop_pad_audio(wav, wav_length)
     orig_mel = audio.melspectrogram(wav).T
     spec = orig_mel.copy()         # nframes 80
     indiv_mels = []
 
-    for i in range(num_frames):
+    for i in tqdm(range(num_frames), 'mel:'):
         start_frame_num = i-2
         start_idx = int(80. * (start_frame_num / float(fps)))
         end_idx = start_idx + syncnet_mel_step_size
@@ -97,7 +77,6 @@ def get_data(first_coeff_path, audio_path, device):
         indiv_mels.append(m.T)
     indiv_mels = np.asarray(indiv_mels)         # T 80 16
     ratio = generate_blink_seq_randomly(num_frames)      # T
-
     
     indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1).unsqueeze(0) # bs T 1 80 16
     ratio = torch.FloatTensor(ratio).unsqueeze(0)                        # bs T
diff --git a/src/gradio_demo.py b/src/gradio_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f78c97349652e23cf463c49527191fcec795564
--- /dev/null
+++ b/src/gradio_demo.py
@@ -0,0 +1,113 @@
+import torch, uuid
+from time import gmtime, strftime
+import os, sys, shutil
+from src.utils.preprocess import CropAndExtract
+from src.test_audio2coeff import Audio2Coeff  
+from src.facerender.animate import AnimateFromCoeff
+from src.generate_batch import get_data
+from src.generate_facerender_batch import get_facerender_data
+from src.utils.text2speech import text2speech
+
+from pydub import AudioSegment
+
+def mp3_to_wav(mp3_filename,wav_filename,frame_rate):
+    mp3_file = AudioSegment.from_file(file=mp3_filename)
+    mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav")
+
+
+class SadTalker():
+
+    def __init__(self, checkpoint_path='checkpoints', config_path='src/config'):
+
+        if torch.cuda.is_available() :
+            device = "cuda"
+        else:
+            device = "cpu"
+        
+        os.environ['TORCH_HOME']= checkpoint_path
+
+        path_of_lm_croper = os.path.join( checkpoint_path, 'shape_predictor_68_face_landmarks.dat')
+        path_of_net_recon_model = os.path.join( checkpoint_path, 'epoch_20.pth')
+        dir_of_BFM_fitting = os.path.join( checkpoint_path, 'BFM_Fitting')
+        wav2lip_checkpoint = os.path.join( checkpoint_path, 'wav2lip.pth')
+
+        audio2pose_checkpoint = os.path.join( checkpoint_path, 'auido2pose_00140-model.pth')
+        audio2pose_yaml_path = os.path.join( config_path, 'auido2pose.yaml')
+    
+        audio2exp_checkpoint = os.path.join( checkpoint_path, 'auido2exp_00300-model.pth')
+        audio2exp_yaml_path = os.path.join( config_path, 'auido2exp.yaml')
+
+        free_view_checkpoint = os.path.join( checkpoint_path, 'facevid2vid_00189-model.pth.tar')
+        mapping_checkpoint = os.path.join( checkpoint_path, 'mapping_00229-model.pth.tar')
+        facerender_yaml_path = os.path.join( config_path, 'facerender.yaml')
+
+        #init model
+        print(path_of_lm_croper)
+        self.preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
+
+        print(audio2pose_checkpoint)
+        self.audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path, 
+                                audio2exp_checkpoint, audio2exp_yaml_path, wav2lip_checkpoint, device)
+        print(free_view_checkpoint)
+        self.animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint, 
+                                            facerender_yaml_path, device)
+        self.device = device
+
+    def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'):
+
+        time_tag = str(uuid.uuid4())
+        save_dir = os.path.join(result_dir, time_tag)
+        os.makedirs(save_dir, exist_ok=True)
+
+        input_dir = os.path.join(save_dir, 'input')
+        os.makedirs(input_dir, exist_ok=True)
+
+        print(source_image)
+        pic_path = os.path.join(input_dir, os.path.basename(source_image)) 
+        shutil.move(source_image, input_dir)
+
+        if os.path.isfile(driven_audio):
+            audio_path = os.path.join(input_dir, os.path.basename(driven_audio))  
+
+            #### mp3 to wav
+            if '.mp3' in audio_path:
+                mp3_to_wav(driven_audio, audio_path.replace('.mp3', '.wav'), 16000)
+                audio_path = audio_path.replace('.mp3', '.wav')
+            else:
+                shutil.move(driven_audio, input_dir)
+        else:
+            text2speech
+
+
+        os.makedirs(save_dir, exist_ok=True)
+        pose_style = 0
+        #crop image and extract 3dmm from image
+        first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
+        os.makedirs(first_frame_dir, exist_ok=True)
+        first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir)
+        
+        if first_coeff_path is None:
+            raise AttributeError("No face is detected")
+
+        #audio2ceoff
+        batch = get_data(first_coeff_path, audio_path, self.device) # longer audio?
+        coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style)
+        #coeff2video
+        batch_size = 4
+        data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
+        self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size)
+        video_name = data['video_name']
+        print(f'The generated video is named {video_name} in {save_dir}')
+
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        import gc; gc.collect()
+        
+        if use_enhancer:
+            return os.path.join(save_dir, video_name+'_enhanced.mp4'), os.path.join(save_dir, video_name+'_enhanced.mp4')
+
+        else:
+            return os.path.join(save_dir, video_name+'.mp4'), os.path.join(save_dir, video_name+'.mp4')
+        
+
+    
\ No newline at end of file
diff --git a/src/test_audio2coeff.py b/src/test_audio2coeff.py
index 3de26514660d9a12853c45e4e5278c7cfce7a7cd..3db6be3af59b0319c50106d9a92c903118f28410 100644
--- a/src/test_audio2coeff.py
+++ b/src/test_audio2coeff.py
@@ -81,7 +81,7 @@ class Audio2Coeff():
         
             savemat(os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])),  
                     {'coeff_3dmm': coeffs_pred_numpy})
-            torch.cuda.empty_cache()
+
             return os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name']))
 
 
diff --git a/src/utils/__pycache__/audio.cpython-38.pyc b/src/utils/__pycache__/audio.cpython-38.pyc
index 71fec496164398d674b966d4146ed59d46ac7e0f..c9037ed6e9b29bf1f5ba29b25ed9c067103bb361 100644
Binary files a/src/utils/__pycache__/audio.cpython-38.pyc and b/src/utils/__pycache__/audio.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/croper.cpython-38.pyc b/src/utils/__pycache__/croper.cpython-38.pyc
index 66904192497cde94208a03a315a9017c41850d1d..addfae662741dd661426427e2f29d506c399adba 100644
Binary files a/src/utils/__pycache__/croper.cpython-38.pyc and b/src/utils/__pycache__/croper.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/face_enhancer.cpython-38.pyc b/src/utils/__pycache__/face_enhancer.cpython-38.pyc
index c2c91bd41ae88985f474551e32f12ba2b07bfc0e..51b465795f49c49c741a7fb510d02564337deb28 100644
Binary files a/src/utils/__pycache__/face_enhancer.cpython-38.pyc and b/src/utils/__pycache__/face_enhancer.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/hparams.cpython-38.pyc b/src/utils/__pycache__/hparams.cpython-38.pyc
index d0e429fd2d0a979e1704da0a94b4cce691d2ccef..29278c1421204d040aa03f77ed43e18f9b60dad8 100644
Binary files a/src/utils/__pycache__/hparams.cpython-38.pyc and b/src/utils/__pycache__/hparams.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/preprocess.cpython-38.pyc b/src/utils/__pycache__/preprocess.cpython-38.pyc
index 58abea029b3e9d96718a6812624cbba961ef5202..e5e0b7f2a4c29050bfbb30405816311acd3060f0 100644
Binary files a/src/utils/__pycache__/preprocess.cpython-38.pyc and b/src/utils/__pycache__/preprocess.cpython-38.pyc differ
diff --git a/src/utils/preprocess.py b/src/utils/preprocess.py
index e5362752551bba79bfacb3dd304b47ff525f4e2f..4e3dad8d4a49080a3300f672965a11a8a2054fa2 100644
--- a/src/utils/preprocess.py
+++ b/src/utils/preprocess.py
@@ -1,5 +1,5 @@
 import numpy as np
-import cv2, os, sys,torch
+import cv2, os, sys, torch
 from tqdm import tqdm
 from PIL import Image 
 
@@ -51,7 +51,7 @@ class CropAndExtract():
         self.lm3d_std = load_lm3d(dir_of_BFM_fitting)
         self.device = device
     
-    def generate(self, input_path, save_dir):
+    def generate(self, input_path, save_dir, crop_or_resize='crop'):
 
         pic_size = 256
         pic_name = os.path.splitext(os.path.split(input_path)[-1])[0]  
@@ -81,7 +81,7 @@ class CropAndExtract():
                 break
         x_full_frames = [cv2.cvtColor(full_frames[0], cv2.COLOR_BGR2RGB) ] 
 
-        if True:
+        if crop_or_resize.lower() == 'crop': # default crop
             x_full_frames, crop, quad = self.croper.crop(x_full_frames, xsize=pic_size)
             clx, cly, crx, cry = crop
             lx, ly, rx, ry = quad
@@ -90,7 +90,9 @@ class CropAndExtract():
             original_size = (ox2 - ox1, oy2 - oy1)
         else:
             oy1, oy2, ox1, ox2 = 0, x_full_frames[0].shape[0], 0, x_full_frames[0].shape[1] 
-        frames_pil = [Image.fromarray(cv2.resize(frame,(pic_size,pic_size))) for frame in x_full_frames]
+            original_size = (ox2 - ox1, oy2 - oy1)
+
+        frames_pil = [Image.fromarray(cv2.resize(frame,(pic_size, pic_size))) for frame in x_full_frames]
         if len(frames_pil) == 0:
             print('No face is detected in the input file')
             return None, None
@@ -110,7 +112,7 @@ class CropAndExtract():
         if not os.path.isfile(coeff_path):
             # load 3dmm paramter generator from Deep3DFaceRecon_pytorch 
             video_coeffs, full_coeffs = [],  []
-            for idx in tqdm(range(len(frames_pil)), desc=' 3DMM Extraction In Video:'):
+            for idx in tqdm(range(len(frames_pil)), desc='3DMM Extraction In Video:'):
                 frame = frames_pil[idx]
                 W,H = frame.size
                 lm1 = lm[idx].reshape([-1, 2])
@@ -147,4 +149,4 @@ class CropAndExtract():
 
             savemat(coeff_path, {'coeff_3dmm': semantic_npy, 'full_3dmm': np.array(full_coeffs)[0]})
 
-        return coeff_path, png_path
\ No newline at end of file
+        return coeff_path, png_path, original_size
\ No newline at end of file
diff --git a/src/utils/text2speech.py b/src/utils/text2speech.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ecaef36961494c8b2b1f5771a70b997efa04ffd
--- /dev/null
+++ b/src/utils/text2speech.py
@@ -0,0 +1,12 @@
+import os
+
+def text2speech(txt, audio_path):
+    print(txt)
+    cmd = f'tts --text "{txt}" --out_path {audio_path}'
+    print(cmd)
+    try:
+        os.system(cmd)
+        return audio_path
+    except:
+        print("Error: Failed convert txt to audio")
+        return None
\ No newline at end of file