EchoMimic

Running on Zero

App Files Files Community

VictorMais commited on Mar 5

Commit

e355904

verified ·

1 Parent(s): 9098160

Added the ZeroGPU processing

Browse files

Files changed (1) hide show

webgui.py +250 -293

webgui.py CHANGED Viewed

@@ -3,11 +3,8 @@
 '''
 webui
 '''
-import spaces
-import os
-os.system('pip install scikit-image')
-os.system('pip install IPython')
 import random
 from datetime import datetime
 from pathlib import Path
@@ -29,22 +26,17 @@ from facenet_pytorch import MTCNN
 import argparse
 import gradio as gr
-import huggingface_hub
-import pickle
-from src.utils.draw_utils import FaceMeshVisualizer
-from src.utils.motion_utils import motion_sync
-from src.utils.mp_utils  import LMKExtractor
 huggingface_hub.snapshot_download(
     repo_id='BadToBest/EchoMimic',
-    local_dir='./pretrained_weights',
-    local_dir_use_symlinks=False,
 )
-is_shared_ui = True if "fffiloni/EchoMimic" in os.environ['SPACE_ID'] else False
 available_property = False if is_shared_ui else True
 advanced_settings_label = "Advanced Configuration (only for duplicated spaces)" if is_shared_ui else "Advanced Configuration"
@@ -67,7 +59,7 @@ default_values = {
 ffmpeg_path = os.getenv('FFMPEG_PATH')
 if ffmpeg_path is None:
     print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=/musetalk/ffmpeg-4.4-amd64-static")
-elif ffmpeg_path not in os.getenv('PATH'):
     print("add ffmpeg to path")
     os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
@@ -86,64 +78,91 @@ if not torch.cuda.is_available():
 inference_config_path = config.inference_config
 infer_config = OmegaConf.load(inference_config_path)
-############# model_init started #############
-## vae init
-vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to("cuda", dtype=weight_dtype)
-## reference net init
-reference_unet = UNet2DConditionModel.from_pretrained(
-    config.pretrained_base_model_path,
-    subfolder="unet",
-).to(dtype=weight_dtype, device=device)
-reference_unet.load_state_dict(torch.load(config.reference_unet_path, map_location="cpu"))
-## denoising net init
-if os.path.exists(config.motion_module_path):
-    ### stage1 + stage2
-    denoising_unet = EchoUNet3DConditionModel.from_pretrained_2d(
-        config.pretrained_base_model_path,
-        config.motion_module_path,
-        subfolder="unet",
-        unet_additional_kwargs=infer_config.unet_additional_kwargs,
-    ).to(dtype=weight_dtype, device=device)
-else:
-    ### only stage1
-    denoising_unet = EchoUNet3DConditionModel.from_pretrained_2d(
         config.pretrained_base_model_path,
-        "",
         subfolder="unet",
-        unet_additional_kwargs={
-            "use_motion_module": False,
-            "unet_use_temporal_attention": False,
-            "cross_attention_dim": infer_config.unet_additional_kwargs.cross_attention_dim
-        }
     ).to(dtype=weight_dtype, device=device)
-denoising_unet.load_state_dict(torch.load(config.denoising_unet_path, map_location="cpu"), strict=False)
-## face locator init
-face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cuda")
-face_locator.load_state_dict(torch.load(config.face_locator_path, map_location='cpu'))
-## load audio processor params
-audio_processor = load_audio_model(model_path=config.audio_model_path, device=device)
-## load face detector params
-face_detector = MTCNN(image_size=320, margin=0, min_face_size=20, thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True, device=device)
-############# model_init finished #############
-sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
-scheduler = DDIMScheduler(**sched_kwargs)
-pipe = Audio2VideoPipeline(
-    vae=vae,
-    reference_unet=reference_unet,
-    denoising_unet=denoising_unet,
-    audio_guider=audio_processor,
-    face_locator=face_locator,
-    scheduler=scheduler,
-).to("cuda", dtype=weight_dtype)
 def select_face(det_bboxes, probs):
     ## max face from faces that the prob is above 0.8
@@ -159,25 +178,58 @@ def select_face(det_bboxes, probs):
     sorted_bboxes = sorted(filtered_bboxes, key=lambda x:(x[3]-x[1]) * (x[2] - x[0]), reverse=True)
     return sorted_bboxes[0]
-lmk_extractor = LMKExtractor()
-def face_detection(uploaded_img, facemask_dilation_ratio, facecrop_dilation_ratio, width, height):
     face_img = cv2.imread(uploaded_img)
-    if face_img is None:
-        raise gr.Error("input image should be uploaded or selected.")
     face_mask = np.zeros((face_img.shape[0], face_img.shape[1])).astype('uint8')
     det_bboxes, probs = face_detector.detect(face_img)
     select_bbox = select_face(det_bboxes, probs)
     if select_bbox is None:
         face_mask[:, :] = 255
     else:
         xyxy = select_bbox[:4]
         xyxy = np.round(xyxy).astype('int')
         rb, re, cb, ce = xyxy[1], xyxy[3], xyxy[0], xyxy[2]
         r_pad = int((re - rb) * facemask_dilation_ratio)
         c_pad = int((ce - cb) * facemask_dilation_ratio)
         face_mask[rb - r_pad : re + r_pad, cb - c_pad : ce + c_pad] = 255
         r_pad_crop = int((re - rb) * facecrop_dilation_ratio)
         c_pad_crop = int((ce - cb) * facecrop_dilation_ratio)
         crop_rect = [max(0, cb - c_pad_crop), max(0, rb - r_pad_crop), min(ce + c_pad_crop, face_img.shape[1]), min(re + r_pad_crop, face_img.shape[0])]
@@ -185,15 +237,10 @@ def face_detection(uploaded_img, facemask_dilation_ratio, facecrop_dilation_rati
         face_mask = crop_and_pad(face_mask, crop_rect)
         face_img = cv2.resize(face_img, (width, height))
         face_mask = cv2.resize(face_mask, (width, height))
-    print('face detect done.')
-    return face_img, face_mask
-@spaces.GPU(duration=300)
-def video_pipe(face_img, face_mask, uploaded_audio, width, height, length, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
-    face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
     ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
     video = pipe(
         ref_image_pil,
         uploaded_audio,
@@ -203,12 +250,12 @@ def video_pipe(face_img, face_mask, uploaded_audio, width, height, length, conte
         length,
         steps,
         cfg,
         audio_sample_rate=sample_rate,
         context_frames=context_frames,
         fps=fps,
         context_overlap=context_overlap
     ).videos
-    print('video pipe done.')
     save_dir = Path("output/tmp")
     save_dir.mkdir(exist_ok=True, parents=True)
@@ -223,107 +270,27 @@ def video_pipe(face_img, face_mask, uploaded_audio, width, height, length, conte
     return final_output_path
-def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
-    face_img, face_mask = face_detection(uploaded_img, facemask_dilation_ratio, facecrop_dilation_ratio, width, height)
-    final_output_path = video_pipe(face_img, face_mask, uploaded_audio, width, height, length, context_frames, context_overlap, cfg, steps, sample_rate, fps, device)
-    return final_output_path
-# @spaces.GPU
-# def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
-#     #### face musk prepare
-#     face_img = cv2.imread(uploaded_img)
-#     if face_img is None:
-#         raise gr.Error("input image should be uploaded or selected.")
-#     face_mask = np.zeros((face_img.shape[0], face_img.shape[1])).astype('uint8')
-#     det_bboxes, probs = face_detector.detect(face_img)
-#     select_bbox = select_face(det_bboxes, probs)
-#     if select_bbox is None:
-#         face_mask[:, :] = 255
-#     else:
-#         xyxy = select_bbox[:4]
-#         xyxy = np.round(xyxy).astype('int')
-#         rb, re, cb, ce = xyxy[1], xyxy[3], xyxy[0], xyxy[2]
-#         r_pad = int((re - rb) * facemask_dilation_ratio)
-#         c_pad = int((ce - cb) * facemask_dilation_ratio)
-#         face_mask[rb - r_pad : re + r_pad, cb - c_pad : ce + c_pad] = 255
-#         #### face crop
-#         r_pad_crop = int((re - rb) * facecrop_dilation_ratio)
-#         c_pad_crop = int((ce - cb) * facecrop_dilation_ratio)
-#         crop_rect = [max(0, cb - c_pad_crop), max(0, rb - r_pad_crop), min(ce + c_pad_crop, face_img.shape[1]), min(re + r_pad_crop, face_img.shape[0])]
-#         face_img = crop_and_pad(face_img, crop_rect)
-#         face_mask = crop_and_pad(face_mask, crop_rect)
-#         face_img = cv2.resize(face_img, (width, height))
-#         face_mask = cv2.resize(face_mask, (width, height))
-#     print('face detect done.')
-#     # ==================== face_locator =====================
-#     '''
-#     driver_video = "./assets/driven_videos/c.mp4"
-#     input_frames_cv2 = [cv2.resize(center_crop_cv2(pil_to_cv2(i)), (512, 512)) for i in pils_from_video(driver_video)]
-#     ref_det = lmk_extractor(face_img)
-#     visualizer = FaceMeshVisualizer(draw_iris=False, draw_mouse=False)
-#     pose_list = []
-#     sequence_driver_det = []
-#     try:
-#         for frame in input_frames_cv2:
-#             result = lmk_extractor(frame)
-#             assert result is not None, "{}, bad video, face not detected".format(driver_video)
-#             sequence_driver_det.append(result)
-#     except:
-#         print("face detection failed")
-#         exit()
-#     sequence_det_ms = motion_sync(sequence_driver_det, ref_det)
-#     for p in sequence_det_ms:
-#         tgt_musk = visualizer.draw_landmarks((width, height), p)
-#         tgt_musk_pil = Image.fromarray(np.array(tgt_musk).astype(np.uint8)).convert('RGB')
-#         pose_list.append(torch.Tensor(np.array(tgt_musk_pil)).to(dtype=weight_dtype, device="cuda").permute(2,0,1) / 255.0)
-#     '''
-#     # face_mask_tensor = torch.stack(pose_list, dim=1).unsqueeze(0)
-#     face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
-#     ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
-#     #del pose_list, sequence_det_ms, sequence_driver_det, input_frames_cv2
-#     video = pipe(
-#         ref_image_pil,
-#         uploaded_audio,
-#         face_mask_tensor,
-#         width,
-#         height,
-#         length,
-#         steps,
-#         cfg,
-#         #generator=generator,
-#         audio_sample_rate=sample_rate,
-#         context_frames=context_frames,
-#         fps=fps,
-#         context_overlap=context_overlap
-#     ).videos
-#     print('video pipe done.')
-#     save_dir = Path("output/tmp")
-#     save_dir.mkdir(exist_ok=True, parents=True)
-#     output_video_path = save_dir / "output_video.mp4"
-#     save_videos_grid(video, str(output_video_path), n_rows=1, fps=fps)
-#     video_clip = VideoFileClip(str(output_video_path))
-#     audio_clip = AudioFileClip(uploaded_audio)
-#     final_output_path = save_dir / "output_video_with_audio.mp4"
-#     video_clip = video_clip.set_audio(audio_clip)
-#     video_clip.write_videofile(str(final_output_path), codec="libx264", audio_codec="aac")
-#     return final_output_path
 with gr.Blocks() as demo:
     gr.Markdown('# EchoMimic')
     gr.Markdown('## Lifelike Audio-Driven Portrait Animations through Editable Landmark Conditioning')
-    gr.Markdown('Inference time: from ~7mins/240frames to ~50s/240frames on V100 GPU')
     gr.HTML("""
     <div style="display:flex;column-gap:4px;">
         <a href='https://badtobest.github.io/echomimic.html'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
@@ -331,12 +298,24 @@ with gr.Blocks() as demo:
         <a href='https://arxiv.org/abs/2407.08136'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
     </div>
     """)
     with gr.Row():
-        with gr.Column(min_width=250):
             uploaded_img = gr.Image(type="filepath", label="Reference Image")
-        with gr.Column(min_width=250):
-            uploaded_audio = gr.Audio(type="filepath", label="Input Audio")
             with gr.Accordion(label=advanced_settings_label, open=False):
                 with gr.Row():
                     width = gr.Slider(label="Width", minimum=128, maximum=1024, value=default_values["width"], interactive=available_property)
@@ -357,132 +336,110 @@ with gr.Blocks() as demo:
                     sample_rate = gr.Slider(label="Sample Rate", minimum=8000, maximum=48000, step=1000, value=default_values["sample_rate"], interactive=available_property)
                     fps = gr.Slider(label="FPS", minimum=1, maximum=60, step=1, value=default_values["fps"], interactive=available_property)
                     device = gr.Radio(label="Device", choices=["cuda", "cpu"], value=default_values["device"], interactive=available_property)
-        with gr.Column(min_width=250):
             generate_button = gr.Button("Generate Video")
             output_video = gr.Video()
-    with gr.Row():
-        gr.Examples(
-            label = "Portrait examples",
-            examples = [
-                ['assets/test_imgs/a.png'],
-                ['assets/test_imgs/b.png'],
-                ['assets/test_imgs/c.png'],
-                ['assets/test_imgs/d.png'],
-                ['assets/test_imgs/e.png']
-            ],
-            inputs = [uploaded_img]
-        )
-        gr.Examples(
-            label = "Audio examples",
-            examples = [
-                ['assets/test_audios/chunnuanhuakai.wav'],
-                ['assets/test_audios/chunwang.wav'],
-                ['assets/test_audios/echomimic_en_girl.wav'],
-                ['assets/test_audios/echomimic_en.wav'],
-                ['assets/test_audios/echomimic_girl.wav'],
-                ['assets/test_audios/echomimic.wav'],
-                ['assets/test_audios/jane.wav'],
-                ['assets/test_audios/mei.wav'],
-                ['assets/test_audios/walden.wav'],
-                ['assets/test_audios/yun.wav'],
-            ],
-            inputs = [uploaded_audio]
-        )
-        # gr.HTML("""
-        # <div style="display:flex;column-gap:4px;">
-        #     <a href="https://huggingface.co/spaces/fffiloni/EchoMimic?duplicate=true">
-        #         <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-xl.svg" alt="Duplicate this Space">
-        #     </a>
-        #     <a href="https://huggingface.co/fffiloni">
-        #         <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-xl-dark.svg" alt="Follow me on HF">
-        #     </a>
-        # </div>
-        # """)
-    # def generate_video(uploaded_img, uploaded_audio, facemask_dilation_ratio=default_values["facemask_dilation_ratio"],
-    #                    facecrop_dilation_ratio=default_values["facecrop_dilation_ratio"],
-    #                    context_frames=default_values["context_frames"],
-    #                    context_overlap=default_values["context_overlap"],
-    #                    cfg=default_values["cfg"],
-    #                    steps=default_values["steps"],
-    #                    sample_rate=default_values["sample_rate"],
-    #                    fps=default_values["fps"],
-    #                    device=default_values["device"],
-    #                    width=default_values["width"],
-    #                    height=default_values["height"],
-    #                    length=default_values["length"] ):
-    #     final_output_path = process_video(
-    #         uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device
-    #     )
-    #     output_video= final_output_path
-    #     return final_output_path
-    # generate_button.click(
-    #     generate_video,
-    #     inputs=[
-    #         uploaded_img,
-    #         uploaded_audio,
-    #         # width,
-    #         # height,
-    #         # length,
-    #         # seed,
-    #         # facemask_dilation_ratio,
-    #         # facecrop_dilation_ratio,
-    #         # context_frames,
-    #         # context_overlap,
-    #         # cfg,
-    #         # steps,
-    #         # sample_rate,
-    #         # fps,
-    #         # device
-    #     ],
-    #     outputs=output_video,
-    #     show_api=False
-    # )
-    def generate_video(uploaded_img, uploaded_audio,
-                       facemask_dilation_ratio=default_values["facemask_dilation_ratio"],
-                       facecrop_dilation_ratio=default_values["facecrop_dilation_ratio"],
-                       context_frames=default_values["context_frames"],
-                       context_overlap=default_values["context_overlap"],
-                       cfg=default_values["cfg"],
-                       steps=default_values["steps"],
-                       sample_rate=default_values["sample_rate"],
-                       fps=default_values["fps"],
-                       device=default_values["device"],
-                       width=default_values["width"],
-                       height=default_values["height"],
-                       length=default_values["length"] ):
         final_output_path = process_video(
-            uploaded_img,
-            uploaded_audio, width, height,
-            length, facemask_dilation_ratio,
-            facecrop_dilation_ratio, context_frames,
-            context_overlap, cfg, steps,
-            sample_rate, fps, device
         )
-        output_video = final_output_path
         return final_output_path
     generate_button.click(
         generate_video,
         inputs=[
             uploaded_img,
-            uploaded_audio
         ],
         outputs=output_video,
-        show_progress=True
     )
-parser = argparse.ArgumentParser(description='EchoMimic')
 parser.add_argument('--server_name', type=str, default='0.0.0.0', help='Server name')
 parser.add_argument('--server_port', type=int, default=7680, help='Server port')
 args = parser.parse_args()
-# demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
 if __name__ == '__main__':
-    demo.queue(max_size=3).launch(show_api=False, show_error=True)
     #demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)

 '''
 webui
 '''
+import os
 import random
 from datetime import datetime
 from pathlib import Path
 import argparse
 import gradio as gr
+from gradio_client import Client, handle_file
+from pydub import AudioSegment
+import huggingface_hub
+import spaces  # Import spaces module for ZeroGPU support
 huggingface_hub.snapshot_download(
     repo_id='BadToBest/EchoMimic',
+    local_dir='./pretrained_weights'
 )
+is_shared_ui = True if "fffiloni/EchoMimic" in os.environ.get('SPACE_ID', '') else False
 available_property = False if is_shared_ui else True
 advanced_settings_label = "Advanced Configuration (only for duplicated spaces)" if is_shared_ui else "Advanced Configuration"
 ffmpeg_path = os.getenv('FFMPEG_PATH')
 if ffmpeg_path is None:
     print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=/musetalk/ffmpeg-4.4-amd64-static")
+elif ffmpeg_path not in os.getenv('PATH', ''):
     print("add ffmpeg to path")
     os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
 inference_config_path = config.inference_config
 infer_config = OmegaConf.load(inference_config_path)
+# Model initialization is performed on-demand with ZeroGPU
+# Function to initialize models when needed
+@spaces.GPU
+def initialize_models():
+    global vae, reference_unet, denoising_unet, face_locator, audio_processor, face_detector, pipe
+    ## vae init
+    vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to(device, dtype=weight_dtype)
+    ## reference net init
+    reference_unet = UNet2DConditionModel.from_pretrained(
         config.pretrained_base_model_path,
         subfolder="unet",
     ).to(dtype=weight_dtype, device=device)
+    reference_unet.load_state_dict(torch.load(config.reference_unet_path, map_location="cpu"))
+    ## denoising net init
+    if os.path.exists(config.motion_module_path):
+        ### stage1 + stage2
+        denoising_unet = EchoUNet3DConditionModel.from_pretrained_2d(
+            config.pretrained_base_model_path,
+            config.motion_module_path,
+            subfolder="unet",
+            unet_additional_kwargs=infer_config.unet_additional_kwargs,
+        ).to(dtype=weight_dtype, device=device)
+    else:
+        ### only stage1
+        denoising_unet = EchoUNet3DConditionModel.from_pretrained_2d(
+            config.pretrained_base_model_path,
+            "",
+            subfolder="unet",
+            unet_additional_kwargs={
+                "use_motion_module": False,
+                "unet_use_temporal_attention": False,
+                "cross_attention_dim": infer_config.unet_additional_kwargs.cross_attention_dim
+            }
+        ).to(dtype=weight_dtype, device=device)
+    denoising_unet.load_state_dict(torch.load(config.denoising_unet_path, map_location="cpu"), strict=False)
+    ## face locator init
+    face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device=device)
+    face_locator.load_state_dict(torch.load(config.face_locator_path))
+    ## load audio processor params
+    audio_processor = load_audio_model(model_path=config.audio_model_path, device=device)
+    ## load face detector params
+    face_detector = MTCNN(image_size=320, margin=0, min_face_size=20, thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True, device=device)
+    sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
+    scheduler = DDIMScheduler(**sched_kwargs)
+    pipe = Audio2VideoPipeline(
+        vae=vae,
+        reference_unet=reference_unet,
+        denoising_unet=denoising_unet,
+        audio_guider=audio_processor,
+        face_locator=face_locator,
+        scheduler=scheduler,
+    ).to(device, dtype=weight_dtype)
+# Global variables for models
+vae = None
+reference_unet = None
+denoising_unet = None
+face_locator = None
+audio_processor = None
+face_detector = None
+pipe = None
+def ensure_png(image_path):
+    # Load the image with Pillow
+    with Image.open(image_path) as img:
+        # Check if the image is already a PNG
+        if img.format != "PNG":
+            # Convert and save as PNG
+            png_path = os.path.splitext(image_path)[0] + ".png"
+            img.save(png_path, format="PNG")
+            print(f"Image converted to PNG and saved as {png_path}")
+            return png_path
+        else:
+            print("Image is already a PNG.")
+            return image_path
 def select_face(det_bboxes, probs):
     ## max face from faces that the prob is above 0.8
     sorted_bboxes = sorted(filtered_bboxes, key=lambda x:(x[3]-x[1]) * (x[2] - x[0]), reverse=True)
     return sorted_bboxes[0]
+@spaces.GPU(duration=120)  # Allow up to 2 minutes for video processing (maximum allowed)
+def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
+    # Ensure models are initialized
+    if vae is None:
+        initialize_models()
+    if seed is not None and seed > -1:
+        generator = torch.manual_seed(seed)
+    else:
+        generator = torch.manual_seed(random.randint(100, 1000000))
+    uploaded_img = ensure_png(uploaded_img)
+    #### face mask prepare
     face_img = cv2.imread(uploaded_img)
+    # Get the original dimensions
+    original_height, original_width = face_img.shape[:2]
+    # Set the new width to 512 pixels
+    new_width = 512
+    # Calculate the new height with the same aspect ratio
+    new_height = int(original_height * (new_width / original_width))
+    # Ensure both width and height are divisible by 8
+    new_width = (new_width // 8) * 8  # Force target width to be divisible by 8
+    new_height = (new_height // 8) * 8   # Floor the height to the nearest multiple of 8
+    # Resize the image to the calculated dimensions
+    face_img = cv2.resize(face_img, (new_width, new_height))
     face_mask = np.zeros((face_img.shape[0], face_img.shape[1])).astype('uint8')
     det_bboxes, probs = face_detector.detect(face_img)
     select_bbox = select_face(det_bboxes, probs)
     if select_bbox is None:
+        print("SELECT_BBOX IS NONE")
         face_mask[:, :] = 255
+        face_img = cv2.resize(face_img, (width, height))
+        face_mask = cv2.resize(face_mask, (width, height))
+        raise gr.Error("Face Detector could not detect a face in your image. Try with a 512 squared image where the face is clearly visible.")
     else:
+        print("SELECT_BBOX IS NOT NONE")
         xyxy = select_bbox[:4]
         xyxy = np.round(xyxy).astype('int')
         rb, re, cb, ce = xyxy[1], xyxy[3], xyxy[0], xyxy[2]
         r_pad = int((re - rb) * facemask_dilation_ratio)
         c_pad = int((ce - cb) * facemask_dilation_ratio)
         face_mask[rb - r_pad : re + r_pad, cb - c_pad : ce + c_pad] = 255
+        #### face crop
         r_pad_crop = int((re - rb) * facecrop_dilation_ratio)
         c_pad_crop = int((ce - cb) * facecrop_dilation_ratio)
         crop_rect = [max(0, cb - c_pad_crop), max(0, rb - r_pad_crop), min(ce + c_pad_crop, face_img.shape[1]), min(re + r_pad_crop, face_img.shape[0])]
         face_mask = crop_and_pad(face_mask, crop_rect)
         face_img = cv2.resize(face_img, (width, height))
         face_mask = cv2.resize(face_mask, (width, height))
     ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
+    face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device=device).unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
     video = pipe(
         ref_image_pil,
         uploaded_audio,
         length,
         steps,
         cfg,
+        generator=generator,
         audio_sample_rate=sample_rate,
         context_frames=context_frames,
         fps=fps,
         context_overlap=context_overlap
     ).videos
     save_dir = Path("output/tmp")
     save_dir.mkdir(exist_ok=True, parents=True)
     return final_output_path
+@spaces.GPU(duration=60)  # Allow 1 minute for voice cloning
+def get_maskGCT_TTS(prompt_audio_maskGCT, audio_to_clone):
+    try:
+        client = Client("amphion/maskgct")
+    except:
+        raise gr.Error(f"amphion/maskgct space's api might not be ready, please wait, or upload an audio instead.")
+    result = client.predict(
+        prompt_wav = handle_file(audio_to_clone),
+        target_text = prompt_audio_maskGCT,
+        target_len=-1,
+        n_timesteps=25,
+        api_name="/predict"
+    )
+    print(result)
+    return result, gr.update(value=result, visible=True)
 with gr.Blocks() as demo:
     gr.Markdown('# EchoMimic')
     gr.Markdown('## Lifelike Audio-Driven Portrait Animations through Editable Landmark Conditioning')
+    gr.Markdown('Running on Spaces ZeroGPU: Dynamic GPU allocation for optimal resource usage')
     gr.HTML("""
     <div style="display:flex;column-gap:4px;">
         <a href='https://badtobest.github.io/echomimic.html'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
         <a href='https://arxiv.org/abs/2407.08136'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
     </div>
     """)
     with gr.Row():
+        with gr.Column():
             uploaded_img = gr.Image(type="filepath", label="Reference Image")
+            uploaded_audio = gr.Audio(type="filepath", label="Input Audio", format="wav")
+            preprocess_audio_file = gr.File(visible=False)
+            with gr.Accordion(label="Voice cloning with MaskGCT", open=False):
+                prompt_audio_maskGCT = gr.Textbox(
+                    label = "Text to synthetize",
+                    lines = 2,
+                    max_lines = 2,
+                    elem_id = "text-synth-maskGCT"
+                )
+                audio_to_clone_maskGCT = gr.Audio(
+                    label = "Voice to clone",
+                    type = "filepath",
+                    elem_id = "audio-clone-elm-maskGCT"
+                )
+                gen_maskGCT_voice_btn = gr.Button("Generate voice clone (optional)")
             with gr.Accordion(label=advanced_settings_label, open=False):
                 with gr.Row():
                     width = gr.Slider(label="Width", minimum=128, maximum=1024, value=default_values["width"], interactive=available_property)
                     sample_rate = gr.Slider(label="Sample Rate", minimum=8000, maximum=48000, step=1000, value=default_values["sample_rate"], interactive=available_property)
                     fps = gr.Slider(label="FPS", minimum=1, maximum=60, step=1, value=default_values["fps"], interactive=available_property)
                     device = gr.Radio(label="Device", choices=["cuda", "cpu"], value=default_values["device"], interactive=available_property)
             generate_button = gr.Button("Generate Video")
+        with gr.Column():
             output_video = gr.Video()
+            gr.Examples(
+                label = "Portrait examples",
+                examples = [
+                    ['assets/test_imgs/a.png'],
+                    ['assets/test_imgs/b.png'],
+                    ['assets/test_imgs/c.png'],
+                    ['assets/test_imgs/d.png'],
+                    ['assets/test_imgs/e.png']
+                ],
+                inputs = [uploaded_img]
+            )
+            gr.Examples(
+                label = "Audio examples",
+                examples = [
+                    ['assets/test_audios/chunnuanhuakai.wav'],
+                    ['assets/test_audios/chunwang.wav'],
+                    ['assets/test_audios/echomimic_en_girl.wav'],
+                    ['assets/test_audios/echomimic_en.wav'],
+                    ['assets/test_audios/echomimic_girl.wav'],
+                    ['assets/test_audios/echomimic.wav'],
+                    ['assets/test_audios/jane.wav'],
+                    ['assets/test_audios/mei.wav'],
+                    ['assets/test_audios/walden.wav'],
+                    ['assets/test_audios/yun.wav'],
+                ],
+                inputs = [uploaded_audio]
+            )
+            gr.HTML("""
+            <div style="display:flex;column-gap:4px;">
+                <a href="https://huggingface.co/spaces/fffiloni/EchoMimic?duplicate=true">
+                    <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-xl.svg" alt="Duplicate this Space">
+                </a>
+                <a href="https://huggingface.co/fffiloni">
+                    <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-xl-dark.svg" alt="Follow me on HF">
+                </a>
+            </div>
+            """)
+    def trim_audio(file_path, output_path, max_duration=10):
+        # Load the audio file
+        audio = AudioSegment.from_wav(file_path)
+        # Convert max duration to milliseconds
+        max_duration_ms = max_duration * 1000
+        # Trim the audio if it's longer than max_duration
+        if len(audio) > max_duration_ms:
+            audio = audio[:max_duration_ms]
+        # Export the trimmed audio
+        audio.export(output_path, format="wav")
+        print(f"Audio trimmed and saved as {output_path}")
+        return output_path
+    def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device, progress=gr.Progress(track_tqdm=True)):
+        # First, check and trim audio if needed
+        if is_shared_ui:
+            gr.Info("Trimming audio to max 10 seconds. Duplicate the space for unlimited audio length.")
+            uploaded_audio = trim_audio(uploaded_audio, "trimmed_audio.wav")
+        # Process the video with ZeroGPU support
         final_output_path = process_video(
+            uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device
         )
         return final_output_path
+    gen_maskGCT_voice_btn.click(
+        fn = get_maskGCT_TTS,
+        inputs = [prompt_audio_maskGCT, audio_to_clone_maskGCT],
+        outputs = [uploaded_audio, preprocess_audio_file],
+        queue = False,
+        show_api = False
+    )
     generate_button.click(
         generate_video,
         inputs=[
             uploaded_img,
+            uploaded_audio,
+            width,
+            height,
+            length,
+            seed,
+            facemask_dilation_ratio,
+            facecrop_dilation_ratio,
+            context_frames,
+            context_overlap,
+            cfg,
+            steps,
+            sample_rate,
+            fps,
+            device
         ],
         outputs=output_video,
+        show_api=False
     )
+parser = argparse.ArgumentParser(description='EchoMimic with ZeroGPU Support')
 parser.add_argument('--server_name', type=str, default='0.0.0.0', help='Server name')
 parser.add_argument('--server_port', type=int, default=7680, help='Server port')
 args = parser.parse_args()
 if __name__ == '__main__':
+    demo.queue(max_size=3).launch(show_api=False, show_error=True, ssr_mode=False)
     #demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)