import os import shutil from huggingface_hub import snapshot_download import gradio as gr import numpy as np from PIL import Image import soundfile as sf import argparse import uuid os.chdir(os.path.dirname(os.path.abspath(__file__))) from scripts.inference import inference_process is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False if not is_shared_ui: hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models") def check_image_square(image_path): image = Image.open(image_path) if image.width != image.height: raise gr.Error("The uploaded image is not square. Please upload a square image.") return image_path def convert_audio_to_wav(audio_path): if not audio_path.endswith('.wav'): audio_data, samplerate = sf.read(audio_path) wav_path = audio_path.rsplit('.', 1)[0] + '.wav' sf.write(wav_path, audio_data, samplerate) return wav_path return audio_path def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)): if is_shared_ui: raise gr.Error("This Space only works in duplicated instances") unique_id = uuid.uuid4() args = argparse.Namespace( config='configs/inference/default.yaml', source_image=source_image, driving_audio=driving_audio, output=f'output-{unique_id}.mp4', pose_weight=pose_weight, face_weight=face_weight, lip_weight=lip_weight, face_expand_ratio=face_expand_ratio, checkpoint=None ) inference_process(args) return f'output-{unique_id}.mp4' with gr.Blocks(theme='freddyaboulton/dracula_revamped@0.3.8') as demo: gr.Markdown( """ # Talking Head Generation Upload a face image and driving audio, and adjust the weights to generate a talking head video. > **Note:** > - The face should be the main focus, making up 50%-70% of the image. > - The face should be facing forward, with a rotation angle of less than 30° (no side profiles). > - To make it work, duplicate the Space and run it on your own profile using a private GPU. > - An L4 costs US$0.80/h. """ ) with gr.Row(): with gr.Column(): avatar_face = gr.Image(type="filepath", label="Face", elem_id="face-input") driving_audio = gr.Audio(type="filepath", label="Driving Audio", elem_id="audio-input") with gr.Column(): with gr.Accordion("Advanced Settings", open=False): pose_weight = gr.Slider(minimum=0.0, value=1.5, label="Pose Weight") face_weight = gr.Slider(minimum=0.0, value=1.0, label="Face Weight") lip_weight = gr.Slider(minimum=0.0, value=1.1, label="Lip Weight") face_expand_ratio = gr.Slider(minimum=0.0, value=1.2, label="Face Expand Ratio") generate = gr.Button("Generate", elem_id="generate-button") output_video = gr.Video(label="Your Talking Head", elem_id="output-video") avatar_face.change(fn=check_image_square, inputs=avatar_face, outputs=avatar_face) driving_audio.change(fn=convert_audio_to_wav, inputs=driving_audio, outputs=driving_audio) generate.click( fn=run_inference, inputs=[avatar_face, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio], outputs=output_video ) demo.launch()