import os
import shutil
from huggingface_hub import snapshot_download
import gradio as gr
os.chdir(os.path.dirname(os.path.abspath(__file__)))
from scripts.inference import inference_process
import argparse
import uuid

hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")

def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
    unique_id = uuid.uuid4()
    
    args = argparse.Namespace(
        config='configs/inference/default.yaml',
        source_image=source_image,
        driving_audio=driving_audio,
        output=f'output-{unique_id}.mp4',
        pose_weight=1.0,
        face_weight=1.0,
        lip_weight=1.0,
        face_expand_ratio=1.2,
        checkpoint=None
    )
    
    inference_process(args)
    return f'output-{unique_id}.mp4' 

iface = gr.Interface(
    title="Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation",
    description="Generate talking head avatars driven from audio. **every 10 seconds of generation takes ~1 minute** - duplicate the space for private use or try for free on Google Colab",
    fn=run_inference,
    inputs=[gr.Image(type="filepath"), gr.Audio(type="filepath")],
    cache_examples=False,
    outputs="video"
)

iface.launch(share=True)