Spaces:
Running
on
L4
Running
on
L4
File size: 2,614 Bytes
f7e8357 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import os
import shutil
from huggingface_hub import snapshot_download
import gradio as gr
os.chdir(os.path.dirname(os.path.abspath(__file__)))
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
joyhallo_dir = snapshot_download(repo_id="jdh-algo/JoyHallo-v1", local_dir="pretrained_models/joyhallo")
wav_dir = snapshot_download(repo_id="TencentGameMate/chinese-wav2vec2-base", local_dir="pretrained_models/chinese-wav2vec2-base")
print(hallo_dir, joyhallo_dir)
print(os.listdir(hallo_dir))
from scripts.inference import predict
def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
return predict(source_image, driving_audio, 1.0, 1.0, 1.0, 1.2)
css = '''
div#warning-ready {
background-color: #ecfdf5;
padding: 0 16px 16px;
margin: 20px 0;
color: #030303!important;
}
div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p {
color: #057857!important;
}
div#warning-duplicate {
background-color: #ebf5ff;
padding: 0 16px 16px;
margin: 20px 0;
color: #030303!important;
}
div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p {
color: #0f4592!important;
}
div#warning-duplicate strong {
color: #0f4592;
}
p.actions {
display: flex;
align-items: center;
margin: 20px 0;
}
div#warning-duplicate .actions a {
display: inline-block;
margin-right: 10px;
}
.dark #warning-duplicate {
background-color: #0c0c0c !important;
border: 1px solid white !important;
}
'''
with gr.Blocks(css=css) as demo:
gr.Markdown("# JoyHallo: Digital human model for Mandarin")
gr.Markdown("Generate talking head avatars driven with Mandarin speech.")
gr.Markdown("""
Data requirements:
Image:
1. Cropped to square shape.
2. Face should be facing forward and occupy 50%-70% of the image area.
Audio:
1. Audio in wav format.
2. Mandarin or English or mixed, with clear audio and suitable background music.
! Important: Too long audio will casue a very long processing time, please keep the audio length within 5s.
""")
with gr.Row():
with gr.Column():
avatar_face = gr.Image(type="filepath", label="Face")
driving_audio = gr.Audio(type="filepath", label="Driving audio")
generate = gr.Button("Generate")
with gr.Column():
output_video = gr.Video(label="Your talking head")
generate.click(
fn=run_inference,
inputs=[avatar_face, driving_audio],
outputs=output_video
)
demo.launch() |