fffiloni commited on
Commit
f64097e
1 Parent(s): f905233

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -28
app.py CHANGED
@@ -17,15 +17,15 @@ def load_audio(audio_listed):
17
  def execute_command(command: str) -> None:
18
  subprocess.run(command, check=True)
19
 
20
- def infer(audio_input, image_path):
21
 
22
- output_name = "acknowledgement_english@M030_front_neutral_level1_001@male_face"
23
 
24
  command = [
25
  f"python",
26
  f"inference_for_demo_video.py",
27
  f"--wav_path={audio_input}",
28
- f"--style_clip_path=data/style_clip/3DMM/M030_front_neutral_level1_001.mat",
29
  f"--pose_path=data/pose/RichardShelby_front_neutral_level1_001.mat",
30
  f"--image_path={image_path}",
31
  f"--cfg_scale=1.0",
@@ -46,8 +46,11 @@ def infer(audio_input, image_path):
46
  with gr.Blocks() as demo:
47
  with gr.Column():
48
  gr.HTML("""
49
- <h2 style="text-align: center;">DreamTalk</h2>
50
- <p style="text-align: center;"></p>
 
 
 
51
  """)
52
  with gr.Row():
53
  with gr.Column():
@@ -56,38 +59,77 @@ with gr.Blocks() as demo:
56
  examples = [
57
  "data/src_img/uncropped/face3.png",
58
  "data/src_img/uncropped/male_face.png",
59
- "data/src_img/uncropped/uncut_src_img.jpg"
 
 
 
 
 
 
 
60
  ],
61
  inputs=[image_path]
62
  )
63
- audio_input = gr.Audio(label="Audio input", type="filepath", sources=["upload"])
64
- audio_list = gr.Dropdown(
65
- label="Choose an audio (optional)",
66
- choices=[
67
- "German1.wav", "German2.wav", "German3.wav", "German4.wav",
68
- "acknowledgement_chinese.m4a", "acknowledgement_english.m4a",
69
- "chinese1_haierlizhi.wav", "chinese2_guanyu.wav",
70
- "french1.wav", "french2.wav", "french3.wav",
71
- "italian1.wav", "italian2.wav", "italian3.wav",
72
- "japan1.wav", "japan2.wav", "japan3.wav",
73
- "korean1.wav", "korean2.wav", "korean3.wav",
74
- "noisy_audio_cafeter_snr_0.wav", "noisy_audio_meeting_snr_0.wav", "noisy_audio_meeting_snr_10.wav", "noisy_audio_meeting_snr_20.wav", "noisy_audio_narrative.wav", "noisy_audio_office_snr_0.wav", "out_of_domain_narrative.wav",
75
- "spanish1.wav", "spanish2.wav", "spanish3.wav"
76
- ]
77
- )
78
- audio_list.change(
79
- fn = load_audio,
80
- inputs = [audio_list],
81
- outputs = [audio_input]
82
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  run_btn = gr.Button("Run")
84
  with gr.Column():
85
  output_video = gr.Video(format="mp4")
86
 
87
  run_btn.click(
88
  fn = infer,
89
- inputs = [audio_input, image_path],
90
  outputs = [output_video]
91
  )
92
 
93
- demo.launch()
 
17
  def execute_command(command: str) -> None:
18
  subprocess.run(command, check=True)
19
 
20
+ def infer(audio_input, image_path, emotional_style):
21
 
22
+ output_name = "lipsynced_result"
23
 
24
  command = [
25
  f"python",
26
  f"inference_for_demo_video.py",
27
  f"--wav_path={audio_input}",
28
+ f"--style_clip_path=data/style_clip/3DMM/{emotional_style}",
29
  f"--pose_path=data/pose/RichardShelby_front_neutral_level1_001.mat",
30
  f"--image_path={image_path}",
31
  f"--cfg_scale=1.0",
 
46
  with gr.Blocks() as demo:
47
  with gr.Column():
48
  gr.HTML("""
49
+ <h2 style="text-align: center;">DreamTalk: When Expressive Talking Head Generation Meets Diffusion Probabilistic Models</h2>
50
+ <p style="text-align: center;">
51
+ DreamTalk is a diffusion-based audio-driven expressive talking head generation framework that can produce high-quality talking head videos across diverse speaking styles. DreamTalk exhibits robust performance with a diverse array of inputs, including songs, speech in multiple languages, noisy audio, and out-of-domain portraits.
52
+ <img src="https://github.com/ali-vilab/dreamtalk/raw/main/media/teaser.gif" />
53
+ </p>
54
  """)
55
  with gr.Row():
56
  with gr.Column():
 
59
  examples = [
60
  "data/src_img/uncropped/face3.png",
61
  "data/src_img/uncropped/male_face.png",
62
+ "data/src_img/uncropped/uncut_src_img.jpg",
63
+ "data/src_img/cropped/chpa5.png",
64
+ "data/src_img/cropped/cut_img.png",
65
+ "data/src_img/cropped/f30.png",
66
+ "data/src_img/cropped/menglu2.png",
67
+ "data/src_img/cropped/nscu2.png",
68
+ "data/src_img/cropped/zp1.png"
69
+ "data/src_img/cropped/zt12.png"
70
  ],
71
  inputs=[image_path]
72
  )
73
+ audio_input = gr.Audio(label="Audio input", type="filepath", sources=["upload"], value="data/audio/acknowledgement_english.m4a")
74
+ with gr.Row():
75
+ audio_list = gr.Dropdown(
76
+ label="Choose an audio (optional)",
77
+ choices=[
78
+ "German1.wav", "German2.wav", "German3.wav", "German4.wav",
79
+ "acknowledgement_chinese.m4a", "acknowledgement_english.m4a",
80
+ "chinese1_haierlizhi.wav", "chinese2_guanyu.wav",
81
+ "french1.wav", "french2.wav", "french3.wav",
82
+ "italian1.wav", "italian2.wav", "italian3.wav",
83
+ "japan1.wav", "japan2.wav", "japan3.wav",
84
+ "korean1.wav", "korean2.wav", "korean3.wav",
85
+ "noisy_audio_cafeter_snr_0.wav", "noisy_audio_meeting_snr_0.wav", "noisy_audio_meeting_snr_10.wav", "noisy_audio_meeting_snr_20.wav", "noisy_audio_narrative.wav", "noisy_audio_office_snr_0.wav", "out_of_domain_narrative.wav",
86
+ "spanish1.wav", "spanish2.wav", "spanish3.wav"
87
+ ],
88
+ value = "acknowledgement_english.m4a"
89
+ )
90
+ audio_list.change(
91
+ fn = load_audio,
92
+ inputs = [audio_list],
93
+ outputs = [audio_input]
94
+ )
95
+ emotional_style = gr.Dropdown(
96
+ label = "emotional style",
97
+ choices = [
98
+ "M030_front_angry_level3_001.mat",
99
+ "M030_front_contempt_level3_001.mat",
100
+ "M030_front_disgusted_level3_001.mat",
101
+ "M030_front_fear_level3_001.mat",
102
+ "M030_front_happy_level3_001.mat",
103
+ "M030_front_neutral_level1_001.mat",
104
+ "M030_front_sad_level3_001.mat",
105
+ "M030_front_surprised_level3_001.mat",
106
+ "W009_front_angry_level3_001.mat",
107
+ "W009_front_contempt_level3_001.mat",
108
+ "W009_front_disgusted_level3_001.mat",
109
+ "W009_front_fear_level3_001.mat",
110
+ "W009_front_happy_level3_001.mat",
111
+ "W009_front_neutral_level1_001.mat",
112
+ "W009_front_sad_level3_001.mat",
113
+ "W009_front_surprised_level3_001.mat",
114
+ "W011_front_angry_level3_001.mat",
115
+ "W011_front_contempt_level3_001.mat",
116
+ "W011_front_disgusted_level3_001.mat",
117
+ "W011_front_fear_level3_001.mat",
118
+ "W011_front_happy_level3_001.mat",
119
+ "W011_front_neutral_level1_001.mat",
120
+ "W011_front_sad_level3_001.mat",
121
+ "W011_front_surprised_level3_001.mat
122
+ ],
123
+ value = "M030_front_neutral_level1_001.mat"
124
+ )
125
  run_btn = gr.Button("Run")
126
  with gr.Column():
127
  output_video = gr.Video(format="mp4")
128
 
129
  run_btn.click(
130
  fn = infer,
131
+ inputs = [audio_input, image_path, emotional_style],
132
  outputs = [output_video]
133
  )
134
 
135
+ demo.queue().launch()