fffiloni commited on
Commit
446a654
1 Parent(s): e2b4c82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +277 -51
app.py CHANGED
@@ -2,6 +2,10 @@ import os
2
  import shutil
3
  from huggingface_hub import snapshot_download
4
  import gradio as gr
 
 
 
 
5
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
6
  from scripts.inference import inference_process
7
  import argparse
@@ -12,7 +16,118 @@ is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] els
12
  if(not is_shared_ui):
13
  hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
14
 
15
- def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  if is_shared_ui:
17
  raise gr.Error("This Space only works in duplicated instances")
18
 
@@ -33,8 +148,61 @@ def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=T
33
  inference_process(args)
34
  return f'output-{unique_id}.mp4'
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  css = '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  div#warning-ready {
39
  background-color: #ecfdf5;
40
  padding: 0 16px 16px;
@@ -72,54 +240,112 @@ div#warning-duplicate .actions a {
72
  '''
73
 
74
  with gr.Blocks(css=css) as demo:
75
- if is_shared_ui:
76
- top_description = gr.HTML(f'''
77
- <div class="gr-prose">
78
- <h2 class="custom-color"><svg xmlns="http://www.w3.org/2000/svg" width="18px" height="18px" style="margin-right: 0px;display: inline-block;"fill="none"><path fill="#fff" d="M7 13.2a6.3 6.3 0 0 0 4.4-10.7A6.3 6.3 0 0 0 .6 6.9 6.3 6.3 0 0 0 7 13.2Z"/><path fill="#fff" fill-rule="evenodd" d="M7 0a6.9 6.9 0 0 1 4.8 11.8A6.9 6.9 0 0 1 0 7 6.9 6.9 0 0 1 7 0Zm0 0v.7V0ZM0 7h.6H0Zm7 6.8v-.6.6ZM13.7 7h-.6.6ZM9.1 1.7c-.7-.3-1.4-.4-2.2-.4a5.6 5.6 0 0 0-4 1.6 5.6 5.6 0 0 0-1.6 4 5.6 5.6 0 0 0 1.6 4 5.6 5.6 0 0 0 4 1.7 5.6 5.6 0 0 0 4-1.7 5.6 5.6 0 0 0 1.7-4 5.6 5.6 0 0 0-1.7-4c-.5-.5-1.1-.9-1.8-1.2Z" clip-rule="evenodd"/><path fill="#000" fill-rule="evenodd" d="M7 2.9a.8.8 0 1 1 0 1.5A.8.8 0 0 1 7 3ZM5.8 5.7c0-.4.3-.6.6-.6h.7c.3 0 .6.2.6.6v3.7h.5a.6.6 0 0 1 0 1.3H6a.6.6 0 0 1 0-1.3h.4v-3a.6.6 0 0 1-.6-.7Z" clip-rule="evenodd"/></svg>
79
- Attention: this Space need to be duplicated to work</h2>
80
- <p class="main-message custom-color">
81
- To make it work, <strong>duplicate the Space</strong> and run it on your own profile using a <strong>private</strong> GPU.<br />
82
- An L4 costs <strong>US$0.80/h</strong>
83
- </p>
84
- <p class="actions custom-color">
85
- <a href="https://huggingface.co/spaces/{os.environ['SPACE_ID']}?duplicate=true">
86
- <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-lg-dark.svg" alt="Duplicate this Space" />
87
- </a>
88
- to start generate your talking head
89
- </p>
90
- </div>
91
- ''', elem_id="warning-duplicate")
92
- gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
93
- gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
94
- gr.Markdown("""
95
- Hallo has a few simple requirements for input data:
96
-
97
- For the source image:
98
-
99
- 1. It should be cropped into squares.
100
- 2. The face should be the main focus, making up 50%-70% of the image.
101
- 3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
102
-
103
- For the driving audio:
104
-
105
- 1. It must be in WAV format.
106
- 2. It must be in English since our training datasets are only in this language.
107
- 3. Ensure the vocals are clear; background music is acceptable.
108
-
109
- We have provided some [samples](https://huggingface.co/datasets/fudan-generative-ai/hallo_inference_samples) for your reference.
110
- """)
111
- with gr.Row():
112
- with gr.Column():
113
- avatar_face = gr.Image(type="filepath", label="Face")
114
- driving_audio = gr.Audio(type="filepath", label="Driving audio")
115
- generate = gr.Button("Generate")
116
- with gr.Column():
117
- output_video = gr.Video(label="Your talking head")
118
-
119
- generate.click(
120
- fn=run_inference,
121
- inputs=[avatar_face, driving_audio],
122
- outputs=output_video
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  )
124
-
125
- demo.launch(show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import shutil
3
  from huggingface_hub import snapshot_download
4
  import gradio as gr
5
+ from gradio_client import Client, handle_file
6
+ from mutagen.mp3 import MP3
7
+ from pydub import AudioSegment
8
+ from PIL import Image
9
  os.chdir(os.path.dirname(os.path.abspath(__file__)))
10
  from scripts.inference import inference_process
11
  import argparse
 
16
  if(not is_shared_ui):
17
  hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
18
 
19
+ def is_mp3(file_path):
20
+ try:
21
+ audio = MP3(file_path)
22
+ return True
23
+ except Exception as e:
24
+ return False
25
+
26
+ def convert_mp3_to_wav(mp3_file_path, wav_file_path):
27
+ # Load the MP3 file
28
+ audio = AudioSegment.from_mp3(mp3_file_path)
29
+ # Export as WAV file
30
+ audio.export(wav_file_path, format="wav")
31
+ return wav_file_path
32
+
33
+
34
+ def trim_audio(file_path, output_path, max_duration=4000):
35
+ # Load the audio file
36
+ audio = AudioSegment.from_wav(file_path)
37
+
38
+ # Check the length of the audio in milliseconds
39
+ audio_length = len(audio)
40
+
41
+ # If the audio is longer than the maximum duration, trim it
42
+ if audio_length > max_duration:
43
+ trimmed_audio = audio[:max_duration]
44
+ else:
45
+ trimmed_audio = audio
46
+
47
+ # Export the trimmed audio to a new file
48
+ trimmed_audio.export(output_path, format="wav")
49
+
50
+ return output_path
51
+
52
+
53
+ def add_silence_to_wav(wav_file_path, duration_s=1):
54
+ # Load the WAV file
55
+ audio = AudioSegment.from_wav(wav_file_path)
56
+ # Create 1 second of silence
57
+ silence = AudioSegment.silent(duration=duration_s * 1000) # duration is in milliseconds
58
+ # Add silence to the end of the audio file
59
+ audio_with_silence = audio + silence
60
+ # Export the modified audio
61
+ audio_with_silence.export(wav_file_path, format="wav")
62
+ return wav_file_path
63
+
64
+ def check_mp3(file_path):
65
+
66
+ if is_mp3(file_path):
67
+ wav_file_path = os.path.splitext(file_path)[0] + '.wav'
68
+ converted_audio = convert_mp3_to_wav(file_path, wav_file_path)
69
+ print(f"File converted to {wav_file_path}")
70
+
71
+ return converted_audio
72
+ else:
73
+ print("The file is not an MP3 file.")
74
+
75
+ return file_path
76
+
77
+ def convert_webp_to_png(webp_file):
78
+
79
+ # Open the WebP image
80
+ webp_image = Image.open(webp_file)
81
+
82
+ # Convert and save as PNG
83
+ webp_image.save("png_converted_image.png", "PNG")
84
+
85
+ return "png_converted_image.png"
86
+
87
+ def generate_portrait(prompt_image):
88
+ if prompt_image is None or prompt_image == "":
89
+ raise gr.Error("Can't generate a portrait without a prompt !")
90
+ client = Client("AP123/SDXL-Lightning")
91
+ result = client.predict(
92
+ prompt_image,
93
+ "4-Step",
94
+ api_name="/generate_image"
95
+ )
96
+ print(result)
97
+
98
+ return result
99
+
100
+ def generate_voice(prompt_audio, voice_description):
101
+ if prompt_audio is None or prompt_audio == "" :
102
+ raise gr.Error("Can't generate a voice without text to synthetize !")
103
+ if voice_description is None or voice_description == "":
104
+ gr.Info(
105
+ "For better control, You may want to provide a voice character description next time.",
106
+ duration = 10,
107
+ visible = True
108
+ )
109
+ client = Client("parler-tts/parler_tts_mini")
110
+ result = client.predict(
111
+ text=prompt_audio,
112
+ description=voice_description,
113
+ api_name="/gen_tts"
114
+ )
115
+ print(result)
116
+ return result
117
+
118
+ def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone):
119
+ client = Client("collabora/WhisperSpeech")
120
+ result = client.predict(
121
+ multilingual_text=prompt_audio_whisperspeech,
122
+ speaker_audio=handle_file(audio_to_clone),
123
+ speaker_url="",
124
+ cps=14,
125
+ api_name="/whisper_speech_demo"
126
+ )
127
+ print(result)
128
+ return result
129
+
130
+ def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
131
  if is_shared_ui:
132
  raise gr.Error("This Space only works in duplicated instances")
133
 
 
148
  inference_process(args)
149
  return f'output-{unique_id}.mp4'
150
 
151
+ def generate_talking_portrait(portrait, voice):
152
+
153
+ if portrait is None:
154
+ raise gr.Error("Please provide a portrait to animate.")
155
+ if voice is None:
156
+ raise gr.Error("Please provide audio (4 seconds max).")
157
+
158
+ # trim audio
159
+ input_file = voice
160
+ trimmed_output_file = "trimmed_audio.wav"
161
+ trimmed_output_file = trim_audio(input_file, trimmed_output_file)
162
+ voice = trimmed_output_file
163
+
164
+ ready_audio = add_silence_to_wav(voice)
165
+ print(f"1 second of silence added to {voice}")
166
+
167
+ # call hallo
168
+ talking_portrait_vid = run_hallo(portrait, ready_audio)
169
+ return talking_portrait_vid
170
+
171
 
172
  css = '''
173
+ #col-container {
174
+ margin: 0 auto;
175
+ }
176
+ #main-group {
177
+ background-color: none;
178
+ }
179
+ .tabs {
180
+ background-color: unset;
181
+ }
182
+ #image-block {
183
+ flex: 1;
184
+ }
185
+ #video-block {
186
+ flex: 9;
187
+ }
188
+ #audio-block, #audio-clone-elm {
189
+ flex: 1;
190
+ }
191
+ #text-synth, #voice-desc, #text-synth-wsp{
192
+ height: 180px;
193
+ }
194
+ #audio-column, #result-column {
195
+ display: flex;
196
+ }
197
+ #gen-voice-btn {
198
+ flex: 1;
199
+ }
200
+ #parler-tab, #whisperspeech-tab {
201
+ padding: 0;
202
+ }
203
+ #main-submit{
204
+ flex: 1;
205
+ }
206
  div#warning-ready {
207
  background-color: #ecfdf5;
208
  padding: 0 16px 16px;
 
240
  '''
241
 
242
  with gr.Blocks(css=css) as demo:
243
+ with gr.Column(elem_id="col-container"):
244
+ gr.Markdown("""
245
+ # Parler X Hallo
246
+ Generate talking portraits
247
+ """)
248
+ with gr.Group(elem_id="main-group"):
249
+ with gr.Row():
250
+ with gr.Column():
251
+ portrait = gr.Image(
252
+ sources=["upload"],
253
+ type="filepath",
254
+ format="png",
255
+ elem_id="image-block"
256
+ )
257
+
258
+ prompt_image = gr.Textbox(
259
+ label="Generate image",
260
+ lines=3
261
+ )
262
+
263
+ gen_image_btn = gr.Button("Generate portrait (optional)")
264
+
265
+ with gr.Column(elem_id="audio-column"):
266
+ voice = gr.Audio(
267
+ type="filepath",
268
+ max_length=4000,
269
+ elem_id="audio-block"
270
+ )
271
+
272
+ with gr.Tab("Parler TTS", elem_id="parler-tab"):
273
+
274
+ prompt_audio = gr.Textbox(
275
+ label="Text to synthetize",
276
+ lines=4,
277
+ max_lines=4,
278
+ elem_id="text-synth"
279
+ )
280
+
281
+ voice_description = gr.Textbox(
282
+ label="Voice description",
283
+ lines=4,
284
+ max_lines=4,
285
+ elem_id="voice-desc"
286
+ )
287
+
288
+ gen_voice_btn = gr.Button("Generate voice (optional)")
289
+
290
+ with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"):
291
+ prompt_audio_whisperspeech = gr.Textbox(
292
+ label="Text to synthetize",
293
+ lines=4,
294
+ max_lines=4,
295
+ elem_id="text-synth-wsp"
296
+ )
297
+ audio_to_clone = gr.Audio(
298
+ label="Voice to clone",
299
+ type="filepath",
300
+ elem_id="audio-clone-elm"
301
+ )
302
+ gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")
303
+
304
+ with gr.Column(elem_id="result-column"):
305
+ result = gr.Video(
306
+ elem_id="video-block"
307
+ )
308
+
309
+ submit_btn = gr.Button("Submit", elem_id="main-submit")
310
+
311
+ voice.upload(
312
+ fn = check_mp3,
313
+ inputs = [voice],
314
+ outputs = [voice],
315
+ queue = False,
316
+ show_api = False
317
  )
318
+
319
+ gen_image_btn.click(
320
+ fn = generate_portrait,
321
+ inputs = [prompt_image],
322
+ outputs = [portrait],
323
+ queue=False,
324
+ show_api = False
325
+ )
326
+
327
+ gen_voice_btn.click(
328
+ fn = generate_voice,
329
+ inputs = [prompt_audio, voice_description],
330
+ outputs = [voice],
331
+ queue=False,
332
+ show_api = False
333
+ )
334
+
335
+ gen_wsp_voice_btn.click(
336
+ fn = get_whisperspeech,
337
+ inputs = [prompt_audio_whisperspeech, audio_to_clone],
338
+ outputs = [voice],
339
+ queue=False,
340
+ show_api = False
341
+ )
342
+
343
+ submit_btn.click(
344
+ fn = generate_talking_portrait,
345
+ inputs = [portrait, voice],
346
+ outputs = [result],
347
+ show_api = False
348
+ )
349
+
350
+
351
+ demo.queue(max_size=2).launch(show_error=True, show_api=False)