fffiloni commited on
Commit
dcda854
1 Parent(s): 28bc4f4

tts x hallo ui integration

Browse files
Files changed (1) hide show
  1. app.py +211 -88
app.py CHANGED
@@ -13,9 +13,15 @@ import uuid
13
 
14
  is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False
15
 
 
 
16
  if(not is_shared_ui):
17
  hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
18
 
 
 
 
 
19
  def is_mp3(file_path):
20
  try:
21
  audio = MP3(file_path)
@@ -31,7 +37,7 @@ def convert_mp3_to_wav(mp3_file_path, wav_file_path):
31
  return wav_file_path
32
 
33
 
34
- def trim_audio(file_path, output_path, max_duration=4000):
35
  # Load the audio file
36
  audio = AudioSegment.from_wav(file_path)
37
 
@@ -72,100 +78,140 @@ def check_mp3(file_path):
72
  else:
73
  print("The file is not an MP3 file.")
74
 
75
- return file_path
76
-
77
- def convert_webp_to_png(webp_file):
78
-
79
- # Open the WebP image
80
- webp_image = Image.open(webp_file)
81
 
82
- # Convert and save as PNG
83
- webp_image.save("png_converted_image.png", "PNG")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- return "png_converted_image.png"
 
 
86
 
87
  def generate_portrait(prompt_image):
88
  if prompt_image is None or prompt_image == "":
89
  raise gr.Error("Can't generate a portrait without a prompt !")
90
- client = Client("AP123/SDXL-Lightning")
91
- result = client.predict(
92
- prompt_image,
93
- "4-Step",
94
- api_name="/generate_image"
95
- )
96
- print(result)
 
 
 
 
 
 
97
 
98
- return result
 
 
 
 
99
 
100
- def generate_voice(prompt_audio, voice_description):
 
 
101
  if prompt_audio is None or prompt_audio == "" :
102
- raise gr.Error("Can't generate a voice without text to synthetize !")
103
  if voice_description is None or voice_description == "":
104
  gr.Info(
105
  "For better control, You may want to provide a voice character description next time.",
106
  duration = 10,
107
  visible = True
108
  )
109
- client = Client("parler-tts/parler_tts_mini")
 
 
 
 
110
  result = client.predict(
111
- text=prompt_audio,
112
- description=voice_description,
113
- api_name="/gen_tts"
114
  )
115
  print(result)
116
- return result
117
 
118
  def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone):
119
- client = Client("collabora/WhisperSpeech")
 
 
 
 
120
  result = client.predict(
121
- multilingual_text=prompt_audio_whisperspeech,
122
- speaker_audio=handle_file(audio_to_clone),
123
- speaker_url="",
124
- cps=14,
125
- api_name="/whisper_speech_demo"
126
  )
127
  print(result)
128
- return result
 
 
 
 
 
129
 
130
  def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
131
- if is_shared_ui:
132
- raise gr.Error("This Space only works in duplicated instances")
133
-
134
  unique_id = uuid.uuid4()
135
 
136
  args = argparse.Namespace(
137
- config='configs/inference/default.yaml',
138
- source_image=source_image,
139
- driving_audio=driving_audio,
140
- output=f'output-{unique_id}.mp4',
141
- pose_weight=1.0,
142
- face_weight=1.0,
143
- lip_weight=1.0,
144
- face_expand_ratio=1.2,
145
- checkpoint=None
146
  )
147
 
148
  inference_process(args)
149
  return f'output-{unique_id}.mp4'
150
 
151
- def generate_talking_portrait(portrait, voice):
152
 
153
  if portrait is None:
154
  raise gr.Error("Please provide a portrait to animate.")
 
155
  if voice is None:
156
  raise gr.Error("Please provide audio (4 seconds max).")
157
 
158
- # trim audio
159
- input_file = voice
160
- trimmed_output_file = "trimmed_audio.wav"
161
- trimmed_output_file = trim_audio(input_file, trimmed_output_file)
162
- voice = trimmed_output_file
163
-
 
 
164
  ready_audio = add_silence_to_wav(voice)
165
  print(f"1 second of silence added to {voice}")
166
 
167
- # call hallo
168
  talking_portrait_vid = run_hallo(portrait, ready_audio)
 
169
  return talking_portrait_vid
170
 
171
 
@@ -173,6 +219,9 @@ css = '''
173
  #col-container {
174
  margin: 0 auto;
175
  }
 
 
 
176
  #main-group {
177
  background-color: none;
178
  }
@@ -188,8 +237,17 @@ css = '''
188
  #audio-block, #audio-clone-elm {
189
  flex: 1;
190
  }
191
- #text-synth, #voice-desc, #text-synth-wsp{
192
- height: 180px;
 
 
 
 
 
 
 
 
 
193
  }
194
  #audio-column, #result-column {
195
  display: flex;
@@ -203,6 +261,9 @@ css = '''
203
  #main-submit{
204
  flex: 1;
205
  }
 
 
 
206
  div#warning-ready {
207
  background-color: #ecfdf5;
208
  padding: 0 16px 16px;
@@ -242,76 +303,138 @@ div#warning-duplicate .actions a {
242
  with gr.Blocks(css=css) as demo:
243
  with gr.Column(elem_id="col-container"):
244
  gr.Markdown("""
245
- # Parler X Hallo
246
- Generate talking portraits
 
 
 
 
 
247
  """)
 
 
 
 
248
  with gr.Group(elem_id="main-group"):
249
  with gr.Row():
250
  with gr.Column():
 
251
  portrait = gr.Image(
252
- sources=["upload"],
253
- type="filepath",
254
- format="png",
255
- elem_id="image-block"
256
  )
257
 
258
  prompt_image = gr.Textbox(
259
- label="Generate image",
260
- lines=3
 
261
  )
262
 
263
  gen_image_btn = gr.Button("Generate portrait (optional)")
264
 
265
  with gr.Column(elem_id="audio-column"):
 
266
  voice = gr.Audio(
267
- type="filepath",
268
- max_length=4000,
269
- elem_id="audio-block"
270
  )
271
 
 
 
 
272
  with gr.Tab("Parler TTS", elem_id="parler-tab"):
273
 
274
  prompt_audio = gr.Textbox(
275
- label="Text to synthetize",
276
- lines=4,
277
- max_lines=4,
278
- elem_id="text-synth"
279
  )
280
 
281
  voice_description = gr.Textbox(
282
- label="Voice description",
283
- lines=4,
284
- max_lines=4,
285
- elem_id="voice-desc"
286
  )
287
 
288
  gen_voice_btn = gr.Button("Generate voice (optional)")
289
 
290
  with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"):
291
  prompt_audio_whisperspeech = gr.Textbox(
292
- label="Text to synthetize",
293
- lines=4,
294
- max_lines=4,
295
- elem_id="text-synth-wsp"
296
  )
297
  audio_to_clone = gr.Audio(
298
- label="Voice to clone",
299
- type="filepath",
300
- elem_id="audio-clone-elm"
301
  )
302
  gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")
303
 
304
  with gr.Column(elem_id="result-column"):
 
305
  result = gr.Video(
306
  elem_id="video-block"
307
  )
308
 
309
- submit_btn = gr.Button("Submit", elem_id="main-submit")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
  voice.upload(
312
  fn = check_mp3,
313
  inputs = [voice],
314
- outputs = [voice],
 
 
 
 
 
 
 
 
315
  queue = False,
316
  show_api = False
317
  )
@@ -320,23 +443,23 @@ with gr.Blocks(css=css) as demo:
320
  fn = generate_portrait,
321
  inputs = [prompt_image],
322
  outputs = [portrait],
323
- queue=False,
324
  show_api = False
325
  )
326
 
327
  gen_voice_btn.click(
328
- fn = generate_voice,
329
  inputs = [prompt_audio, voice_description],
330
- outputs = [voice],
331
- queue=False,
332
  show_api = False
333
  )
334
 
335
  gen_wsp_voice_btn.click(
336
  fn = get_whisperspeech,
337
  inputs = [prompt_audio_whisperspeech, audio_to_clone],
338
- outputs = [voice],
339
- queue=False,
340
  show_api = False
341
  )
342
 
 
13
 
14
  is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False
15
 
16
+ AUDIO_MAX_DURATION = 4000
17
+
18
  if(not is_shared_ui):
19
  hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
20
 
21
+ #############
22
+ # UTILITIES #
23
+ #############
24
+
25
  def is_mp3(file_path):
26
  try:
27
  audio = MP3(file_path)
 
37
  return wav_file_path
38
 
39
 
40
+ def trim_audio(file_path, output_path, max_duration):
41
  # Load the audio file
42
  audio = AudioSegment.from_wav(file_path)
43
 
 
78
  else:
79
  print("The file is not an MP3 file.")
80
 
81
+ return file_path, gr.update(value=file_path, visible=True)
 
 
 
 
 
82
 
83
+ def check_and_convert_webp_to_png(input_path, output_path):
84
+ try:
85
+ # Open the image file
86
+ with Image.open(input_path) as img:
87
+ # Check if the image is in WebP format
88
+ if img.format == 'WEBP':
89
+ # Convert and save as PNG
90
+ img.save(output_path, 'PNG')
91
+ print(f"Converted {input_path} to {output_path}")
92
+ return output_path
93
+ else:
94
+ print(f"The file {input_path} is not in WebP format.")
95
+ return input_path
96
+ except IOError:
97
+ print(f"Cannot open {input_path}. The file might not exist or is not an image.")
98
+
99
+ def clear_audio_elms():
100
+ return gr.update(value=None, visible=False)
101
 
102
+ #######################################################
103
+ # Gradio APIs for optional image and voice generation #
104
+ #######################################################
105
 
106
  def generate_portrait(prompt_image):
107
  if prompt_image is None or prompt_image == "":
108
  raise gr.Error("Can't generate a portrait without a prompt !")
109
+
110
+ try:
111
+ client = Client("ByteDance/SDXL-Lightning")
112
+ except:
113
+ raise gr.Error(f"ByteDance/SDXL-Lightning space's api might not be ready, please wait, or upload an image instead.")
114
+
115
+ try:
116
+ result = client.predict(
117
+ prompt = prompt_image,
118
+ ckpt = "4-Step",
119
+ api_name = "/generate_image"
120
+ )
121
+ print(result)
122
 
123
+ # convert to png if necessary
124
+ input_file = result
125
+ output_file = "converted_to_png_portrait.png"
126
+ ready_png = check_and_convert_webp_to_png(input_file, output_file)
127
+ print(f"PORTRAIT PNG FILE: {ready_png}")
128
 
129
+ return ready_png
130
+
131
+ def generate_voice_with_parler(prompt_audio, voice_description):
132
  if prompt_audio is None or prompt_audio == "" :
133
+ raise gr.Error(f"Can't generate a voice without text to synthetize !")
134
  if voice_description is None or voice_description == "":
135
  gr.Info(
136
  "For better control, You may want to provide a voice character description next time.",
137
  duration = 10,
138
  visible = True
139
  )
140
+ try:
141
+ client = Client("parler-tts/parler_tts_mini")
142
+ except:
143
+ raise gr.Error(f"parler-tts/parler_tts_mini space's api might not be ready, please wait, or upload an audio instead.")
144
+
145
  result = client.predict(
146
+ text = prompt_audio,
147
+ description = voice_description,
148
+ api_name = "/gen_tts"
149
  )
150
  print(result)
151
+ return result, gr.update(value=result, visible=True)
152
 
153
  def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone):
154
+ try:
155
+ client = Client("collabora/WhisperSpeech")
156
+ except:
157
+ raise gr.Error(f"collabora/WhisperSpeech space's api might not be ready, please wait, or upload an audio instead.")
158
+
159
  result = client.predict(
160
+ multilingual_text = prompt_audio_whisperspeech,
161
+ speaker_audio = handle_file(audio_to_clone),
162
+ speaker_url = "",
163
+ cps = 14,
164
+ api_name = "/whisper_speech_demo"
165
  )
166
  print(result)
167
+ return result, gr.update(value=result, visible=True)
168
+
169
+
170
+ ########################
171
+ # TALKING PORTRAIT GEN #
172
+ ########################
173
 
174
  def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
175
+
 
 
176
  unique_id = uuid.uuid4()
177
 
178
  args = argparse.Namespace(
179
+ config = 'configs/inference/default.yaml',
180
+ source_image = source_image,
181
+ driving_audio = driving_audio,
182
+ output = f'output-{unique_id}.mp4',
183
+ pose_weight = 1.0,
184
+ face_weight = 1.0,
185
+ lip_weight = 1.0,
186
+ face_expand_ratio = 1.2,
187
+ checkpoint = None
188
  )
189
 
190
  inference_process(args)
191
  return f'output-{unique_id}.mp4'
192
 
193
+ def generate_talking_portrait(portrait, voice, progress=gr.Progress(track_tqdm=True)):
194
 
195
  if portrait is None:
196
  raise gr.Error("Please provide a portrait to animate.")
197
+
198
  if voice is None:
199
  raise gr.Error("Please provide audio (4 seconds max).")
200
 
201
+ if is_shared_ui :
202
+ # Trim audio to AUDIO_MAX_DURATION for better shared experience with community
203
+ input_file = voice
204
+ trimmed_output_file = "trimmed_audio.wav"
205
+ trimmed_output_file = trim_audio(input_file, trimmed_output_file, AUDIO_MAX_DURATION)
206
+ voice = trimmed_output_file
207
+
208
+ # Add 1 second of silence at the end to avoid last word being cut by hallo
209
  ready_audio = add_silence_to_wav(voice)
210
  print(f"1 second of silence added to {voice}")
211
 
212
+ # Call hallo
213
  talking_portrait_vid = run_hallo(portrait, ready_audio)
214
+
215
  return talking_portrait_vid
216
 
217
 
 
219
  #col-container {
220
  margin: 0 auto;
221
  }
222
+ #column-names {
223
+ margin-top: 50px;
224
+ }
225
  #main-group {
226
  background-color: none;
227
  }
 
237
  #audio-block, #audio-clone-elm {
238
  flex: 1;
239
  }
240
+ div#audio-clone-elm > .audio-container > button {
241
+ height: 180px!important;
242
+ }
243
+ div#audio-clone-elm > .audio-container > button > .wrap {
244
+ font-size: 0.9em;
245
+ }
246
+ #text-synth, #voice-desc{
247
+ height: 130px;
248
+ }
249
+ #text-synth-wsp {
250
+ height: 120px;
251
  }
252
  #audio-column, #result-column {
253
  display: flex;
 
261
  #main-submit{
262
  flex: 1;
263
  }
264
+ #pro-tips {
265
+ margin-top: 50px;
266
+ }
267
  div#warning-ready {
268
  background-color: #ecfdf5;
269
  padding: 0 16px 16px;
 
303
  with gr.Blocks(css=css) as demo:
304
  with gr.Column(elem_id="col-container"):
305
  gr.Markdown("""
306
+ # TTS x Hallo Talking Portrait Generator
307
+
308
+ This demo allows you to generate a talking portrait with the help of several open-source projects: SDXL Lightning | Parler TTS | WhisperSpeech | Hallo
309
+
310
+ To let the community try and enjoy this demo, video length is limited to 4 seconds audio maximum.
311
+
312
+ Duplicate this space to skip the queue and get unlimited video duration. 4-5 seconds of audio will take ~5 minutes per inference, please be patient.
313
  """)
314
+ with gr.Row(elem_id="column-names"):
315
+ gr.Markdown("## 1. Load Portrait")
316
+ gr.Markdown("## 2. Load Voice")
317
+ gr.Markdown("## 3. Result")
318
  with gr.Group(elem_id="main-group"):
319
  with gr.Row():
320
  with gr.Column():
321
+
322
  portrait = gr.Image(
323
+ sources = ["upload"],
324
+ type = "filepath",
325
+ format = "png",
326
+ elem_id = "image-block"
327
  )
328
 
329
  prompt_image = gr.Textbox(
330
+ label = "Generate image",
331
+ lines = 2,
332
+ max_lines = 2
333
  )
334
 
335
  gen_image_btn = gr.Button("Generate portrait (optional)")
336
 
337
  with gr.Column(elem_id="audio-column"):
338
+
339
  voice = gr.Audio(
340
+ type = "filepath",
341
+ elem_id = "audio-block"
 
342
  )
343
 
344
+ preprocess_audio_file = gr.File(visible=False)
345
+
346
+
347
  with gr.Tab("Parler TTS", elem_id="parler-tab"):
348
 
349
  prompt_audio = gr.Textbox(
350
+ label = "Text to synthetize",
351
+ lines = 3,
352
+ max_lines = 3,
353
+ elem_id = "text-synth"
354
  )
355
 
356
  voice_description = gr.Textbox(
357
+ label = "Voice description",
358
+ lines = 3,
359
+ max_lines = 3,
360
+ elem_id = "voice-desc"
361
  )
362
 
363
  gen_voice_btn = gr.Button("Generate voice (optional)")
364
 
365
  with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"):
366
  prompt_audio_whisperspeech = gr.Textbox(
367
+ label = "Text to synthetize",
368
+ lines = 2,
369
+ max_lines = 2,
370
+ elem_id = "text-synth-wsp"
371
  )
372
  audio_to_clone = gr.Audio(
373
+ label = "Voice to clone",
374
+ type = "filepath",
375
+ elem_id = "audio-clone-elm"
376
  )
377
  gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")
378
 
379
  with gr.Column(elem_id="result-column"):
380
+
381
  result = gr.Video(
382
  elem_id="video-block"
383
  )
384
 
385
+ submit_btn = gr.Button("Go talking Portrait !", elem_id="main-submit")
386
+
387
+ with gr.Row(elem_id="pro-tips"):
388
+ gr.Markdown("""
389
+ # Hallo Pro Tips:
390
+
391
+ Hallo has a few simple requirements for input data:
392
+
393
+ For the source image:
394
+
395
+ 1. It should be cropped into squares.
396
+ 2. The face should be the main focus, making up 50%-70% of the image.
397
+ 3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
398
+
399
+ For the driving audio:
400
+
401
+ 1. It must be in WAV format.
402
+ 2. It must be in English since our training datasets are only in this language.
403
+ 3. Ensure the vocals are clear; background music is acceptable.
404
+
405
+
406
+ """)
407
+
408
+ gr.Markdown("""
409
+ # TTS Pro Tips:
410
+
411
+ For Parler TTS:
412
+
413
+ - Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise
414
+ - Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech
415
+ - The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt
416
+
417
+ For WhisperSpeech:
418
+
419
+ WhisperSpeech is able to quickly clone a voice from an audio sample.
420
+
421
+ - Upload a voice sample in the WhisperSpeech tab
422
+ - Add text to synthetize, hit Generate voice clone button
423
+
424
+ """)
425
 
426
  voice.upload(
427
  fn = check_mp3,
428
  inputs = [voice],
429
+ outputs = [voice, preprocess_audio_file],
430
+ queue = False,
431
+ show_api = False
432
+ )
433
+
434
+ voice.clear(
435
+ fn = clear_audio_elms,
436
+ inputs = None,
437
+ outputs = [preprocess_audio_file],
438
  queue = False,
439
  show_api = False
440
  )
 
443
  fn = generate_portrait,
444
  inputs = [prompt_image],
445
  outputs = [portrait],
446
+ queue = False,
447
  show_api = False
448
  )
449
 
450
  gen_voice_btn.click(
451
+ fn = generate_voice_with_parler,
452
  inputs = [prompt_audio, voice_description],
453
+ outputs = [voice, preprocess_audio_file],
454
+ queue = False,
455
  show_api = False
456
  )
457
 
458
  gen_wsp_voice_btn.click(
459
  fn = get_whisperspeech,
460
  inputs = [prompt_audio_whisperspeech, audio_to_clone],
461
+ outputs = [voice, preprocess_audio_file],
462
+ queue = False,
463
  show_api = False
464
  )
465