M4xjunior commited on
Commit
26a9ffe
·
verified ·
1 Parent(s): 1384004

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -202
app.py CHANGED
@@ -144,13 +144,13 @@ with gr.Blocks(css=custom_css) as app:
144
  step=0.01,
145
  info="Defina a duração do cross-fade entre os clipes de áudio.",
146
  )
147
- sentence_count_slider = gr.Slider(
148
- label="Número de Sentenças por Vez",
149
  minimum=1,
150
  maximum=10,
151
  value=1,
152
  step=1,
153
- info="Selecione quantas sentenças serão geradas por vez.",
154
  )
155
 
156
  audio_output = gr.Audio(label="Áudio Sintetizado")
@@ -159,26 +159,31 @@ with gr.Blocks(css=custom_css) as app:
159
  analyzer = SentenceAnalyzer()
160
 
161
  @gpu_decorator
162
- def basic_tts(
163
  ref_audio_input,
164
  ref_text_input,
165
  gen_text_input,
166
  remove_silence,
167
  cross_fade_duration_slider,
168
  speed_slider,
169
- sentence_count_slider,
170
  ):
171
- # Divida o texto em sentenças
172
  sentences = analyzer.split_into_sentences(gen_text_input)
173
- num_sentences = min(len(sentences), sentence_count_slider)
174
 
175
- # Gere áudio para o número selecionado de sentenças
 
 
 
 
 
 
176
  audio_segments = []
177
- for i in range(num_sentences):
178
  audio_out, spectrogram_path, ref_text_out = infer(
179
  ref_audio_input,
180
  ref_text_input,
181
- sentences[i],
182
  remove_silence,
183
  cross_fade_duration_slider,
184
  speed_slider,
@@ -186,7 +191,7 @@ with gr.Blocks(css=custom_css) as app:
186
  sr, audio_data = audio_out
187
  audio_segments.append(audio_data)
188
 
189
- # Concatene os segmentos de áudio
190
  if audio_segments:
191
  final_audio_data = np.concatenate(audio_segments)
192
  return (sr, final_audio_data), spectrogram_path, gr.update(value=ref_text_out)
@@ -195,7 +200,7 @@ with gr.Blocks(css=custom_css) as app:
195
  return None, None, gr.update(value=ref_text_out)
196
 
197
  generate_btn.click(
198
- basic_tts,
199
  inputs=[
200
  ref_audio_input,
201
  ref_text_input,
@@ -203,201 +208,13 @@ with gr.Blocks(css=custom_css) as app:
203
  remove_silence,
204
  cross_fade_duration_slider,
205
  speed_slider,
206
- sentence_count_slider,
207
  ],
208
  outputs=[audio_output, spectrogram_output],
209
  )
210
 
211
 
212
- with gr.Tab("Multi-Speech"):
213
- gr.Markdown("# Geração Multi-Speech com F5-TTS")
214
- with gr.Row():
215
- with gr.Column():
216
- regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
217
- regular_insert = gr.Button("Insert Label", variant="secondary")
218
- regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
219
- regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
220
- # Regular speech type (max 100)
221
- max_speech_types = 100
222
- speech_type_rows = [] # 99
223
- speech_type_names = [regular_name] # 100
224
- speech_type_audios = [regular_audio] # 100
225
- speech_type_ref_texts = [regular_ref_text] # 100
226
- speech_type_delete_btns = [] # 99
227
- speech_type_insert_btns = [regular_insert] # 100
228
- # Additional speech types (99 more)
229
- for i in range(max_speech_types - 1):
230
- with gr.Row(visible=False) as row:
231
- with gr.Column():
232
- name_input = gr.Textbox(label="Speech Type Name")
233
- delete_btn = gr.Button("Delete Type", variant="secondary")
234
- insert_btn = gr.Button("Insert Label", variant="secondary")
235
- audio_input = gr.Audio(label="Reference Audio", type="filepath")
236
- ref_text_input = gr.Textbox(label="Reference Text", lines=2)
237
- speech_type_rows.append(row)
238
- speech_type_names.append(name_input)
239
- speech_type_audios.append(audio_input)
240
- speech_type_ref_texts.append(ref_text_input)
241
- speech_type_delete_btns.append(delete_btn)
242
- speech_type_insert_btns.append(insert_btn)
243
- # Button to add speech type
244
- add_speech_type_btn = gr.Button("Add Speech Type")
245
- # Keep track of current number of speech types
246
- speech_type_count = gr.State(value=1)
247
- # Function to add a speech type
248
- def add_speech_type_fn(speech_type_count):
249
- if speech_type_count < max_speech_types:
250
- speech_type_count += 1
251
- # Prepare updates for the rows
252
- row_updates = []
253
- for i in range(1, max_speech_types):
254
- if i < speech_type_count:
255
- row_updates.append(gr.update(visible=True))
256
- else:
257
- row_updates.append(gr.update())
258
- else:
259
- # Optionally, show a warning
260
- row_updates = [gr.update() for _ in range(1, max_speech_types)]
261
- return [speech_type_count] + row_updates
262
- add_speech_type_btn.click(
263
- add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
264
- )
265
- # Function to delete a speech type
266
- def make_delete_speech_type_fn(index):
267
- def delete_speech_type_fn(speech_type_count):
268
- # Prepare updates
269
- row_updates = []
270
- for i in range(1, max_speech_types):
271
- if i == index:
272
- row_updates.append(gr.update(visible=False))
273
- else:
274
- row_updates.append(gr.update())
275
- speech_type_count = max(1, speech_type_count)
276
- return [speech_type_count] + row_updates
277
- return delete_speech_type_fn
278
- # Update delete button clicks
279
- for i, delete_btn in enumerate(speech_type_delete_btns):
280
- delete_fn = make_delete_speech_type_fn(i)
281
- delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
282
- # Text input for the prompt
283
- gen_text_input_multistyle = gr.Textbox(
284
- label="Text to Generate",
285
- lines=10,
286
- placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
287
- )
288
- def make_insert_speech_type_fn(index):
289
- def insert_speech_type_fn(current_text, speech_type_name):
290
- current_text = current_text or ""
291
- speech_type_name = speech_type_name or "None"
292
- updated_text = current_text + f"{{{speech_type_name}}} "
293
- return gr.update(value=updated_text)
294
- return insert_speech_type_fn
295
- for i, insert_btn in enumerate(speech_type_insert_btns):
296
- insert_fn = make_insert_speech_type_fn(i)
297
- insert_btn.click(
298
- insert_fn,
299
- inputs=[gen_text_input_multistyle, speech_type_names[i]],
300
- outputs=gen_text_input_multistyle,
301
- )
302
- with gr.Accordion("Advanced Settings", open=False):
303
- remove_silence_multistyle = gr.Checkbox(
304
- label="Remove Silences",
305
- value=True,
306
- )
307
- # Generate button
308
- generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
309
- # Output audio
310
- audio_output_multistyle = gr.Audio(label="Synthesized Audio")
311
- @gpu_decorator
312
- def generate_multistyle_speech(
313
- gen_text,
314
- *args,
315
- ):
316
- speech_type_names_list = args[:max_speech_types]
317
- speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
318
- speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
319
- remove_silence = args[3 * max_speech_types]
320
- # Collect the speech types and their audios into a dict
321
- speech_types = OrderedDict()
322
- ref_text_idx = 0
323
- for name_input, audio_input, ref_text_input in zip(
324
- speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
325
- ):
326
- if name_input and audio_input:
327
- speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
328
- else:
329
- speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
330
- ref_text_idx += 1
331
- # Parse the gen_text into segments
332
- segments = parse_speechtypes_text(gen_text)
333
- # For each segment, generate speech
334
- generated_audio_segments = []
335
- current_style = "Regular"
336
- for segment in segments:
337
- style = segment["style"]
338
- text = segment["text"]
339
- if style in speech_types:
340
- current_style = style
341
- else:
342
- # If style not available, default to Regular
343
- current_style = "Regular"
344
- ref_audio = speech_types[current_style]["audio"]
345
- ref_text = speech_types[current_style].get("ref_text", "")
346
- # Generate speech for this segment
347
- audio_out, _, ref_text_out = infer(
348
- ref_audio, ref_text, text, remove_silence, 0, show_info=print
349
- ) # show_info=print no pull to top when generating
350
- sr, audio_data = audio_out
351
- generated_audio_segments.append(audio_data)
352
- speech_types[current_style]["ref_text"] = ref_text_out
353
- # Concatenate all audio segments
354
- if generated_audio_segments:
355
- final_audio_data = np.concatenate(generated_audio_segments)
356
- return [(sr, final_audio_data)] + [
357
- gr.update(value=speech_types[style]["ref_text"]) for style in speech_types
358
- ]
359
- else:
360
- gr.Warning("No audio generated.")
361
- return [None] + [gr.update(value=speech_types[style]["ref_text"]) for style in speech_types]
362
- generate_multistyle_btn.click(
363
- generate_multistyle_speech,
364
- inputs=[
365
- gen_text_input_multistyle,
366
- ]
367
- + speech_type_names
368
- + speech_type_audios
369
- + speech_type_ref_texts
370
- + [
371
- remove_silence_multistyle,
372
- ],
373
- outputs=[audio_output_multistyle] + speech_type_ref_texts,
374
- )
375
- # Validation function to disable Generate button if speech types are missing
376
- def validate_speech_types(gen_text, regular_name, *args):
377
- speech_type_names_list = args[:max_speech_types]
378
- # Collect the speech types names
379
- speech_types_available = set()
380
- if regular_name:
381
- speech_types_available.add(regular_name)
382
- for name_input in speech_type_names_list:
383
- if name_input:
384
- speech_types_available.add(name_input)
385
- # Parse the gen_text to get the speech types used
386
- segments = parse_speechtypes_text(gen_text)
387
- speech_types_in_text = set(segment["style"] for segment in segments)
388
- # Check if all speech types in text are available
389
- missing_speech_types = speech_types_in_text - speech_types_available
390
- if missing_speech_types:
391
- # Disable the generate button
392
- return gr.update(interactive=False)
393
- else:
394
- # Enable the generate button
395
- return gr.update(interactive=True)
396
- gen_text_input_multistyle.change(
397
- validate_speech_types,
398
- inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
399
- outputs=generate_multistyle_btn,
400
- )
401
 
402
  @click.command()
403
  @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
 
144
  step=0.01,
145
  info="Defina a duração do cross-fade entre os clipes de áudio.",
146
  )
147
+ chunk_size_slider = gr.Slider(
148
+ label="Número de Sentenças por Chunk",
149
  minimum=1,
150
  maximum=10,
151
  value=1,
152
  step=1,
153
+ info="Defina quantas sentenças serão processadas em cada chunk.",
154
  )
155
 
156
  audio_output = gr.Audio(label="Áudio Sintetizado")
 
159
  analyzer = SentenceAnalyzer()
160
 
161
  @gpu_decorator
162
+ def process_chunks(
163
  ref_audio_input,
164
  ref_text_input,
165
  gen_text_input,
166
  remove_silence,
167
  cross_fade_duration_slider,
168
  speed_slider,
169
+ chunk_size_slider,
170
  ):
171
+ # Dividir o texto em sentenças
172
  sentences = analyzer.split_into_sentences(gen_text_input)
 
173
 
174
+ # Agrupar sentenças em chunks
175
+ chunks = [
176
+ " ".join(sentences[i : i + chunk_size_slider])
177
+ for i in range(0, len(sentences), chunk_size_slider)
178
+ ]
179
+
180
+ # Processar cada chunk
181
  audio_segments = []
182
+ for chunk in chunks:
183
  audio_out, spectrogram_path, ref_text_out = infer(
184
  ref_audio_input,
185
  ref_text_input,
186
+ chunk, # Passa o chunk atual
187
  remove_silence,
188
  cross_fade_duration_slider,
189
  speed_slider,
 
191
  sr, audio_data = audio_out
192
  audio_segments.append(audio_data)
193
 
194
+ # Concatenar os segmentos de áudio gerados
195
  if audio_segments:
196
  final_audio_data = np.concatenate(audio_segments)
197
  return (sr, final_audio_data), spectrogram_path, gr.update(value=ref_text_out)
 
200
  return None, None, gr.update(value=ref_text_out)
201
 
202
  generate_btn.click(
203
+ process_chunks,
204
  inputs=[
205
  ref_audio_input,
206
  ref_text_input,
 
208
  remove_silence,
209
  cross_fade_duration_slider,
210
  speed_slider,
211
+ chunk_size_slider,
212
  ],
213
  outputs=[audio_output, spectrogram_output],
214
  )
215
 
216
 
217
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  @click.command()
220
  @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")