Update app.py
Browse files
app.py
CHANGED
@@ -144,13 +144,13 @@ with gr.Blocks(css=custom_css) as app:
|
|
144 |
step=0.01,
|
145 |
info="Defina a duração do cross-fade entre os clipes de áudio.",
|
146 |
)
|
147 |
-
|
148 |
-
label="Número de Sentenças por
|
149 |
minimum=1,
|
150 |
maximum=10,
|
151 |
value=1,
|
152 |
step=1,
|
153 |
-
info="
|
154 |
)
|
155 |
|
156 |
audio_output = gr.Audio(label="Áudio Sintetizado")
|
@@ -159,26 +159,31 @@ with gr.Blocks(css=custom_css) as app:
|
|
159 |
analyzer = SentenceAnalyzer()
|
160 |
|
161 |
@gpu_decorator
|
162 |
-
def
|
163 |
ref_audio_input,
|
164 |
ref_text_input,
|
165 |
gen_text_input,
|
166 |
remove_silence,
|
167 |
cross_fade_duration_slider,
|
168 |
speed_slider,
|
169 |
-
|
170 |
):
|
171 |
-
#
|
172 |
sentences = analyzer.split_into_sentences(gen_text_input)
|
173 |
-
num_sentences = min(len(sentences), sentence_count_slider)
|
174 |
|
175 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
audio_segments = []
|
177 |
-
for
|
178 |
audio_out, spectrogram_path, ref_text_out = infer(
|
179 |
ref_audio_input,
|
180 |
ref_text_input,
|
181 |
-
|
182 |
remove_silence,
|
183 |
cross_fade_duration_slider,
|
184 |
speed_slider,
|
@@ -186,7 +191,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
186 |
sr, audio_data = audio_out
|
187 |
audio_segments.append(audio_data)
|
188 |
|
189 |
-
#
|
190 |
if audio_segments:
|
191 |
final_audio_data = np.concatenate(audio_segments)
|
192 |
return (sr, final_audio_data), spectrogram_path, gr.update(value=ref_text_out)
|
@@ -195,7 +200,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
195 |
return None, None, gr.update(value=ref_text_out)
|
196 |
|
197 |
generate_btn.click(
|
198 |
-
|
199 |
inputs=[
|
200 |
ref_audio_input,
|
201 |
ref_text_input,
|
@@ -203,201 +208,13 @@ with gr.Blocks(css=custom_css) as app:
|
|
203 |
remove_silence,
|
204 |
cross_fade_duration_slider,
|
205 |
speed_slider,
|
206 |
-
|
207 |
],
|
208 |
outputs=[audio_output, spectrogram_output],
|
209 |
)
|
210 |
|
211 |
|
212 |
-
|
213 |
-
gr.Markdown("# Geração Multi-Speech com F5-TTS")
|
214 |
-
with gr.Row():
|
215 |
-
with gr.Column():
|
216 |
-
regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
|
217 |
-
regular_insert = gr.Button("Insert Label", variant="secondary")
|
218 |
-
regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
|
219 |
-
regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
|
220 |
-
# Regular speech type (max 100)
|
221 |
-
max_speech_types = 100
|
222 |
-
speech_type_rows = [] # 99
|
223 |
-
speech_type_names = [regular_name] # 100
|
224 |
-
speech_type_audios = [regular_audio] # 100
|
225 |
-
speech_type_ref_texts = [regular_ref_text] # 100
|
226 |
-
speech_type_delete_btns = [] # 99
|
227 |
-
speech_type_insert_btns = [regular_insert] # 100
|
228 |
-
# Additional speech types (99 more)
|
229 |
-
for i in range(max_speech_types - 1):
|
230 |
-
with gr.Row(visible=False) as row:
|
231 |
-
with gr.Column():
|
232 |
-
name_input = gr.Textbox(label="Speech Type Name")
|
233 |
-
delete_btn = gr.Button("Delete Type", variant="secondary")
|
234 |
-
insert_btn = gr.Button("Insert Label", variant="secondary")
|
235 |
-
audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
236 |
-
ref_text_input = gr.Textbox(label="Reference Text", lines=2)
|
237 |
-
speech_type_rows.append(row)
|
238 |
-
speech_type_names.append(name_input)
|
239 |
-
speech_type_audios.append(audio_input)
|
240 |
-
speech_type_ref_texts.append(ref_text_input)
|
241 |
-
speech_type_delete_btns.append(delete_btn)
|
242 |
-
speech_type_insert_btns.append(insert_btn)
|
243 |
-
# Button to add speech type
|
244 |
-
add_speech_type_btn = gr.Button("Add Speech Type")
|
245 |
-
# Keep track of current number of speech types
|
246 |
-
speech_type_count = gr.State(value=1)
|
247 |
-
# Function to add a speech type
|
248 |
-
def add_speech_type_fn(speech_type_count):
|
249 |
-
if speech_type_count < max_speech_types:
|
250 |
-
speech_type_count += 1
|
251 |
-
# Prepare updates for the rows
|
252 |
-
row_updates = []
|
253 |
-
for i in range(1, max_speech_types):
|
254 |
-
if i < speech_type_count:
|
255 |
-
row_updates.append(gr.update(visible=True))
|
256 |
-
else:
|
257 |
-
row_updates.append(gr.update())
|
258 |
-
else:
|
259 |
-
# Optionally, show a warning
|
260 |
-
row_updates = [gr.update() for _ in range(1, max_speech_types)]
|
261 |
-
return [speech_type_count] + row_updates
|
262 |
-
add_speech_type_btn.click(
|
263 |
-
add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
|
264 |
-
)
|
265 |
-
# Function to delete a speech type
|
266 |
-
def make_delete_speech_type_fn(index):
|
267 |
-
def delete_speech_type_fn(speech_type_count):
|
268 |
-
# Prepare updates
|
269 |
-
row_updates = []
|
270 |
-
for i in range(1, max_speech_types):
|
271 |
-
if i == index:
|
272 |
-
row_updates.append(gr.update(visible=False))
|
273 |
-
else:
|
274 |
-
row_updates.append(gr.update())
|
275 |
-
speech_type_count = max(1, speech_type_count)
|
276 |
-
return [speech_type_count] + row_updates
|
277 |
-
return delete_speech_type_fn
|
278 |
-
# Update delete button clicks
|
279 |
-
for i, delete_btn in enumerate(speech_type_delete_btns):
|
280 |
-
delete_fn = make_delete_speech_type_fn(i)
|
281 |
-
delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
|
282 |
-
# Text input for the prompt
|
283 |
-
gen_text_input_multistyle = gr.Textbox(
|
284 |
-
label="Text to Generate",
|
285 |
-
lines=10,
|
286 |
-
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
|
287 |
-
)
|
288 |
-
def make_insert_speech_type_fn(index):
|
289 |
-
def insert_speech_type_fn(current_text, speech_type_name):
|
290 |
-
current_text = current_text or ""
|
291 |
-
speech_type_name = speech_type_name or "None"
|
292 |
-
updated_text = current_text + f"{{{speech_type_name}}} "
|
293 |
-
return gr.update(value=updated_text)
|
294 |
-
return insert_speech_type_fn
|
295 |
-
for i, insert_btn in enumerate(speech_type_insert_btns):
|
296 |
-
insert_fn = make_insert_speech_type_fn(i)
|
297 |
-
insert_btn.click(
|
298 |
-
insert_fn,
|
299 |
-
inputs=[gen_text_input_multistyle, speech_type_names[i]],
|
300 |
-
outputs=gen_text_input_multistyle,
|
301 |
-
)
|
302 |
-
with gr.Accordion("Advanced Settings", open=False):
|
303 |
-
remove_silence_multistyle = gr.Checkbox(
|
304 |
-
label="Remove Silences",
|
305 |
-
value=True,
|
306 |
-
)
|
307 |
-
# Generate button
|
308 |
-
generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
|
309 |
-
# Output audio
|
310 |
-
audio_output_multistyle = gr.Audio(label="Synthesized Audio")
|
311 |
-
@gpu_decorator
|
312 |
-
def generate_multistyle_speech(
|
313 |
-
gen_text,
|
314 |
-
*args,
|
315 |
-
):
|
316 |
-
speech_type_names_list = args[:max_speech_types]
|
317 |
-
speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
|
318 |
-
speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
|
319 |
-
remove_silence = args[3 * max_speech_types]
|
320 |
-
# Collect the speech types and their audios into a dict
|
321 |
-
speech_types = OrderedDict()
|
322 |
-
ref_text_idx = 0
|
323 |
-
for name_input, audio_input, ref_text_input in zip(
|
324 |
-
speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
|
325 |
-
):
|
326 |
-
if name_input and audio_input:
|
327 |
-
speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
|
328 |
-
else:
|
329 |
-
speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
|
330 |
-
ref_text_idx += 1
|
331 |
-
# Parse the gen_text into segments
|
332 |
-
segments = parse_speechtypes_text(gen_text)
|
333 |
-
# For each segment, generate speech
|
334 |
-
generated_audio_segments = []
|
335 |
-
current_style = "Regular"
|
336 |
-
for segment in segments:
|
337 |
-
style = segment["style"]
|
338 |
-
text = segment["text"]
|
339 |
-
if style in speech_types:
|
340 |
-
current_style = style
|
341 |
-
else:
|
342 |
-
# If style not available, default to Regular
|
343 |
-
current_style = "Regular"
|
344 |
-
ref_audio = speech_types[current_style]["audio"]
|
345 |
-
ref_text = speech_types[current_style].get("ref_text", "")
|
346 |
-
# Generate speech for this segment
|
347 |
-
audio_out, _, ref_text_out = infer(
|
348 |
-
ref_audio, ref_text, text, remove_silence, 0, show_info=print
|
349 |
-
) # show_info=print no pull to top when generating
|
350 |
-
sr, audio_data = audio_out
|
351 |
-
generated_audio_segments.append(audio_data)
|
352 |
-
speech_types[current_style]["ref_text"] = ref_text_out
|
353 |
-
# Concatenate all audio segments
|
354 |
-
if generated_audio_segments:
|
355 |
-
final_audio_data = np.concatenate(generated_audio_segments)
|
356 |
-
return [(sr, final_audio_data)] + [
|
357 |
-
gr.update(value=speech_types[style]["ref_text"]) for style in speech_types
|
358 |
-
]
|
359 |
-
else:
|
360 |
-
gr.Warning("No audio generated.")
|
361 |
-
return [None] + [gr.update(value=speech_types[style]["ref_text"]) for style in speech_types]
|
362 |
-
generate_multistyle_btn.click(
|
363 |
-
generate_multistyle_speech,
|
364 |
-
inputs=[
|
365 |
-
gen_text_input_multistyle,
|
366 |
-
]
|
367 |
-
+ speech_type_names
|
368 |
-
+ speech_type_audios
|
369 |
-
+ speech_type_ref_texts
|
370 |
-
+ [
|
371 |
-
remove_silence_multistyle,
|
372 |
-
],
|
373 |
-
outputs=[audio_output_multistyle] + speech_type_ref_texts,
|
374 |
-
)
|
375 |
-
# Validation function to disable Generate button if speech types are missing
|
376 |
-
def validate_speech_types(gen_text, regular_name, *args):
|
377 |
-
speech_type_names_list = args[:max_speech_types]
|
378 |
-
# Collect the speech types names
|
379 |
-
speech_types_available = set()
|
380 |
-
if regular_name:
|
381 |
-
speech_types_available.add(regular_name)
|
382 |
-
for name_input in speech_type_names_list:
|
383 |
-
if name_input:
|
384 |
-
speech_types_available.add(name_input)
|
385 |
-
# Parse the gen_text to get the speech types used
|
386 |
-
segments = parse_speechtypes_text(gen_text)
|
387 |
-
speech_types_in_text = set(segment["style"] for segment in segments)
|
388 |
-
# Check if all speech types in text are available
|
389 |
-
missing_speech_types = speech_types_in_text - speech_types_available
|
390 |
-
if missing_speech_types:
|
391 |
-
# Disable the generate button
|
392 |
-
return gr.update(interactive=False)
|
393 |
-
else:
|
394 |
-
# Enable the generate button
|
395 |
-
return gr.update(interactive=True)
|
396 |
-
gen_text_input_multistyle.change(
|
397 |
-
validate_speech_types,
|
398 |
-
inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
|
399 |
-
outputs=generate_multistyle_btn,
|
400 |
-
)
|
401 |
|
402 |
@click.command()
|
403 |
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
|
|
|
144 |
step=0.01,
|
145 |
info="Defina a duração do cross-fade entre os clipes de áudio.",
|
146 |
)
|
147 |
+
chunk_size_slider = gr.Slider(
|
148 |
+
label="Número de Sentenças por Chunk",
|
149 |
minimum=1,
|
150 |
maximum=10,
|
151 |
value=1,
|
152 |
step=1,
|
153 |
+
info="Defina quantas sentenças serão processadas em cada chunk.",
|
154 |
)
|
155 |
|
156 |
audio_output = gr.Audio(label="Áudio Sintetizado")
|
|
|
159 |
analyzer = SentenceAnalyzer()
|
160 |
|
161 |
@gpu_decorator
|
162 |
+
def process_chunks(
|
163 |
ref_audio_input,
|
164 |
ref_text_input,
|
165 |
gen_text_input,
|
166 |
remove_silence,
|
167 |
cross_fade_duration_slider,
|
168 |
speed_slider,
|
169 |
+
chunk_size_slider,
|
170 |
):
|
171 |
+
# Dividir o texto em sentenças
|
172 |
sentences = analyzer.split_into_sentences(gen_text_input)
|
|
|
173 |
|
174 |
+
# Agrupar sentenças em chunks
|
175 |
+
chunks = [
|
176 |
+
" ".join(sentences[i : i + chunk_size_slider])
|
177 |
+
for i in range(0, len(sentences), chunk_size_slider)
|
178 |
+
]
|
179 |
+
|
180 |
+
# Processar cada chunk
|
181 |
audio_segments = []
|
182 |
+
for chunk in chunks:
|
183 |
audio_out, spectrogram_path, ref_text_out = infer(
|
184 |
ref_audio_input,
|
185 |
ref_text_input,
|
186 |
+
chunk, # Passa o chunk atual
|
187 |
remove_silence,
|
188 |
cross_fade_duration_slider,
|
189 |
speed_slider,
|
|
|
191 |
sr, audio_data = audio_out
|
192 |
audio_segments.append(audio_data)
|
193 |
|
194 |
+
# Concatenar os segmentos de áudio gerados
|
195 |
if audio_segments:
|
196 |
final_audio_data = np.concatenate(audio_segments)
|
197 |
return (sr, final_audio_data), spectrogram_path, gr.update(value=ref_text_out)
|
|
|
200 |
return None, None, gr.update(value=ref_text_out)
|
201 |
|
202 |
generate_btn.click(
|
203 |
+
process_chunks,
|
204 |
inputs=[
|
205 |
ref_audio_input,
|
206 |
ref_text_input,
|
|
|
208 |
remove_silence,
|
209 |
cross_fade_duration_slider,
|
210 |
speed_slider,
|
211 |
+
chunk_size_slider,
|
212 |
],
|
213 |
outputs=[audio_output, spectrogram_output],
|
214 |
)
|
215 |
|
216 |
|
217 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
@click.command()
|
220 |
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
|