import subprocess import os import json import gradio as gr from pydub import AudioSegment from pydub.playback import play from header import badges, description from pydub.silence import split_on_silence from get_voices import get_voices import asyncio from pathlib import Path import pysrt from tqdm import tqdm import shutil srt_temp_deleta = True def load_voices(): with open('voices.json', 'r', encoding='utf-8') as f: return json.load(f) def get_voice_options(language, voices_data): if language in voices_data: return [f"{voice['name']} | {voice['gender']}" for voice in voices_data[language]] return [] def extract_voice_name(formatted_voice): return formatted_voice.split(" | ")[0] def update_voice_options(language): voices_data = load_voices() voice_options = get_voice_options(language, voices_data) if voice_options: return gr.Dropdown(choices=voice_options, value=voice_options[0]) return gr.Dropdown(choices=[], value=None) def update_voices_and_refresh(): get_voices() voices_data = load_voices() available_languages = list(voices_data.keys()) initial_voices = get_voice_options(available_languages[0], voices_data) if available_languages else [] return ( gr.Dropdown(choices=available_languages, value=available_languages[0] if available_languages else None), gr.Dropdown(choices=initial_voices, value=initial_voices[0] if initial_voices else None) ) def remove_silence(input_file, output_file): audio = AudioSegment.from_wav(input_file) segments = split_on_silence(audio, min_silence_len=500, silence_thresh=-40) non_silent_audio = AudioSegment.silent(duration=0) for segment in segments: non_silent_audio += segment non_silent_audio.export(output_file, format="wav") def controlador_generate_audio(audio_input, voice_model_input, speed_input, pitch_input, volume_input, checkbox_cortar_silencio): audio_file = generate_audio(audio_input, voice_model_input, speed_input, pitch_input, volume_input) if audio_file: if checkbox_cortar_silencio: remove_silence(audio_file, audio_file) else: print("Erro ao gerar áudio.") return audio_file def generate_audio(texto, modelo_de_voz, velocidade, tom, volume): actual_voice = extract_voice_name(modelo_de_voz) rate_str = f"+{velocidade}%" if velocidade >= 0 else f"{velocidade}%" pitch_str = f"+{tom}Hz" if tom >= 0 else f"{tom}Hz" volume_str = f"+{volume}%" if volume >= 0 else f"{volume}%" output_dir = "output" os.makedirs(output_dir, exist_ok=True) output_file = os.path.join(output_dir, "new_audio.mp3") cmd = [ "edge-tts", "--rate=" + rate_str, "--pitch=" + pitch_str, "--volume=" + volume_str, "-v", actual_voice, "-t", texto, "--write-media", output_file ] print("Gerando áudio...") try: subprocess.run(cmd, check=True) except subprocess.CalledProcessError as e: print("Erro ao gerar áudio:", e) return None print("Áudio gerado com sucesso!") return output_file def generate_audio_from_file(file_path, modelo_de_voz, velocidade, tom, volume): actual_voice = extract_voice_name(modelo_de_voz) rate_str = f"+{velocidade}%" if velocidade >= 0 else f"{velocidade}%" pitch_str = f"+{tom}Hz" if tom >= 0 else f"{tom}Hz" volume_str = f"+{volume}%" if volume >= 0 else f"{volume}%" output_dir = "output" os.makedirs(output_dir, exist_ok=True) output_file = os.path.join(output_dir, "new_audio.mp3") cmd = [ "edge-tts", "-f", file_path, "--rate=" + rate_str, "--pitch=" + pitch_str, "--volume=" + volume_str, "-v", actual_voice, "--write-media", output_file ] print("Gerando áudio do arquivo...") try: subprocess.run(cmd, check=True) except subprocess.CalledProcessError as e: print("Erro ao gerar áudio:", e) return None print("Áudio gerado com sucesso!") return output_file def controlador_generate_audio_from_file(file, voice_model_input, speed_input, pitch_input, volume_input, checkbox_cortar_silencio): if file is None: return None temp_file_path = file audio_file = generate_audio_from_file(temp_file_path, voice_model_input, speed_input, pitch_input, volume_input) if audio_file: print("Áudio gerado com sucesso:", audio_file) if checkbox_cortar_silencio: print("Cortando silêncio...") remove_silence(audio_file, audio_file) print("Silêncio removido com sucesso!") else: print("Erro ao gerar áudio.") return audio_file def timetoms(time_obj): return time_obj.hours * 3600000 + time_obj.minutes * 60000 + time_obj.seconds * 1000 + time_obj.milliseconds async def merge_audio_files(output_folder, srt_file): subs = pysrt.open(str(srt_file)) final_audio = AudioSegment.silent(duration=0) base_name = Path(srt_file).stem audio_dir = Path(output_folder) total_files = len(subs) additional_silence_duration = 1000 with tqdm(total=total_files, desc=f"Mesclando áudios para {base_name}", unit="segmento") as pbar: current_time = 0 for i, sub in enumerate(subs, start=1): start_time = timetoms(sub.start) end_time = timetoms(sub.end) audio_file = audio_dir / f"{sub.index:02d}.mp3" if audio_file.exists(): audio = AudioSegment.from_mp3(str(audio_file)) audio_segment = audio else: print(f"\nArquivo de áudio não encontrado: {audio_file}") audio_segment = AudioSegment.silent(duration=end_time - start_time) pbar.update(1) if i == 1 and start_time > 0: silence = AudioSegment.silent(duration=start_time) final_audio += silence current_time = start_time if start_time > current_time: silence_duration = start_time - current_time silence = AudioSegment.silent(duration=silence_duration) final_audio += silence final_audio += audio_segment current_time = end_time final_audio += AudioSegment.silent(duration=additional_silence_duration) srt_output_dir = Path("output/srt_output") srt_output_dir.mkdir(parents=True, exist_ok=True) output_file = srt_output_dir / f"{base_name}_final.mp3" final_audio.export(str(output_file), format="mp3") print(f"\nÁudio final salvo em: {output_file}\n") return str(output_file) async def adjust_audio_speed(input_file, output_file, target_duration_ms): audio = AudioSegment.from_mp3(input_file) original_duration_ms = len(audio) if original_duration_ms == 0: print(f"Erro: Áudio em {input_file} tem duração zero.") return audio speed_factor = original_duration_ms / target_duration_ms adjusted_audio = audio.speedup(playback_speed=speed_factor) if speed_factor > 1 else audio._spawn(audio.raw_data, overrides={"frame_rate": int(audio.frame_rate * speed_factor)}) if len(adjusted_audio) > target_duration_ms: adjusted_audio = adjusted_audio[:target_duration_ms] elif len(adjusted_audio) < target_duration_ms: adjusted_audio += AudioSegment.silent(duration=target_duration_ms - len(adjusted_audio)) adjusted_audio.export(output_file, format="mp3") return adjusted_audio async def process_srt_file(srt_file, voice, output_dir, pitch, volume): from edge_tts import Communicate as EdgeTTS subs = pysrt.open(srt_file) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) total_indices = len(subs) batches = [list(range(i, min(i + 2, total_indices))) for i in range(0, total_indices, 2)] pitch_str = f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz" volume_str = f"+{volume}%" if volume >= 0 else f"{volume}%" with tqdm(total=total_indices, desc="Gerando e ajustando áudios com EdgeTTS", unit="segmento") as pbar: for batch in batches: tasks = [] for i in batch: sub = subs[i] output_file = output_dir / f"{sub.index:02d}.mp3" temp_file = output_dir / f"{sub.index:02d}_temp.mp3" target_duration_ms = timetoms(sub.end) - timetoms(sub.start) if not output_file.exists() or output_file.stat().st_size == 0: tts = EdgeTTS(text=sub.text, voice=voice, pitch=pitch_str, volume=volume_str) tasks.append(tts.save(str(temp_file))) if tasks: await asyncio.gather(*tasks) for i in batch: sub = subs[i] temp_file = output_dir / f"{sub.index:02d}_temp.mp3" output_file = output_dir / f"{sub.index:02d}.mp3" target_duration_ms = timetoms(sub.end) - timetoms(sub.start) if temp_file.exists(): await adjust_audio_speed(temp_file, output_file, target_duration_ms) os.remove(temp_file) pbar.update(1) final_audio = await merge_audio_files(output_dir, srt_file) if srt_temp_deleta: shutil.rmtree(output_dir, ignore_errors=True) print(f"Pasta temporária {output_dir} apagada.") else: print(f"Pasta temporária {output_dir} mantida.") return final_audio def controlador_process_srt_file(srt_file, voice_model_input, pitch_input, volume_input): if srt_file is None: return None actual_voice = extract_voice_name(voice_model_input) output_dir = "output/srt_temp" audio_file = asyncio.run(process_srt_file(srt_file, actual_voice, output_dir, pitch_input, volume_input)) return audio_file def listar_audios(): try: srt_output_dir = "output/srt_output" if not os.path.exists(srt_output_dir): os.makedirs(srt_output_dir, exist_ok=True) return ["Nenhum áudio gerado ainda"] arquivos = [f for f in os.listdir(srt_output_dir) if f.endswith(('.mp3', '.wav'))] return arquivos if arquivos else ["Nenhum áudio gerado ainda"] except Exception as e: print(f"Erro ao listar áudios: {e}") return ["Erro ao listar arquivos"] def tocar_audio(arquivo): if arquivo and arquivo != "Nenhum áudio gerado ainda": return f"output/srt_output/{arquivo}" return None with gr.Blocks(theme=gr.themes.Default(primary_hue="green", secondary_hue="blue"), title="QuickTTS") as iface: gr.Markdown(badges) gr.Markdown(description) voices_data = load_voices() available_languages = list(voices_data.keys()) with gr.Tabs(): with gr.TabItem("Edge-TTS"): gr.Markdown("É ilimitado, podendo até mesmo colocar um livro inteiro, mas claro, tem a questão de tempo, quanto maior o texto, mais demorado é.") with gr.Row(): language_input = gr.Dropdown( choices=available_languages, label="Idioma", value=available_languages[52] if available_languages else None ) initial_voices = get_voice_options(available_languages[52], voices_data) if available_languages else [] voice_model_input = gr.Dropdown( choices=initial_voices, label="Modelo de Voz", value=initial_voices[0] if initial_voices else None ) language_input.change( fn=update_voice_options, inputs=[language_input], outputs=[voice_model_input] ) audio_input = gr.Textbox(label="Texto", value='Texto de exemplo!', interactive=True) with gr.Row(): with gr.Column(): speed_input = gr.Slider(minimum=-200, maximum=200, label="Velocidade (%)", value=0, interactive=True) with gr.Column(): pitch_input = gr.Slider(minimum=-100, maximum=100, label="Tom (Hz)", value=0, interactive=True) with gr.Column(): volume_input = gr.Slider(minimum=-99, maximum=100, label="Volume (%)", value=0, interactive=True) checkbox_cortar_silencio = gr.Checkbox(label="Cortar Silencio", interactive=True) audio_output = gr.Audio(label="Resultado", type="filepath", interactive=False) with gr.Row(): edgetts_button = gr.Button(value="Falar") edgetts_button.click( controlador_generate_audio, inputs=[audio_input, voice_model_input, speed_input, pitch_input, volume_input, checkbox_cortar_silencio], outputs=[audio_output] ) clear_button = gr.ClearButton(audio_input, value='Limpar') update_voices_btn = gr.Button(value="Atualizar Lista de Vozes") update_voices_btn.click( fn=update_voices_and_refresh, inputs=[], outputs=[language_input, voice_model_input] ) gr.Markdown("Agradecimentos a rany2 pelo Edge-TTS") with gr.TabItem("Lote (Arquivo txt)"): gr.Markdown("Carregar texto de um arquivo") with gr.Row(): language_input_file = gr.Dropdown( choices=available_languages, label="Idioma", value=available_languages[52] if available_languages else None ) initial_voices = get_voice_options(available_languages[52], voices_data) if available_languages else [] voice_model_input_file = gr.Dropdown( choices=initial_voices, label="Modelo de Voz", value=initial_voices[0] if initial_voices else None ) language_input_file.change( fn=update_voice_options, inputs=[language_input_file], outputs=[voice_model_input_file] ) gr.Markdown("O programa vai ler linha por linha e entregar em um único áudio") file_input = gr.File(label="Arquivo de Texto", file_types=[".txt"], type="filepath") with gr.Row(): with gr.Column(): speed_input_file = gr.Slider(minimum=-200, maximum=200, label="Velocidade (%)", value=0, interactive=True) with gr.Column(): pitch_input_file = gr.Slider(minimum=-100, maximum=100, label="Tom (Hz)", value=0, interactive=True) with gr.Column(): volume_input_file = gr.Slider(minimum=-99, maximum=100, label="Volume (%)", value=0, interactive=True) checkbox_cortar_silencio_file = gr.Checkbox(label="Cortar Silencio", interactive=True) audio_output_file = gr.Audio(label="Resultado", type="filepath", interactive=False) with gr.Row(): edgetts_button_file = gr.Button(value="Falar") edgetts_button_file.click( controlador_generate_audio_from_file, inputs=[file_input, voice_model_input_file, speed_input_file, pitch_input_file, volume_input_file, checkbox_cortar_silencio_file], outputs=[audio_output_file] ) clear_button_file = gr.ClearButton(file_input, value='Limpar') gr.Markdown("Agradecimentos a rany2 pelo Edge-TTS") with gr.TabItem("Ler .SRT"): gr.Markdown("Carregar um arquivo SRT e gerenciar áudios sincronizados com os tempos das legendas.

Se você precisa de dublagem por IA para seus vídeos do YouTube, cursos e outros projetos, entre em contato comigo:
https://www.instagram.com/rafael.godoy.ebert/
Este é apenas um teste para brincar e explorar a funcionalidade básica. Tenho uma versão mais completa e personalizada que pode atender às suas necessidades específicas, incluindo clone de voz, entonação na fala e outras funcionalidades.") with gr.Tabs(): with gr.TabItem("Gerar áudio"): gr.Markdown("A velocidade é ajustada automaticamente para cada legenda.") with gr.Row(): language_input_srt = gr.Dropdown( choices=available_languages, label="Idioma", value=available_languages[52] if available_languages else None ) initial_voices = get_voice_options(available_languages[52], voices_data) if available_languages else [] voice_model_input_srt = gr.Dropdown( choices=initial_voices, label="Modelo de Voz", value=initial_voices[0] if initial_voices else None ) language_input_srt.change( fn=update_voice_options, inputs=[language_input_srt], outputs=[voice_model_input_srt] ) srt_input = gr.File(label="Arquivo SRT", file_types=[".srt"], type="filepath") with gr.Row(): with gr.Column(): pitch_input_srt = gr.Slider(minimum=-100, maximum=100, label="Tom (Hz)", value=0, interactive=True) with gr.Column(): volume_input_srt = gr.Slider(minimum=-99, maximum=200, label="Volume (%)", value=0, interactive=True) audio_output_srt = gr.Audio(label="Resultado", type="filepath", interactive=False) with gr.Row(): srt_button = gr.Button(value="Gerar Áudio") clear_button_srt = gr.ClearButton(srt_input, value='Limpar') def generate_and_update_list(srt_file, voice_model_input, pitch_input, volume_input): audio_file = controlador_process_srt_file(srt_file, voice_model_input, pitch_input, volume_input) updated_list = listar_audios() return audio_file, updated_list srt_button.click( fn=generate_and_update_list, inputs=[srt_input, voice_model_input_srt, pitch_input_srt, volume_input_srt], outputs=[audio_output_srt, gr.Dropdown(visible=False)], queue=True ) gr.Markdown("Agradecimentos a rany2 pelo Edge-TTS") with gr.TabItem("Arquivos gerados"): gr.Markdown("Lista de arquivos de áudio gerados na pasta 'output/srt_output'.") audio_list = gr.Dropdown( label="Arquivos de áudio", choices=listar_audios(), value=None, interactive=True, allow_custom_value=True ) play_button = gr.Button(value="Tocar") audio_player = gr.Audio(label="Reproduzir", type="filepath", interactive=False) status_message = gr.Textbox(label="Status", interactive=False, visible=True) def update_audio_list(): arquivos = listar_audios() return gr.update(choices=arquivos, value=None), "Lista atualizada com sucesso" if "Erro" not in arquivos[0] else "Erro ao atualizar lista" refresh_button = gr.Button(value="Atualizar Lista") refresh_button.click( fn=update_audio_list, inputs=[], outputs=[audio_list, status_message], queue=True ) play_button.click( fn=tocar_audio, inputs=[audio_list], outputs=[audio_player], queue=True ) gr.Markdown(""" Desenvolvido por Rafael Godoy
Apoie o projeto pelo https://nubank.com.br/pagar/1ls6a4/0QpSSbWBSq, qualquer valor é bem vindo. """) iface.launch()