import gradio as gr import whisper from translate import Translator from TTS.api import TTS import uuid import os from pathlib import Path import gc import torch os.environ["COQUI_TOS_AGREED"] = "1" model = whisper.load_model("base") tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") output_dir = "output_audio" os.makedirs(output_dir, exist_ok=True) def transcribeaudio(audiofile): print("Transcribing audio...") tresult = model.transcribe(audiofile) if "text" not in tresult: print("Transcription failed.") return {"status": "error", "error": "Transcription failed"} audio = whisper.load_audio(audiofile) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) detected_language = max(probs, key=probs.get) print(f"Detected language: {detected_language}") return {"text": tresult["text"], "language": detected_language} def translatetext(text, source_lang, target_lang): try: translator = Translator(from_lang=source_lang, to_lang=target_lang) translated_text = translator.translate(text) print(f"Translated text: {translated_text}") return translated_text except Exception as e: print(f"Error translating to {target_lang}: {str(e)}") return f"Error: Could not translate to {target_lang}" def readtranslation(text, audiofile, language): output_path = os.path.join(output_dir, f"{language}_{uuid.uuid4()}.wav") print(f"Generating TTS for text: {text}") tts.tts_to_file(text=text, file_path=output_path, speaker_wav=audiofile, language=language) print(f"Generated audio file at: {output_path}") return output_path def v2vtranslate(audiofile, selected_lang,COQUI_TOS_AGREED, progress=gr.Progress()): if COQUI_TOS_AGREED == True: progress(0, desc="Starting process...") try: progress(0.2, desc="Transcribing audio...") transcription_result = transcribeaudio(audiofile) if isinstance(transcription_result, dict) and transcription_result.get("status") == "error": raise gr.Error(transcription_result["error"]) text = transcription_result["text"] detected_language = transcription_result["language"] progress(0.4, desc="Translating text...") translated_text = translatetext(text, detected_language, selected_lang) progress(0.7, desc="Generating audio...") audio_path = readtranslation(translated_text, audiofile, selected_lang) progress(1.0, desc="Process complete!") return audio_path, translated_text except Exception as e: raise gr.Error(f"An error occurred: {str(e)}") finally: cleanup_memory() else: gr.Warning("Please accept the Terms & Condition!") return ( None, None, None, None, ) with gr.Blocks() as demo: gr.Markdown("## Record yourself in any language and immediately receive voice translations.") with gr.Row(): with gr.Column(): audio_input = gr.Audio( sources=["microphone"], type="filepath", show_download_button=True, max_length=15, label="Record your voice", waveform_options=gr.WaveformOptions( waveform_color="#01C6FF", waveform_progress_color="#0066B4", skip_length=2, show_controls=False,) ) language_gr = gr.Dropdown( label="Language", info="Select an output language for the synthesised speech", choices=[ "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "ko", "hu", "hi" ], max_choices=1, value="es", ) tos_gr = gr.Checkbox( label="Agree", value=False, info="I agree to the terms of the CPML: https://coqui.ai/cpml", ) submit = gr.Button("Submit", variant="primary") reset = gr.Button("Reset") with gr.Row(): output_audio = gr.Audio(label="Translated Audio", interactive=False) output_text = gr.Markdown() output_components = [output_audio, output_text] submit.click(fn=v2vtranslate, inputs=[audio_input, language_gr,tos_gr], outputs=output_components, show_progress=True) reset.click(fn=lambda: None, inputs=None, outputs=output_components + [audio_input]) def cleanup_memory(): gc.collect() torch.cuda.empty_cache() print("Memory cleaned up") if __name__ == "__main__": demo.launch()