import gradio as gr from auralis import TTS, TTSRequest, TTSOutput from auralis.common.definitions.requests import AudioPreprocessingConfig from io import BytesIO import tempfile import time # Initialize TTS model tts = TTS().from_pretrained("AstraMindAI/xttsv2", gpt_model="AstraMindAI/xtts2-gpt") # Function to generate speech with enhancement def generate_speech_with_enhancement(text, speaker_audio): if not speaker_audio: return None, "Error: No speaker audio provided." start_time = time.time() # Start timing # Read the speaker audio file try: with open(speaker_audio, "rb") as f: audio_data = f.read() except Exception as e: return None, f"Error reading speaker audio: {e}" # Create an instance of AudioPreprocessingConfig for enhancement audio_config = AudioPreprocessingConfig( normalize=True, trim_silence=True, enhance_speech=True, enhance_amount=1.5 # Adjust enhancement strength as needed ) # Create the TTS request request = TTSRequest( text=text, speaker_files=[BytesIO(audio_data)], audio_config=audio_config # Pass the audio config ) # Generate the speech output = tts.generate_speech(request) generation_time = time.time() - start_time # Calculate generation time # Save the output to a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") output.save(temp_file.name, format="wav") # Get audio duration audio_duration = output.get_info()[2] # Duration in seconds # Return the results return temp_file.name, f"Generated in {generation_time:.2f} seconds, Audio Length: {audio_duration:.2f} seconds" # Function to generate speech without enhancement def generate_speech_without_enhancement(text, speaker_audio): if not speaker_audio: return None, "Error: No speaker audio provided." start_time = time.time() # Start timing # Read the speaker audio file try: with open(speaker_audio, "rb") as f: audio_data = f.read() except Exception as e: return None, f"Error reading speaker audio: {e}" # Create the TTS request without enhancement request = TTSRequest( text=text, speaker_files=[BytesIO(audio_data)] ) # Generate the speech output = tts.generate_speech(request) generation_time = time.time() - start_time # Calculate generation time # Save the output to a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") output.save(temp_file.name, format="wav") # Get audio duration audio_duration = output.get_info()[2] # Duration in seconds # Return the results return temp_file.name, f"Generated in {generation_time:.2f} seconds, Audio Length: {audio_duration:.2f} seconds" # Gradio interface with gr.Blocks() as demo: gr.Markdown("# Auralis Hugging Face Space") text_input = gr.Textbox(label="Text to Convert to Speech") audio_input = gr.Audio(label="Reference Speaker Audio", type="filepath") enhance_checkbox = gr.Checkbox(label="Enhance Audio", value=False) output_audio = gr.Audio(label="Generated Speech", type="filepath") timing_info = gr.Textbox(label="Timing Info", interactive=False) generate_button = gr.Button("Generate Speech") # Use different functions based on enhancement selection def generate_speech_router(text, speaker_audio, enhance_audio): if enhance_audio: return generate_speech_with_enhancement(text, speaker_audio) else: return generate_speech_without_enhancement(text, speaker_audio) generate_button.click( fn=generate_speech_router, inputs=[text_input, audio_input, enhance_checkbox], outputs=[output_audio, timing_info] ) # Launch the Gradio app if __name__ == "__main__": demo.launch()