drewThomasson's picture
Update app.py
6879295 verified
import gradio as gr
from auralis import TTS, TTSRequest, TTSOutput
from auralis.common.definitions.requests import AudioPreprocessingConfig
from io import BytesIO
import tempfile
import time
# Initialize TTS model
tts = TTS().from_pretrained("AstraMindAI/xttsv2", gpt_model="AstraMindAI/xtts2-gpt")
# Function to generate speech with enhancement
def generate_speech_with_enhancement(text, speaker_audio):
if not speaker_audio:
return None, "Error: No speaker audio provided."
start_time = time.time() # Start timing
# Read the speaker audio file
try:
with open(speaker_audio, "rb") as f:
audio_data = f.read()
except Exception as e:
return None, f"Error reading speaker audio: {e}"
# Create an instance of AudioPreprocessingConfig for enhancement
audio_config = AudioPreprocessingConfig(
normalize=True,
trim_silence=True,
enhance_speech=True,
enhance_amount=1.5 # Adjust enhancement strength as needed
)
# Create the TTS request
request = TTSRequest(
text=text,
speaker_files=[BytesIO(audio_data)],
audio_config=audio_config # Pass the audio config
)
# Generate the speech
output = tts.generate_speech(request)
generation_time = time.time() - start_time # Calculate generation time
# Save the output to a temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
output.save(temp_file.name, format="wav")
# Get audio duration
audio_duration = output.get_info()[2] # Duration in seconds
# Return the results
return temp_file.name, f"Generated in {generation_time:.2f} seconds, Audio Length: {audio_duration:.2f} seconds"
# Function to generate speech without enhancement
def generate_speech_without_enhancement(text, speaker_audio):
if not speaker_audio:
return None, "Error: No speaker audio provided."
start_time = time.time() # Start timing
# Read the speaker audio file
try:
with open(speaker_audio, "rb") as f:
audio_data = f.read()
except Exception as e:
return None, f"Error reading speaker audio: {e}"
# Create the TTS request without enhancement
request = TTSRequest(
text=text,
speaker_files=[BytesIO(audio_data)]
)
# Generate the speech
output = tts.generate_speech(request)
generation_time = time.time() - start_time # Calculate generation time
# Save the output to a temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
output.save(temp_file.name, format="wav")
# Get audio duration
audio_duration = output.get_info()[2] # Duration in seconds
# Return the results
return temp_file.name, f"Generated in {generation_time:.2f} seconds, Audio Length: {audio_duration:.2f} seconds"
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Auralis Hugging Face Space")
text_input = gr.Textbox(label="Text to Convert to Speech")
audio_input = gr.Audio(label="Reference Speaker Audio", type="filepath")
enhance_checkbox = gr.Checkbox(label="Enhance Audio", value=False)
output_audio = gr.Audio(label="Generated Speech", type="filepath")
timing_info = gr.Textbox(label="Timing Info", interactive=False)
generate_button = gr.Button("Generate Speech")
# Use different functions based on enhancement selection
def generate_speech_router(text, speaker_audio, enhance_audio):
if enhance_audio:
return generate_speech_with_enhancement(text, speaker_audio)
else:
return generate_speech_without_enhancement(text, speaker_audio)
generate_button.click(
fn=generate_speech_router,
inputs=[text_input, audio_input, enhance_checkbox],
outputs=[output_audio, timing_info]
)
# Launch the Gradio app
if __name__ == "__main__":
demo.launch()