|
import gradio as gr |
|
from auralis import TTS, TTSRequest, TTSOutput |
|
from auralis.common.definitions.requests import AudioPreprocessingConfig |
|
from io import BytesIO |
|
import tempfile |
|
import time |
|
|
|
|
|
tts = TTS().from_pretrained("AstraMindAI/xttsv2", gpt_model="AstraMindAI/xtts2-gpt") |
|
|
|
|
|
def generate_speech_with_enhancement(text, speaker_audio): |
|
if not speaker_audio: |
|
return None, "Error: No speaker audio provided." |
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
with open(speaker_audio, "rb") as f: |
|
audio_data = f.read() |
|
except Exception as e: |
|
return None, f"Error reading speaker audio: {e}" |
|
|
|
|
|
audio_config = AudioPreprocessingConfig( |
|
normalize=True, |
|
trim_silence=True, |
|
enhance_speech=True, |
|
enhance_amount=1.5 |
|
) |
|
|
|
|
|
request = TTSRequest( |
|
text=text, |
|
speaker_files=[BytesIO(audio_data)], |
|
audio_config=audio_config |
|
) |
|
|
|
|
|
output = tts.generate_speech(request) |
|
generation_time = time.time() - start_time |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") |
|
output.save(temp_file.name, format="wav") |
|
|
|
|
|
audio_duration = output.get_info()[2] |
|
|
|
|
|
return temp_file.name, f"Generated in {generation_time:.2f} seconds, Audio Length: {audio_duration:.2f} seconds" |
|
|
|
|
|
def generate_speech_without_enhancement(text, speaker_audio): |
|
if not speaker_audio: |
|
return None, "Error: No speaker audio provided." |
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
with open(speaker_audio, "rb") as f: |
|
audio_data = f.read() |
|
except Exception as e: |
|
return None, f"Error reading speaker audio: {e}" |
|
|
|
|
|
request = TTSRequest( |
|
text=text, |
|
speaker_files=[BytesIO(audio_data)] |
|
) |
|
|
|
|
|
output = tts.generate_speech(request) |
|
generation_time = time.time() - start_time |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") |
|
output.save(temp_file.name, format="wav") |
|
|
|
|
|
audio_duration = output.get_info()[2] |
|
|
|
|
|
return temp_file.name, f"Generated in {generation_time:.2f} seconds, Audio Length: {audio_duration:.2f} seconds" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Auralis Hugging Face Space") |
|
text_input = gr.Textbox(label="Text to Convert to Speech") |
|
audio_input = gr.Audio(label="Reference Speaker Audio", type="filepath") |
|
enhance_checkbox = gr.Checkbox(label="Enhance Audio", value=False) |
|
output_audio = gr.Audio(label="Generated Speech", type="filepath") |
|
timing_info = gr.Textbox(label="Timing Info", interactive=False) |
|
|
|
generate_button = gr.Button("Generate Speech") |
|
|
|
|
|
def generate_speech_router(text, speaker_audio, enhance_audio): |
|
if enhance_audio: |
|
return generate_speech_with_enhancement(text, speaker_audio) |
|
else: |
|
return generate_speech_without_enhancement(text, speaker_audio) |
|
|
|
generate_button.click( |
|
fn=generate_speech_router, |
|
inputs=[text_input, audio_input, enhance_checkbox], |
|
outputs=[output_audio, timing_info] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|