|
import gradio as gr |
|
import base64 |
|
import numpy as np |
|
from scipy.io import wavfile |
|
from voice_processing import tts, get_model_names, voice_mapping |
|
from io import BytesIO |
|
import asyncio |
|
|
|
async def convert_tts(model_name, tts_text, selected_voice, slang_rate, use_uploaded_voice, voice_upload): |
|
edge_tts_voice = voice_mapping.get(selected_voice) |
|
if not edge_tts_voice: |
|
return {"error": f"Invalid voice '{selected_voice}'."}, None |
|
|
|
voice_upload_file = None |
|
if use_uploaded_voice and voice_upload is not None: |
|
with open(voice_upload.name, 'rb') as f: |
|
voice_upload_file = f.read() |
|
|
|
|
|
info, edge_tts_output_path, tts_output_data, edge_output_file = await tts( |
|
model_name, tts_text, edge_tts_voice, slang_rate, use_uploaded_voice, voice_upload_file |
|
) |
|
|
|
_, audio_output = tts_output_data |
|
|
|
|
|
audio_bytes = None |
|
if isinstance(audio_output, np.ndarray): |
|
byte_io = BytesIO() |
|
wavfile.write(byte_io, 40000, audio_output) |
|
byte_io.seek(0) |
|
audio_bytes = byte_io.read() |
|
else: |
|
audio_bytes = audio_output |
|
|
|
audio_data_uri = f"data:audio/wav;base64,{base64.b64encode(audio_bytes).decode('utf-8')}" |
|
return {"info": info}, audio_data_uri |
|
|
|
def get_models(): |
|
return get_model_names() |
|
|
|
def get_voices(): |
|
return list(voice_mapping.keys()) |
|
|
|
iface = gr.Interface( |
|
fn=convert_tts, |
|
inputs=[ |
|
gr.Dropdown(choices=get_models(), label="Model", interactive=True), |
|
gr.Textbox(label="Text", placeholder="Enter text here"), |
|
gr.Dropdown(choices=get_voices(), label="Voice", interactive=True), |
|
gr.Slider(minimum=0, maximum=1, step=0.01, label="Slang Rate"), |
|
gr.Checkbox(label="Use Uploaded Voice"), |
|
gr.File(label="Voice File") |
|
], |
|
outputs=[ |
|
gr.JSON(label="Info"), |
|
gr.Textbox(label="Audio URI") |
|
|
|
], |
|
title="Text-to-Speech Conversion" |
|
) |
|
|
|
iface.launch() |
|
|