Text-to-Voice / app.py
ruslanmv's picture
First commit
d07996d
raw
history blame
No virus
3.33 kB
import streamlit as st
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf
from io import StringIO
# Load models outside of function calls for efficiency
@st.cache(allow_output_mutation=True)
def load_models():
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
return model, processor, vocoder
model, processor, vocoder = load_models()
# Load speaker embeddings
@st.cache(allow_output_mutation=True)
def get_speaker_embeddings():
speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy")
return torch.tensor(speaker_embeddings).unsqueeze(0)
speaker_embeddings = get_speaker_embeddings()
# Improved Styling (assuming style.css is present)
def local_css(file_name):
with open(file_name) as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
local_css("style.css") # Apply custom CSS styles
# Streamlit Layout
st.title("Text-to-Voice Conversion")
st.markdown("Convert your text to speech using advanced AI models.")
# Function to convert text to speech
def text_to_speech(text):
try:
max_length = 100 # Set a max length as per model's capability
segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
audio_paths = []
for i, segment in enumerate(segments):
inputs = processor(text=segment, return_tensors="pt")
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
with torch.no_grad():
speech = vocoder(spectrogram)
audio_path = f"speech_segment_{i}.wav"
sf.write(audio_path, speech.numpy(), samplerate=16000)
audio_paths.append(audio_path)
return audio_paths
except Exception as e:
st.error(f"Error in text-to-speech conversion: {e}")
return []
# Function to combine audio segments
def combine_audio_segments(paths):
combined_speech = []
for path in paths:
data, samplerate = sf.read(path)
combined_speech.extend(data)
sf.write("combined_speech.wav", np.array(combined_speech), samplerate)
return "combined_speech.wav"
# Text Input and Conversion Button
text = st.text_area("Type your text here.")
if st.button("Convert"):
if text:
audio_paths = text_to_speech(text)
combined_audio_path = combine_audio_segments(audio_paths)
audio_bytes = open(combined_audio_path, 'rb').read()
st.audio(audio_bytes, format='audio/wav')
else:
st.error("Please enter some text to convert.")
# File Uploader and Conversion Button
uploaded_file = st.file_uploader("Upload a text file here", type=['txt'])
if uploaded_file is not None:
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
text = stringio.read()
st.write(text)
if st.button("Convert Uploaded File", key="upload"):
audio_paths = text_to_speech(text)
combined_audio_path = combine_audio_segments(audio_paths)
audio_bytes = open(combined_audio_path, 'rb').read()
st.audio(audio_bytes, format='audio/wav')