Text-to-Voice / app.py
Nag189's picture
Update app.py
18d8a13
raw
history blame
2.34 kB
import streamlit as st
import time
from datetime import datetime
from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, SpeechT5ForTextToSpeech
import numpy as np
import torch
from io import StringIO
import soundfile as sf
# Improved Styling
def local_css(file_name):
with open(file_name) as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
local_css("style.css") # Assuming a CSS file named 'style.css' in the same directory
# Streamlined Layout
st.title("Text-to-Voice Conversion")
st.markdown("Convert your text to speech using advanced AI models.")
# Load models outside of function calls for efficiency
@st.cache_data
def load_models():
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
return model, processor, vocoder
model, processor, vocoder = load_models()
# Load speaker embeddings
@st.cache_data
def get_speaker_embeddings():
speaker_embeddings = np.load("cmu_us_slt_arctic-wav-arctic_a0508.npy")
return torch.tensor(speaker_embeddings).unsqueeze(0)
speaker_embeddings = get_speaker_embeddings()
# Text Input
text = st.text_area("Type your text or upload a text file below.")
# Function to convert text to speech
def text_to_speech(text):
inputs = processor(text=text, return_tensors="pt")
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
with torch.no_grad():
speech = vocoder(spectrogram)
sf.write("speech.wav", speech.numpy(), samplerate=16000)
return "speech.wav"
# Convert Button
if st.button("Convert"):
if text:
audio_path = text_to_speech(text)
audio_file = open(audio_path, 'rb')
audio_bytes = audio_file.read()
st.audio(audio_bytes, format='audio/wav')
else:
st.error("Please enter some text to convert.")
# File Uploader
uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
if uploaded_file is not None:
text = uploaded_file.getvalue().decode("utf-8")
audio_path = text_to_speech(text)
audio_file = open(audio_path, 'rb')
audio_bytes = audio_file.read()
st.audio(audio_bytes, format='audio/wav')