Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import VitsModel, AutoTokenizer | |
import torch | |
# Title and Description | |
st.title("Text-to-Speech with VitsModel") | |
st.write("Enter some English text, and I'll generate audio for you!") | |
# Load Model and Tokenizer | |
# Cache the model for efficiency | |
def load_tts_model(): | |
model = VitsModel.from_pretrained("facebook/mms-tts-eng") | |
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") | |
return model, tokenizer | |
model, tokenizer = load_tts_model() | |
# User Input | |
user_text = st.text_input("Enter your text here:") | |
# Generate Audio on Button Click | |
if st.button("Generate Speech"): | |
if not user_text: | |
st.warning("Please enter some text.") | |
else: | |
inputs = tokenizer(user_text, return_tensors="pt") | |
with torch.no_grad(): | |
output = model(**inputs).waveform | |
# Specify sample rate (assuming it's the correct rate for the model) | |
sample_rate = 16000 # Or replace with the correct sample rate for 'facebook/mms-tts-eng' | |
# Optionally save to a temporary file (if needed) | |
sf.write("temp_audio.wav", output[0].numpy(), sample_rate) | |
# Choose one of the following playback methods: | |
# Method 1: Play from temporary file | |
st.audio("temp_audio.wav") | |
# Method 2: Play directly with sample rate | |
st.audio(output[0].numpy(), sample_rate=sample_rate) | |