Spaces:
Runtime error
Runtime error
File size: 2,390 Bytes
19bdfd0 00b0d1a 00f303b 19bdfd0 00f303b 00b0d1a 00f303b 00b0d1a 00f303b 00b0d1a 00f303b 00b0d1a 19bdfd0 00b0d1a 19bdfd0 00b0d1a 19bdfd0 00b0d1a 00f303b 19bdfd0 00b0d1a 19bdfd0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import streamlit as st
import time
from datetime import datetime
from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, SpeechT5ForTextToSpeech
import numpy as np
import torch
from io import StringIO
import soundfile as sf
# Improved Styling
def local_css(file_name):
with open(file_name) as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
local_css("style.css") # Assuming a CSS file named 'style.css' in the same directory
# Streamlined Layout
st.title("Text-to-Voice Conversion")
st.markdown("Convert your text to speech using advanced AI models.")
# Load models outside of function calls for efficiency
@st.cache(allow_output_mutation=True)
def load_models():
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
return model, processor, vocoder
model, processor, vocoder = load_models()
# Load speaker embeddings
@st.cache(allow_output_mutation=True)
def get_speaker_embeddings():
speaker_embeddings = np.load("cmu_us_slt_arctic-wav-arctic_a0508.npy")
return torch.tensor(speaker_embeddings).unsqueeze(0)
speaker_embeddings = get_speaker_embeddings()
# Text Input
text = st.text_area("Type your text or upload a text file below.")
# Function to convert text to speech
def text_to_speech(text):
inputs = processor(text=text, return_tensors="pt")
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
with torch.no_grad():
speech = vocoder(spectrogram)
sf.write("speech.wav", speech.numpy(), samplerate=16000)
return "speech.wav"
# Convert Button
if st.button("Convert"):
if text:
audio_path = text_to_speech(text)
audio_file = open(audio_path, 'rb')
audio_bytes = audio_file.read()
st.audio(audio_bytes, format='audio/wav')
else:
st.error("Please enter some text to convert.")
# File Uploader
uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
if uploaded_file is not None:
text = uploaded_file.getvalue().decode("utf-8")
audio_path = text_to_speech(text)
audio_file = open(audio_path, 'rb')
audio_bytes = audio_file.read()
st.audio(audio_bytes, format='audio/wav')
|