import streamlit as st import numpy as np import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from io import StringIO import soundfile as sf # Load models outside of function calls for efficiency @st.cache_data def load_models(): model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") return model, processor, vocoder model, processor, vocoder = load_models() # Load speaker embeddings @st.cache_data def get_speaker_embeddings(): speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy") return torch.tensor(speaker_embeddings).unsqueeze(0) speaker_embeddings = get_speaker_embeddings() # Improved Styling def local_css(file_name): with open(file_name) as f: st.markdown(f'', unsafe_allow_html=True) local_css("style.css") # Streamlined Layout st.title("Text-to-Voice Conversion") st.markdown("Convert your text to speech using advanced AI models.") # Function to convert text to speech def text_to_speech(text): try: # Segment the text if it's too long max_length = 100 # Set a max length as per model's capability segments = [text[i:i+max_length] for i in range(0, len(text), max_length)] audio_paths = [] for segment in segments: inputs = processor(text=segment, return_tensors="pt") spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) with torch.no_grad(): speech = vocoder(spectrogram) audio_path = f"speech_segment_{len(audio_paths)}.wav" sf.write(audio_path, speech.numpy(), samplerate=16000) audio_paths.append(audio_path) return audio_paths except Exception as e: st.error(f"Error in text-to-speech conversion: {e}") return [] # Function to combine audio segments def combine_audio_segments(paths): combined_speech = [] for path in paths: data, samplerate = sf.read(path) combined_speech.extend(data) sf.write("combined_speech.wav", np.array(combined_speech), samplerate) return "combined_speech.wav" # Text Input text = st.text_area("Type your text or upload a text file below.") # Convert Button if st.button("Convert"): if text: audio_paths = text_to_speech(text) combined_audio_path = combine_audio_segments(audio_paths) audio_file = open(combined_audio_path, 'rb') audio_bytes = audio_file.read() st.audio(audio_bytes, format='audio/wav') else: st.error("Please enter some text to convert.") # File Uploader uploaded_file = st.file_uploader("Upload your text file here", type=['txt']) if uploaded_file is not None: stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) text = stringio.read() st.write(text) if st.button("Convert Uploaded File", key=1): audio_paths = text_to_speech(text) combined_audio_path = combine_audio_segments(audio_paths) audio_file = open(combined_audio_path, 'rb') audio_bytes = audio_file.read() st.audio(audio_bytes, format='audio/wav')