import streamlit as st import numpy as np import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import soundfile as sf from io import StringIO # Load models outside of function calls for efficiency @st.cache(allow_output_mutation=True) def load_models(): model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") return model, processor, vocoder model, processor, vocoder = load_models() # Load speaker embeddings @st.cache(allow_output_mutation=True) def get_speaker_embeddings(): speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy") return torch.tensor(speaker_embeddings).unsqueeze(0) speaker_embeddings = get_speaker_embeddings() # Improved Styling (assuming style.css is present) def local_css(file_name): with open(file_name) as f: st.markdown(f'', unsafe_allow_html=True) local_css("style.css") # Apply custom CSS styles # Streamlit Layout st.title("Text-to-Voice Conversion") st.markdown("Convert your text to speech using advanced AI models.") # Function to convert text to speech def text_to_speech(text): try: max_length = 100 # Set a max length as per model's capability segments = [text[i:i+max_length] for i in range(0, len(text), max_length)] audio_paths = [] for i, segment in enumerate(segments): inputs = processor(text=segment, return_tensors="pt") spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) with torch.no_grad(): speech = vocoder(spectrogram) audio_path = f"speech_segment_{i}.wav" sf.write(audio_path, speech.numpy(), samplerate=16000) audio_paths.append(audio_path) return audio_paths except Exception as e: st.error(f"Error in text-to-speech conversion: {e}") return [] # Function to combine audio segments def combine_audio_segments(paths): combined_speech = [] for path in paths: data, samplerate = sf.read(path) combined_speech.extend(data) sf.write("combined_speech.wav", np.array(combined_speech), samplerate) return "combined_speech.wav" # Text Input and Conversion Button text = st.text_area("Type your text here.") if st.button("Convert"): if text: audio_paths = text_to_speech(text) combined_audio_path = combine_audio_segments(audio_paths) audio_bytes = open(combined_audio_path, 'rb').read() st.audio(audio_bytes, format='audio/wav') else: st.error("Please enter some text to convert.") # File Uploader and Conversion Button uploaded_file = st.file_uploader("Upload a text file here", type=['txt']) if uploaded_file is not None: stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) text = stringio.read() st.write(text) if st.button("Convert Uploaded File", key="upload"): audio_paths = text_to_speech(text) combined_audio_path = combine_audio_segments(audio_paths) audio_bytes = open(combined_audio_path, 'rb').read() st.audio(audio_bytes, format='audio/wav')