File size: 2,390 Bytes
19bdfd0
 
 
00b0d1a
00f303b
 
19bdfd0
 
00f303b
00b0d1a
 
 
 
00f303b
00b0d1a
00f303b
00b0d1a
 
 
00f303b
00b0d1a
 
 
 
 
 
 
19bdfd0
00b0d1a
19bdfd0
00b0d1a
 
 
 
 
19bdfd0
00b0d1a
 
 
 
 
 
 
00f303b
19bdfd0
 
 
 
00b0d1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19bdfd0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import streamlit as st
import time
from datetime import datetime
from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, SpeechT5ForTextToSpeech
import numpy as np
import torch
from io import StringIO
import soundfile as sf

# Improved Styling
def local_css(file_name):
    with open(file_name) as f:
        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)

local_css("style.css")  # Assuming a CSS file named 'style.css' in the same directory

# Streamlined Layout
st.title("Text-to-Voice Conversion")
st.markdown("Convert your text to speech using advanced AI models.")

# Load models outside of function calls for efficiency
@st.cache(allow_output_mutation=True)
def load_models():
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    return model, processor, vocoder

model, processor, vocoder = load_models()

# Load speaker embeddings
@st.cache(allow_output_mutation=True)
def get_speaker_embeddings():
    speaker_embeddings = np.load("cmu_us_slt_arctic-wav-arctic_a0508.npy")
    return torch.tensor(speaker_embeddings).unsqueeze(0)

speaker_embeddings = get_speaker_embeddings()

# Text Input
text = st.text_area("Type your text or upload a text file below.")

# Function to convert text to speech
def text_to_speech(text):
    inputs = processor(text=text, return_tensors="pt")
    spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
    with torch.no_grad():
        speech = vocoder(spectrogram)
        sf.write("speech.wav", speech.numpy(), samplerate=16000)
        return "speech.wav"

# Convert Button
if st.button("Convert"):
    if text:
        audio_path = text_to_speech(text)
        audio_file = open(audio_path, 'rb')
        audio_bytes = audio_file.read()
        st.audio(audio_bytes, format='audio/wav')
    else:
        st.error("Please enter some text to convert.")

# File Uploader
uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
if uploaded_file is not None:
    text = uploaded_file.getvalue().decode("utf-8")
    audio_path = text_to_speech(text)
    audio_file = open(audio_path, 'rb')
    audio_bytes = audio_file.read()
    st.audio(audio_bytes, format='audio/wav')