IndriVoice / app.py
skriller18's picture
Made it private
c370778
raw
history blame
2.32 kB
import torch
import torchaudio
import streamlit as st
import requests
import streamlit.components.v1 as components
import asyncio
import io
import os
from dotenv import load_dotenv
load_dotenv()
model_id = '11mlabs/indri-0.1-124m-tts'
task = 'indri-tts'
st.title("Indri")
st.subheader("Ultrafast multi-modal AI")
baseUrl = os.getenv("BASE_URL")
speakers = {
"[spkr_63]" : "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿ‘จ book reader",
"[spkr_67]" : "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿ‘จ influencer",
"[spkr_68]" : "๐Ÿ‡ฎ๐Ÿ‡ณ ๐Ÿ‘จ book reader",
"[spkr_69]" : "๐Ÿ‡ฎ๐Ÿ‡ณ ๐Ÿ‘จ book reader",
"[spkr_70]" : "๐Ÿ‡ฎ๐Ÿ‡ณ ๐Ÿ‘จ motivational speaker",
"[spkr_62]" : "๐Ÿ‡ฎ๐Ÿ‡ณ ๐Ÿ‘จ book reader heavy",
"[spkr_53]" : "๐Ÿ‡ฎ๐Ÿ‡ณ ๐Ÿ‘ฉ recipe reciter",
"[spkr_60]" : "๐Ÿ‡ฎ๐Ÿ‡ณ ๐Ÿ‘ฉ book reader",
"[spkr_74]" : "๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿ‘จ book reader",
"[spkr_75]" : "๐Ÿ‡ฎ๐Ÿ‡ณ ๐Ÿ‘จ entrepreneur",
"[spkr_76]" : "๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿ‘จ nature lover",
"[spkr_77]" : "๐Ÿ‡ฎ๐Ÿ‡ณ ๐Ÿ‘จ influencer",
"[spkr_66]" : "๐Ÿ‡ฎ๐Ÿ‡ณ ๐Ÿ‘จ politician"
}
with st.container():
st.markdown("### Speaker Selection")
speaker_id = st.selectbox("Select a speaker:", options=list(speakers.keys()), format_func=lambda x: speakers[x])
st.markdown("### Text Input")
text_input = st.text_area("Enter text for TTS (max 200 characters):", max_chars=200)
async def generate_audio():
if text_input:
speaker_name = speakers[speaker_id]
response = requests.post(
f"{baseUrl}/tts",
json={
"text": text_input,
"speaker": speaker_name
},
headers={
"accept": "application/json",
"Content-Type": "application/json"
}
)
if response.ok:
audio_blob = response.content
audio_tensor, sample_rate = torchaudio.load(io.BytesIO(audio_blob))
torchaudio.save('output.wav', audio_tensor, sample_rate=sample_rate)
st.audio('output.wav')
else:
st.warning(f"Received invalid response format. Status Code: {response.status_code}, Response: {response.text}")
else:
st.warning("Please enter text to generate audio.")
if st.button("Generate Audio", key="generate_audio"):
asyncio.run(generate_audio())