import torch import torchaudio import streamlit as st import requests import streamlit.components.v1 as components import asyncio import io import os from dotenv import load_dotenv load_dotenv() model_id = '11mlabs/indri-0.1-124m-tts' task = 'indri-tts' st.title("Indri") st.subheader("Ultrafast multi-modal AI") baseUrl = os.getenv("BASE_URL") speakers = { "[spkr_63]" : "🇬🇧 👨 book reader", "[spkr_67]" : "🇺🇸 👨 influencer", "[spkr_68]" : "🇮🇳 👨 book reader", "[spkr_69]" : "🇮🇳 👨 book reader", "[spkr_70]" : "🇮🇳 👨 motivational speaker", "[spkr_62]" : "🇮🇳 👨 book reader heavy", "[spkr_53]" : "🇮🇳 👩 recipe reciter", "[spkr_60]" : "🇮🇳 👩 book reader", "[spkr_74]" : "🇺🇸 👨 book reader", "[spkr_75]" : "🇮🇳 👨 entrepreneur", "[spkr_76]" : "🇬🇧 👨 nature lover", "[spkr_77]" : "🇮🇳 👨 influencer", "[spkr_66]" : "🇮🇳 👨 politician" } with st.container(): st.markdown("### Speaker Selection") speaker_id = st.selectbox("Select a speaker:", options=list(speakers.keys()), format_func=lambda x: speakers[x]) st.markdown("### Text Input") text_input = st.text_area("Enter text for TTS (max 200 characters):", max_chars=200) async def generate_audio(): if text_input: speaker_name = speakers[speaker_id] response = requests.post( f"{baseUrl}/tts", json={ "text": text_input, "speaker": speaker_name }, headers={ "accept": "application/json", "Content-Type": "application/json" } ) if response.ok: audio_blob = response.content audio_tensor, sample_rate = torchaudio.load(io.BytesIO(audio_blob)) torchaudio.save('output.wav', audio_tensor, sample_rate=sample_rate) st.audio('output.wav') else: st.warning(f"Received invalid response format. Status Code: {response.status_code}, Response: {response.text}") else: st.warning("Please enter text to generate audio.") if st.button("Generate Audio", key="generate_audio"): asyncio.run(generate_audio())