import os
import base64
import numpy as np
import streamlit as st
import streamlit.components.v1 as components
from streamlit_mic_recorder import mic_recorder
from utils import load_model, generate_response, bytes_to_array, start_server
def audio_llm():
with st.sidebar:
st.divider()
st.markdown("""
""", unsafe_allow_html=True)
st.slider(label='Temperature', min_value=0.0, max_value=2.0, value=0.7, key='temperature')
st.slider(label='Top P', min_value=0.0, max_value=1.0, value=1.0, key='top_p')
if st.sidebar.button('Clear History'):
st.session_state.update(messages=[],
on_upload=False,
on_record=False,
on_select=False,
audio_array=np.array([]))
if "server" not in st.session_state:
st.session_state.server = start_server()
if "client" not in st.session_state or 'model_name' not in st.session_state:
st.session_state.client, st.session_state.model_name = load_model()
if "audio_array" not in st.session_state:
st.session_state.audio_base64 = ''
st.session_state.audio_array = np.array([])
if "default_instruction" not in st.session_state:
st.session_state.default_instruction = ""
st.markdown("MERaLiON-AudioLLM ChatBot 🤖
", unsafe_allow_html=True)
st.markdown(
"""This demo is based on [MERaLiON-AudioLLM](https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION),
developed by I2R, A*STAR, in collaboration with AISG, Singapore.
It is tailored for Singapore’s multilingual and multicultural landscape."""
)
col1, col2, col3 = st.columns(3)
with col2:
st.markdown("**Record Audio:**")
recording = mic_recorder(
format="wav",
use_container_width=True,
callback=lambda: st.session_state.update(on_record=True, messages=[]),
key='record')
if recording and st.session_state.on_record:
audio_bytes = recording["bytes"]
st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
st.session_state.audio_array = bytes_to_array(audio_bytes)
with col3:
st.markdown("**Upload Audio:**")
uploaded_file = st.file_uploader(
label="**Upload Audio:**",
label_visibility="collapsed",
type=['wav', 'mp3'],
on_change=lambda: st.session_state.update(on_upload=True, messages=[]),
key='upload'
)
if uploaded_file and st.session_state.on_upload:
audio_bytes = uploaded_file.read()
st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
st.session_state.audio_array = bytes_to_array(audio_bytes)
with col1:
audio_samples_w_instruct = {
'1_ASR_IMDA_PART1_ASR_v2_141' : "- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
'7_ASR_IMDA_PART3_30_ASR_v2_2269': "- Need this talk written down, please.",
'17_ASR_IMDA_PART6_30_ASR_v2_1413': "- Record the spoken word in text form.",
'25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': "- Please translate the given speech to English.",
'26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': "- Please translate the given speech to Chinese.",
'30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': "- Please follow the instruction in the speech.",
'32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': "- What does the man think the woman should do at 4:00?",
'33_SQA_IMDA_PART3_30_SQA_V2_2310': "- Does Speaker2's wife cook for Speaker2 when they are at home?",
'34_SQA_IMDA_PART3_30_SQA_V2_3621': "- Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language?",
'35_SQA_IMDA_PART3_30_SQA_V2_4062': "- What is the color of the vase mentioned in the dialogue?",
'36_DS_IMDA_PART4_30_DS_V2_849': "- Condense the dialogue into a concise summary highlighting major topics and conclusions.",
'39_Paralingual_IEMOCAP_ER_V2_91': "- Based on the speaker's speech patterns, what do you think they are feeling?",
'40_Paralingual_IEMOCAP_ER_V2_567': "- Based on the speaker's speech patterns, what do you think they are feeling?",
'42_Paralingual_IEMOCAP_GR_V2_320': "- Is it possible for you to identify whether the speaker in this recording is male or female?",
'43_Paralingual_IEMOCAP_GR_V2_129': "- Is it possible for you to identify whether the speaker in this recording is male or female?",
'45_Paralingual_IMDA_PART3_30_GR_V2_12312': "- So, who's speaking in the second part of the clip? \n\n- So, who's speaking in the first part of the clip?",
'47_Paralingual_IMDA_PART3_30_NR_V2_10479': "- Can you guess which ethnic group this person is from based on their accent?",
'49_Paralingual_MELD_ER_V2_676': "- What emotions do you think the speaker is expressing?",
'50_Paralingual_MELD_ER_V2_692': "- Based on the speaker's speech patterns, what do you think they are feeling?",
'51_Paralingual_VOXCELEB1_GR_V2_2148': "- May I know the gender of the speaker?",
'53_Paralingual_VOXCELEB1_NR_V2_2286': "- What's the nationality identity of the speaker?",
'55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2': "- What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth?",
'56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415': "- Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore?",
'57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460': "- How does the author respond to parents' worries about masks in schools?",
'2_ASR_IMDA_PART1_ASR_v2_2258': "- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
'3_ASR_IMDA_PART1_ASR_v2_2265': "- Turn the spoken language into a text format.",
'4_ASR_IMDA_PART2_ASR_v2_999' : "- Translate the spoken words into text format.",
'5_ASR_IMDA_PART2_ASR_v2_2241': "- Translate the spoken words into text format.",
'6_ASR_IMDA_PART2_ASR_v2_3409': "- Translate the spoken words into text format.",
'8_ASR_IMDA_PART3_30_ASR_v2_1698': "- Need this talk written down, please.",
'9_ASR_IMDA_PART3_30_ASR_v2_2474': "- Need this talk written down, please.",
'11_ASR_IMDA_PART4_30_ASR_v2_3771': "- Write out the dialogue as text.",
'12_ASR_IMDA_PART4_30_ASR_v2_103' : "- Write out the dialogue as text.",
'10_ASR_IMDA_PART4_30_ASR_v2_1527': "- Write out the dialogue as text.",
'13_ASR_IMDA_PART5_30_ASR_v2_1446': "- Translate this vocal recording into a textual format.",
'14_ASR_IMDA_PART5_30_ASR_v2_2281': "- Translate this vocal recording into a textual format.",
'15_ASR_IMDA_PART5_30_ASR_v2_4388': "- Translate this vocal recording into a textual format.",
'16_ASR_IMDA_PART6_30_ASR_v2_576': "- Record the spoken word in text form.",
'18_ASR_IMDA_PART6_30_ASR_v2_2834': "- Record the spoken word in text form.",
'19_ASR_AIShell_zh_ASR_v2_5044': "- Transform the oral presentation into a text document.",
'20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': "- Please provide a written transcription of the speech.",
'27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': "- Please translate the given speech to Chinese.",
'28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': "- Please follow the instruction in the speech.",
'29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': "- Please follow the instruction in the speech.",
}
audio_sample_names = [audio_sample_name for audio_sample_name in audio_samples_w_instruct.keys()]
st.markdown("**Select Audio From Examples:**")
sample_name = st.selectbox(
label="**Select Audio:**",
label_visibility="collapsed",
options=audio_sample_names,
index=None,
placeholder="Select an audio sample:",
on_change=lambda: st.session_state.update(on_select=True, messages=[]),
key='select')
if sample_name and st.session_state.on_select:
audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
st.session_state.default_instruction = audio_samples_w_instruct[sample_name]
st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
st.session_state.audio_array = bytes_to_array(audio_bytes)
st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
st.session_state.update(on_upload=False, on_record=False, on_select=False)
if st.session_state.default_instruction:
st.write("**Example Instructions:**")
st.write(st.session_state.default_instruction)
st.markdown(
"""
""",
unsafe_allow_html=True,
)
if "messages" not in st.session_state:
st.session_state.messages = []
if 'disprompt' not in st.session_state:
st.session_state.disprompt = False
for message in st.session_state.messages[-2:]:
with st.chat_message(message["role"]):
st.write(message["content"])
if prompt := st.chat_input(
placeholder="Type Your Instruction Here",
disabled=st.session_state.disprompt,
on_submit=lambda: st.session_state.update(disprompt=True)
):
with st.chat_message("user"):
st.write(prompt)
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("assistant"):
if not st.session_state.audio_base64:
response = "Please specify audio first!"
st.write(response)
else:
with st.spinner("Thinking..."):
try:
stream = generate_response(prompt)
response = st.write_stream(stream)
except Exception as e:
response = f"Caught Exception: {repr(e)}. Please contact the administrator to restart this space."
st.write(response)
raise(e)
st.session_state.messages.append({"role": "assistant", "content": response})
st.session_state.disprompt = False
st.rerun()