MERaLiON-AudioLLM ChatBot 🤖

import copy
import base64

import numpy as np
import streamlit as st
from streamlit_mic_recorder import mic_recorder

from utils import (
    GENERAL_INSTRUCTIONS, 
    AUDIO_SAMPLES_W_INSTRUCT,
    NoAudioException,
    TunnelNotRunningException,
    retry_generate_response,
    load_model,
    bytes_to_array, 
    start_server, 
)

DEFAULT_DIALOGUE_STATES = dict(
    default_instruction=[],
    audio_base64='',
    audio_array=np.array([]),
    disprompt = False,
    new_prompt = "",
    messages=[], 
    on_select=False, 
    on_upload=False, 
    on_record=False, 
    on_click_button = False
)

@st.fragment
def sidebar_fragment():
    st.markdown("""<div class="sidebar-intro">
                <p><strong>📌 Supported Tasks</strong>
                <p>Automatic Speech Recognation</p>
                <p>Speech Translation</p>
                <p>Spoken Question Answering</p>
                <p>Spoken Dialogue Summarization</p>
                <p>Speech Instruction</p>
                <p>Paralinguistics</p>
                <br>
                <p><strong>📎 Generation Config</strong>
                </div>""", unsafe_allow_html=True)

    st.slider(label='Temperature', min_value=0.0, max_value=2.0, value=0.7, key='temperature')

    st.slider(label='Top P', min_value=0.0, max_value=1.0, value=1.0, key='top_p')

@st.fragment
def specify_audio_fragment():
    col1, col2, col3 = st.columns([3.5, 4, 1.5])

    with col1:        
        audio_sample_names = [audio_sample_name for audio_sample_name in AUDIO_SAMPLES_W_INSTRUCT.keys()]
       
        st.markdown("**Select Audio From Examples:**")
       
        sample_name = st.selectbox(
            label="**Select Audio:**",
            label_visibility="collapsed",
            options=audio_sample_names,
            index=None,
            placeholder="Select an audio sample:",
            on_change=lambda: st.session_state.update(on_select=True),
            key='select')
       
        if sample_name and st.session_state.on_select:
            audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
            st.session_state.default_instruction = AUDIO_SAMPLES_W_INSTRUCT[sample_name]
            st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
            st.session_state.audio_array = bytes_to_array(audio_bytes)


    with col2:
        st.markdown("or **Upload Audio:**")

        uploaded_file = st.file_uploader(
            label="**Upload Audio:**", 
            label_visibility="collapsed",
            type=['wav', 'mp3'],
            on_change=lambda: st.session_state.update(on_upload=True),
            key='upload'
        )
        
        if uploaded_file and st.session_state.on_upload:
            audio_bytes = uploaded_file.read()
            st.session_state.default_instruction = GENERAL_INSTRUCTIONS
            st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
            st.session_state.audio_array = bytes_to_array(audio_bytes)


    with col3:
        st.markdown("or **Record Audio:**")
        
        recording = mic_recorder(
            start_prompt="▶ start recording",
            stop_prompt="🔴 stop recording",
            format="wav", 
            use_container_width=True, 
            callback=lambda: st.session_state.update(on_record=True),
            key='record')
        
        if recording and st.session_state.on_record:
            audio_bytes = recording["bytes"]
            st.session_state.default_instruction = GENERAL_INSTRUCTIONS
            st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
            st.session_state.audio_array = bytes_to_array(audio_bytes)

    st.session_state.update(on_upload=False, on_record=False, on_select=False)

    if st.session_state.audio_array.size:
        with st.chat_message("user"):
            if st.session_state.audio_array.shape[0] / 16000 > 30.0:
                st.warning("MERaLiON-AudioLLM can only process audio for up to 30 seconds. Audio longer than that will be truncated.")
                
            st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
            
            for i, inst in enumerate(st.session_state.default_instruction):
                st.button(
                    f"**Example Instruction {i+1}**: {inst}", 
                    args=(inst,),
                    disabled=st.session_state.disprompt, 
                    on_click=lambda p: st.session_state.update(disprompt=True, new_prompt=p, on_click_button=True)
                )

    if st.session_state.on_click_button:
        st.session_state.on_click_button = False
        st.rerun(scope="app")


def dialogue_section():
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            if message.get("error"):
                st.error(message["error"])
            for warning_msg in message.get("warnings", []):
                st.warning(warning_msg)
            if message.get("content"):
                st.write(message["content"])
    
    if chat_input := st.chat_input(
        placeholder="Type Your Instruction Here", 
        disabled=st.session_state.disprompt, 
        on_submit=lambda: st.session_state.update(disprompt=True)
    ):
        st.session_state.new_prompt = chat_input

    if one_time_prompt := st.session_state.new_prompt:
        st.session_state.update(new_prompt="", messages=[])

        with st.chat_message("user"):
            st.write(one_time_prompt)
        st.session_state.messages.append({"role": "user", "content": one_time_prompt})
    
        with st.chat_message("assistant"):
            with st.spinner("Thinking..."):
                error_msg, warnings, response = "", [], ""
                try:
                    response, warnings = retry_generate_response(one_time_prompt)
                except NoAudioException:
                    error_msg = "Please specify audio first!"
                except TunnelNotRunningException:
                    error_msg = "Internet connection cannot be established. Please contact the administrator."
                except Exception as e:
                    error_msg = f"Caught Exception: {repr(e)}. Please contact the administrator."
        st.session_state.messages.append({
            "role": "assistant", 
            "error": error_msg,
            "warnings": warnings, 
            "content": response
        })

        st.session_state.disprompt=False
        st.rerun(scope="app")


def audio_llm():    
    if "server" not in st.session_state:
        st.session_state.server = start_server()
    
    if "client" not in st.session_state or 'model_name' not in st.session_state:
        st.session_state.client, st.session_state.model_name = load_model()

    for key, value in DEFAULT_DIALOGUE_STATES.items():
        if key not in st.session_state:
            st.session_state[key]=copy.deepcopy(value)

    with st.sidebar:
        sidebar_fragment()

    if st.sidebar.button('Clear History'):
        st.session_state.update(DEFAULT_DIALOGUE_STATES)   

    st.markdown("<h1 style='text-align: center; color: black;'>MERaLiON-AudioLLM ChatBot 🤖</h1>", unsafe_allow_html=True)
    st.markdown(
        """This demo is based on [MERaLiON-AudioLLM](https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION), 
        developed by I2R, A*STAR, in collaboration with AISG, Singapore. 
        It is tailored for Singapore’s multilingual and multicultural landscape."""
    )

    specify_audio_fragment()
    dialogue_section()