Spaces:
Running
Running
Create pages.py
Browse files
pages.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import streamlit as st
|
5 |
+
from streamlit_mic_recorder import mic_recorder
|
6 |
+
|
7 |
+
from utils import load_model, generate_response, bytes_to_array
|
8 |
+
|
9 |
+
|
10 |
+
def home_page():
|
11 |
+
## Set up home page Title
|
12 |
+
col1, col2 = st.columns([1, 4])
|
13 |
+
custom_html = """
|
14 |
+
<div class="banner">
|
15 |
+
<img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRhB2e_AhOe11wKxnnwOmOVg9E7J1MBgiTeYzzFAESwcCP5IbBAc2X8BwGChMfJzwqtVg&usqp=CAU" alt="Banner Image">
|
16 |
+
</div>
|
17 |
+
<style>
|
18 |
+
.banner {
|
19 |
+
width: 100%;
|
20 |
+
height: 200px;
|
21 |
+
overflow: visible;
|
22 |
+
}
|
23 |
+
.banner img {
|
24 |
+
width: 100%;
|
25 |
+
object-fit: cover;
|
26 |
+
}
|
27 |
+
</style>
|
28 |
+
"""
|
29 |
+
with col1:
|
30 |
+
components.html(custom_html)
|
31 |
+
with col2:
|
32 |
+
st.write("# Welcome to Merlion AI - AudioLLMs 🤖")
|
33 |
+
|
34 |
+
## Set up home page other information
|
35 |
+
st.markdown('')
|
36 |
+
|
37 |
+
|
38 |
+
def audio_llm():
|
39 |
+
with st.sidebar:
|
40 |
+
st.divider()
|
41 |
+
st.markdown("""<div class="sidebar-intro">
|
42 |
+
<p><strong>Purpose</strong>: Complext Audio Understanding</p>
|
43 |
+
<p><strong>Name</strong>: MERaLion-AudioLLM-Experimental-Stage-1</p>
|
44 |
+
<p><strong>Version</strong>: 0.0.1, Oct. 21, 2024</p>
|
45 |
+
</div>""", unsafe_allow_html=True)
|
46 |
+
|
47 |
+
|
48 |
+
if st.sidebar.button('Clear History'):
|
49 |
+
st.session_state.update(messages=[],
|
50 |
+
on_upload=False,
|
51 |
+
on_record=False,
|
52 |
+
on_select=False,
|
53 |
+
audio_array=np.array([]))
|
54 |
+
|
55 |
+
|
56 |
+
if "client" not in st.session_state or 'model_name' not in st.session_state:
|
57 |
+
st.session_state.client, st.session_state.model_name = load_model()
|
58 |
+
|
59 |
+
|
60 |
+
if "audio_array" not in st.session_state:
|
61 |
+
st.session_state.audio_base64 = ''
|
62 |
+
st.session_state.audio_array = np.array([])
|
63 |
+
|
64 |
+
|
65 |
+
if "default_instruction" not in st.session_state:
|
66 |
+
st.session_state.default_instruction = ""
|
67 |
+
|
68 |
+
col1, col2, col3 = st.columns(3)
|
69 |
+
|
70 |
+
with col1:
|
71 |
+
st.markdown("**Record Audio:**")
|
72 |
+
|
73 |
+
recording = mic_recorder(
|
74 |
+
format="wav",
|
75 |
+
use_container_width=True,
|
76 |
+
callback=lambda: st.session_state.update(on_record=True, messages=[]),
|
77 |
+
key='record')
|
78 |
+
|
79 |
+
if recording and st.session_state.on_record:
|
80 |
+
audio_bytes = recording["bytes"]
|
81 |
+
st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
|
82 |
+
st.session_state.audio_array = bytes_to_array(audio_bytes)
|
83 |
+
|
84 |
+
with col2:
|
85 |
+
uploaded_file = st.file_uploader(
|
86 |
+
label="**Upload Audio:**",
|
87 |
+
type=['wav', 'mp3'],
|
88 |
+
on_change=lambda: st.session_state.update(on_upload=True, messages=[]),
|
89 |
+
key='upload'
|
90 |
+
)
|
91 |
+
|
92 |
+
if uploaded_file and st.session_state.on_upload:
|
93 |
+
audio_bytes = uploaded_file.read()
|
94 |
+
st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
|
95 |
+
st.session_state.audio_array = bytes_to_array(audio_bytes)
|
96 |
+
|
97 |
+
with col3:
|
98 |
+
audio_samples_w_instruct = {
|
99 |
+
'1_ASR_IMDA_PART1_ASR_v2_141' : "Example Instruction:\n\n- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
|
100 |
+
'2_ASR_IMDA_PART1_ASR_v2_2258': "Example Instruction:\n\n- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
|
101 |
+
'3_ASR_IMDA_PART1_ASR_v2_2265': "Example Instruction:\n\n- Turn the spoken language into a text format.",
|
102 |
+
|
103 |
+
'4_ASR_IMDA_PART2_ASR_v2_999' : "Example Instruction:\n\n- Translate the spoken words into text format.",
|
104 |
+
'5_ASR_IMDA_PART2_ASR_v2_2241': "Example Instruction: \n\n- Translate the spoken words into text format.",
|
105 |
+
'6_ASR_IMDA_PART2_ASR_v2_3409': "Example Instruction: \n\n- Translate the spoken words into text format.",
|
106 |
+
|
107 |
+
'7_ASR_IMDA_PART3_30_ASR_v2_2269': "Example Instruction:\n\n- Need this talk written down, please.",
|
108 |
+
'8_ASR_IMDA_PART3_30_ASR_v2_1698': "Example Instruction: \n\n- Need this talk written down, please.",
|
109 |
+
'9_ASR_IMDA_PART3_30_ASR_v2_2474': "Example Instruction: \n\n- Need this talk written down, please.",
|
110 |
+
|
111 |
+
'10_ASR_IMDA_PART4_30_ASR_v2_1527': "Example Instruction:\n\n- Write out the dialogue as text.",
|
112 |
+
'11_ASR_IMDA_PART4_30_ASR_v2_3771': "Example Instruction: \n\n- Write out the dialogue as text.",
|
113 |
+
'12_ASR_IMDA_PART4_30_ASR_v2_103' : "Example Instruction: \n\n- Write out the dialogue as text.",
|
114 |
+
|
115 |
+
'13_ASR_IMDA_PART5_30_ASR_v2_1446': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
|
116 |
+
'14_ASR_IMDA_PART5_30_ASR_v2_2281': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
|
117 |
+
'15_ASR_IMDA_PART5_30_ASR_v2_4388': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
|
118 |
+
|
119 |
+
'16_ASR_IMDA_PART6_30_ASR_v2_576': "Example Instruction: \n\n- Record the spoken word in text form.",
|
120 |
+
'17_ASR_IMDA_PART6_30_ASR_v2_1413': "Example Instruction: \n\n- Record the spoken word in text form.",
|
121 |
+
'18_ASR_IMDA_PART6_30_ASR_v2_2834': "Example Instruction: \n\n- Record the spoken word in text form.",
|
122 |
+
|
123 |
+
'19_ASR_AIShell_zh_ASR_v2_5044': "Example Instruction: \n\n- Transform the oral presentation into a text document.",
|
124 |
+
|
125 |
+
'20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': "Example Instruction: \n\n- Please provide a written transcription of the speech.",
|
126 |
+
|
127 |
+
'21_ASR_LIBRISPEECH_OTHER_ASR_V2_656': "Example Instruction: \n\n- Can you make this audio into text?",
|
128 |
+
|
129 |
+
'22_ASR_MEDIACORP_ASR_V2_35': "Example Instruction: \n\n- Transform the audio speech into a written transcript.",
|
130 |
+
|
131 |
+
'23_ASR_MEDIACORP_ASR_V2_6': "Example Instruction: \n\n- Transform the audio speech into a written transcript.",
|
132 |
+
|
133 |
+
'24_ASR_PEOPLES_SPEECH_ASR_V2_21376': "Example Instruction: \n\n- Need this audio turned into a written piece.",
|
134 |
+
|
135 |
+
'25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': "Example Instruction: \n\n- Please translate the given speech to English.",
|
136 |
+
|
137 |
+
'26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': "Example Instruction: \n\n- Please translate the given speech to Chinese.",
|
138 |
+
|
139 |
+
'27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': "Example Instruction: \n\n- Please translate the given speech to Chinese.",
|
140 |
+
|
141 |
+
'28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': "Example Instruction: \n\n- Please follow the instruction in the speech.",
|
142 |
+
|
143 |
+
'29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': "Example Instruction: \n\n- Please follow the instruction in the speech.",
|
144 |
+
|
145 |
+
'30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': "Example Instruction: \n\n- Please follow the instruction in the speech.",
|
146 |
+
|
147 |
+
'31_SI_OPENHERMES-AUDIO_SI_V2_673': "Example Instruction: \n\n- Please follow the instruction in the speech.",
|
148 |
+
|
149 |
+
'32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': "Example Instruction: \n\n- What does the man think the woman should do at 4:00?",
|
150 |
+
|
151 |
+
'33_SQA_IMDA_PART3_30_SQA_V2_2310': "Example Instruction: \n\n- Does Speaker2's wife cook for Speaker2 when they are at home?",
|
152 |
+
|
153 |
+
'34_SQA_IMDA_PART3_30_SQA_V2_3621': "Example Instruction: \n\n- Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language?",
|
154 |
+
|
155 |
+
'35_SQA_IMDA_PART3_30_SQA_V2_4062': "Example Instruction: \n\n- What is the color of the vase mentioned in the dialogue?",
|
156 |
+
|
157 |
+
'36_DS_IMDA_PART4_30_DS_V2_849': "Example Instruction: \n\n- Condense the dialogue into a concise summary highlighting major topics and conclusions.",
|
158 |
+
|
159 |
+
'39_Paralingual_IEMOCAP_ER_V2_91': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
|
160 |
+
|
161 |
+
'40_Paralingual_IEMOCAP_ER_V2_567': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
|
162 |
+
|
163 |
+
'41_Paralingual_IEMOCAP_ER_V2_468': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
|
164 |
+
|
165 |
+
'42_Paralingual_IEMOCAP_GR_V2_320': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
|
166 |
+
|
167 |
+
'43_Paralingual_IEMOCAP_GR_V2_129': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
|
168 |
+
|
169 |
+
'44_Paralingual_IEMOCAP_GR_V2_213': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
|
170 |
+
|
171 |
+
'45_Paralingual_IMDA_PART3_30_GR_V2_12312': "Example Instruction: \n\n- So, who's speaking in the second part of the clip? \n\n- So, who's speaking in the first part of the clip?",
|
172 |
+
|
173 |
+
'46_Paralingual_IMDA_PART3_30_GR_V2_1442': "Example Instruction: \n\n- Who starts the conversation in the dialogue?",
|
174 |
+
|
175 |
+
'47_Paralingual_IMDA_PART3_30_NR_V2_10479': "Example Instruction: \n\n- Can you guess which ethnic group this person is from based on their accent?",
|
176 |
+
|
177 |
+
'48_Paralingual_IMDA_PART3_30_NR_V2_15735': "Example Instruction: \n\n- In an analysis of the audio recording, determine the ethnic backgrounds of the speakers based on the accents used.",
|
178 |
+
|
179 |
+
'49_Paralingual_MELD_ER_V2_676': "Example Instruction: \n\n- What emotions do you think the speaker is expressing?",
|
180 |
+
|
181 |
+
'50_Paralingual_MELD_ER_V2_692': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
|
182 |
+
|
183 |
+
'51_Paralingual_VOXCELEB1_GR_V2_2148': "Example Instruction: \n\n- May I know the gender of the speaker?",
|
184 |
+
|
185 |
+
'52_Paralingual_VOXCELEB1_GR_V2_3282': "Example Instruction: \n\n- I'd appreciate knowing the gender of the speaker, if possible.",
|
186 |
+
|
187 |
+
'53_Paralingual_VOXCELEB1_NR_V2_2286': "Example Instruction: \n\n- What's the nationality identity of the speaker?",
|
188 |
+
|
189 |
+
'54_Paralingual_VOXCELEB1_NR_V2_2742': "Example Instruction: \n\n- I'm intrigued by the speaker's nationality, could you enlighten me?",
|
190 |
+
|
191 |
+
'55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2': "Example Instruction: \n\n- What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth?",
|
192 |
+
|
193 |
+
'56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415': "Example Instruction: \n\n- Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore?",
|
194 |
+
|
195 |
+
'57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460': "Example Instruction: \n\n- How does the author respond to parents' worries about masks in schools?"
|
196 |
+
|
197 |
+
}
|
198 |
+
|
199 |
+
audio_sample_names = [audio_sample_name for audio_sample_name in audio_samples_w_instruct.keys()]
|
200 |
+
|
201 |
+
sample_name = st.selectbox(
|
202 |
+
label="**Select Audio:**",
|
203 |
+
options=audio_sample_names,
|
204 |
+
index=None,
|
205 |
+
placeholder="Select an audio sample:",
|
206 |
+
on_change=lambda: st.session_state.update(on_select=True, messages=[]),
|
207 |
+
key='select')
|
208 |
+
|
209 |
+
if sample_name and st.session_state.on_select:
|
210 |
+
audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
|
211 |
+
st.session_state.default_instruction = audio_samples_w_instruct[sample_name]
|
212 |
+
st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
|
213 |
+
st.session_state.audio_array = bytes_to_array(audio_bytes)
|
214 |
+
|
215 |
+
st.write(st.session_state.default_instruction)
|
216 |
+
st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
|
217 |
+
st.session_state.update(on_upload=False, on_record=False, on_select=False)
|
218 |
+
|
219 |
+
|
220 |
+
st.markdown(
|
221 |
+
"""
|
222 |
+
<style>
|
223 |
+
.st-emotion-cache-1c7y2kd {
|
224 |
+
flex-direction: row-reverse;
|
225 |
+
text-align: right;
|
226 |
+
}
|
227 |
+
</style>
|
228 |
+
|
229 |
+
""",
|
230 |
+
unsafe_allow_html=True,
|
231 |
+
)
|
232 |
+
|
233 |
+
if "messages" not in st.session_state:
|
234 |
+
st.session_state.messages = []
|
235 |
+
|
236 |
+
if prompt := st.chat_input():
|
237 |
+
with st.chat_message("user"):
|
238 |
+
st.write(prompt)
|
239 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
240 |
+
|
241 |
+
with st.chat_message("assistant"):
|
242 |
+
with st.spinner("Thinking..."):
|
243 |
+
stream = generate_response(prompt, st.session_state.audio_base64)
|
244 |
+
response = st.write_stream(stream)
|
245 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|