YingxuHe commited on
Commit
e531eef
·
verified ·
1 Parent(s): 35c9b59

Create pages.py

Browse files
Files changed (1) hide show
  1. pages.py +245 -0
pages.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+
3
+ import numpy as np
4
+ import streamlit as st
5
+ from streamlit_mic_recorder import mic_recorder
6
+
7
+ from utils import load_model, generate_response, bytes_to_array
8
+
9
+
10
+ def home_page():
11
+ ## Set up home page Title
12
+ col1, col2 = st.columns([1, 4])
13
+ custom_html = """
14
+ <div class="banner">
15
+ <img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRhB2e_AhOe11wKxnnwOmOVg9E7J1MBgiTeYzzFAESwcCP5IbBAc2X8BwGChMfJzwqtVg&usqp=CAU" alt="Banner Image">
16
+ </div>
17
+ <style>
18
+ .banner {
19
+ width: 100%;
20
+ height: 200px;
21
+ overflow: visible;
22
+ }
23
+ .banner img {
24
+ width: 100%;
25
+ object-fit: cover;
26
+ }
27
+ </style>
28
+ """
29
+ with col1:
30
+ components.html(custom_html)
31
+ with col2:
32
+ st.write("# Welcome to Merlion AI - AudioLLMs 🤖")
33
+
34
+ ## Set up home page other information
35
+ st.markdown('')
36
+
37
+
38
+ def audio_llm():
39
+ with st.sidebar:
40
+ st.divider()
41
+ st.markdown("""<div class="sidebar-intro">
42
+ <p><strong>Purpose</strong>: Complext Audio Understanding</p>
43
+ <p><strong>Name</strong>: MERaLion-AudioLLM-Experimental-Stage-1</p>
44
+ <p><strong>Version</strong>: 0.0.1, Oct. 21, 2024</p>
45
+ </div>""", unsafe_allow_html=True)
46
+
47
+
48
+ if st.sidebar.button('Clear History'):
49
+ st.session_state.update(messages=[],
50
+ on_upload=False,
51
+ on_record=False,
52
+ on_select=False,
53
+ audio_array=np.array([]))
54
+
55
+
56
+ if "client" not in st.session_state or 'model_name' not in st.session_state:
57
+ st.session_state.client, st.session_state.model_name = load_model()
58
+
59
+
60
+ if "audio_array" not in st.session_state:
61
+ st.session_state.audio_base64 = ''
62
+ st.session_state.audio_array = np.array([])
63
+
64
+
65
+ if "default_instruction" not in st.session_state:
66
+ st.session_state.default_instruction = ""
67
+
68
+ col1, col2, col3 = st.columns(3)
69
+
70
+ with col1:
71
+ st.markdown("**Record Audio:**")
72
+
73
+ recording = mic_recorder(
74
+ format="wav",
75
+ use_container_width=True,
76
+ callback=lambda: st.session_state.update(on_record=True, messages=[]),
77
+ key='record')
78
+
79
+ if recording and st.session_state.on_record:
80
+ audio_bytes = recording["bytes"]
81
+ st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
82
+ st.session_state.audio_array = bytes_to_array(audio_bytes)
83
+
84
+ with col2:
85
+ uploaded_file = st.file_uploader(
86
+ label="**Upload Audio:**",
87
+ type=['wav', 'mp3'],
88
+ on_change=lambda: st.session_state.update(on_upload=True, messages=[]),
89
+ key='upload'
90
+ )
91
+
92
+ if uploaded_file and st.session_state.on_upload:
93
+ audio_bytes = uploaded_file.read()
94
+ st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
95
+ st.session_state.audio_array = bytes_to_array(audio_bytes)
96
+
97
+ with col3:
98
+ audio_samples_w_instruct = {
99
+ '1_ASR_IMDA_PART1_ASR_v2_141' : "Example Instruction:\n\n- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
100
+ '2_ASR_IMDA_PART1_ASR_v2_2258': "Example Instruction:\n\n- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
101
+ '3_ASR_IMDA_PART1_ASR_v2_2265': "Example Instruction:\n\n- Turn the spoken language into a text format.",
102
+
103
+ '4_ASR_IMDA_PART2_ASR_v2_999' : "Example Instruction:\n\n- Translate the spoken words into text format.",
104
+ '5_ASR_IMDA_PART2_ASR_v2_2241': "Example Instruction: \n\n- Translate the spoken words into text format.",
105
+ '6_ASR_IMDA_PART2_ASR_v2_3409': "Example Instruction: \n\n- Translate the spoken words into text format.",
106
+
107
+ '7_ASR_IMDA_PART3_30_ASR_v2_2269': "Example Instruction:\n\n- Need this talk written down, please.",
108
+ '8_ASR_IMDA_PART3_30_ASR_v2_1698': "Example Instruction: \n\n- Need this talk written down, please.",
109
+ '9_ASR_IMDA_PART3_30_ASR_v2_2474': "Example Instruction: \n\n- Need this talk written down, please.",
110
+
111
+ '10_ASR_IMDA_PART4_30_ASR_v2_1527': "Example Instruction:\n\n- Write out the dialogue as text.",
112
+ '11_ASR_IMDA_PART4_30_ASR_v2_3771': "Example Instruction: \n\n- Write out the dialogue as text.",
113
+ '12_ASR_IMDA_PART4_30_ASR_v2_103' : "Example Instruction: \n\n- Write out the dialogue as text.",
114
+
115
+ '13_ASR_IMDA_PART5_30_ASR_v2_1446': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
116
+ '14_ASR_IMDA_PART5_30_ASR_v2_2281': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
117
+ '15_ASR_IMDA_PART5_30_ASR_v2_4388': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
118
+
119
+ '16_ASR_IMDA_PART6_30_ASR_v2_576': "Example Instruction: \n\n- Record the spoken word in text form.",
120
+ '17_ASR_IMDA_PART6_30_ASR_v2_1413': "Example Instruction: \n\n- Record the spoken word in text form.",
121
+ '18_ASR_IMDA_PART6_30_ASR_v2_2834': "Example Instruction: \n\n- Record the spoken word in text form.",
122
+
123
+ '19_ASR_AIShell_zh_ASR_v2_5044': "Example Instruction: \n\n- Transform the oral presentation into a text document.",
124
+
125
+ '20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': "Example Instruction: \n\n- Please provide a written transcription of the speech.",
126
+
127
+ '21_ASR_LIBRISPEECH_OTHER_ASR_V2_656': "Example Instruction: \n\n- Can you make this audio into text?",
128
+
129
+ '22_ASR_MEDIACORP_ASR_V2_35': "Example Instruction: \n\n- Transform the audio speech into a written transcript.",
130
+
131
+ '23_ASR_MEDIACORP_ASR_V2_6': "Example Instruction: \n\n- Transform the audio speech into a written transcript.",
132
+
133
+ '24_ASR_PEOPLES_SPEECH_ASR_V2_21376': "Example Instruction: \n\n- Need this audio turned into a written piece.",
134
+
135
+ '25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': "Example Instruction: \n\n- Please translate the given speech to English.",
136
+
137
+ '26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': "Example Instruction: \n\n- Please translate the given speech to Chinese.",
138
+
139
+ '27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': "Example Instruction: \n\n- Please translate the given speech to Chinese.",
140
+
141
+ '28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': "Example Instruction: \n\n- Please follow the instruction in the speech.",
142
+
143
+ '29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': "Example Instruction: \n\n- Please follow the instruction in the speech.",
144
+
145
+ '30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': "Example Instruction: \n\n- Please follow the instruction in the speech.",
146
+
147
+ '31_SI_OPENHERMES-AUDIO_SI_V2_673': "Example Instruction: \n\n- Please follow the instruction in the speech.",
148
+
149
+ '32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': "Example Instruction: \n\n- What does the man think the woman should do at 4:00?",
150
+
151
+ '33_SQA_IMDA_PART3_30_SQA_V2_2310': "Example Instruction: \n\n- Does Speaker2's wife cook for Speaker2 when they are at home?",
152
+
153
+ '34_SQA_IMDA_PART3_30_SQA_V2_3621': "Example Instruction: \n\n- Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language?",
154
+
155
+ '35_SQA_IMDA_PART3_30_SQA_V2_4062': "Example Instruction: \n\n- What is the color of the vase mentioned in the dialogue?",
156
+
157
+ '36_DS_IMDA_PART4_30_DS_V2_849': "Example Instruction: \n\n- Condense the dialogue into a concise summary highlighting major topics and conclusions.",
158
+
159
+ '39_Paralingual_IEMOCAP_ER_V2_91': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
160
+
161
+ '40_Paralingual_IEMOCAP_ER_V2_567': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
162
+
163
+ '41_Paralingual_IEMOCAP_ER_V2_468': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
164
+
165
+ '42_Paralingual_IEMOCAP_GR_V2_320': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
166
+
167
+ '43_Paralingual_IEMOCAP_GR_V2_129': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
168
+
169
+ '44_Paralingual_IEMOCAP_GR_V2_213': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
170
+
171
+ '45_Paralingual_IMDA_PART3_30_GR_V2_12312': "Example Instruction: \n\n- So, who's speaking in the second part of the clip? \n\n- So, who's speaking in the first part of the clip?",
172
+
173
+ '46_Paralingual_IMDA_PART3_30_GR_V2_1442': "Example Instruction: \n\n- Who starts the conversation in the dialogue?",
174
+
175
+ '47_Paralingual_IMDA_PART3_30_NR_V2_10479': "Example Instruction: \n\n- Can you guess which ethnic group this person is from based on their accent?",
176
+
177
+ '48_Paralingual_IMDA_PART3_30_NR_V2_15735': "Example Instruction: \n\n- In an analysis of the audio recording, determine the ethnic backgrounds of the speakers based on the accents used.",
178
+
179
+ '49_Paralingual_MELD_ER_V2_676': "Example Instruction: \n\n- What emotions do you think the speaker is expressing?",
180
+
181
+ '50_Paralingual_MELD_ER_V2_692': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
182
+
183
+ '51_Paralingual_VOXCELEB1_GR_V2_2148': "Example Instruction: \n\n- May I know the gender of the speaker?",
184
+
185
+ '52_Paralingual_VOXCELEB1_GR_V2_3282': "Example Instruction: \n\n- I'd appreciate knowing the gender of the speaker, if possible.",
186
+
187
+ '53_Paralingual_VOXCELEB1_NR_V2_2286': "Example Instruction: \n\n- What's the nationality identity of the speaker?",
188
+
189
+ '54_Paralingual_VOXCELEB1_NR_V2_2742': "Example Instruction: \n\n- I'm intrigued by the speaker's nationality, could you enlighten me?",
190
+
191
+ '55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2': "Example Instruction: \n\n- What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth?",
192
+
193
+ '56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415': "Example Instruction: \n\n- Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore?",
194
+
195
+ '57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460': "Example Instruction: \n\n- How does the author respond to parents' worries about masks in schools?"
196
+
197
+ }
198
+
199
+ audio_sample_names = [audio_sample_name for audio_sample_name in audio_samples_w_instruct.keys()]
200
+
201
+ sample_name = st.selectbox(
202
+ label="**Select Audio:**",
203
+ options=audio_sample_names,
204
+ index=None,
205
+ placeholder="Select an audio sample:",
206
+ on_change=lambda: st.session_state.update(on_select=True, messages=[]),
207
+ key='select')
208
+
209
+ if sample_name and st.session_state.on_select:
210
+ audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
211
+ st.session_state.default_instruction = audio_samples_w_instruct[sample_name]
212
+ st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
213
+ st.session_state.audio_array = bytes_to_array(audio_bytes)
214
+
215
+ st.write(st.session_state.default_instruction)
216
+ st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
217
+ st.session_state.update(on_upload=False, on_record=False, on_select=False)
218
+
219
+
220
+ st.markdown(
221
+ """
222
+ <style>
223
+ .st-emotion-cache-1c7y2kd {
224
+ flex-direction: row-reverse;
225
+ text-align: right;
226
+ }
227
+ </style>
228
+
229
+ """,
230
+ unsafe_allow_html=True,
231
+ )
232
+
233
+ if "messages" not in st.session_state:
234
+ st.session_state.messages = []
235
+
236
+ if prompt := st.chat_input():
237
+ with st.chat_message("user"):
238
+ st.write(prompt)
239
+ st.session_state.messages.append({"role": "user", "content": prompt})
240
+
241
+ with st.chat_message("assistant"):
242
+ with st.spinner("Thinking..."):
243
+ stream = generate_response(prompt, st.session_state.audio_base64)
244
+ response = st.write_stream(stream)
245
+ st.session_state.messages.append({"role": "assistant", "content": response})