YingxuHe commited on
Commit
59b0ce3
·
verified ·
1 Parent(s): 56c10d7

Upload 3 files

Browse files
Files changed (3) hide show
  1. pages/audiollm.py +216 -0
  2. pages/nav.py +7 -0
  3. pages/utils.py +70 -0
pages/audiollm.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+
3
+ import numpy as np
4
+ import streamlit as st
5
+ from streamlit_mic_recorder import mic_recorder
6
+
7
+ ## set up streamlit page layout
8
+ from pages.nav import Navbar
9
+ from pages.utils import load_model, generate_response, bytes_to_array
10
+
11
+ st.set_page_config(page_title="AudioLLM", layout='wide', page_icon = "💬")
12
+ Navbar()
13
+
14
+ with st.sidebar:
15
+ st.divider()
16
+ st.markdown("""<div class="sidebar-intro">
17
+ <p><strong>Purpose</strong>: Complext Audio Understanding</p>
18
+ <p><strong>Name</strong>: MERaLion-AudioLLM-Experimental-Stage-1</p>
19
+ <p><strong>Version</strong>: 0.0.1, Oct. 21, 2024</p>
20
+ </div>""", unsafe_allow_html=True)
21
+
22
+
23
+ if st.sidebar.button('Clear History'):
24
+ st.session_state.update(messages=[],
25
+ on_upload=False,
26
+ on_record=False,
27
+ on_select=False,
28
+ audio_array=np.array([]))
29
+
30
+
31
+ if "client" not in st.session_state or 'model_name' not in st.session_state:
32
+ st.session_state.client, st.session_state.model_name = load_model()
33
+
34
+
35
+ if "audio_array" not in st.session_state:
36
+ st.session_state.audio_base64 = ''
37
+ st.session_state.audio_array = np.array([])
38
+
39
+
40
+ if "default_instruction" not in st.session_state:
41
+ st.session_state.default_instruction = ""
42
+
43
+ col1, col2, col3 = st.columns(3)
44
+
45
+ with col1:
46
+ st.markdown("**Record Audio:**")
47
+
48
+ recording = mic_recorder(format="wav",
49
+ use_container_width=True,
50
+ callback=lambda: st.session_state.update(on_record=True, messages=[]),
51
+ key='record')
52
+ if recording and st.session_state.on_record:
53
+ audio_bytes = recording["bytes"]
54
+ st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
55
+ st.session_state.audio_array = bytes_to_array(audio_bytes)
56
+
57
+ with col2:
58
+ uploaded_file = st.file_uploader(label="**Upload Audio:**",
59
+ type=['wav', 'mp3'],
60
+ on_change=lambda: st.session_state.update(on_upload=True, messages=[]),
61
+ key='upload')
62
+ if uploaded_file and st.session_state.on_upload:
63
+ audio_bytes = uploaded_file.read()
64
+ st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
65
+ st.session_state.audio_array = bytes_to_array(audio_bytes)
66
+
67
+ with col3:
68
+ audio_samples_w_instruct = {
69
+
70
+ '1_ASR_IMDA_PART1_ASR_v2_141' : "Example Instruction:\n\n- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
71
+ '2_ASR_IMDA_PART1_ASR_v2_2258': "Example Instruction:\n\n- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
72
+ '3_ASR_IMDA_PART1_ASR_v2_2265': "Example Instruction:\n\n- Turn the spoken language into a text format.",
73
+
74
+ '4_ASR_IMDA_PART2_ASR_v2_999' : "Example Instruction:\n\n- Translate the spoken words into text format.",
75
+ '5_ASR_IMDA_PART2_ASR_v2_2241': "Example Instruction: \n\n- Translate the spoken words into text format.",
76
+ '6_ASR_IMDA_PART2_ASR_v2_3409': "Example Instruction: \n\n- Translate the spoken words into text format.",
77
+
78
+ '7_ASR_IMDA_PART3_30_ASR_v2_2269': "Example Instruction:\n\n- Need this talk written down, please.",
79
+ '8_ASR_IMDA_PART3_30_ASR_v2_1698': "Example Instruction: \n\n- Need this talk written down, please.",
80
+ '9_ASR_IMDA_PART3_30_ASR_v2_2474': "Example Instruction: \n\n- Need this talk written down, please.",
81
+
82
+ '10_ASR_IMDA_PART4_30_ASR_v2_1527': "Example Instruction:\n\n- Write out the dialogue as text.",
83
+ '11_ASR_IMDA_PART4_30_ASR_v2_3771': "Example Instruction: \n\n- Write out the dialogue as text.",
84
+ '12_ASR_IMDA_PART4_30_ASR_v2_103': "Example Instruction: \n\n- Write out the dialogue as text.",
85
+
86
+ '13_ASR_IMDA_PART5_30_ASR_v2_1446': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
87
+ '14_ASR_IMDA_PART5_30_ASR_v2_2281': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
88
+ '15_ASR_IMDA_PART5_30_ASR_v2_4388': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
89
+
90
+ '16_ASR_IMDA_PART6_30_ASR_v2_576': "Example Instruction: \n\n- Record the spoken word in text form.",
91
+ '17_ASR_IMDA_PART6_30_ASR_v2_1413': "Example Instruction: \n\n- Record the spoken word in text form.",
92
+ '18_ASR_IMDA_PART6_30_ASR_v2_2834': "Example Instruction: \n\n- Record the spoken word in text form.",
93
+
94
+ '19_ASR_AIShell_zh_ASR_v2_5044': "Example Instruction: \n\n- Transform the oral presentation into a text document.",
95
+
96
+ '20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': "Example Instruction: \n\n- Please provide a written transcription of the speech.",
97
+
98
+ '21_ASR_LIBRISPEECH_OTHER_ASR_V2_656': "Example Instruction: \n\n- Can you make this audio into text?",
99
+
100
+ '22_ASR_MEDIACORP_ASR_V2_35': "Example Instruction: \n\n- Transform the audio speech into a written transcript.",
101
+
102
+ '23_ASR_MEDIACORP_ASR_V2_6': "Example Instruction: \n\n- Transform the audio speech into a written transcript.",
103
+
104
+ '24_ASR_PEOPLES_SPEECH_ASR_V2_21376': "Example Instruction: \n\n- Need this audio turned into a written piece.",
105
+
106
+ '25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': "Example Instruction: \n\n- Please translate the given speech to English.",
107
+
108
+ '26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': "Example Instruction: \n\n- Please translate the given speech to Chinese.",
109
+
110
+ '27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': "Example Instruction: \n\n- Please translate the given speech to Chinese.",
111
+
112
+ '28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': "Example Instruction: \n\n- Please follow the instruction in the speech.",
113
+
114
+ '29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': "Example Instruction: \n\n- Please follow the instruction in the speech.",
115
+
116
+ '30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': "Example Instruction: \n\n- Please follow the instruction in the speech.",
117
+
118
+ '31_SI_OPENHERMES-AUDIO_SI_V2_673': "Example Instruction: \n\n- Please follow the instruction in the speech.",
119
+
120
+ '32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': "Example Instruction: \n\n- What does the man think the woman should do at 4:00?",
121
+
122
+ '33_SQA_IMDA_PART3_30_SQA_V2_2310': "Example Instruction: \n\n- Does Speaker2's wife cook for Speaker2 when they are at home?",
123
+
124
+ '34_SQA_IMDA_PART3_30_SQA_V2_3621': "Example Instruction: \n\n- Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language?",
125
+
126
+ '35_SQA_IMDA_PART3_30_SQA_V2_4062': "Example Instruction: \n\n- What is the color of the vase mentioned in the dialogue?",
127
+
128
+ '36_DS_IMDA_PART4_30_DS_V2_849': "Example Instruction: \n\n- Condense the dialogue into a concise summary highlighting major topics and conclusions.",
129
+
130
+ '39_Paralingual_IEMOCAP_ER_V2_91': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
131
+
132
+ '40_Paralingual_IEMOCAP_ER_V2_567': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
133
+
134
+ '41_Paralingual_IEMOCAP_ER_V2_468': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
135
+
136
+ '42_Paralingual_IEMOCAP_GR_V2_320': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
137
+
138
+ '43_Paralingual_IEMOCAP_GR_V2_129': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
139
+
140
+ '44_Paralingual_IEMOCAP_GR_V2_213': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
141
+
142
+ '45_Paralingual_IMDA_PART3_30_GR_V2_12312': "Example Instruction: \n\n- So, who's speaking in the second part of the clip? \n\n- So, who's speaking in the first part of the clip?",
143
+
144
+ '46_Paralingual_IMDA_PART3_30_GR_V2_1442': "Example Instruction: \n\n- Who starts the conversation in the dialogue?",
145
+
146
+ '47_Paralingual_IMDA_PART3_30_NR_V2_10479': "Example Instruction: \n\n- Can you guess which ethnic group this person is from based on their accent?",
147
+
148
+ '48_Paralingual_IMDA_PART3_30_NR_V2_15735': "Example Instruction: \n\n- In an analysis of the audio recording, determine the ethnic backgrounds of the speakers based on the accents used.",
149
+
150
+ '49_Paralingual_MELD_ER_V2_676': "Example Instruction: \n\n- What emotions do you think the speaker is expressing?",
151
+
152
+ '50_Paralingual_MELD_ER_V2_692': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
153
+
154
+ '51_Paralingual_VOXCELEB1_GR_V2_2148': "Example Instruction: \n\n- May I know the gender of the speaker?",
155
+
156
+ '52_Paralingual_VOXCELEB1_GR_V2_3282': "Example Instruction: \n\n- I'd appreciate knowing the gender of the speaker, if possible.",
157
+
158
+ '53_Paralingual_VOXCELEB1_NR_V2_2286': "Example Instruction: \n\n- What's the nationality identity of the speaker?",
159
+
160
+ '54_Paralingual_VOXCELEB1_NR_V2_2742': "Example Instruction: \n\n- I'm intrigued by the speaker's nationality, could you enlighten me?",
161
+
162
+ '55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2': "Example Instruction: \n\n- What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth?",
163
+
164
+ '56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415': "Example Instruction: \n\n- Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore?",
165
+
166
+ '57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460': "Example Instruction: \n\n- How does the author respond to parents' worries about masks in schools?"
167
+
168
+ }
169
+
170
+ audio_sample_names = [audio_sample_name for audio_sample_name in audio_samples_w_instruct.keys()]
171
+
172
+ sample_name = st.selectbox(label="**Select Audio:**",
173
+ options=audio_sample_names,
174
+ index=None,
175
+ placeholder="Select an audio sample:",
176
+ on_change=lambda: st.session_state.update(on_select=True, messages=[]),
177
+ key='select')
178
+
179
+ if sample_name and st.session_state.on_select:
180
+ audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
181
+ st.session_state.default_instruction = audio_samples_w_instruct[sample_name]
182
+ st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
183
+ st.session_state.audio_array = bytes_to_array(audio_bytes)
184
+
185
+ st.write(st.session_state.default_instruction)
186
+ st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
187
+ st.session_state.update(on_upload=False, on_record=False, on_select=False)
188
+
189
+
190
+ st.markdown(
191
+ """
192
+ <style>
193
+ .st-emotion-cache-1c7y2kd {
194
+ flex-direction: row-reverse;
195
+ text-align: right;
196
+ }
197
+ </style>
198
+
199
+ """,
200
+ unsafe_allow_html=True,
201
+ )
202
+
203
+ if "messages" not in st.session_state:
204
+ st.session_state.messages = []
205
+
206
+ if prompt := st.chat_input():
207
+ with st.chat_message("user"):
208
+ st.write(prompt)
209
+ st.session_state.messages.append({"role": "user", "content": prompt})
210
+
211
+ with st.chat_message("assistant"):
212
+ with st.spinner("Thinking..."):
213
+ stream = generate_response(prompt, st.session_state.audio_base64)
214
+ response = st.write_stream(stream)
215
+ st.session_state.messages.append({"role": "assistant", "content": response})
216
+
pages/nav.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def Navbar():
5
+ with st.sidebar:
6
+ st.page_link('app.py', label='Home', icon='🔥')
7
+ st.page_link('pages/audiollm.py', label='MERaLion-AudioLLM', icon='🛡️')
pages/utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+
3
+ import librosa
4
+ import paramiko
5
+ import streamlit as st
6
+ from openai import OpenAI
7
+ from sshtunnel import SSHTunnelForwarder
8
+
9
+
10
+ @st.cache_resource()
11
+ def load_model():
12
+ openai_api_key = "EMPTY"
13
+ openai_api_base = "http://localhost:8000/v1"
14
+
15
+ client = OpenAI(
16
+ api_key=openai_api_key,
17
+ base_url=openai_api_base,
18
+ )
19
+
20
+ models = client.models.list()
21
+ model_name = models.data[0].id
22
+
23
+ return client, model_name
24
+
25
+
26
+ def generate_response(text_input, audio_input):
27
+ stream = st.session_state.client.chat.completions.create(
28
+ messages=[{
29
+ "role":
30
+ "user",
31
+ "content": [
32
+ {
33
+ "type": "text",
34
+ "text": f"Text instruction: {text_input}"
35
+ },
36
+ {
37
+ "type": "audio_url",
38
+ "audio_url": {
39
+ "url": f"data:audio/ogg;base64,{audio_input}"
40
+ },
41
+ },
42
+ ],
43
+ }],
44
+ model=st.session_state.model_name,
45
+ max_completion_tokens=512,
46
+ stream=True,
47
+ )
48
+
49
+ return stream
50
+
51
+
52
+ def bytes_to_array(audio_bytes):
53
+ audio_array, _ = librosa.load(
54
+ io.BytesIO(audio_bytes),
55
+ sr=16000
56
+ )
57
+ return audio_array
58
+
59
+ def start_server(ssh_key, dns_name):
60
+ pkey = paramiko.RSAKey.from_private_key(io.StringIO(ssh_key))
61
+
62
+ server = SSHTunnelForwarder(
63
+ ssh_address_or_host=dns_name,
64
+ ssh_username="ec2-user",
65
+ ssh_pkey=pkey,
66
+ local_bind_address=("127.0.0.1", 8080),
67
+ remote_bind_address=("127.0.0.1", 8000)
68
+ )
69
+
70
+ server.start()