Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

App Files Files Community

MERaLiON-AudioLLM / utils.py

YingxuHe

Update utils.py

585689d verified about 1 month ago

raw

history blame

7.18 kB

	import io
	import os
	import re

	import librosa
	import paramiko
	import streamlit as st
	from openai import OpenAI
	from sshtunnel import SSHTunnelForwarder

	local_port = int(os.getenv('LOCAL_PORT'))


	GENERAL_INSTRUCTIONS = [
	"Please transcribe this speech.",
	"Please summarise this speech."
	]


	AUDIO_SAMPLES_W_INSTRUCT = {
	'1_ASR_IMDA_PART1_ASR_v2_141' : ["Turn the spoken language into a text format.", "Please translate the content into Chinese."],
	'7_ASR_IMDA_PART3_30_ASR_v2_2269': ["Need this talk written down, please."],
	'17_ASR_IMDA_PART6_30_ASR_v2_1413': ["Record the spoken word in text form."],

	'25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': ["Please translate the given speech to English."],
	'26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': ["Please translate the given speech to Chinese."],
	'30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': ["Please follow the instruction in the speech."],

	'32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': ["What does the man think the woman should do at 4:00."],
	'33_SQA_IMDA_PART3_30_SQA_V2_2310': ["Does Speaker2's wife cook for Speaker2 when they are at home."],
	'34_SQA_IMDA_PART3_30_SQA_V2_3621': ["Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language."],
	'35_SQA_IMDA_PART3_30_SQA_V2_4062': ["What is the color of the vase mentioned in the dialogue."],
	'36_DS_IMDA_PART4_30_DS_V2_849': ["Condense the dialogue into a concise summary highlighting major topics and conclusions."],

	'39_Paralingual_IEMOCAP_ER_V2_91': ["Based on the speaker's speech patterns, what do you think they are feeling."],
	'40_Paralingual_IEMOCAP_ER_V2_567': ["Based on the speaker's speech patterns, what do you think they are feeling."],
	'42_Paralingual_IEMOCAP_GR_V2_320': ["Is it possible for you to identify whether the speaker in this recording is male or female."],
	'43_Paralingual_IEMOCAP_GR_V2_129': ["Is it possible for you to identify whether the speaker in this recording is male or female."],
	'45_Paralingual_IMDA_PART3_30_GR_V2_12312': ["So, who's speaking in the second part of the clip?", "So, who's speaking in the first part of the clip?"],
	'47_Paralingual_IMDA_PART3_30_NR_V2_10479': ["Can you guess which ethnic group this person is from based on their accent."],
	'49_Paralingual_MELD_ER_V2_676': ["What emotions do you think the speaker is expressing."],
	'50_Paralingual_MELD_ER_V2_692': ["Based on the speaker's speech patterns, what do you think they are feeling."],
	'51_Paralingual_VOXCELEB1_GR_V2_2148': ["May I know the gender of the speaker."],
	'53_Paralingual_VOXCELEB1_NR_V2_2286': ["What's the nationality identity of the speaker."],

	'55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2': ["What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth."],
	'56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415': ["Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore."],
	'57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460': ["How does the author respond to parents' worries about masks in schools."],

	'2_ASR_IMDA_PART1_ASR_v2_2258': ["Turn the spoken language into a text format.", "Please translate the content into Chinese."],
	'3_ASR_IMDA_PART1_ASR_v2_2265': ["Turn the spoken language into a text format."],

	'4_ASR_IMDA_PART2_ASR_v2_999' : ["Translate the spoken words into text format."],
	'5_ASR_IMDA_PART2_ASR_v2_2241': ["Translate the spoken words into text format."],
	'6_ASR_IMDA_PART2_ASR_v2_3409': ["Translate the spoken words into text format."],

	'8_ASR_IMDA_PART3_30_ASR_v2_1698': ["Need this talk written down, please."],
	'9_ASR_IMDA_PART3_30_ASR_v2_2474': ["Need this talk written down, please."],

	'11_ASR_IMDA_PART4_30_ASR_v2_3771': ["Write out the dialogue as text."],
	'12_ASR_IMDA_PART4_30_ASR_v2_103' : ["Write out the dialogue as text."],
	'10_ASR_IMDA_PART4_30_ASR_v2_1527': ["Write out the dialogue as text."],

	'13_ASR_IMDA_PART5_30_ASR_v2_1446': ["Translate this vocal recording into a textual format."],
	'14_ASR_IMDA_PART5_30_ASR_v2_2281': ["Translate this vocal recording into a textual format."],
	'15_ASR_IMDA_PART5_30_ASR_v2_4388': ["Translate this vocal recording into a textual format."],

	'16_ASR_IMDA_PART6_30_ASR_v2_576': ["Record the spoken word in text form."],
	'18_ASR_IMDA_PART6_30_ASR_v2_2834': ["Record the spoken word in text form."],

	'19_ASR_AIShell_zh_ASR_v2_5044': ["Transform the oral presentation into a text document."],
	'20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': ["Please provide a written transcription of the speech."],

	'27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': ["Please translate the given speech to Chinese."],
	'28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': ["Please follow the instruction in the speech."],
	'29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': ["Please follow the instruction in the speech."],
	}


	class NoAudioException(Exception):
	pass


	@st.cache_resource()
	def start_server():
	pkey = paramiko.RSAKey.from_private_key(io.StringIO(os.getenv('PRIVATE_KEY')))

	server = SSHTunnelForwarder(
	ssh_address_or_host=os.getenv('SERVER_DNS_NAME'),
	ssh_username="ec2-user",
	ssh_pkey=pkey,
	local_bind_address=("127.0.0.1", local_port),
	remote_bind_address=("127.0.0.1", 8000)
	)
	server.start()
	return server


	@st.cache_resource()
	def load_model():
	openai_api_key = os.getenv('API_KEY')
	openai_api_base = f"http://localhost:{local_port}/v1"

	client = OpenAI(
	api_key=openai_api_key,
	base_url=openai_api_base,
	)

	models = client.models.list()
	model_name = models.data[0].id

	return client, model_name


	def generate_response(text_input):
	if not st.session_state.audio_base64:
	raise NoAudioException("audio is empty.")

	warnings = []
	if re.search("tool\|code\|python\|java\|math\|calculate", text_input):
	warnings.append("WARNING: MERaLiON-AudioLLM is not intended for use in tool calling, math, and coding tasks.")

	if re.search(r'[\u4e00-\u9fff]+', text_input):
	warnings.append("NOTE: Please try to prompt in English for the best performance.")

	stream = st.session_state.client.chat.completions.create(
	messages=[{
	"role":
	"user",
	"content": [
	{
	"type": "text",
	"text": f"Text instruction: {text_input}"
	},
	{
	"type": "audio_url",
	"audio_url": {
	"url": f"data:audio/ogg;base64,{st.session_state.audio_base64}"
	},
	},
	],
	}],
	model=st.session_state.model_name,
	max_completion_tokens=512,
	temperature=st.session_state.temperature,
	top_p=st.session_state.top_p,
	stream=True,
	)

	return stream, warnings


	def bytes_to_array(audio_bytes):
	audio_array, _ = librosa.load(
	io.BytesIO(audio_bytes),
	sr=16000
	)
	return audio_array