Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

App Files Files Community

YingxuHe commited on Jan 3

Commit

8a5a187

verified ·

1 Parent(s): 4bd0ef1

Update utils.py

Browse files

Files changed (1) hide show

utils.py +112 -40

utils.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import io
 import os
 import re
 import librosa
 import paramiko
 import streamlit as st
-from openai import OpenAI
 from sshtunnel import SSHTunnelForwarder
 local_port = int(os.getenv('LOCAL_PORT'))
@@ -20,12 +21,9 @@ GENERAL_INSTRUCTIONS = [
 AUDIO_SAMPLES_W_INSTRUCT = {
     '1_ASR_IMDA_PART1_ASR_v2_141' : ["Turn the spoken language into a text format.", "Please translate the content into Chinese."],
     '7_ASR_IMDA_PART3_30_ASR_v2_2269': ["Need this talk written down, please."],
     '17_ASR_IMDA_PART6_30_ASR_v2_1413': ["Record the spoken word in text form."],
-    '25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': ["Please translate the given speech to English."],
-    '26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': ["Please translate the given speech to Chinese."],
-    '30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': ["Please follow the instruction in the speech."],
     '32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': ["What does the man think the woman should do at 4:00."],
     '33_SQA_IMDA_PART3_30_SQA_V2_2310': ["Does Speaker2's wife cook for Speaker2 when they are at home."],
     '34_SQA_IMDA_PART3_30_SQA_V2_3621': ["Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language."],
@@ -61,7 +59,6 @@ AUDIO_SAMPLES_W_INSTRUCT = {
     '12_ASR_IMDA_PART4_30_ASR_v2_103' : ["Write out the dialogue as text."],
     '10_ASR_IMDA_PART4_30_ASR_v2_1527': ["Write out the dialogue as text."],
-    '13_ASR_IMDA_PART5_30_ASR_v2_1446': ["Translate this vocal recording into a textual format."],
     '14_ASR_IMDA_PART5_30_ASR_v2_2281': ["Translate this vocal recording into a textual format."],
     '15_ASR_IMDA_PART5_30_ASR_v2_4388': ["Translate this vocal recording into a textual format."],
@@ -71,9 +68,13 @@ AUDIO_SAMPLES_W_INSTRUCT = {
     '19_ASR_AIShell_zh_ASR_v2_5044': ["Transform the oral presentation into a text document."],
     '20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': ["Please provide a written transcription of the speech."],
     '27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': ["Please translate the given speech to Chinese."],
     '28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': ["Please follow the instruction in the speech."],
     '29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': ["Please follow the instruction in the speech."],
 }
@@ -81,22 +82,64 @@ class NoAudioException(Exception):
     pass
 @st.cache_resource()
 def start_server():
-    pkey = paramiko.RSAKey.from_private_key(io.StringIO(os.getenv('PRIVATE_KEY')))
-    server = SSHTunnelForwarder(
-        ssh_address_or_host=os.getenv('SERVER_DNS_NAME'),
-        ssh_username="ec2-user",
-        ssh_pkey=pkey,
-        local_bind_address=("127.0.0.1", local_port),
-        remote_bind_address=("127.0.0.1", 8000)
-    )
     server.start()
     return server
-@st.cache_resource()
 def load_model():
     openai_api_key = os.getenv('API_KEY')
     openai_api_base = f"http://localhost:{local_port}/v1"
@@ -122,34 +165,63 @@ def generate_response(text_input):
     if re.search(r'[\u4e00-\u9fff]+', text_input):
         warnings.append("NOTE: Please try to prompt in English for the best performance.")
-    stream = st.session_state.client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": f"Text instruction: {text_input}"
-                },
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": f"data:audio/ogg;base64,{st.session_state.audio_base64}"
                     },
-                },
-            ],
-        }],
-        model=st.session_state.model_name,
-        max_completion_tokens=512,
-        temperature=st.session_state.temperature,
-        top_p=st.session_state.top_p,
-        stream=True,
-    )
     return stream, warnings
 def bytes_to_array(audio_bytes):
     audio_array, _ = librosa.load(
         io.BytesIO(audio_bytes),

 import io
 import os
 import re
+import time
 import librosa
 import paramiko
 import streamlit as st
+from openai import OpenAI, APIConnectionError
 from sshtunnel import SSHTunnelForwarder
 local_port = int(os.getenv('LOCAL_PORT'))
 AUDIO_SAMPLES_W_INSTRUCT = {
     '1_ASR_IMDA_PART1_ASR_v2_141' : ["Turn the spoken language into a text format.", "Please translate the content into Chinese."],
     '7_ASR_IMDA_PART3_30_ASR_v2_2269': ["Need this talk written down, please."],
+    '13_ASR_IMDA_PART5_30_ASR_v2_1446': ["Translate this vocal recording into a textual format."],
     '17_ASR_IMDA_PART6_30_ASR_v2_1413': ["Record the spoken word in text form."],
     '32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': ["What does the man think the woman should do at 4:00."],
     '33_SQA_IMDA_PART3_30_SQA_V2_2310': ["Does Speaker2's wife cook for Speaker2 when they are at home."],
     '34_SQA_IMDA_PART3_30_SQA_V2_3621': ["Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language."],
     '12_ASR_IMDA_PART4_30_ASR_v2_103' : ["Write out the dialogue as text."],
     '10_ASR_IMDA_PART4_30_ASR_v2_1527': ["Write out the dialogue as text."],
     '14_ASR_IMDA_PART5_30_ASR_v2_2281': ["Translate this vocal recording into a textual format."],
     '15_ASR_IMDA_PART5_30_ASR_v2_4388': ["Translate this vocal recording into a textual format."],
     '19_ASR_AIShell_zh_ASR_v2_5044': ["Transform the oral presentation into a text document."],
     '20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': ["Please provide a written transcription of the speech."],
+    '25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': ["Please translate the given speech to English."],
+    '26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': ["Please translate the given speech to Chinese."],
     '27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': ["Please translate the given speech to Chinese."],
     '28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': ["Please follow the instruction in the speech."],
     '29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': ["Please follow the instruction in the speech."],
+    '30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': ["Please follow the instruction in the speech."],
 }
     pass
+class TunnelNotRunningException(Exception):
+    pass
+class SSHTunnelManager:
+    def __init__(self):
+        pkey = paramiko.RSAKey.from_private_key(io.StringIO(os.getenv('PRIVATE_KEY')))
+        self.server = SSHTunnelForwarder(
+            ssh_address_or_host=os.getenv('SERVER_DNS_NAME'),
+            ssh_username="ec2-user",
+            ssh_pkey=pkey,
+            local_bind_address=("127.0.0.1", local_port),
+            remote_bind_address=("127.0.0.1", 8000)
+        )
+        self._is_starting = False
+        self._is_running = False
+    def update_status(self):
+        if not self._is_starting:
+            self.server.check_tunnels()
+            self._is_running = list(self.server.tunnel_is_up.values())[0]
+        else:
+            self._is_running = False
+    def is_starting(self):
+        self.update_status()
+        return self._is_starting
+    def is_running(self):
+        self.update_status()
+        return self._is_running
+    def is_down(self):
+        self.update_status()
+        return (not self._is_running) and (not self._is_starting)
+    def start(self, *args, **kwargs):
+        if not self._is_starting:
+            self._is_starting = True
+            self.server.start(*args, **kwargs)
+            self._is_starting = False
+    def restart(self, *args, **kwargs):
+        if not self._is_starting:
+            self._is_starting = True
+            self.server.restart(*args, **kwargs)
+            self._is_starting = False
 @st.cache_resource()
 def start_server():
+    server = SSHTunnelManager()
     server.start()
     return server
 def load_model():
     openai_api_key = os.getenv('API_KEY')
     openai_api_base = f"http://localhost:{local_port}/v1"
     if re.search(r'[\u4e00-\u9fff]+', text_input):
         warnings.append("NOTE: Please try to prompt in English for the best performance.")
+    try:
+        stream = st.session_state.client.chat.completions.create(
+            messages=[{
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"Text instruction: {text_input}"
                     },
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            "url": f"data:audio/ogg;base64,{st.session_state.audio_base64}"
+                        },
+                    },
+                ],
+            }],
+            model=st.session_state.model_name,
+            max_completion_tokens=512,
+            temperature=st.session_state.temperature,
+            top_p=st.session_state.top_p,
+            stream=True,
+        )
+    except APIConnectionError as e:
+        if not st.session_state.server.is_running():
+            raise TunnelNotRunningException()
+        raise e
     return stream, warnings
+def retry_generate_response(retry=3):
+    response, warnings = "", []
+    try:
+        stream, warnings = generate_response(st.session_state.prompt)
+        for warning_msg in warnings:
+            st.warning(warning_msg)
+        response = st.write_stream(stream)
+    except TunnelNotRunningException as e:
+        if retry == 0:
+            raise e
+        st.error(f"Internet connection is down. Trying to re-establish connection ({retry}).")
+        if st.session_state.server.is_down():
+            st.session_state.server.restart()
+        elif st.session_state.server.is_starting():
+            time.sleep(2)
+        return retry_generate_response(retry-1)
+    return response, warnings
 def bytes_to_array(audio_bytes):
     audio_array, _ = librosa.load(
         io.BytesIO(audio_bytes),