|
import json |
|
import gradio as gr |
|
import requests |
|
import os |
|
|
|
def is_file_larger_than_30mb(file_path): |
|
try: |
|
file_size = os.path.getsize(file_path) |
|
return file_size > (30 * 1024 * 1024) |
|
except FileNotFoundError: |
|
return False |
|
except PermissionError: |
|
return False |
|
except Exception as e: |
|
return False |
|
|
|
def upload_audio(audio_path): |
|
try: |
|
size = is_file_larger_than_30mb(audio_path) |
|
if size == True: |
|
return 'size' |
|
with open(audio_path, 'rb') as audio_file: |
|
response = requests.post('http://sls-titan-6.csail.mit.edu:8080/upload/', files={'audio_file': audio_file}) |
|
if response.status_code == 200: |
|
return response.json()["path"] |
|
except: |
|
return None |
|
|
|
def upload_audio_13b(audio_path): |
|
try: |
|
size = is_file_larger_than_30mb(audio_path) |
|
if size == True: |
|
return 'size' |
|
with open(audio_path, 'rb') as audio_file: |
|
response = requests.post('http://sls-titan-5.csail.mit.edu:8080/upload/', files={'audio_file': audio_file}) |
|
if response.status_code == 200: |
|
return response.json()["path"] |
|
except: |
|
return None |
|
|
|
def predict(audio_path, question, model): |
|
if model == '7B (Default)': |
|
upload_statues = upload_audio(audio_path) |
|
if upload_statues == None: |
|
return 'Please upload an audio file.' |
|
if upload_statues == 'size': |
|
return 'This demo does not support audio file size larger than 30MB.' |
|
if question == '': |
|
return 'Please ask a question.' |
|
print(audio_path, question) |
|
response = requests.put('http://sls-titan-6.csail.mit.edu:8080/items/0', json={ |
|
'audio_path': audio_path, 'question': question |
|
}) |
|
answer_7b = json.loads(response.content) |
|
ans_str_7b = answer_7b['output'] |
|
return ans_str_7b |
|
|
|
if model == '13B (Beta)': |
|
upload_statues = upload_audio_13b(audio_path) |
|
if upload_statues == None: |
|
return 'Please upload an audio file.' |
|
if upload_statues == 'size': |
|
return 'This demo does not support audio file size larger than 30MB.' |
|
if question == '': |
|
return 'Please ask a question.' |
|
print(audio_path, question) |
|
response = requests.put('http://sls-titan-5.csail.mit.edu:8080/items/0', json={ |
|
'audio_path': audio_path, 'question': question |
|
}) |
|
answer_13b = json.loads(response.content) |
|
ans_str_13b = answer_13b['output'] |
|
return ans_str_13b |
|
|
|
if __name__ == '__main__': |
|
link = "https://github.com/YuanGongND/ltu" |
|
text = "[Github]" |
|
paper_link = "https://arxiv.org/pdf/2309.14405.pdf" |
|
paper_text = "[ASRU Paper]" |
|
sample_audio_link = "https://drive.google.com/drive/folders/17yeBevX0LIS1ugt0DZDOoJolwxvncMja?usp=sharing" |
|
sample_audio_text = "[sample audios from AudioSet evaluation set]" |
|
demo = gr.Interface(fn=predict, |
|
inputs=[gr.Audio(type="filepath"), |
|
gr.Textbox(value='What can be inferred from the spoken text and sounds? Why?', label='Edit the textbox to ask your own questions!'), |
|
gr.Radio(["7B (Default)", "13B (Beta)"], value='7B (Default)', label="LLM size", info="All experiments in the ASRU paper are 7B LLM.")], |
|
outputs=[gr.Textbox(label="LTU-AS-Output")], |
|
cache_examples=True, |
|
title="Demo of LTU-AS", |
|
description="LTU-AS an improved version of LTU. LTU-AS is stronger in spoken text understanding and music understanding. " + f"<a href='{paper_link}'>{paper_text}</a> <br>" + |
|
"LTU-AS is authored by Yuan Gong, Alexander H. Liu, Hongyin Luo, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab). <br>" + |
|
"Input should be wav file sampled at 16kHz. This demo trims input audio to 10 seconds. <br>" + |
|
"Code of LTU-AS will be available soon at " + f"<a href='{link}'>{text}</a> <br>" + |
|
"**Research Demo, Not for Commercial Use (Due to license of LLaMA).**") |
|
demo.launch(debug=False, share=False) |