import gradio as gr import numpy as np from huggingface_hub import InferenceClient import os import requests import scipy.io.wavfile import io import time client = InferenceClient( "meta-llama/Meta-Llama-3-8B-Instruct", token=os.getenv('hf_token') ) def process_audio(audio_data): if audio_data is None: return "No audio provided.", "" # 检查 audio_data 是否是元组,并提取数据 if isinstance(audio_data, tuple): sample_rate, data = audio_data else: return "Invalid audio data format.", "" # Convert the audio data to WAV format in memory buf = io.BytesIO() scipy.io.wavfile.write(buf, sample_rate, data) wav_bytes = buf.getvalue() buf.close() API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v2" headers = {"Authorization": f"Bearer {os.getenv('hf_token')}"} def query(wav_data): response = requests.post(API_URL, headers=headers, data=wav_data) return response.json() # Call the API to process the audio output = query(wav_bytes) print(output) # Check output in console (logs in HF space) # Check the API response if 'text' in output: recognized_text = output['text'] return recognized_text, recognized_text else: recognized_text = "The ASR module is still loading, please press the button again!" return recognized_text, "" # 定义函数以禁用按钮并显示加载指示器 def disable_components(): # 更新 recognized_text 的内容,提示用户正在处理 recognized_text_update = gr.update(value='正在处理,请稍候...') # 禁用 process_button process_button_update = gr.update(interactive=False) # 显示加载动画 loading_animation_update = gr.update(visible=True) return recognized_text_update, process_button_update, loading_animation_update # 定义函数以启用按钮并隐藏加载指示器 def enable_components(recognized_text): process_button_update = gr.update(interactive=True) # 隐藏加载动画 loading_animation_update = gr.update(visible=False) return recognized_text, process_button_update, loading_animation_update llama_responded = 0 def respond( message, history: list[tuple[str, str]] ): global llama_responded system_message = "You are a helpful chatbot that answers questions. Give any answer within 50 words." messages = [{"role": "system", "content": system_message}] for val in history: print(val[0]) if val[0] != None: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( messages, stream=True, ): token = message.choices[0].delta.content response += token llama_responded = 1 return response #gr.Audio("/home/yxpeng/Projects/RAGHack/Exodia/voice_sample/trump1.wav") def update_response_display(): while not llama_responded: time.sleep(1) def bot(history): global llama_responded #print(history) history.append([None,gr.Audio("/home/yxpeng/Projects/RAGHack/Exodia/voice_sample/trump1.wav")]) llama_responded = 0 return history def create_interface(): with gr.Blocks() as demo: # Title gr.Markdown("# Exodia AI Assistant") # Audio input section with gr.Row(): audio_input = gr.Audio( sources="microphone", type="numpy", # Get audio data and sample rate label="Say Something..." ) recognized_text = gr.Textbox(label="Recognized Text",interactive=False) # Process audio button process_button = gr.Button("Process Audio") # Loading animation loading_animation = gr.HTML( value='
ASR Model is running...
', visible=False ) chatbot_custom = gr.Chatbot(height=500) # Set height to 500 pixels # Chat interface using the custom chatbot instance chatbot = gr.ChatInterface( fn=respond, chatbot=chatbot_custom, submit_btn="Start Chatting" ) user_start =chatbot.textbox.submit( fn=update_response_display, inputs=[], outputs=[], ) # 在用户提交请求的时候 #user_start = chatbot.textbox.submit() user_start.then( fn=bot, inputs=[chatbot_custom], outputs=chatbot_custom, # 更新 response_display 的内容 ) # Associate audio processing function and update component states on click process_button.click( fn=disable_components, inputs=[], outputs=[recognized_text, process_button, loading_animation] ).then( fn=process_audio, inputs=[audio_input], outputs=[recognized_text, chatbot.textbox] ).then( fn=enable_components, inputs=[recognized_text], outputs=[recognized_text, process_button, loading_animation] ) # Layout includes Chatbot with gr.Row(): chatbot_output = chatbot return demo if __name__ == "__main__": demo = create_interface() demo.launch() if __name__ == "__main__": demo = create_interface() demo.launch()