from openai import OpenAI import gradio as gr from transformers import pipeline import numpy as np transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad") def predict(message, history, api_key): print('in predict') client = OpenAI(api_key=api_key) history_openai_format = [] if 0: for human, assistant in history: history_openai_format.append({"role": "user", "content": human}) history_openai_format.append({"role": "assistant", "content": assistant}) history_openai_format.append({"role": "user", "content": message}) response = client.chat.completions.create( model='gpt-4o', messages=history_openai_format, temperature=1.0, stream=True ) partial_message = "" for chunk in response: if chunk.choices[0].delta.content: print(111, chunk.choices[0].delta.content) partial_message += chunk.choices[0].delta.content yield partial_message def chat_with_api_key(api_key, message, history): print('in chat_with_api_key') accumulated_message = "" for partial_message in predict(message, history, api_key): accumulated_message = partial_message history.append((message, accumulated_message)) yield accumulated_message, history def transcribe(audio): if audio is None: return "No audio recorded." sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) return transcriber({"sampling_rate": sr, "raw": y})["text"] def answer(transcription): context = "You are chatbot answering general questions" print(transcription) result = qa_model(question=transcription, context=context) print(result) return result['answer'] def clear_all(): return None, "", "" with gr.Blocks() as demo: with gr.Row(): api_key = gr.Textbox(label="API Key", placeholder="Enter your API key", type="password") message = gr.Textbox(label="Message") gr.Markdown("# Audio Transcription and Question Answering") with gr.Row(): audio_input = gr.Audio(label="Audio Input", sources=["microphone"], type="numpy") with gr.Column(): transcription_output = gr.Textbox(label="Transcription") clear_button = gr.Button("Clear") state = gr.State([]) output = gr.Textbox(label="Output", lines=10) def update_output(api_key, audio_input, state): print('in update_output') message = transcribe(audio_input) responses = chat_with_api_key(api_key, message, state) accumulated_response = "" for response, updated_state in responses: accumulated_response = response yield accumulated_response, updated_state btn = gr.Button("Submit") btn.click(update_output, inputs=[api_key, message, state], outputs=[output, state]) audio_input.stop_recording( fn=update_output, inputs=[api_key, audio_input, state], outputs=[output, state] ) clear_button.click( fn=clear_all, inputs=[], outputs=[audio_input, transcription_output, output] ) demo.launch()