friday / app.py
gospacedev's picture
delay user audio decoding
202030f
import time
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient
# Model names
ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
# Initial system prompt
system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant, and you respond with one to two sentences. [/INST] Hello there! I'm Friday, how can I help you?</s>"""
# Global variables for history
instruct_history = system_prompt
formatted_history = ""
# Create inference client for text generation
client = InferenceClient(LLM_MODEL_NAME)
# Set device for ASR pipeline
device = 0 if torch.cuda.is_available() else "cpu"
# ASR pipeline
pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
device=device,
)
def generate(instruct_history, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
output = client.text_generation(
instruct_history, **generate_kwargs, stream=False, details=False, return_full_text=False)
return output
@spaces.GPU(duration=60)
def transcribe(audio, past_history):
global instruct_history, formatted_history
time.sleep(1)
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
transcribed_user_audio = pipe({"sampling_rate": sr, "raw": y})["text"]
formatted_history += past_history
formatted_history += f"πŸ˜ƒ Human: {transcribed_user_audio}\n\n"
instruct_history += f"<s>[INST] {transcribed_user_audio} [/INST] "
# Generate LLM response
llm_response = generate(instruct_history)
instruct_history += f" {llm_response}</s>"
formatted_history += f"πŸ€– Friday: {llm_response}\n\n"
# Convert AI response to audio
audio_response = gTTS(llm_response)
audio_response.save("response.mp3")
print("Formatted History: ", formatted_history)
# Return the full conversation history
return "response.mp3", formatted_history
def clear_history(formatted_history):
instruct_history = ""
instruct_history += system_prompt
formatted_history = ""
return formatted_history
with gr.Blocks() as demo:
gr.HTML("<center><h1>Friday: AI Virtual Assistant πŸ€–</h1><center>")
with gr.Row():
audio_input = gr.Audio(label="Human", sources="microphone")
output_audio = gr.Audio(label="Friday", type="filepath", interactive=False, autoplay=True, elem_classes="audio")
with gr.Row():
send_btn = gr.Button("πŸš€ Send")
clear_btn = gr.Button("πŸ—‘οΈ Clear")
# Textbox to display the full conversation history
transcription_box = gr.Textbox(label="Transcription", lines=10, placeholder="Conversation History...")
send_btn.click(fn=transcribe, inputs=[audio_input, transcription_box], outputs=[output_audio, transcription_box])
clear_btn.click(fn=clear_history, inputs=[transcription_box], outputs=[transcription_box])
if __name__ == "__main__":
demo.queue()
demo.launch()