|
import gradio as gr |
|
from transformers import pipeline |
|
|
|
|
|
asr_pipeline = pipeline("automatic-speech-recognition", model="canary/asr-small-librispeech", device=0) |
|
qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa") |
|
tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0) |
|
|
|
|
|
def capture_audio(): |
|
while True: |
|
print("Say, 'Hey, Alex'") |
|
|
|
audio_input = asr_pipeline(None)[0]['input_values'] |
|
transcript = asr_pipeline(audio_input)[0]['transcription'] |
|
if "hey alex" in transcript.lower(): |
|
print("I hear you!") |
|
break |
|
print("Listening...") |
|
return audio_input |
|
|
|
|
|
def ai_assistant(audio_input): |
|
|
|
transcribed_text = asr_pipeline(audio_input)[0]['transcription'] |
|
|
|
|
|
question = transcribed_text |
|
|
|
context = "Friends is a popular American sitcom that aired from 1994 to 2004. The show revolves around a group of six friends living in New York City—Ross, Rachel, Chandler, Monica, Joey, and Phoebe—as they navigate various aspects of their personal and professional lives. Friends is known for its humor, memorable characters, and iconic catchphrases, making it a beloved and enduring cultural phenomenon." |
|
answer = qa_pipeline(question=question, context=context) |
|
|
|
|
|
tts_output = tts_pipeline(answer['answer']) |
|
|
|
|
|
return tts_output[0]['audio'] |
|
|
|
if __name__ == "__main__": |
|
|
|
gr.Interface(ai_assistant, |
|
inputs=gr.inputs.Audio(capture= capture_audio, label="Speak Here"), |
|
outputs=gr.outputs.Audio(type="audio", label="Assistant's Response"), |
|
title="Alexander the Great AI Assistant", |
|
description="An AI Assistant. Say 'Hey Alex' to speak to Alexander").launch(inbrowser=True) |
|
|
|
|