File size: 2,457 Bytes
ea225b5
4710fcc
 
ea225b5
25a5cc8
 
 
 
 
 
4710fcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25a5cc8
 
 
 
 
 
4710fcc
 
25a5cc8
 
 
 
 
 
 
 
 
 
4710fcc
 
25a5cc8
4710fcc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import gradio as gr
import sounddevice as sd
import soundfile as sf
from transformers import pipeline

# Create pipelines for ASR, QA, and TTS
asr_pipeline = pipeline("automatic-speech-recognition", model="canary/asr-small-librispeech", device=0)  # Adjust device based on your hardware
qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)  # Adjust device based on your hardware

# Function to capture audio
def capture_audio(duration=10, filename="temp.wav"):
    print("Listening for trigger word...")
    # Listen for trigger word ("Hey, Alexander")
    while True:
        audio_input, _ = sd.rec(int(duration * 16000), samplerate=16000, channels=1, dtype="int16")
        sd.wait()
        sf.write(filename, audio_input, 16000)
        transcript = asr_pipeline(filename)[0]['transcription']
        if "hey alex" in transcript.lower():
            print("Hi! I'm listening...")
            break
    print("Listening...")

# AI assistant function
def ai_assistant(audio_input):
    # Perform automatic speech recognition (ASR)
    transcribed_text = asr_pipeline(audio_input)[0]['transcription']

    # Perform question answering (QA)
    question = transcribed_text
    context = "Friends is a popular American sitcom that aired from 1994 to 2004. The show revolves around a group of six friends living in New York City—Ross, Rachel, Chandler, Monica, Joey, and Phoebe—as they navigate various aspects of their personal and professional lives. Friends is known for its humor, memorable characters, and iconic catchphrases, making it a beloved and enduring cultural phenomenon." 
    # Provide the context for the question answering model
    answer = qa_pipeline(question=question, context=context)

    # Convert the answer to speech using text-to-speech (TTS)
    tts_output = tts_pipeline(answer['answer'])

    # Output the speech
    return tts_output[0]['audio']

if __name__ == "__main__":
    # Create a Gradio interface
    gr.Interface(ai_assistant,
                 inputs=gr.inputs.Audio(capture= capture_audio, label="Speak Here"),
                 outputs=gr.outputs.Audio(type="audio", label="Assistant's Response"),
                 title="Alexander the Great AI Assistant",
                 description="An AI Assistant. Say 'Hi Alexander' to speak to Alex").launch(inbrowser=True)