Update app.py
Browse files
app.py
CHANGED
@@ -1,20 +1,19 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline
|
3 |
|
4 |
-
#
|
5 |
-
asr_pipeline = pipeline("automatic-speech-recognition", model="
|
6 |
qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
|
7 |
-
tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)
|
8 |
|
9 |
# Function to capture audio using Canary ASR
|
10 |
def capture_audio():
|
|
|
11 |
while True:
|
12 |
-
print("Say, 'Hey, Alex'")
|
13 |
-
# Use Canary ASR pipeline to capture audio
|
14 |
audio_input = asr_pipeline(None)[0]['input_values']
|
15 |
transcript = asr_pipeline(audio_input)[0]['transcription']
|
16 |
-
if "hey
|
17 |
-
print("
|
18 |
break
|
19 |
print("Listening...")
|
20 |
return audio_input
|
@@ -22,18 +21,14 @@ def capture_audio():
|
|
22 |
# AI assistant function
|
23 |
def ai_assistant(audio_input):
|
24 |
# Perform automatic speech recognition (ASR)
|
25 |
-
|
26 |
|
27 |
# Perform question answering (QA)
|
28 |
-
question =
|
29 |
-
# Provide the context for the question answering model
|
30 |
-
context = "Friends is a popular American sitcom that aired from 1994 to 2004. The show revolves around a group of six friends living in New York City—Ross, Rachel, Chandler, Monica, Joey, and Phoebe—as they navigate various aspects of their personal and professional lives. Friends is known for its humor, memorable characters, and iconic catchphrases, making it a beloved and enduring cultural phenomenon."
|
31 |
-
answer = qa_pipeline(question=question, context=context)
|
32 |
|
33 |
-
# Convert the
|
34 |
-
tts_output = tts_pipeline(
|
35 |
|
36 |
-
# Output the speech
|
37 |
return tts_output[0]['audio']
|
38 |
|
39 |
if __name__ == "__main__":
|
@@ -41,6 +36,6 @@ if __name__ == "__main__":
|
|
41 |
gr.Interface(ai_assistant,
|
42 |
inputs=gr.inputs.Audio(capture= capture_audio, label="Speak Here"),
|
43 |
outputs=gr.outputs.Audio(type="audio", label="Assistant's Response"),
|
44 |
-
title="
|
45 |
-
description="An AI Assistant
|
46 |
-
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline
|
3 |
|
4 |
+
# Load pipelines for Canary ASR, LLama3 QA, and VITS TTS
|
5 |
+
asr_pipeline = pipeline("automatic-speech-recognition", model="canary/asr-small-librispeech", device=0)
|
6 |
qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
|
7 |
+
tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)
|
8 |
|
9 |
# Function to capture audio using Canary ASR
|
10 |
def capture_audio():
|
11 |
+
print("Listening for cue words...")
|
12 |
while True:
|
|
|
|
|
13 |
audio_input = asr_pipeline(None)[0]['input_values']
|
14 |
transcript = asr_pipeline(audio_input)[0]['transcription']
|
15 |
+
if "hey canary" in transcript.lower():
|
16 |
+
print("Cue word detected!")
|
17 |
break
|
18 |
print("Listening...")
|
19 |
return audio_input
|
|
|
21 |
# AI assistant function
|
22 |
def ai_assistant(audio_input):
|
23 |
# Perform automatic speech recognition (ASR)
|
24 |
+
transcript = asr_pipeline(audio_input)[0]['transcription']
|
25 |
|
26 |
# Perform question answering (QA)
|
27 |
+
qa_result = qa_pipeline(question=transcript, context="Insert your context here")
|
|
|
|
|
|
|
28 |
|
29 |
+
# Convert the QA result to speech using text-to-speech (TTS)
|
30 |
+
tts_output = tts_pipeline(qa_result['answer'])
|
31 |
|
|
|
32 |
return tts_output[0]['audio']
|
33 |
|
34 |
if __name__ == "__main__":
|
|
|
36 |
gr.Interface(ai_assistant,
|
37 |
inputs=gr.inputs.Audio(capture= capture_audio, label="Speak Here"),
|
38 |
outputs=gr.outputs.Audio(type="audio", label="Assistant's Response"),
|
39 |
+
title="AI Assistant",
|
40 |
+
description="An AI Assistant that answers questions based on your speech input.")
|
41 |
+
.launch()
|