File size: 3,511 Bytes
4080d0e
 
 
ba2cd04
4080d0e
 
 
afe2476
4080d0e
 
 
 
 
ba2cd04
 
 
4080d0e
ba2cd04
4080d0e
 
 
 
 
afe2476
 
 
 
4080d0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afe2476
 
 
 
 
 
 
 
4080d0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c8bda0
4080d0e
 
afe2476
 
fc2b862
 
 
 
4080d0e
 
afe2476
 
 
 
 
4080d0e
 
 
 
 
 
 
 
 
 
 
 
ba2cd04
4080d0e
fc2b862
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import google.generativeai as genai
import speech_recognition as sr
from gtts import gTTS  # Replacing pyttsx3 with gTTS for text-to-speech
from dotenv import load_dotenv
import gradio as gr
import tempfile
from pydub import AudioSegment  # Importing for audio conversion

# Load environment variables
load_dotenv()

def speak_and_save(text):
    """Use gTTS to speak the given text and save it as an audio file."""
    tts = gTTS(text)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        audio_path = fp.name
    tts.save(audio_path)
    return audio_path

def recognize_speech_from_audio(audio_file):
    """Capture and recognize speech from the audio file."""
    recognizer = sr.Recognizer()
    
    # Debug print to check if audio file exists
    print(f"Processing audio file: {audio_file}")
    
    with sr.AudioFile(audio_file) as source:
        audio = recognizer.record(source)
    try:
        command = recognizer.recognize_google(audio)
        print(f"You said: {command}")
        return command
    except sr.UnknownValueError:
        print("Could not understand audio.")
        return None
    except sr.RequestError:
        print("Error with the speech recognition service.")
        return None

def process_command(command):
    """Generate a response based on the voice command using the AI model."""
    if command:
        response = model.generate_content([command])
        reply = response.text.strip()
        print(f"AI Response: {reply}")
        return reply

# Convert any audio file to WAV format to ensure compatibility with speech_recognition
def convert_to_wav(audio_path):
    """Convert any audio file to a WAV format."""
    audio = AudioSegment.from_file(audio_path)
    wav_path = tempfile.mktemp(suffix=".wav")
    audio.export(wav_path, format="wav")
    return wav_path

# Main Code with Generative AI Setup
api_key = os.getenv("MY_API_KEY")

if api_key is None:
    raise ValueError("API key not found in environment variables")

# Configure the AI model
genai.configure(api_key=api_key)

generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash-8b-exp-0827",
    generation_config=generation_config,
)

wake_word = "hello"

def assistant(audio):
    # Check if the audio file path is provided
    print(f"Audio file received: {audio}")
    
    # Check if the file exists before processing
    if not audio or not os.path.exists(audio):
        print(f"Audio file does not exist or is not provided: {audio}")
        return "No audio provided.", None

    # Convert to WAV format before processing
    audio_wav = convert_to_wav(audio)
    
    # Process the speech from the audio
    command = recognize_speech_from_audio(audio_wav)
    
    if command and wake_word in command.lower():
        response_text = process_command(command)
        audio_response = speak_and_save(response_text)
        return response_text, audio_response
    else:
        return "Wake word not detected.", None

# Gradio Interface
gr.Interface(
    fn=assistant,  # Function to call when the interface is run
    inputs=gr.Audio(type="filepath"),  # Audio input, expecting a file path from the microphone
    outputs=[gr.Textbox(), gr.Audio(type="filepath", label="Response Audio")],  # Outputs text and the response audio
    title="Sema Voice Assistant"
).launch(share=True)