Spaces:
Sleeping
Sleeping
File size: 3,511 Bytes
4080d0e ba2cd04 4080d0e afe2476 4080d0e ba2cd04 4080d0e ba2cd04 4080d0e afe2476 4080d0e afe2476 4080d0e 6c8bda0 4080d0e afe2476 fc2b862 4080d0e afe2476 4080d0e ba2cd04 4080d0e fc2b862 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import os
import google.generativeai as genai
import speech_recognition as sr
from gtts import gTTS # Replacing pyttsx3 with gTTS for text-to-speech
from dotenv import load_dotenv
import gradio as gr
import tempfile
from pydub import AudioSegment # Importing for audio conversion
# Load environment variables
load_dotenv()
def speak_and_save(text):
"""Use gTTS to speak the given text and save it as an audio file."""
tts = gTTS(text)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
audio_path = fp.name
tts.save(audio_path)
return audio_path
def recognize_speech_from_audio(audio_file):
"""Capture and recognize speech from the audio file."""
recognizer = sr.Recognizer()
# Debug print to check if audio file exists
print(f"Processing audio file: {audio_file}")
with sr.AudioFile(audio_file) as source:
audio = recognizer.record(source)
try:
command = recognizer.recognize_google(audio)
print(f"You said: {command}")
return command
except sr.UnknownValueError:
print("Could not understand audio.")
return None
except sr.RequestError:
print("Error with the speech recognition service.")
return None
def process_command(command):
"""Generate a response based on the voice command using the AI model."""
if command:
response = model.generate_content([command])
reply = response.text.strip()
print(f"AI Response: {reply}")
return reply
# Convert any audio file to WAV format to ensure compatibility with speech_recognition
def convert_to_wav(audio_path):
"""Convert any audio file to a WAV format."""
audio = AudioSegment.from_file(audio_path)
wav_path = tempfile.mktemp(suffix=".wav")
audio.export(wav_path, format="wav")
return wav_path
# Main Code with Generative AI Setup
api_key = os.getenv("MY_API_KEY")
if api_key is None:
raise ValueError("API key not found in environment variables")
# Configure the AI model
genai.configure(api_key=api_key)
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash-8b-exp-0827",
generation_config=generation_config,
)
wake_word = "hello"
def assistant(audio):
# Check if the audio file path is provided
print(f"Audio file received: {audio}")
# Check if the file exists before processing
if not audio or not os.path.exists(audio):
print(f"Audio file does not exist or is not provided: {audio}")
return "No audio provided.", None
# Convert to WAV format before processing
audio_wav = convert_to_wav(audio)
# Process the speech from the audio
command = recognize_speech_from_audio(audio_wav)
if command and wake_word in command.lower():
response_text = process_command(command)
audio_response = speak_and_save(response_text)
return response_text, audio_response
else:
return "Wake word not detected.", None
# Gradio Interface
gr.Interface(
fn=assistant, # Function to call when the interface is run
inputs=gr.Audio(type="filepath"), # Audio input, expecting a file path from the microphone
outputs=[gr.Textbox(), gr.Audio(type="filepath", label="Response Audio")], # Outputs text and the response audio
title="Sema Voice Assistant"
).launch(share=True)
|