prashant-garg's picture
audio stream error
8021f64
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["SDL_AUDIODRIVER"] = "dummy" # For SDL-based libraries
os.environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1" # Optional: Hide pygame welcome message
import streamlit as st
import numpy as np
import torch
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
import sys
import os
# Suppress ALSA warnings
sys.stderr = open(os.devnull, 'w')
import pyaudio
sys.stderr = sys.__stderr__
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Define audio stream parameters
FORMAT = pyaudio.paInt16 # 16-bit resolution
CHANNELS = 1 # Mono audio
RATE = 16000 # 16kHz sampling rate
CHUNK = 1024 # Number of frames per buffer
# Load Model and Feature Extractor
@st.cache_resource
def load_model():
"""
Load the wav2vec2 model and feature extractor for gender recognition.
"""
model_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
model = AutoModelForAudioClassification.from_pretrained(model_path)
model.eval()
return feature_extractor, model
placeholder = st.empty()
placeholder.text("Loading model...")
feature_extractor, model = load_model()
placeholder.text("Model loaded!")
st.title("Real-Time Gender Detection from Voice :microphone:")
st.write("Click 'Start' to detect gender in real-time.")
placeholder.empty()
# Initialize session state
if 'listening' not in st.session_state:
st.session_state['listening'] = False
if 'prediction' not in st.session_state:
st.session_state['prediction'] = ""
# Function to stop listening
def stop_listening():
"""Stop the audio stream and update session state to stop listening."""
if 'stream' in st.session_state:
logging.info("Stopping stream")
st.session_state['stream'].stop_stream()
st.session_state['stream'].close()
if 'audio' in st.session_state:
logging.info("Stopping audio")
st.session_state['audio'].terminate()
st.session_state['listening'] = False
st.session_state['prediction'] = "Stopped listening, click 'Start Listening' to start again."
st.rerun()
def start_listening():
"""Start the audio stream and continuously process audio for gender detection."""
try:
placeholder = st.empty()
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
)
st.session_state['stream'] = stream
st.session_state['audio'] = audio
st.session_state['listening'] = True
st.session_state['prediction'] = "Listening........................"
placeholder.write("Listening for audio...")
while st.session_state['listening']:
audio_data = np.array([], dtype=np.float32)
for _ in range(int(RATE / CHUNK * 1.5)):
# Read audio chunk from the stream
data = stream.read(CHUNK, exception_on_overflow=False)
# Convert byte data to numpy array and normalize
chunk_data = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
audio_data = np.concatenate((audio_data, chunk_data))
# Check if there is significant sound
if np.max(np.abs(audio_data)) > 0.05: # Threshold for detecting sound
# Process the audio data
inputs = feature_extractor(audio_data, sampling_rate=RATE, return_tensors="pt", padding=True)
# Perform inference
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
# Map predicted IDs to labels
predicted_label = model.config.id2label[predicted_ids.item()]
if predicted_label != st.session_state['prediction']:
st.session_state['prediction'] = predicted_label
# st.write(f"Detected Gender: {predicted_label}")
placeholder.write(f"Detected Gender: {predicted_label}")
else:
st.session_state['prediction'] = "---- No significant sound detected, skipping prediction. ----"
placeholder.empty()
placeholder.empty()
except Exception as e:
logging.error(f"An error occurred: {e}")
st.error(f"An error occurred: {e}")
# stop_listening()
col1, col2 = st.columns(2)
with col1:
if st.button("Start"):
start_listening()
with col2:
if st.button("Stop"):
stop_listening()