Spaces:
Running
Running
import warnings | |
warnings.filterwarnings("ignore") | |
import os | |
os.environ["SDL_AUDIODRIVER"] = "dummy" # For SDL-based libraries | |
os.environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1" # Optional: Hide pygame welcome message | |
import streamlit as st | |
import numpy as np | |
import torch | |
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification | |
import sys | |
import os | |
# Suppress ALSA warnings | |
sys.stderr = open(os.devnull, 'w') | |
import pyaudio | |
sys.stderr = sys.__stderr__ | |
import logging | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Define audio stream parameters | |
FORMAT = pyaudio.paInt16 # 16-bit resolution | |
CHANNELS = 1 # Mono audio | |
RATE = 16000 # 16kHz sampling rate | |
CHUNK = 1024 # Number of frames per buffer | |
# Load Model and Feature Extractor | |
def load_model(): | |
""" | |
Load the wav2vec2 model and feature extractor for gender recognition. | |
""" | |
model_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech" | |
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path) | |
model = AutoModelForAudioClassification.from_pretrained(model_path) | |
model.eval() | |
return feature_extractor, model | |
placeholder = st.empty() | |
placeholder.text("Loading model...") | |
feature_extractor, model = load_model() | |
placeholder.text("Model loaded!") | |
st.title("Real-Time Gender Detection from Voice :microphone:") | |
st.write("Click 'Start' to detect gender in real-time.") | |
placeholder.empty() | |
# Initialize session state | |
if 'listening' not in st.session_state: | |
st.session_state['listening'] = False | |
if 'prediction' not in st.session_state: | |
st.session_state['prediction'] = "" | |
# Function to stop listening | |
def stop_listening(): | |
"""Stop the audio stream and update session state to stop listening.""" | |
if 'stream' in st.session_state: | |
logging.info("Stopping stream") | |
st.session_state['stream'].stop_stream() | |
st.session_state['stream'].close() | |
if 'audio' in st.session_state: | |
logging.info("Stopping audio") | |
st.session_state['audio'].terminate() | |
st.session_state['listening'] = False | |
st.session_state['prediction'] = "Stopped listening, click 'Start Listening' to start again." | |
st.rerun() | |
def start_listening(): | |
"""Start the audio stream and continuously process audio for gender detection.""" | |
try: | |
placeholder = st.empty() | |
audio = pyaudio.PyAudio() | |
stream = audio.open(format=FORMAT, | |
channels=CHANNELS, | |
rate=RATE, | |
input=True, | |
frames_per_buffer=CHUNK, | |
) | |
st.session_state['stream'] = stream | |
st.session_state['audio'] = audio | |
st.session_state['listening'] = True | |
st.session_state['prediction'] = "Listening........................" | |
placeholder.write("Listening for audio...") | |
while st.session_state['listening']: | |
audio_data = np.array([], dtype=np.float32) | |
for _ in range(int(RATE / CHUNK * 1.5)): | |
# Read audio chunk from the stream | |
data = stream.read(CHUNK, exception_on_overflow=False) | |
# Convert byte data to numpy array and normalize | |
chunk_data = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0 | |
audio_data = np.concatenate((audio_data, chunk_data)) | |
# Check if there is significant sound | |
if np.max(np.abs(audio_data)) > 0.05: # Threshold for detecting sound | |
# Process the audio data | |
inputs = feature_extractor(audio_data, sampling_rate=RATE, return_tensors="pt", padding=True) | |
# Perform inference | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
# Map predicted IDs to labels | |
predicted_label = model.config.id2label[predicted_ids.item()] | |
if predicted_label != st.session_state['prediction']: | |
st.session_state['prediction'] = predicted_label | |
# st.write(f"Detected Gender: {predicted_label}") | |
placeholder.write(f"Detected Gender: {predicted_label}") | |
else: | |
st.session_state['prediction'] = "---- No significant sound detected, skipping prediction. ----" | |
placeholder.empty() | |
placeholder.empty() | |
except Exception as e: | |
logging.error(f"An error occurred: {e}") | |
st.error(f"An error occurred: {e}") | |
# stop_listening() | |
col1, col2 = st.columns(2) | |
with col1: | |
if st.button("Start"): | |
start_listening() | |
with col2: | |
if st.button("Stop"): | |
stop_listening() |