import warnings warnings.filterwarnings("ignore") import os os.environ["SDL_AUDIODRIVER"] = "dummy" # For SDL-based libraries os.environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1" # Optional: Hide pygame welcome message import streamlit as st import numpy as np import torch from transformers import AutoFeatureExtractor, AutoModelForAudioClassification import sys import os # Suppress ALSA warnings sys.stderr = open(os.devnull, 'w') import pyaudio sys.stderr = sys.__stderr__ import logging # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Define audio stream parameters FORMAT = pyaudio.paInt16 # 16-bit resolution CHANNELS = 1 # Mono audio RATE = 16000 # 16kHz sampling rate CHUNK = 1024 # Number of frames per buffer # Load Model and Feature Extractor @st.cache_resource def load_model(): """ Load the wav2vec2 model and feature extractor for gender recognition. """ model_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech" feature_extractor = AutoFeatureExtractor.from_pretrained(model_path) model = AutoModelForAudioClassification.from_pretrained(model_path) model.eval() return feature_extractor, model placeholder = st.empty() placeholder.text("Loading model...") feature_extractor, model = load_model() placeholder.text("Model loaded!") st.title("Real-Time Gender Detection from Voice :microphone:") st.write("Click 'Start' to detect gender in real-time.") placeholder.empty() # Initialize session state if 'listening' not in st.session_state: st.session_state['listening'] = False if 'prediction' not in st.session_state: st.session_state['prediction'] = "" # Function to stop listening def stop_listening(): """Stop the audio stream and update session state to stop listening.""" if 'stream' in st.session_state: logging.info("Stopping stream") st.session_state['stream'].stop_stream() st.session_state['stream'].close() if 'audio' in st.session_state: logging.info("Stopping audio") st.session_state['audio'].terminate() st.session_state['listening'] = False st.session_state['prediction'] = "Stopped listening, click 'Start Listening' to start again." st.rerun() def start_listening(): """Start the audio stream and continuously process audio for gender detection.""" try: placeholder = st.empty() audio = pyaudio.PyAudio() stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK, ) st.session_state['stream'] = stream st.session_state['audio'] = audio st.session_state['listening'] = True st.session_state['prediction'] = "Listening........................" placeholder.write("Listening for audio...") while st.session_state['listening']: audio_data = np.array([], dtype=np.float32) for _ in range(int(RATE / CHUNK * 1.5)): # Read audio chunk from the stream data = stream.read(CHUNK, exception_on_overflow=False) # Convert byte data to numpy array and normalize chunk_data = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0 audio_data = np.concatenate((audio_data, chunk_data)) # Check if there is significant sound if np.max(np.abs(audio_data)) > 0.05: # Threshold for detecting sound # Process the audio data inputs = feature_extractor(audio_data, sampling_rate=RATE, return_tensors="pt", padding=True) # Perform inference with torch.no_grad(): logits = model(**inputs).logits predicted_ids = torch.argmax(logits, dim=-1) # Map predicted IDs to labels predicted_label = model.config.id2label[predicted_ids.item()] if predicted_label != st.session_state['prediction']: st.session_state['prediction'] = predicted_label # st.write(f"Detected Gender: {predicted_label}") placeholder.write(f"Detected Gender: {predicted_label}") else: st.session_state['prediction'] = "---- No significant sound detected, skipping prediction. ----" placeholder.empty() placeholder.empty() except Exception as e: logging.error(f"An error occurred: {e}") st.error(f"An error occurred: {e}") # stop_listening() col1, col2 = st.columns(2) with col1: if st.button("Start"): start_listening() with col2: if st.button("Stop"): stop_listening()