Spaces:
Running
Running
File size: 4,945 Bytes
6a3ed8c 96e12ac ab02fe1 3b0b181 622a5d0 6a3ed8c 622a5d0 6a3ed8c 3b0b181 4d31d09 3b0b181 4d31d09 6a3ed8c 622a5d0 6a3ed8c 8021f64 6a3ed8c 15d8269 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["SDL_AUDIODRIVER"] = "dummy" # For SDL-based libraries
os.environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1" # Optional: Hide pygame welcome message
import streamlit as st
import numpy as np
import torch
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
import sys
import os
# Suppress ALSA warnings
sys.stderr = open(os.devnull, 'w')
import pyaudio
sys.stderr = sys.__stderr__
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Define audio stream parameters
FORMAT = pyaudio.paInt16 # 16-bit resolution
CHANNELS = 1 # Mono audio
RATE = 16000 # 16kHz sampling rate
CHUNK = 1024 # Number of frames per buffer
# Load Model and Feature Extractor
@st.cache_resource
def load_model():
"""
Load the wav2vec2 model and feature extractor for gender recognition.
"""
model_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
model = AutoModelForAudioClassification.from_pretrained(model_path)
model.eval()
return feature_extractor, model
placeholder = st.empty()
placeholder.text("Loading model...")
feature_extractor, model = load_model()
placeholder.text("Model loaded!")
st.title("Real-Time Gender Detection from Voice :microphone:")
st.write("Click 'Start' to detect gender in real-time.")
placeholder.empty()
# Initialize session state
if 'listening' not in st.session_state:
st.session_state['listening'] = False
if 'prediction' not in st.session_state:
st.session_state['prediction'] = ""
# Function to stop listening
def stop_listening():
"""Stop the audio stream and update session state to stop listening."""
if 'stream' in st.session_state:
logging.info("Stopping stream")
st.session_state['stream'].stop_stream()
st.session_state['stream'].close()
if 'audio' in st.session_state:
logging.info("Stopping audio")
st.session_state['audio'].terminate()
st.session_state['listening'] = False
st.session_state['prediction'] = "Stopped listening, click 'Start Listening' to start again."
st.rerun()
def start_listening():
"""Start the audio stream and continuously process audio for gender detection."""
try:
placeholder = st.empty()
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
)
st.session_state['stream'] = stream
st.session_state['audio'] = audio
st.session_state['listening'] = True
st.session_state['prediction'] = "Listening........................"
placeholder.write("Listening for audio...")
while st.session_state['listening']:
audio_data = np.array([], dtype=np.float32)
for _ in range(int(RATE / CHUNK * 1.5)):
# Read audio chunk from the stream
data = stream.read(CHUNK, exception_on_overflow=False)
# Convert byte data to numpy array and normalize
chunk_data = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
audio_data = np.concatenate((audio_data, chunk_data))
# Check if there is significant sound
if np.max(np.abs(audio_data)) > 0.05: # Threshold for detecting sound
# Process the audio data
inputs = feature_extractor(audio_data, sampling_rate=RATE, return_tensors="pt", padding=True)
# Perform inference
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
# Map predicted IDs to labels
predicted_label = model.config.id2label[predicted_ids.item()]
if predicted_label != st.session_state['prediction']:
st.session_state['prediction'] = predicted_label
# st.write(f"Detected Gender: {predicted_label}")
placeholder.write(f"Detected Gender: {predicted_label}")
else:
st.session_state['prediction'] = "---- No significant sound detected, skipping prediction. ----"
placeholder.empty()
placeholder.empty()
except Exception as e:
logging.error(f"An error occurred: {e}")
st.error(f"An error occurred: {e}")
# stop_listening()
col1, col2 = st.columns(2)
with col1:
if st.button("Start"):
start_listening()
with col2:
if st.button("Stop"):
stop_listening() |