File size: 4,945 Bytes
6a3ed8c
 
 
96e12ac
 
 
 
ab02fe1
3b0b181
 
 
622a5d0
 
 
 
 
6a3ed8c
622a5d0
 
6a3ed8c
 
 
 
 
 
 
 
 
 
3b0b181
4d31d09
3b0b181
 
 
 
 
 
 
 
 
 
 
4d31d09
 
 
6a3ed8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
622a5d0
 
6a3ed8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8021f64
6a3ed8c
 
 
15d8269
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import warnings
warnings.filterwarnings("ignore")

import os
os.environ["SDL_AUDIODRIVER"] = "dummy"  # For SDL-based libraries
os.environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1"  # Optional: Hide pygame welcome message

import streamlit as st
import numpy as np
import torch
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

import sys
import os
# Suppress ALSA warnings
sys.stderr = open(os.devnull, 'w')
import pyaudio
sys.stderr = sys.__stderr__

import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define audio stream parameters
FORMAT = pyaudio.paInt16  # 16-bit resolution
CHANNELS = 1              # Mono audio
RATE = 16000              # 16kHz sampling rate
CHUNK = 1024              # Number of frames per buffer

# Load Model and Feature Extractor
@st.cache_resource
def load_model():
    """
    Load the wav2vec2 model and feature extractor for gender recognition.
    """
    model_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
    model = AutoModelForAudioClassification.from_pretrained(model_path)
    model.eval()
    return feature_extractor, model

placeholder = st.empty()
placeholder.text("Loading model...")
feature_extractor, model = load_model()
placeholder.text("Model loaded!")    

st.title("Real-Time Gender Detection from Voice :microphone:")
st.write("Click 'Start' to detect gender in real-time.")
placeholder.empty()

# Initialize session state
if 'listening' not in st.session_state:
    st.session_state['listening'] = False
if 'prediction' not in st.session_state:
    st.session_state['prediction'] = ""

# Function to stop listening
def stop_listening():
    """Stop the audio stream and update session state to stop listening."""
    if 'stream' in st.session_state:
        logging.info("Stopping stream")
        st.session_state['stream'].stop_stream()
        st.session_state['stream'].close()
    if 'audio' in st.session_state:
        logging.info("Stopping audio")
        st.session_state['audio'].terminate()
    st.session_state['listening'] = False
    st.session_state['prediction'] = "Stopped listening, click 'Start Listening' to start again."
    st.rerun()

def start_listening():
    """Start the audio stream and continuously process audio for gender detection."""
    try:
        placeholder = st.empty()
        audio = pyaudio.PyAudio()
        stream = audio.open(format=FORMAT,
                            channels=CHANNELS,
                            rate=RATE,
                            input=True,
                            frames_per_buffer=CHUNK,
                        )

        st.session_state['stream'] = stream
        st.session_state['audio'] = audio
        st.session_state['listening'] = True
        st.session_state['prediction'] = "Listening........................"
        placeholder.write("Listening for audio...")

        while st.session_state['listening']:
            audio_data = np.array([], dtype=np.float32)

            for _ in range(int(RATE / CHUNK * 1.5)):
                # Read audio chunk from the stream
                data = stream.read(CHUNK, exception_on_overflow=False)
                
                # Convert byte data to numpy array and normalize
                chunk_data = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
                audio_data = np.concatenate((audio_data, chunk_data))

            # Check if there is significant sound
            if np.max(np.abs(audio_data)) > 0.05:  # Threshold for detecting sound
                # Process the audio data
                inputs = feature_extractor(audio_data, sampling_rate=RATE, return_tensors="pt", padding=True)
                # Perform inference
                with torch.no_grad():
                    logits = model(**inputs).logits
                    predicted_ids = torch.argmax(logits, dim=-1)

                    # Map predicted IDs to labels
                    predicted_label = model.config.id2label[predicted_ids.item()]

                    if predicted_label != st.session_state['prediction']:
                        st.session_state['prediction'] = predicted_label
                        # st.write(f"Detected Gender: {predicted_label}")
                        placeholder.write(f"Detected Gender: {predicted_label}")
            else:
                st.session_state['prediction'] = "---- No significant sound detected, skipping prediction. ----"
                placeholder.empty()
        placeholder.empty()
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        st.error(f"An error occurred: {e}")
        # stop_listening()


col1, col2 = st.columns(2)
with col1: 
    if st.button("Start"):
        start_listening()
with col2:
    if st.button("Stop"):
        stop_listening()