import os import subprocess import streamlit as st import librosa import librosa.display import numpy as np import matplotlib.pyplot as plt import soundfile as sf import wave import json from vosk import Model, KaldiRecognizer from transformers import pipeline from huggingface_hub import snapshot_download from pydub import AudioSegment import noisereduce as nr import plotly.graph_objects as go import plotly.express as px # 🎨 Apply Custom Dark Mode CSS st.markdown( """ """, unsafe_allow_html=True ) # ✅ Auto-Download Vosk Model (Speech-to-Text) VOSK_MODEL = "vosk-model-small-en-us-0.15" if not os.path.exists(VOSK_MODEL): st.write("📥 Downloading Vosk Model...") subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"]) subprocess.run(["unzip", "vosk.zip"]) subprocess.run(["rm", "vosk.zip"]) # Load Vosk model model = Model(VOSK_MODEL) # ✅ Auto-Download Wav2Vec2 Model (Emotion Detection) WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53" if not os.path.exists(WAV2VEC_MODEL): st.write(f"📥 Downloading {WAV2VEC_MODEL}...") snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL) # Load emotion detection model emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL) # ✅ Streamlit UI st.markdown("
🎙️ Speech Detection System
", unsafe_allow_html=True) st.markdown("
🔍 Upload an audio file for speech-to-text, noise filtering, and emotion analysis.
", unsafe_allow_html=True) uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"]) if uploaded_file: # Convert MP3 to WAV if needed file_path = f"temp/{uploaded_file.name}" os.makedirs("temp", exist_ok=True) with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) if file_path.endswith(".mp3"): wav_path = file_path.replace(".mp3", ".wav") audio = AudioSegment.from_mp3(file_path) audio.export(wav_path, format="wav") file_path = wav_path # Load audio y, sr = librosa.load(file_path, sr=16000) # 🎵 Display waveform using Plotly st.markdown("
🎼 Interactive Audio Waveform:
", unsafe_allow_html=True) time_axis = np.linspace(0, len(y) / sr, num=len(y)) fig_waveform = go.Figure() fig_waveform.add_trace(go.Scatter( x=time_axis, y=y, mode='lines', line=dict(color='cyan'), name="Waveform" )) fig_waveform.update_layout( title="Audio Waveform", xaxis_title="Time (seconds)", yaxis_title="Amplitude", template="plotly_dark" ) st.plotly_chart(fig_waveform) # ✅ Noise Reduction st.markdown("
🔇 Applying Noise Reduction...
", unsafe_allow_html=True) y_denoised = nr.reduce_noise(y=y, sr=sr) denoised_path = file_path.replace(".wav", "_denoised.wav") sf.write(denoised_path, y_denoised, sr) # ✅ Spectrogram using Plotly st.markdown("
🎤 Spectrogram (Frequency Analysis):
", unsafe_allow_html=True) S = librosa.stft(y) S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max) fig_spectrogram = px.imshow( S_db, aspect='auto', origin='lower', labels={"x": "Time (frames)", "y": "Frequency (bins)", "color": "Intensity (dB)"}, color_continuous_scale="plasma" ) fig_spectrogram.update_layout( title="Spectrogram", template="plotly_dark" ) st.plotly_chart(fig_spectrogram) # ✅ MFCC using Plotly st.markdown("
🎵 MFCC Feature Extraction:
", unsafe_allow_html=True) mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) fig_mfcc = px.imshow( mfccs, aspect='auto', origin='lower', labels={"x": "Time (frames)", "y": "MFCC Coefficients", "color": "Magnitude"}, color_continuous_scale="viridis" ) fig_mfcc.update_layout( title="Mel-Frequency Cepstral Coefficients (MFCC)", template="plotly_dark" ) st.plotly_chart(fig_mfcc) # ✅ Speech-to-Text using Vosk def transcribe_audio(audio_path): wf = wave.open(audio_path, "rb") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): result = json.loads(rec.Result()) return result["text"] transcription = transcribe_audio(file_path) st.markdown("
📝 Transcribed Text:
", unsafe_allow_html=True) st.markdown(f"
{transcription}
", unsafe_allow_html=True) # ✅ Emotion Detection st.markdown("
😊 Emotion Analysis:
", unsafe_allow_html=True) emotion_result = emotion_model(file_path) emotion_labels = { "LABEL_0": "Neutral", "LABEL_1": "Happy", "LABEL_2": "Sad", "LABEL_3": "Angry", "LABEL_4": "Surprised" } top_emotion = max(emotion_result, key=lambda x: x["score"]) emotion_name = emotion_labels.get(top_emotion["label"], "Unknown") emotion_score = top_emotion["score"] st.markdown( f"""
{emotion_name} ({emotion_score:.2%} confidence)
""", unsafe_allow_html=True ) # ✅ Play Original & Denoised Audio st.audio(file_path, format="audio/wav") st.audio(denoised_path, format="audio/wav")