import os
import subprocess
import streamlit as st
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import wave
import json
from vosk import Model, KaldiRecognizer
from transformers import pipeline
from huggingface_hub import snapshot_download
from pydub import AudioSegment
import noisereduce as nr
import plotly.graph_objects as go
import plotly.express as px
# 🎨 Apply Custom Dark Mode CSS
st.markdown(
"""
""",
unsafe_allow_html=True
)
# ✅ Auto-Download Vosk Model (Speech-to-Text)
VOSK_MODEL = "vosk-model-small-en-us-0.15"
if not os.path.exists(VOSK_MODEL):
st.write("📥 Downloading Vosk Model...")
subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"])
subprocess.run(["unzip", "vosk.zip"])
subprocess.run(["rm", "vosk.zip"])
# Load Vosk model
model = Model(VOSK_MODEL)
# ✅ Auto-Download Wav2Vec2 Model (Emotion Detection)
WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53"
if not os.path.exists(WAV2VEC_MODEL):
st.write(f"📥 Downloading {WAV2VEC_MODEL}...")
snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL)
# Load emotion detection model
emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL)
# ✅ Streamlit UI
st.markdown("
🎙️ Speech Detection System
", unsafe_allow_html=True)
st.markdown("", unsafe_allow_html=True)
uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])
if uploaded_file:
# Convert MP3 to WAV if needed
file_path = f"temp/{uploaded_file.name}"
os.makedirs("temp", exist_ok=True)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
if file_path.endswith(".mp3"):
wav_path = file_path.replace(".mp3", ".wav")
audio = AudioSegment.from_mp3(file_path)
audio.export(wav_path, format="wav")
file_path = wav_path
# Load audio
y, sr = librosa.load(file_path, sr=16000)
# 🎵 Display waveform using Plotly
st.markdown("", unsafe_allow_html=True)
time_axis = np.linspace(0, len(y) / sr, num=len(y))
fig_waveform = go.Figure()
fig_waveform.add_trace(go.Scatter(
x=time_axis,
y=y,
mode='lines',
line=dict(color='cyan'),
name="Waveform"
))
fig_waveform.update_layout(
title="Audio Waveform",
xaxis_title="Time (seconds)",
yaxis_title="Amplitude",
template="plotly_dark"
)
st.plotly_chart(fig_waveform)
# ✅ Noise Reduction
st.markdown("", unsafe_allow_html=True)
y_denoised = nr.reduce_noise(y=y, sr=sr)
denoised_path = file_path.replace(".wav", "_denoised.wav")
sf.write(denoised_path, y_denoised, sr)
# ✅ Spectrogram using Plotly
st.markdown("", unsafe_allow_html=True)
S = librosa.stft(y)
S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
fig_spectrogram = px.imshow(
S_db,
aspect='auto',
origin='lower',
labels={"x": "Time (frames)", "y": "Frequency (bins)", "color": "Intensity (dB)"},
color_continuous_scale="plasma"
)
fig_spectrogram.update_layout(
title="Spectrogram",
template="plotly_dark"
)
st.plotly_chart(fig_spectrogram)
# ✅ MFCC using Plotly
st.markdown("", unsafe_allow_html=True)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
fig_mfcc = px.imshow(
mfccs,
aspect='auto',
origin='lower',
labels={"x": "Time (frames)", "y": "MFCC Coefficients", "color": "Magnitude"},
color_continuous_scale="viridis"
)
fig_mfcc.update_layout(
title="Mel-Frequency Cepstral Coefficients (MFCC)",
template="plotly_dark"
)
st.plotly_chart(fig_mfcc)
# ✅ Speech-to-Text using Vosk
def transcribe_audio(audio_path):
wf = wave.open(audio_path, "rb")
rec = KaldiRecognizer(model, wf.getframerate())
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
return result["text"]
transcription = transcribe_audio(file_path)
st.markdown("", unsafe_allow_html=True)
st.markdown(f"{transcription}
", unsafe_allow_html=True)
# ✅ Emotion Detection
st.markdown("", unsafe_allow_html=True)
emotion_result = emotion_model(file_path)
emotion_labels = {
"LABEL_0": "Neutral",
"LABEL_1": "Happy",
"LABEL_2": "Sad",
"LABEL_3": "Angry",
"LABEL_4": "Surprised"
}
top_emotion = max(emotion_result, key=lambda x: x["score"])
emotion_name = emotion_labels.get(top_emotion["label"], "Unknown")
emotion_score = top_emotion["score"]
st.markdown(
f"""
{emotion_name} ({emotion_score:.2%} confidence)
""",
unsafe_allow_html=True
)
# ✅ Play Original & Denoised Audio
st.audio(file_path, format="audio/wav")
st.audio(denoised_path, format="audio/wav")