import gradio as gr import torch from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer import librosa import numpy as np import plotly.graph_objects as go import warnings import os from scipy.stats import kurtosis, skew warnings.filterwarnings('ignore') def extract_prosodic_features(waveform, sr): """Extract prosodic features from audio""" try: features = {} # 1. Pitch (F0) Features pitches, magnitudes = librosa.piptrack(y=waveform, sr=sr) f0_contour = [] for t in range(pitches.shape[1]): pitches_at_t = pitches[:, t] mags = magnitudes[:, t] pitch_index = mags.argmax() f0_contour.append(pitches[pitch_index, t]) f0_contour = np.array(f0_contour) f0_contour = f0_contour[f0_contour > 0] # Remove zero pitches if len(f0_contour) > 0: features['pitch_mean'] = np.mean(f0_contour) features['pitch_std'] = np.std(f0_contour) features['pitch_range'] = np.ptp(f0_contour) else: features['pitch_mean'] = 0 features['pitch_std'] = 0 features['pitch_range'] = 0 # 2. Energy/Intensity Features rms = librosa.feature.rms(y=waveform)[0] features['energy_mean'] = np.mean(rms) features['energy_std'] = np.std(rms) features['energy_range'] = np.ptp(rms) # 3. Rhythm Features onset_env = librosa.onset.onset_strength(y=waveform, sr=sr) tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr) features['tempo'] = tempo[0] # 4. Voice Quality Features spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0] features['spectral_centroid_mean'] = np.mean(spectral_centroids) spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0] features['spectral_rolloff_mean'] = np.mean(spectral_rolloff) # 5. MFCC Features mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13) for i in range(13): features[f'mfcc_{i}_mean'] = np.mean(mfccs[i]) features[f'mfcc_{i}_std'] = np.std(mfccs[i]) return features except Exception as e: print(f"Error in extract_prosodic_features: {str(e)}") return None def create_feature_plots(features): """Create visualizations for audio features""" try: # Create main figure with subplots fig = go.Figure() # 1. Pitch Features pitch_data = { 'Mean': features['pitch_mean'], 'Std Dev': features['pitch_std'], 'Range': features['pitch_range'] } fig.add_trace(go.Bar( name='Pitch Features', x=list(pitch_data.keys()), y=list(pitch_data.values()), marker_color='blue' )) # 2. Energy Features energy_data = { 'Mean': features['energy_mean'], 'Std Dev': features['energy_std'], 'Range': features['energy_range'] } fig.add_trace(go.Bar( name='Energy Features', x=[f"Energy {k}" for k in energy_data.keys()], y=list(energy_data.values()), marker_color='red' )) # 3. MFCC Plot mfcc_means = [features[f'mfcc_{i}_mean'] for i in range(13)] fig.add_trace(go.Scatter( name='MFCC Coefficients', y=mfcc_means, mode='lines+markers', marker_color='green' )) # Update layout fig.update_layout( title='Voice Feature Analysis', showlegend=True, height=600, barmode='group' ) return fig.to_html(include_plotlyjs=True) except Exception as e: print(f"Error in create_feature_plots: {str(e)}") return None def load_models(): """Initialize and load all required models""" global processor, whisper_model, emotion_tokenizer, emotion_model try: print("Loading Whisper model...") processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") print("Loading emotion model...") emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base") emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base") whisper_model.to("cpu") emotion_model.to("cpu") print("Models loaded successfully!") return True except Exception as e: print(f"Error loading models: {str(e)}") return False def create_emotion_plot(emotions): """Create emotion analysis visualization""" try: fig = go.Figure(data=[ go.Bar( x=list(emotions.keys()), y=list(emotions.values()), marker_color='rgb(55, 83, 109)' ) ]) fig.update_layout( title='Emotion Analysis', xaxis_title='Emotion', yaxis_title='Score', yaxis_range=[0, 1], template='plotly_white', height=400 ) return fig.to_html(include_plotlyjs=True) except Exception as e: print(f"Error creating emotion plot: {str(e)}") return None def analyze_audio(audio_input): """Main function to analyze audio input""" try: if audio_input is None: return "Please provide an audio input", None, None print(f"Processing audio input: {type(audio_input)}") # Handle audio input if isinstance(audio_input, tuple): audio_path = audio_input[0] # Get file path from tuple else: audio_path = audio_input print(f"Loading audio from path: {audio_path}") # Load audio waveform, sr = librosa.load(audio_path, sr=16000) print(f"Audio loaded: {waveform.shape}, SR: {sr}") # Extract voice features print("Extracting voice features...") features = extract_prosodic_features(waveform, sr) if features is None: return "Error extracting voice features", None, None # Create feature plots print("Creating feature visualizations...") feature_viz = create_feature_plots(features) if feature_viz is None: return "Error creating feature visualizations", None, None # Transcribe audio print("Transcribing audio...") inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features with torch.no_grad(): predicted_ids = whisper_model.generate(inputs) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] # Analyze emotions print("Analyzing emotions...") emotion_inputs = emotion_tokenizer( transcription, return_tensors="pt", padding=True, truncation=True, max_length=512 ) with torch.no_grad(): emotion_outputs = emotion_model(**emotion_inputs) emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1) emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] emotion_scores = { label: float(score) for label, score in zip(emotion_labels, emotions[0].cpu().numpy()) } # Create emotion visualization emotion_viz = create_emotion_plot(emotion_scores) if emotion_viz is None: return "Error creating emotion visualization", None, None # Create analysis summary summary = f"""Voice Analysis Summary: Speech Content: {transcription} Voice Characteristics: - Average Pitch: {features['pitch_mean']:.2f} Hz - Pitch Variation: {features['pitch_std']:.2f} Hz - Speech Rate (Tempo): {features['tempo']:.2f} BPM - Voice Energy: {features['energy_mean']:.4f} Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]} """ return summary, emotion_viz, feature_viz except Exception as e: error_msg = f"Error in audio analysis: {str(e)}" print(error_msg) return error_msg, None, None # Load models at startup print("Initializing application...") if not load_models(): raise RuntimeError("Failed to load required models") # Create Gradio interface demo = gr.Interface( fn=analyze_audio, inputs=gr.Audio( sources=["microphone", "upload"], type="filepath", label="Audio Input" ), outputs=[ gr.Textbox(label="Analysis Summary", lines=10), gr.HTML(label="Emotion Analysis"), gr.HTML(label="Voice Feature Analysis") ], title="Voice Analysis System", description=""" This application analyzes voice recordings to extract various characteristics: 1. Voice Features: - Pitch analysis - Energy patterns - Speech rate - Voice quality 2. Emotional Content: - Emotion detection - Emotional intensity 3. Speech Content: - Text transcription Upload an audio file or record directly through your microphone. """, examples=None, cache_examples=False ) if __name__ == "__main__": demo.launch(share=True)