invincible-jha commited on
Commit
8aec16e
1 Parent(s): d1af0d7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -209
app.py CHANGED
@@ -4,18 +4,122 @@ from transformers import WhisperProcessor, WhisperForConditionalGeneration, Auto
4
  import librosa
5
  import numpy as np
6
  import plotly.graph_objects as go
7
- from plotly.subplots import make_subplots
8
  import warnings
9
  import os
10
- import pandas as pd
11
  from scipy.stats import kurtosis, skew
12
  warnings.filterwarnings('ignore')
13
 
14
- # Global variables for models
15
- processor = None
16
- whisper_model = None
17
- emotion_tokenizer = None
18
- emotion_model = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def load_models():
21
  """Initialize and load all required models"""
@@ -30,7 +134,6 @@ def load_models():
30
  emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
31
  emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
32
 
33
- # Move models to CPU explicitly
34
  whisper_model.to("cpu")
35
  emotion_model.to("cpu")
36
 
@@ -40,180 +143,66 @@ def load_models():
40
  print(f"Error loading models: {str(e)}")
41
  return False
42
 
43
- def extract_voice_features(waveform, sr):
44
- """Extract comprehensive voice features for health analysis"""
45
- features = {}
46
-
47
- try:
48
- # 1. Fundamental Frequency (F0) Statistics
49
- f0, voiced_flag, _ = librosa.pyin(waveform,
50
- fmin=librosa.note_to_hz('C2'),
51
- fmax=librosa.note_to_hz('C7'))
52
- f0_valid = f0[voiced_flag]
53
- features['f0_mean'] = np.mean(f0_valid)
54
- features['f0_std'] = np.std(f0_valid)
55
- features['f0_range'] = np.ptp(f0_valid)
56
-
57
- # 2. Jitter (F0 Variation)
58
- if len(f0_valid) > 1:
59
- f0_diff = np.diff(f0_valid)
60
- features['jitter'] = np.mean(np.abs(f0_diff))
61
- features['jitter_percent'] = (features['jitter'] / features['f0_mean']) * 100
62
-
63
- # 3. Shimmer (Amplitude Variation)
64
- amplitude_envelope = np.abs(librosa.stft(waveform))
65
- features['shimmer'] = np.mean(np.std(amplitude_envelope, axis=1))
66
-
67
- # 4. Spectral Features
68
- spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
69
- features['spectral_centroid_mean'] = np.mean(spectral_centroids)
70
- features['spectral_centroid_std'] = np.std(spectral_centroids)
71
-
72
- spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
73
- features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
74
-
75
- # 5. Voice Quality Measures
76
- mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
77
- features['mfcc_means'] = np.mean(mfccs, axis=1)
78
- features['mfcc_stds'] = np.std(mfccs, axis=1)
79
-
80
- # 6. Rhythm and Timing
81
- tempo, _ = librosa.beat.beat_track(y=waveform, sr=sr)
82
- features['speech_rate'] = tempo
83
-
84
- # 7. Energy Features
85
- rms = librosa.feature.rms(y=waveform)[0]
86
- features['energy_mean'] = np.mean(rms)
87
- features['energy_std'] = np.std(rms)
88
- features['energy_kurtosis'] = kurtosis(rms)
89
- features['energy_skewness'] = skew(rms)
90
-
91
- # 8. Pause Analysis
92
- silence_threshold = 0.01
93
- is_silence = rms < silence_threshold
94
- silence_regions = librosa.effects.split(waveform, top_db=20)
95
- features['pause_count'] = len(silence_regions)
96
- features['average_pause_duration'] = np.mean([r[1] - r[0] for r in silence_regions]) / sr
97
-
98
- return features, True
99
- except Exception as e:
100
- print(f"Error extracting voice features: {str(e)}")
101
- return {}, False
102
-
103
- def create_voice_analysis_plots(features):
104
- """Create comprehensive visualization of voice analysis"""
105
  try:
106
- # Create subplot figure
107
- fig = make_subplots(
108
- rows=2, cols=2,
109
- subplot_titles=(
110
- 'Fundamental Frequency Analysis',
111
- 'Voice Quality Measures',
112
- 'Energy and Rhythm Analysis',
113
- 'MFCC Analysis'
114
- )
115
- )
116
-
117
- # 1. F0 Analysis Plot
118
- f0_metrics = {
119
- 'Mean F0': features['f0_mean'],
120
- 'F0 Std Dev': features['f0_std'],
121
- 'F0 Range': features['f0_range'],
122
- 'Jitter %': features['jitter_percent']
123
- }
124
- fig.add_trace(
125
  go.Bar(
126
- x=list(f0_metrics.keys()),
127
- y=list(f0_metrics.values()),
128
- name='F0 Metrics'
129
- ),
130
- row=1, col=1
131
- )
132
-
133
- # 2. Voice Quality Plot
134
- quality_metrics = {
135
- 'Shimmer': features['shimmer'],
136
- 'Spectral Centroid': features['spectral_centroid_mean'] / 1000, # Scale for visibility
137
- 'Spectral Rolloff': features['spectral_rolloff_mean'] / 1000 # Scale for visibility
138
- }
139
- fig.add_trace(
140
- go.Bar(
141
- x=list(quality_metrics.keys()),
142
- y=list(quality_metrics.values()),
143
- name='Voice Quality'
144
- ),
145
- row=1, col=2
146
- )
147
-
148
- # 3. Energy and Rhythm Plot
149
- energy_metrics = {
150
- 'Energy Mean': features['energy_mean'],
151
- 'Energy Std': features['energy_std'],
152
- 'Speech Rate': features['speech_rate'] / 10, # Scale for visibility
153
- 'Pause Count': features['pause_count']
154
- }
155
- fig.add_trace(
156
- go.Bar(
157
- x=list(energy_metrics.keys()),
158
- y=list(energy_metrics.values()),
159
- name='Energy & Rhythm'
160
- ),
161
- row=2, col=1
162
- )
163
-
164
- # 4. MFCC Analysis Plot
165
- fig.add_trace(
166
- go.Scatter(
167
- y=features['mfcc_means'],
168
- mode='lines+markers',
169
- name='MFCC Coefficients'
170
- ),
171
- row=2, col=2
172
- )
173
 
174
- # Update layout
175
  fig.update_layout(
176
- height=800,
177
- showlegend=False,
178
- title_text="Comprehensive Voice Analysis",
 
 
 
179
  )
180
 
181
  return fig.to_html(include_plotlyjs=True)
182
  except Exception as e:
183
- print(f"Error creating voice analysis plots: {str(e)}")
184
- return "Error creating visualizations"
185
 
186
  def analyze_audio(audio_input):
187
  """Main function to analyze audio input"""
188
  try:
189
  if audio_input is None:
190
- print("No audio input provided")
191
- return "No audio file provided", "Please provide an audio file", ""
192
 
193
- print(f"Received audio input: {audio_input}")
194
 
195
- # Load and process audio
196
  if isinstance(audio_input, tuple):
197
- audio_path = audio_input[0]
198
  else:
199
  audio_path = audio_input
200
 
201
- # Load audio with original sampling rate
202
- waveform, sr = librosa.load(audio_path, sr=None)
 
 
 
203
 
204
  # Extract voice features
205
- voice_features, success = extract_voice_features(waveform, sr)
206
- if not success:
207
- return "Error extracting voice features", "Analysis failed", ""
 
208
 
209
- # Create voice analysis visualization
210
- voice_analysis_html = create_voice_analysis_plots(voice_features)
 
 
 
211
 
212
  # Transcribe audio
213
  print("Transcribing audio...")
214
- # Resample for Whisper model
215
- waveform_16k = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
216
- inputs = processor(waveform_16k, sampling_rate=16000, return_tensors="pt").input_features
217
 
218
  with torch.no_grad():
219
  predicted_ids = whisper_model.generate(inputs)
@@ -221,7 +210,7 @@ def analyze_audio(audio_input):
221
 
222
  # Analyze emotions
223
  print("Analyzing emotions...")
224
- inputs = emotion_tokenizer(
225
  transcription,
226
  return_tensors="pt",
227
  padding=True,
@@ -230,8 +219,8 @@ def analyze_audio(audio_input):
230
  )
231
 
232
  with torch.no_grad():
233
- outputs = emotion_model(**inputs)
234
- emotions = torch.nn.functional.softmax(outputs.logits, dim=-1)
235
 
236
  emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
237
  emotion_scores = {
@@ -241,36 +230,30 @@ def analyze_audio(audio_input):
241
 
242
  # Create emotion visualization
243
  emotion_viz = create_emotion_plot(emotion_scores)
 
 
244
 
245
- # Generate analysis summary
246
  summary = f"""Voice Analysis Summary:
247
-
248
- Speech Characteristics:
249
- - Fundamental Frequency (Pitch): {voice_features['f0_mean']:.2f} Hz (average)
250
- - Jitter: {voice_features['jitter_percent']:.2f}% (voice stability)
251
- - Speech Rate: {voice_features['speech_rate']:.2f} BPM
252
- - Number of Pauses: {voice_features['pause_count']}
253
- - Average Pause Duration: {voice_features['average_pause_duration']:.2f} seconds
254
-
255
- Voice Quality Indicators:
256
- - Shimmer: {voice_features['shimmer']:.4f} (amplitude variation)
257
- - Energy Distribution: {voice_features['energy_skewness']:.2f} (skewness)
258
- - Spectral Centroid: {voice_features['spectral_centroid_mean']:.2f} Hz
259
-
260
- Emotional Content:
261
- - Primary Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
262
- - Emotional Variability: {np.std(list(emotion_scores.values())):.2f}
263
 
264
  Speech Content:
265
  {transcription}
 
 
 
 
 
 
 
 
266
  """
267
 
268
- return summary, emotion_viz, voice_analysis_html
269
 
270
  except Exception as e:
271
- error_msg = f"Error analyzing audio: {str(e)}"
272
  print(error_msg)
273
- return error_msg, "Error in analysis", ""
274
 
275
  # Load models at startup
276
  print("Initializing application...")
@@ -287,45 +270,28 @@ demo = gr.Interface(
287
  ),
288
  outputs=[
289
  gr.Textbox(label="Analysis Summary", lines=10),
290
- gr.HTML(label="Emotional Analysis"),
291
- gr.HTML(label="Voice Biomarker Analysis")
292
  ],
293
- title="Comprehensive Vocal Biomarker Analysis",
294
  description="""
295
- This application performs comprehensive analysis of voice recordings to extract potential health-related biomarkers:
296
 
297
- 1. Speech Characteristics:
298
- - Fundamental frequency analysis
299
- - Voice stability measures (jitter, shimmer)
300
- - Speech rate and rhythm
301
-
302
- 2. Voice Quality Analysis:
303
- - Spectral features
304
- - Energy distribution
305
- - MFCC analysis
306
 
307
- 3. Emotional Content:
308
  - Emotion detection
309
- - Emotional stability analysis
310
 
311
- 4. Speech Content:
312
  - Text transcription
313
- - Pause analysis
314
 
315
  Upload an audio file or record directly through your microphone.
316
  """,
317
- article="""
318
- ### About Vocal Biomarkers
319
- Vocal biomarkers are measurable indicators in the human voice that can potentially indicate various health conditions.
320
- This analysis focuses on several key aspects:
321
-
322
- - **Voice Quality**: Changes in voice quality can indicate respiratory or neurological conditions
323
- - **Prosody**: Speech rhythm and timing can be indicators of cognitive function
324
- - **Emotional Content**: Emotional patterns can be relevant to mental health assessment
325
- - **Acoustic Features**: Specific acoustic patterns may correlate with various health conditions
326
-
327
- Note: This is a demonstration tool and should not be used for medical diagnosis.
328
- """,
329
  examples=None,
330
  cache_examples=False
331
  )
 
4
  import librosa
5
  import numpy as np
6
  import plotly.graph_objects as go
 
7
  import warnings
8
  import os
 
9
  from scipy.stats import kurtosis, skew
10
  warnings.filterwarnings('ignore')
11
 
12
+ def extract_prosodic_features(waveform, sr):
13
+ """Extract prosodic features from audio"""
14
+ try:
15
+ features = {}
16
+
17
+ # 1. Pitch (F0) Features
18
+ pitches, magnitudes = librosa.piptrack(y=waveform, sr=sr)
19
+ f0_contour = []
20
+ for t in range(pitches.shape[1]):
21
+ pitches_at_t = pitches[:, t]
22
+ mags = magnitudes[:, t]
23
+ pitch_index = mags.argmax()
24
+ f0_contour.append(pitches[pitch_index, t])
25
+ f0_contour = np.array(f0_contour)
26
+ f0_contour = f0_contour[f0_contour > 0] # Remove zero pitches
27
+
28
+ if len(f0_contour) > 0:
29
+ features['pitch_mean'] = np.mean(f0_contour)
30
+ features['pitch_std'] = np.std(f0_contour)
31
+ features['pitch_range'] = np.ptp(f0_contour)
32
+ else:
33
+ features['pitch_mean'] = 0
34
+ features['pitch_std'] = 0
35
+ features['pitch_range'] = 0
36
+
37
+ # 2. Energy/Intensity Features
38
+ rms = librosa.feature.rms(y=waveform)[0]
39
+ features['energy_mean'] = np.mean(rms)
40
+ features['energy_std'] = np.std(rms)
41
+ features['energy_range'] = np.ptp(rms)
42
+
43
+ # 3. Rhythm Features
44
+ onset_env = librosa.onset.onset_strength(y=waveform, sr=sr)
45
+ tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
46
+ features['tempo'] = tempo[0]
47
+
48
+ # 4. Voice Quality Features
49
+ spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
50
+ features['spectral_centroid_mean'] = np.mean(spectral_centroids)
51
+
52
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
53
+ features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
54
+
55
+ # 5. MFCC Features
56
+ mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
57
+ for i in range(13):
58
+ features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
59
+ features[f'mfcc_{i}_std'] = np.std(mfccs[i])
60
+
61
+ return features
62
+
63
+ except Exception as e:
64
+ print(f"Error in extract_prosodic_features: {str(e)}")
65
+ return None
66
+
67
+ def create_feature_plots(features):
68
+ """Create visualizations for audio features"""
69
+ try:
70
+ # Create main figure with subplots
71
+ fig = go.Figure()
72
+
73
+ # 1. Pitch Features
74
+ pitch_data = {
75
+ 'Mean': features['pitch_mean'],
76
+ 'Std Dev': features['pitch_std'],
77
+ 'Range': features['pitch_range']
78
+ }
79
+
80
+ fig.add_trace(go.Bar(
81
+ name='Pitch Features',
82
+ x=list(pitch_data.keys()),
83
+ y=list(pitch_data.values()),
84
+ marker_color='blue'
85
+ ))
86
+
87
+ # 2. Energy Features
88
+ energy_data = {
89
+ 'Mean': features['energy_mean'],
90
+ 'Std Dev': features['energy_std'],
91
+ 'Range': features['energy_range']
92
+ }
93
+
94
+ fig.add_trace(go.Bar(
95
+ name='Energy Features',
96
+ x=[f"Energy {k}" for k in energy_data.keys()],
97
+ y=list(energy_data.values()),
98
+ marker_color='red'
99
+ ))
100
+
101
+ # 3. MFCC Plot
102
+ mfcc_means = [features[f'mfcc_{i}_mean'] for i in range(13)]
103
+ fig.add_trace(go.Scatter(
104
+ name='MFCC Coefficients',
105
+ y=mfcc_means,
106
+ mode='lines+markers',
107
+ marker_color='green'
108
+ ))
109
+
110
+ # Update layout
111
+ fig.update_layout(
112
+ title='Voice Feature Analysis',
113
+ showlegend=True,
114
+ height=600,
115
+ barmode='group'
116
+ )
117
+
118
+ return fig.to_html(include_plotlyjs=True)
119
+
120
+ except Exception as e:
121
+ print(f"Error in create_feature_plots: {str(e)}")
122
+ return None
123
 
124
  def load_models():
125
  """Initialize and load all required models"""
 
134
  emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
135
  emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
136
 
 
137
  whisper_model.to("cpu")
138
  emotion_model.to("cpu")
139
 
 
143
  print(f"Error loading models: {str(e)}")
144
  return False
145
 
146
+ def create_emotion_plot(emotions):
147
+ """Create emotion analysis visualization"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  try:
149
+ fig = go.Figure(data=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  go.Bar(
151
+ x=list(emotions.keys()),
152
+ y=list(emotions.values()),
153
+ marker_color='rgb(55, 83, 109)'
154
+ )
155
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
 
157
  fig.update_layout(
158
+ title='Emotion Analysis',
159
+ xaxis_title='Emotion',
160
+ yaxis_title='Score',
161
+ yaxis_range=[0, 1],
162
+ template='plotly_white',
163
+ height=400
164
  )
165
 
166
  return fig.to_html(include_plotlyjs=True)
167
  except Exception as e:
168
+ print(f"Error creating emotion plot: {str(e)}")
169
+ return None
170
 
171
  def analyze_audio(audio_input):
172
  """Main function to analyze audio input"""
173
  try:
174
  if audio_input is None:
175
+ return "Please provide an audio input", None, None
 
176
 
177
+ print(f"Processing audio input: {type(audio_input)}")
178
 
179
+ # Handle audio input
180
  if isinstance(audio_input, tuple):
181
+ audio_path = audio_input[0] # Get file path from tuple
182
  else:
183
  audio_path = audio_input
184
 
185
+ print(f"Loading audio from path: {audio_path}")
186
+
187
+ # Load audio
188
+ waveform, sr = librosa.load(audio_path, sr=16000)
189
+ print(f"Audio loaded: {waveform.shape}, SR: {sr}")
190
 
191
  # Extract voice features
192
+ print("Extracting voice features...")
193
+ features = extract_prosodic_features(waveform, sr)
194
+ if features is None:
195
+ return "Error extracting voice features", None, None
196
 
197
+ # Create feature plots
198
+ print("Creating feature visualizations...")
199
+ feature_viz = create_feature_plots(features)
200
+ if feature_viz is None:
201
+ return "Error creating feature visualizations", None, None
202
 
203
  # Transcribe audio
204
  print("Transcribing audio...")
205
+ inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
 
 
206
 
207
  with torch.no_grad():
208
  predicted_ids = whisper_model.generate(inputs)
 
210
 
211
  # Analyze emotions
212
  print("Analyzing emotions...")
213
+ emotion_inputs = emotion_tokenizer(
214
  transcription,
215
  return_tensors="pt",
216
  padding=True,
 
219
  )
220
 
221
  with torch.no_grad():
222
+ emotion_outputs = emotion_model(**emotion_inputs)
223
+ emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
224
 
225
  emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
226
  emotion_scores = {
 
230
 
231
  # Create emotion visualization
232
  emotion_viz = create_emotion_plot(emotion_scores)
233
+ if emotion_viz is None:
234
+ return "Error creating emotion visualization", None, None
235
 
236
+ # Create analysis summary
237
  summary = f"""Voice Analysis Summary:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  Speech Content:
240
  {transcription}
241
+
242
+ Voice Characteristics:
243
+ - Average Pitch: {features['pitch_mean']:.2f} Hz
244
+ - Pitch Variation: {features['pitch_std']:.2f} Hz
245
+ - Speech Rate (Tempo): {features['tempo']:.2f} BPM
246
+ - Voice Energy: {features['energy_mean']:.4f}
247
+
248
+ Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
249
  """
250
 
251
+ return summary, emotion_viz, feature_viz
252
 
253
  except Exception as e:
254
+ error_msg = f"Error in audio analysis: {str(e)}"
255
  print(error_msg)
256
+ return error_msg, None, None
257
 
258
  # Load models at startup
259
  print("Initializing application...")
 
270
  ),
271
  outputs=[
272
  gr.Textbox(label="Analysis Summary", lines=10),
273
+ gr.HTML(label="Emotion Analysis"),
274
+ gr.HTML(label="Voice Feature Analysis")
275
  ],
276
+ title="Voice Analysis System",
277
  description="""
278
+ This application analyzes voice recordings to extract various characteristics:
279
 
280
+ 1. Voice Features:
281
+ - Pitch analysis
282
+ - Energy patterns
283
+ - Speech rate
284
+ - Voice quality
 
 
 
 
285
 
286
+ 2. Emotional Content:
287
  - Emotion detection
288
+ - Emotional intensity
289
 
290
+ 3. Speech Content:
291
  - Text transcription
 
292
 
293
  Upload an audio file or record directly through your microphone.
294
  """,
 
 
 
 
 
 
 
 
 
 
 
 
295
  examples=None,
296
  cache_examples=False
297
  )