invincible-jha
commited on
Commit
•
8aec16e
1
Parent(s):
d1af0d7
Upload app.py
Browse files
app.py
CHANGED
@@ -4,18 +4,122 @@ from transformers import WhisperProcessor, WhisperForConditionalGeneration, Auto
|
|
4 |
import librosa
|
5 |
import numpy as np
|
6 |
import plotly.graph_objects as go
|
7 |
-
from plotly.subplots import make_subplots
|
8 |
import warnings
|
9 |
import os
|
10 |
-
import pandas as pd
|
11 |
from scipy.stats import kurtosis, skew
|
12 |
warnings.filterwarnings('ignore')
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def load_models():
|
21 |
"""Initialize and load all required models"""
|
@@ -30,7 +134,6 @@ def load_models():
|
|
30 |
emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
|
31 |
emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
|
32 |
|
33 |
-
# Move models to CPU explicitly
|
34 |
whisper_model.to("cpu")
|
35 |
emotion_model.to("cpu")
|
36 |
|
@@ -40,180 +143,66 @@ def load_models():
|
|
40 |
print(f"Error loading models: {str(e)}")
|
41 |
return False
|
42 |
|
43 |
-
def
|
44 |
-
"""
|
45 |
-
features = {}
|
46 |
-
|
47 |
-
try:
|
48 |
-
# 1. Fundamental Frequency (F0) Statistics
|
49 |
-
f0, voiced_flag, _ = librosa.pyin(waveform,
|
50 |
-
fmin=librosa.note_to_hz('C2'),
|
51 |
-
fmax=librosa.note_to_hz('C7'))
|
52 |
-
f0_valid = f0[voiced_flag]
|
53 |
-
features['f0_mean'] = np.mean(f0_valid)
|
54 |
-
features['f0_std'] = np.std(f0_valid)
|
55 |
-
features['f0_range'] = np.ptp(f0_valid)
|
56 |
-
|
57 |
-
# 2. Jitter (F0 Variation)
|
58 |
-
if len(f0_valid) > 1:
|
59 |
-
f0_diff = np.diff(f0_valid)
|
60 |
-
features['jitter'] = np.mean(np.abs(f0_diff))
|
61 |
-
features['jitter_percent'] = (features['jitter'] / features['f0_mean']) * 100
|
62 |
-
|
63 |
-
# 3. Shimmer (Amplitude Variation)
|
64 |
-
amplitude_envelope = np.abs(librosa.stft(waveform))
|
65 |
-
features['shimmer'] = np.mean(np.std(amplitude_envelope, axis=1))
|
66 |
-
|
67 |
-
# 4. Spectral Features
|
68 |
-
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
|
69 |
-
features['spectral_centroid_mean'] = np.mean(spectral_centroids)
|
70 |
-
features['spectral_centroid_std'] = np.std(spectral_centroids)
|
71 |
-
|
72 |
-
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
|
73 |
-
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
|
74 |
-
|
75 |
-
# 5. Voice Quality Measures
|
76 |
-
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
|
77 |
-
features['mfcc_means'] = np.mean(mfccs, axis=1)
|
78 |
-
features['mfcc_stds'] = np.std(mfccs, axis=1)
|
79 |
-
|
80 |
-
# 6. Rhythm and Timing
|
81 |
-
tempo, _ = librosa.beat.beat_track(y=waveform, sr=sr)
|
82 |
-
features['speech_rate'] = tempo
|
83 |
-
|
84 |
-
# 7. Energy Features
|
85 |
-
rms = librosa.feature.rms(y=waveform)[0]
|
86 |
-
features['energy_mean'] = np.mean(rms)
|
87 |
-
features['energy_std'] = np.std(rms)
|
88 |
-
features['energy_kurtosis'] = kurtosis(rms)
|
89 |
-
features['energy_skewness'] = skew(rms)
|
90 |
-
|
91 |
-
# 8. Pause Analysis
|
92 |
-
silence_threshold = 0.01
|
93 |
-
is_silence = rms < silence_threshold
|
94 |
-
silence_regions = librosa.effects.split(waveform, top_db=20)
|
95 |
-
features['pause_count'] = len(silence_regions)
|
96 |
-
features['average_pause_duration'] = np.mean([r[1] - r[0] for r in silence_regions]) / sr
|
97 |
-
|
98 |
-
return features, True
|
99 |
-
except Exception as e:
|
100 |
-
print(f"Error extracting voice features: {str(e)}")
|
101 |
-
return {}, False
|
102 |
-
|
103 |
-
def create_voice_analysis_plots(features):
|
104 |
-
"""Create comprehensive visualization of voice analysis"""
|
105 |
try:
|
106 |
-
|
107 |
-
fig = make_subplots(
|
108 |
-
rows=2, cols=2,
|
109 |
-
subplot_titles=(
|
110 |
-
'Fundamental Frequency Analysis',
|
111 |
-
'Voice Quality Measures',
|
112 |
-
'Energy and Rhythm Analysis',
|
113 |
-
'MFCC Analysis'
|
114 |
-
)
|
115 |
-
)
|
116 |
-
|
117 |
-
# 1. F0 Analysis Plot
|
118 |
-
f0_metrics = {
|
119 |
-
'Mean F0': features['f0_mean'],
|
120 |
-
'F0 Std Dev': features['f0_std'],
|
121 |
-
'F0 Range': features['f0_range'],
|
122 |
-
'Jitter %': features['jitter_percent']
|
123 |
-
}
|
124 |
-
fig.add_trace(
|
125 |
go.Bar(
|
126 |
-
x=list(
|
127 |
-
y=list(
|
128 |
-
|
129 |
-
)
|
130 |
-
|
131 |
-
)
|
132 |
-
|
133 |
-
# 2. Voice Quality Plot
|
134 |
-
quality_metrics = {
|
135 |
-
'Shimmer': features['shimmer'],
|
136 |
-
'Spectral Centroid': features['spectral_centroid_mean'] / 1000, # Scale for visibility
|
137 |
-
'Spectral Rolloff': features['spectral_rolloff_mean'] / 1000 # Scale for visibility
|
138 |
-
}
|
139 |
-
fig.add_trace(
|
140 |
-
go.Bar(
|
141 |
-
x=list(quality_metrics.keys()),
|
142 |
-
y=list(quality_metrics.values()),
|
143 |
-
name='Voice Quality'
|
144 |
-
),
|
145 |
-
row=1, col=2
|
146 |
-
)
|
147 |
-
|
148 |
-
# 3. Energy and Rhythm Plot
|
149 |
-
energy_metrics = {
|
150 |
-
'Energy Mean': features['energy_mean'],
|
151 |
-
'Energy Std': features['energy_std'],
|
152 |
-
'Speech Rate': features['speech_rate'] / 10, # Scale for visibility
|
153 |
-
'Pause Count': features['pause_count']
|
154 |
-
}
|
155 |
-
fig.add_trace(
|
156 |
-
go.Bar(
|
157 |
-
x=list(energy_metrics.keys()),
|
158 |
-
y=list(energy_metrics.values()),
|
159 |
-
name='Energy & Rhythm'
|
160 |
-
),
|
161 |
-
row=2, col=1
|
162 |
-
)
|
163 |
-
|
164 |
-
# 4. MFCC Analysis Plot
|
165 |
-
fig.add_trace(
|
166 |
-
go.Scatter(
|
167 |
-
y=features['mfcc_means'],
|
168 |
-
mode='lines+markers',
|
169 |
-
name='MFCC Coefficients'
|
170 |
-
),
|
171 |
-
row=2, col=2
|
172 |
-
)
|
173 |
|
174 |
-
# Update layout
|
175 |
fig.update_layout(
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
179 |
)
|
180 |
|
181 |
return fig.to_html(include_plotlyjs=True)
|
182 |
except Exception as e:
|
183 |
-
print(f"Error creating
|
184 |
-
return
|
185 |
|
186 |
def analyze_audio(audio_input):
|
187 |
"""Main function to analyze audio input"""
|
188 |
try:
|
189 |
if audio_input is None:
|
190 |
-
|
191 |
-
return "No audio file provided", "Please provide an audio file", ""
|
192 |
|
193 |
-
print(f"
|
194 |
|
195 |
-
#
|
196 |
if isinstance(audio_input, tuple):
|
197 |
-
audio_path = audio_input[0]
|
198 |
else:
|
199 |
audio_path = audio_input
|
200 |
|
201 |
-
|
202 |
-
|
|
|
|
|
|
|
203 |
|
204 |
# Extract voice features
|
205 |
-
|
206 |
-
|
207 |
-
|
|
|
208 |
|
209 |
-
# Create
|
210 |
-
|
|
|
|
|
|
|
211 |
|
212 |
# Transcribe audio
|
213 |
print("Transcribing audio...")
|
214 |
-
|
215 |
-
waveform_16k = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
|
216 |
-
inputs = processor(waveform_16k, sampling_rate=16000, return_tensors="pt").input_features
|
217 |
|
218 |
with torch.no_grad():
|
219 |
predicted_ids = whisper_model.generate(inputs)
|
@@ -221,7 +210,7 @@ def analyze_audio(audio_input):
|
|
221 |
|
222 |
# Analyze emotions
|
223 |
print("Analyzing emotions...")
|
224 |
-
|
225 |
transcription,
|
226 |
return_tensors="pt",
|
227 |
padding=True,
|
@@ -230,8 +219,8 @@ def analyze_audio(audio_input):
|
|
230 |
)
|
231 |
|
232 |
with torch.no_grad():
|
233 |
-
|
234 |
-
emotions = torch.nn.functional.softmax(
|
235 |
|
236 |
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
|
237 |
emotion_scores = {
|
@@ -241,36 +230,30 @@ def analyze_audio(audio_input):
|
|
241 |
|
242 |
# Create emotion visualization
|
243 |
emotion_viz = create_emotion_plot(emotion_scores)
|
|
|
|
|
244 |
|
245 |
-
#
|
246 |
summary = f"""Voice Analysis Summary:
|
247 |
-
|
248 |
-
Speech Characteristics:
|
249 |
-
- Fundamental Frequency (Pitch): {voice_features['f0_mean']:.2f} Hz (average)
|
250 |
-
- Jitter: {voice_features['jitter_percent']:.2f}% (voice stability)
|
251 |
-
- Speech Rate: {voice_features['speech_rate']:.2f} BPM
|
252 |
-
- Number of Pauses: {voice_features['pause_count']}
|
253 |
-
- Average Pause Duration: {voice_features['average_pause_duration']:.2f} seconds
|
254 |
-
|
255 |
-
Voice Quality Indicators:
|
256 |
-
- Shimmer: {voice_features['shimmer']:.4f} (amplitude variation)
|
257 |
-
- Energy Distribution: {voice_features['energy_skewness']:.2f} (skewness)
|
258 |
-
- Spectral Centroid: {voice_features['spectral_centroid_mean']:.2f} Hz
|
259 |
-
|
260 |
-
Emotional Content:
|
261 |
-
- Primary Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
|
262 |
-
- Emotional Variability: {np.std(list(emotion_scores.values())):.2f}
|
263 |
|
264 |
Speech Content:
|
265 |
{transcription}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
"""
|
267 |
|
268 |
-
return summary, emotion_viz,
|
269 |
|
270 |
except Exception as e:
|
271 |
-
error_msg = f"Error
|
272 |
print(error_msg)
|
273 |
-
return error_msg,
|
274 |
|
275 |
# Load models at startup
|
276 |
print("Initializing application...")
|
@@ -287,45 +270,28 @@ demo = gr.Interface(
|
|
287 |
),
|
288 |
outputs=[
|
289 |
gr.Textbox(label="Analysis Summary", lines=10),
|
290 |
-
gr.HTML(label="
|
291 |
-
gr.HTML(label="Voice
|
292 |
],
|
293 |
-
title="
|
294 |
description="""
|
295 |
-
This application
|
296 |
|
297 |
-
1.
|
298 |
-
-
|
299 |
-
-
|
300 |
-
- Speech rate
|
301 |
-
|
302 |
-
2. Voice Quality Analysis:
|
303 |
-
- Spectral features
|
304 |
-
- Energy distribution
|
305 |
-
- MFCC analysis
|
306 |
|
307 |
-
|
308 |
- Emotion detection
|
309 |
-
- Emotional
|
310 |
|
311 |
-
|
312 |
- Text transcription
|
313 |
-
- Pause analysis
|
314 |
|
315 |
Upload an audio file or record directly through your microphone.
|
316 |
""",
|
317 |
-
article="""
|
318 |
-
### About Vocal Biomarkers
|
319 |
-
Vocal biomarkers are measurable indicators in the human voice that can potentially indicate various health conditions.
|
320 |
-
This analysis focuses on several key aspects:
|
321 |
-
|
322 |
-
- **Voice Quality**: Changes in voice quality can indicate respiratory or neurological conditions
|
323 |
-
- **Prosody**: Speech rhythm and timing can be indicators of cognitive function
|
324 |
-
- **Emotional Content**: Emotional patterns can be relevant to mental health assessment
|
325 |
-
- **Acoustic Features**: Specific acoustic patterns may correlate with various health conditions
|
326 |
-
|
327 |
-
Note: This is a demonstration tool and should not be used for medical diagnosis.
|
328 |
-
""",
|
329 |
examples=None,
|
330 |
cache_examples=False
|
331 |
)
|
|
|
4 |
import librosa
|
5 |
import numpy as np
|
6 |
import plotly.graph_objects as go
|
|
|
7 |
import warnings
|
8 |
import os
|
|
|
9 |
from scipy.stats import kurtosis, skew
|
10 |
warnings.filterwarnings('ignore')
|
11 |
|
12 |
+
def extract_prosodic_features(waveform, sr):
|
13 |
+
"""Extract prosodic features from audio"""
|
14 |
+
try:
|
15 |
+
features = {}
|
16 |
+
|
17 |
+
# 1. Pitch (F0) Features
|
18 |
+
pitches, magnitudes = librosa.piptrack(y=waveform, sr=sr)
|
19 |
+
f0_contour = []
|
20 |
+
for t in range(pitches.shape[1]):
|
21 |
+
pitches_at_t = pitches[:, t]
|
22 |
+
mags = magnitudes[:, t]
|
23 |
+
pitch_index = mags.argmax()
|
24 |
+
f0_contour.append(pitches[pitch_index, t])
|
25 |
+
f0_contour = np.array(f0_contour)
|
26 |
+
f0_contour = f0_contour[f0_contour > 0] # Remove zero pitches
|
27 |
+
|
28 |
+
if len(f0_contour) > 0:
|
29 |
+
features['pitch_mean'] = np.mean(f0_contour)
|
30 |
+
features['pitch_std'] = np.std(f0_contour)
|
31 |
+
features['pitch_range'] = np.ptp(f0_contour)
|
32 |
+
else:
|
33 |
+
features['pitch_mean'] = 0
|
34 |
+
features['pitch_std'] = 0
|
35 |
+
features['pitch_range'] = 0
|
36 |
+
|
37 |
+
# 2. Energy/Intensity Features
|
38 |
+
rms = librosa.feature.rms(y=waveform)[0]
|
39 |
+
features['energy_mean'] = np.mean(rms)
|
40 |
+
features['energy_std'] = np.std(rms)
|
41 |
+
features['energy_range'] = np.ptp(rms)
|
42 |
+
|
43 |
+
# 3. Rhythm Features
|
44 |
+
onset_env = librosa.onset.onset_strength(y=waveform, sr=sr)
|
45 |
+
tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
|
46 |
+
features['tempo'] = tempo[0]
|
47 |
+
|
48 |
+
# 4. Voice Quality Features
|
49 |
+
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
|
50 |
+
features['spectral_centroid_mean'] = np.mean(spectral_centroids)
|
51 |
+
|
52 |
+
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
|
53 |
+
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
|
54 |
+
|
55 |
+
# 5. MFCC Features
|
56 |
+
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
|
57 |
+
for i in range(13):
|
58 |
+
features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
|
59 |
+
features[f'mfcc_{i}_std'] = np.std(mfccs[i])
|
60 |
+
|
61 |
+
return features
|
62 |
+
|
63 |
+
except Exception as e:
|
64 |
+
print(f"Error in extract_prosodic_features: {str(e)}")
|
65 |
+
return None
|
66 |
+
|
67 |
+
def create_feature_plots(features):
|
68 |
+
"""Create visualizations for audio features"""
|
69 |
+
try:
|
70 |
+
# Create main figure with subplots
|
71 |
+
fig = go.Figure()
|
72 |
+
|
73 |
+
# 1. Pitch Features
|
74 |
+
pitch_data = {
|
75 |
+
'Mean': features['pitch_mean'],
|
76 |
+
'Std Dev': features['pitch_std'],
|
77 |
+
'Range': features['pitch_range']
|
78 |
+
}
|
79 |
+
|
80 |
+
fig.add_trace(go.Bar(
|
81 |
+
name='Pitch Features',
|
82 |
+
x=list(pitch_data.keys()),
|
83 |
+
y=list(pitch_data.values()),
|
84 |
+
marker_color='blue'
|
85 |
+
))
|
86 |
+
|
87 |
+
# 2. Energy Features
|
88 |
+
energy_data = {
|
89 |
+
'Mean': features['energy_mean'],
|
90 |
+
'Std Dev': features['energy_std'],
|
91 |
+
'Range': features['energy_range']
|
92 |
+
}
|
93 |
+
|
94 |
+
fig.add_trace(go.Bar(
|
95 |
+
name='Energy Features',
|
96 |
+
x=[f"Energy {k}" for k in energy_data.keys()],
|
97 |
+
y=list(energy_data.values()),
|
98 |
+
marker_color='red'
|
99 |
+
))
|
100 |
+
|
101 |
+
# 3. MFCC Plot
|
102 |
+
mfcc_means = [features[f'mfcc_{i}_mean'] for i in range(13)]
|
103 |
+
fig.add_trace(go.Scatter(
|
104 |
+
name='MFCC Coefficients',
|
105 |
+
y=mfcc_means,
|
106 |
+
mode='lines+markers',
|
107 |
+
marker_color='green'
|
108 |
+
))
|
109 |
+
|
110 |
+
# Update layout
|
111 |
+
fig.update_layout(
|
112 |
+
title='Voice Feature Analysis',
|
113 |
+
showlegend=True,
|
114 |
+
height=600,
|
115 |
+
barmode='group'
|
116 |
+
)
|
117 |
+
|
118 |
+
return fig.to_html(include_plotlyjs=True)
|
119 |
+
|
120 |
+
except Exception as e:
|
121 |
+
print(f"Error in create_feature_plots: {str(e)}")
|
122 |
+
return None
|
123 |
|
124 |
def load_models():
|
125 |
"""Initialize and load all required models"""
|
|
|
134 |
emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
|
135 |
emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
|
136 |
|
|
|
137 |
whisper_model.to("cpu")
|
138 |
emotion_model.to("cpu")
|
139 |
|
|
|
143 |
print(f"Error loading models: {str(e)}")
|
144 |
return False
|
145 |
|
146 |
+
def create_emotion_plot(emotions):
|
147 |
+
"""Create emotion analysis visualization"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
try:
|
149 |
+
fig = go.Figure(data=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
go.Bar(
|
151 |
+
x=list(emotions.keys()),
|
152 |
+
y=list(emotions.values()),
|
153 |
+
marker_color='rgb(55, 83, 109)'
|
154 |
+
)
|
155 |
+
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
|
|
157 |
fig.update_layout(
|
158 |
+
title='Emotion Analysis',
|
159 |
+
xaxis_title='Emotion',
|
160 |
+
yaxis_title='Score',
|
161 |
+
yaxis_range=[0, 1],
|
162 |
+
template='plotly_white',
|
163 |
+
height=400
|
164 |
)
|
165 |
|
166 |
return fig.to_html(include_plotlyjs=True)
|
167 |
except Exception as e:
|
168 |
+
print(f"Error creating emotion plot: {str(e)}")
|
169 |
+
return None
|
170 |
|
171 |
def analyze_audio(audio_input):
|
172 |
"""Main function to analyze audio input"""
|
173 |
try:
|
174 |
if audio_input is None:
|
175 |
+
return "Please provide an audio input", None, None
|
|
|
176 |
|
177 |
+
print(f"Processing audio input: {type(audio_input)}")
|
178 |
|
179 |
+
# Handle audio input
|
180 |
if isinstance(audio_input, tuple):
|
181 |
+
audio_path = audio_input[0] # Get file path from tuple
|
182 |
else:
|
183 |
audio_path = audio_input
|
184 |
|
185 |
+
print(f"Loading audio from path: {audio_path}")
|
186 |
+
|
187 |
+
# Load audio
|
188 |
+
waveform, sr = librosa.load(audio_path, sr=16000)
|
189 |
+
print(f"Audio loaded: {waveform.shape}, SR: {sr}")
|
190 |
|
191 |
# Extract voice features
|
192 |
+
print("Extracting voice features...")
|
193 |
+
features = extract_prosodic_features(waveform, sr)
|
194 |
+
if features is None:
|
195 |
+
return "Error extracting voice features", None, None
|
196 |
|
197 |
+
# Create feature plots
|
198 |
+
print("Creating feature visualizations...")
|
199 |
+
feature_viz = create_feature_plots(features)
|
200 |
+
if feature_viz is None:
|
201 |
+
return "Error creating feature visualizations", None, None
|
202 |
|
203 |
# Transcribe audio
|
204 |
print("Transcribing audio...")
|
205 |
+
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
|
|
|
|
|
206 |
|
207 |
with torch.no_grad():
|
208 |
predicted_ids = whisper_model.generate(inputs)
|
|
|
210 |
|
211 |
# Analyze emotions
|
212 |
print("Analyzing emotions...")
|
213 |
+
emotion_inputs = emotion_tokenizer(
|
214 |
transcription,
|
215 |
return_tensors="pt",
|
216 |
padding=True,
|
|
|
219 |
)
|
220 |
|
221 |
with torch.no_grad():
|
222 |
+
emotion_outputs = emotion_model(**emotion_inputs)
|
223 |
+
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
|
224 |
|
225 |
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
|
226 |
emotion_scores = {
|
|
|
230 |
|
231 |
# Create emotion visualization
|
232 |
emotion_viz = create_emotion_plot(emotion_scores)
|
233 |
+
if emotion_viz is None:
|
234 |
+
return "Error creating emotion visualization", None, None
|
235 |
|
236 |
+
# Create analysis summary
|
237 |
summary = f"""Voice Analysis Summary:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
Speech Content:
|
240 |
{transcription}
|
241 |
+
|
242 |
+
Voice Characteristics:
|
243 |
+
- Average Pitch: {features['pitch_mean']:.2f} Hz
|
244 |
+
- Pitch Variation: {features['pitch_std']:.2f} Hz
|
245 |
+
- Speech Rate (Tempo): {features['tempo']:.2f} BPM
|
246 |
+
- Voice Energy: {features['energy_mean']:.4f}
|
247 |
+
|
248 |
+
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
|
249 |
"""
|
250 |
|
251 |
+
return summary, emotion_viz, feature_viz
|
252 |
|
253 |
except Exception as e:
|
254 |
+
error_msg = f"Error in audio analysis: {str(e)}"
|
255 |
print(error_msg)
|
256 |
+
return error_msg, None, None
|
257 |
|
258 |
# Load models at startup
|
259 |
print("Initializing application...")
|
|
|
270 |
),
|
271 |
outputs=[
|
272 |
gr.Textbox(label="Analysis Summary", lines=10),
|
273 |
+
gr.HTML(label="Emotion Analysis"),
|
274 |
+
gr.HTML(label="Voice Feature Analysis")
|
275 |
],
|
276 |
+
title="Voice Analysis System",
|
277 |
description="""
|
278 |
+
This application analyzes voice recordings to extract various characteristics:
|
279 |
|
280 |
+
1. Voice Features:
|
281 |
+
- Pitch analysis
|
282 |
+
- Energy patterns
|
283 |
+
- Speech rate
|
284 |
+
- Voice quality
|
|
|
|
|
|
|
|
|
285 |
|
286 |
+
2. Emotional Content:
|
287 |
- Emotion detection
|
288 |
+
- Emotional intensity
|
289 |
|
290 |
+
3. Speech Content:
|
291 |
- Text transcription
|
|
|
292 |
|
293 |
Upload an audio file or record directly through your microphone.
|
294 |
""",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
examples=None,
|
296 |
cache_examples=False
|
297 |
)
|