Spaces:
Runtime error
Runtime error
Commit
·
c023426
1
Parent(s):
427cb0b
Updated visualisations
Browse files
app.py
CHANGED
@@ -70,6 +70,7 @@ def get_syllables_per_second(audio_bytes):
|
|
70 |
# Perform inference
|
71 |
with torch.no_grad():
|
72 |
logits = model(input_values).logits
|
|
|
73 |
predicted_ids = torch.argmax(logits, dim=-1)
|
74 |
transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
|
75 |
offsets = transcription["char_offsets"]
|
@@ -138,12 +139,197 @@ def get_syllables_per_second(audio_bytes):
|
|
138 |
plt.savefig('mel_spectrogram.png')
|
139 |
plt.close()
|
140 |
|
141 |
-
#
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
# Create results directory if it doesn't exist
|
149 |
os.makedirs('results', exist_ok=True)
|
@@ -154,12 +340,6 @@ def get_syllables_per_second(audio_bytes):
|
|
154 |
f.write(audio_bytes)
|
155 |
|
156 |
# Save syllables per second to text file
|
157 |
-
results_text = f"""Syllables per Second Analysis
|
158 |
-
Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
159 |
-
Syllables per second: {syllables_per_second:.2f}
|
160 |
-
Number of syllables: {syllable_count}
|
161 |
-
Total duration: {audio_duration:.2f} seconds
|
162 |
-
"""
|
163 |
with open('results/analysis_results.txt', 'w') as f:
|
164 |
f.write(results_text)
|
165 |
|
@@ -168,6 +348,7 @@ Total duration: {audio_duration:.2f} seconds
|
|
168 |
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
169 |
zipf.write('syllables_per_second.png')
|
170 |
zipf.write('mel_spectrogram.png')
|
|
|
171 |
zipf.write(audio_path)
|
172 |
zipf.write('results/analysis_results.txt')
|
173 |
|
@@ -181,6 +362,23 @@ Total duration: {audio_duration:.2f} seconds
|
|
181 |
help="Download a zip file containing the audio, visualizations, and analysis results"
|
182 |
)
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
return syllables_per_second
|
185 |
|
186 |
|
|
|
70 |
# Perform inference
|
71 |
with torch.no_grad():
|
72 |
logits = model(input_values).logits
|
73 |
+
probabilities = torch.softmax(logits, dim=-1)
|
74 |
predicted_ids = torch.argmax(logits, dim=-1)
|
75 |
transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
|
76 |
offsets = transcription["char_offsets"]
|
|
|
139 |
plt.savefig('mel_spectrogram.png')
|
140 |
plt.close()
|
141 |
|
142 |
+
# Calculate evenness and distinctness metrics
|
143 |
+
syllable_stats = {}
|
144 |
+
for syllable in ['p', 't', 'k']:
|
145 |
+
syllable_times = [offset for offset in syllable_offsets if offset['char'] == syllable]
|
146 |
+
|
147 |
+
if len(syllable_times) > 1:
|
148 |
+
intervals = [(syllable_times[i+1]['start_offset'] - syllable_times[i]['start_offset']) * 0.02
|
149 |
+
for i in range(len(syllable_times)-1)]
|
150 |
+
|
151 |
+
mean_interval = np.mean(intervals)
|
152 |
+
std_interval = np.std(intervals)
|
153 |
+
cv = (std_interval / mean_interval) if mean_interval > 0 else 0
|
154 |
+
|
155 |
+
# Debug prints for confidence calculation
|
156 |
+
syllable_idx = processor.tokenizer.convert_tokens_to_ids(syllable)
|
157 |
+
print(f"\nProcessing syllable: {syllable} (token_id: {syllable_idx})")
|
158 |
+
confidence_scores = []
|
159 |
+
|
160 |
+
# Only look at time windows where this syllable occurs
|
161 |
+
for offset in syllable_times:
|
162 |
+
# Convert time offset to model timestep index
|
163 |
+
time_idx = int(offset['start_offset'])
|
164 |
+
prob = probabilities[0][time_idx]
|
165 |
+
|
166 |
+
# Get top 5 predictions and their indices
|
167 |
+
top_k_values, top_k_indices = torch.topk(prob, k=5)
|
168 |
+
|
169 |
+
print(f"\nTimestep {time_idx} (time: {time_idx * 0.02:.3f}s):")
|
170 |
+
print(f"Top-5 indices: {top_k_indices.tolist()}")
|
171 |
+
print(f"Top-5 values: {top_k_values.tolist()}")
|
172 |
+
|
173 |
+
if syllable_idx in top_k_indices:
|
174 |
+
syllable_prob = prob[syllable_idx]
|
175 |
+
relative_confidence = syllable_prob / top_k_values.sum()
|
176 |
+
print(f"Syllable probability: {syllable_prob:.4f}")
|
177 |
+
print(f"Relative confidence: {relative_confidence:.4f}")
|
178 |
+
confidence_scores.append(float(relative_confidence))
|
179 |
+
else:
|
180 |
+
confidence_scores.append(0.0)
|
181 |
+
print("Syllable not in top-5")
|
182 |
+
|
183 |
+
# Calculate mean confidence only from timesteps where syllable occurs
|
184 |
+
mean_confidence = np.mean(confidence_scores) if confidence_scores else 0.0
|
185 |
+
print(f"\nFinal confidence scores for {syllable}:")
|
186 |
+
print(f"Scores at syllable timestamps: {confidence_scores}")
|
187 |
+
print(f"Mean confidence: {mean_confidence:.4f}")
|
188 |
+
|
189 |
+
syllable_stats[syllable] = {
|
190 |
+
'count': len(syllable_times),
|
191 |
+
'mean_interval': mean_interval,
|
192 |
+
'std_interval': std_interval,
|
193 |
+
'cv': cv,
|
194 |
+
'mean_confidence': mean_confidence,
|
195 |
+
'intervals': intervals,
|
196 |
+
'confidence_scores': confidence_scores
|
197 |
+
}
|
198 |
+
|
199 |
+
# Create visualization for evenness and distinctness
|
200 |
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
|
201 |
+
|
202 |
+
# Color scheme
|
203 |
+
colors = {
|
204 |
+
'p': '#2E86C1', # Blue
|
205 |
+
't': '#28B463', # Green
|
206 |
+
'k': '#E74C3C' # Red
|
207 |
+
}
|
208 |
+
|
209 |
+
# Plot 1: Evenness Analysis
|
210 |
+
for syllable, stats in syllable_stats.items():
|
211 |
+
if len(stats['intervals']) > 0:
|
212 |
+
# Calculate normalized intervals (deviation from mean)
|
213 |
+
mean_interval = stats['mean_interval']
|
214 |
+
normalized_intervals = [(interval - mean_interval) / mean_interval * 100
|
215 |
+
for interval in stats['intervals']]
|
216 |
+
|
217 |
+
# Plot normalized intervals
|
218 |
+
x = range(len(normalized_intervals))
|
219 |
+
ax1.plot(x, normalized_intervals, 'o-',
|
220 |
+
label=f'{syllable} (CV={stats["cv"]:.2f})',
|
221 |
+
color=colors[syllable], linewidth=2, markersize=8)
|
222 |
+
|
223 |
+
# Add individual point annotations
|
224 |
+
for i, val in enumerate(normalized_intervals):
|
225 |
+
ax1.annotate(f'{val:.1f}%',
|
226 |
+
(i, val),
|
227 |
+
xytext=(0, 10),
|
228 |
+
textcoords='offset points',
|
229 |
+
ha='center',
|
230 |
+
fontsize=8)
|
231 |
+
|
232 |
+
# Add reference zones for evenness
|
233 |
+
ax1.axhspan(-10, 10, color='#2ECC71', alpha=0.2, label='Highly Regular (±10%)')
|
234 |
+
ax1.axhspan(-30, -10, color='#F1C40F', alpha=0.2, label='Moderately Regular')
|
235 |
+
ax1.axhspan(10, 30, color='#F1C40F', alpha=0.2)
|
236 |
+
ax1.axhspan(-50, -30, color='#E74C3C', alpha=0.2, label='Irregular')
|
237 |
+
ax1.axhspan(30, 50, color='#E74C3C', alpha=0.2)
|
238 |
+
|
239 |
+
ax1.set_xlabel('Repetition Number', fontsize=12)
|
240 |
+
ax1.set_ylabel('Deviation from Mean Interval (%)', fontsize=12)
|
241 |
+
ax1.set_title('Timing Evenness Analysis\n(Deviation from Mean Interval)', fontsize=14, pad=20)
|
242 |
+
ax1.grid(True, linestyle='--', alpha=0.7)
|
243 |
+
ax1.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
|
244 |
+
ax1.set_ylim(-50, 50)
|
245 |
+
|
246 |
+
# Plot 2: Distinctness Analysis
|
247 |
+
for syllable, stats in syllable_stats.items():
|
248 |
+
if len(stats['confidence_scores']) > 0:
|
249 |
+
x = range(len(stats['confidence_scores']))
|
250 |
+
|
251 |
+
# Create gradient colors based on confidence scores
|
252 |
+
colors_array = []
|
253 |
+
for score in stats['confidence_scores']:
|
254 |
+
if score > 0.7:
|
255 |
+
colors_array.append('#2ECC71') # Green for high confidence
|
256 |
+
elif score > 0.4:
|
257 |
+
colors_array.append('#F1C40F') # Yellow for medium confidence
|
258 |
+
else:
|
259 |
+
colors_array.append('#E74C3C') # Red for low confidence
|
260 |
+
|
261 |
+
# Plot bars with gradient colors
|
262 |
+
bars = ax2.bar(x, stats['confidence_scores'],
|
263 |
+
label=f'{syllable} (mean={stats["mean_confidence"]:.2f})',
|
264 |
+
color=colors_array, alpha=0.7)
|
265 |
+
|
266 |
+
# Add value labels on top of bars
|
267 |
+
for bar in bars:
|
268 |
+
height = bar.get_height()
|
269 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height,
|
270 |
+
f'{height:.2f}',
|
271 |
+
ha='center', va='bottom', fontsize=8)
|
272 |
+
|
273 |
+
# Add reference lines for distinctness
|
274 |
+
ax2.axhline(y=0.7, color='#2ECC71', linestyle='--', alpha=0.5, label='High Distinctness')
|
275 |
+
ax2.axhline(y=0.4, color='#F1C40F', linestyle='--', alpha=0.5, label='Moderate Distinctness')
|
276 |
+
|
277 |
+
ax2.set_xlabel('Syllable Occurrence', fontsize=12)
|
278 |
+
ax2.set_ylabel('Articulation Distinctness Score', fontsize=12)
|
279 |
+
ax2.set_title('Articulation Distinctness Analysis\n(Higher Score = Clearer Articulation)', fontsize=14, pad=20)
|
280 |
+
ax2.grid(True, linestyle='--', alpha=0.7)
|
281 |
+
ax2.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
|
282 |
+
ax2.set_ylim(0, 1)
|
283 |
+
|
284 |
+
# Overall layout adjustments
|
285 |
+
plt.tight_layout()
|
286 |
+
plt.subplots_adjust(right=0.85) # Make room for legends
|
287 |
+
plt.savefig('articulation_analysis.png', dpi=300, bbox_inches='tight')
|
288 |
+
plt.close()
|
289 |
+
|
290 |
+
# Update results text with new metrics
|
291 |
+
results_text = f"""Syllables per Second Analysis
|
292 |
+
Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
293 |
+
|
294 |
+
SPEED MEASUREMENTS
|
295 |
+
----------------
|
296 |
+
- Overall syllables per second: {syllables_per_second:.2f}
|
297 |
+
- Total number of syllables: {syllable_count}
|
298 |
+
- Total duration: {audio_duration:.2f} seconds
|
299 |
+
|
300 |
+
Detailed Analysis by Syllable:"""
|
301 |
+
|
302 |
+
for syllable, stats in syllable_stats.items():
|
303 |
+
results_text += f"""
|
304 |
+
|
305 |
+
{syllable.upper()} Syllable Analysis:
|
306 |
+
Count: {stats['count']} occurrences
|
307 |
+
|
308 |
+
EVENNESS MEASUREMENTS (timing regularity)
|
309 |
+
--------------------------------
|
310 |
+
- Mean interval between repetitions: {stats['mean_interval']:.3f} seconds
|
311 |
+
- Variation in intervals (std dev): {stats['std_interval']:.3f} seconds
|
312 |
+
- Coefficient of variation: {stats['cv']:.3f}
|
313 |
+
(Lower CV = more even timing, Higher CV = more irregular timing)
|
314 |
+
* CV < 0.1: Highly regular
|
315 |
+
* CV 0.1-0.3: Moderately regular
|
316 |
+
* CV > 0.3: Irregular
|
317 |
+
|
318 |
+
DISTINCTNESS MEASUREMENTS (articulation clarity)
|
319 |
+
------------------------------------
|
320 |
+
- Mean articulation confidence: {stats['mean_confidence']:.3f}
|
321 |
+
(Higher values indicate clearer articulation)
|
322 |
+
* Values closer to 1.0 indicate very distinct pronunciation
|
323 |
+
* Values closer to 0.0 indicate less distinct pronunciation
|
324 |
+
- Confidence scores for each occurrence: {stats['confidence_scores']}
|
325 |
+
|
326 |
+
RAW MEASUREMENTS
|
327 |
+
--------------
|
328 |
+
- All intervals between repetitions (seconds): {stats['intervals']}"""
|
329 |
+
|
330 |
+
# Print the results text to verify
|
331 |
+
print("\nFinal Results Text:")
|
332 |
+
print(results_text)
|
333 |
|
334 |
# Create results directory if it doesn't exist
|
335 |
os.makedirs('results', exist_ok=True)
|
|
|
340 |
f.write(audio_bytes)
|
341 |
|
342 |
# Save syllables per second to text file
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
with open('results/analysis_results.txt', 'w') as f:
|
344 |
f.write(results_text)
|
345 |
|
|
|
348 |
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
349 |
zipf.write('syllables_per_second.png')
|
350 |
zipf.write('mel_spectrogram.png')
|
351 |
+
zipf.write('articulation_analysis.png')
|
352 |
zipf.write(audio_path)
|
353 |
zipf.write('results/analysis_results.txt')
|
354 |
|
|
|
362 |
help="Download a zip file containing the audio, visualizations, and analysis results"
|
363 |
)
|
364 |
|
365 |
+
# Display all visualizations
|
366 |
+
col1, col2 = st.columns(2)
|
367 |
+
with col1:
|
368 |
+
st.image('syllables_per_second.png')
|
369 |
+
st.image('articulation_analysis.png')
|
370 |
+
with col2:
|
371 |
+
st.image('mel_spectrogram.png')
|
372 |
+
|
373 |
+
# Display detailed metrics
|
374 |
+
st.write("### Detailed Analysis")
|
375 |
+
for syllable, stats in syllable_stats.items():
|
376 |
+
st.write(f"\n**{syllable.upper()} Syllable:**")
|
377 |
+
st.write(f"- Count: {stats['count']}")
|
378 |
+
st.write(f"- Mean interval: {stats['mean_interval']:.3f} seconds")
|
379 |
+
st.write(f"- Coefficient of variation: {stats['cv']:.3f}")
|
380 |
+
st.write(f"- Mean articulation confidence: {stats['mean_confidence']:.3f}")
|
381 |
+
|
382 |
return syllables_per_second
|
383 |
|
384 |
|