birgermoell commited on
Commit
c023426
·
1 Parent(s): 427cb0b

Updated visualisations

Browse files
Files changed (1) hide show
  1. app.py +210 -12
app.py CHANGED
@@ -70,6 +70,7 @@ def get_syllables_per_second(audio_bytes):
70
  # Perform inference
71
  with torch.no_grad():
72
  logits = model(input_values).logits
 
73
  predicted_ids = torch.argmax(logits, dim=-1)
74
  transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
75
  offsets = transcription["char_offsets"]
@@ -138,12 +139,197 @@ def get_syllables_per_second(audio_bytes):
138
  plt.savefig('mel_spectrogram.png')
139
  plt.close()
140
 
141
- # Display both visualizations side by side
142
- col1, col2 = st.columns(2)
143
- with col1:
144
- st.image('syllables_per_second.png')
145
- with col2:
146
- st.image('mel_spectrogram.png')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  # Create results directory if it doesn't exist
149
  os.makedirs('results', exist_ok=True)
@@ -154,12 +340,6 @@ def get_syllables_per_second(audio_bytes):
154
  f.write(audio_bytes)
155
 
156
  # Save syllables per second to text file
157
- results_text = f"""Syllables per Second Analysis
158
- Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
159
- Syllables per second: {syllables_per_second:.2f}
160
- Number of syllables: {syllable_count}
161
- Total duration: {audio_duration:.2f} seconds
162
- """
163
  with open('results/analysis_results.txt', 'w') as f:
164
  f.write(results_text)
165
 
@@ -168,6 +348,7 @@ Total duration: {audio_duration:.2f} seconds
168
  with zipfile.ZipFile(zip_path, 'w') as zipf:
169
  zipf.write('syllables_per_second.png')
170
  zipf.write('mel_spectrogram.png')
 
171
  zipf.write(audio_path)
172
  zipf.write('results/analysis_results.txt')
173
 
@@ -181,6 +362,23 @@ Total duration: {audio_duration:.2f} seconds
181
  help="Download a zip file containing the audio, visualizations, and analysis results"
182
  )
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  return syllables_per_second
185
 
186
 
 
70
  # Perform inference
71
  with torch.no_grad():
72
  logits = model(input_values).logits
73
+ probabilities = torch.softmax(logits, dim=-1)
74
  predicted_ids = torch.argmax(logits, dim=-1)
75
  transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
76
  offsets = transcription["char_offsets"]
 
139
  plt.savefig('mel_spectrogram.png')
140
  plt.close()
141
 
142
+ # Calculate evenness and distinctness metrics
143
+ syllable_stats = {}
144
+ for syllable in ['p', 't', 'k']:
145
+ syllable_times = [offset for offset in syllable_offsets if offset['char'] == syllable]
146
+
147
+ if len(syllable_times) > 1:
148
+ intervals = [(syllable_times[i+1]['start_offset'] - syllable_times[i]['start_offset']) * 0.02
149
+ for i in range(len(syllable_times)-1)]
150
+
151
+ mean_interval = np.mean(intervals)
152
+ std_interval = np.std(intervals)
153
+ cv = (std_interval / mean_interval) if mean_interval > 0 else 0
154
+
155
+ # Debug prints for confidence calculation
156
+ syllable_idx = processor.tokenizer.convert_tokens_to_ids(syllable)
157
+ print(f"\nProcessing syllable: {syllable} (token_id: {syllable_idx})")
158
+ confidence_scores = []
159
+
160
+ # Only look at time windows where this syllable occurs
161
+ for offset in syllable_times:
162
+ # Convert time offset to model timestep index
163
+ time_idx = int(offset['start_offset'])
164
+ prob = probabilities[0][time_idx]
165
+
166
+ # Get top 5 predictions and their indices
167
+ top_k_values, top_k_indices = torch.topk(prob, k=5)
168
+
169
+ print(f"\nTimestep {time_idx} (time: {time_idx * 0.02:.3f}s):")
170
+ print(f"Top-5 indices: {top_k_indices.tolist()}")
171
+ print(f"Top-5 values: {top_k_values.tolist()}")
172
+
173
+ if syllable_idx in top_k_indices:
174
+ syllable_prob = prob[syllable_idx]
175
+ relative_confidence = syllable_prob / top_k_values.sum()
176
+ print(f"Syllable probability: {syllable_prob:.4f}")
177
+ print(f"Relative confidence: {relative_confidence:.4f}")
178
+ confidence_scores.append(float(relative_confidence))
179
+ else:
180
+ confidence_scores.append(0.0)
181
+ print("Syllable not in top-5")
182
+
183
+ # Calculate mean confidence only from timesteps where syllable occurs
184
+ mean_confidence = np.mean(confidence_scores) if confidence_scores else 0.0
185
+ print(f"\nFinal confidence scores for {syllable}:")
186
+ print(f"Scores at syllable timestamps: {confidence_scores}")
187
+ print(f"Mean confidence: {mean_confidence:.4f}")
188
+
189
+ syllable_stats[syllable] = {
190
+ 'count': len(syllable_times),
191
+ 'mean_interval': mean_interval,
192
+ 'std_interval': std_interval,
193
+ 'cv': cv,
194
+ 'mean_confidence': mean_confidence,
195
+ 'intervals': intervals,
196
+ 'confidence_scores': confidence_scores
197
+ }
198
+
199
+ # Create visualization for evenness and distinctness
200
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
201
+
202
+ # Color scheme
203
+ colors = {
204
+ 'p': '#2E86C1', # Blue
205
+ 't': '#28B463', # Green
206
+ 'k': '#E74C3C' # Red
207
+ }
208
+
209
+ # Plot 1: Evenness Analysis
210
+ for syllable, stats in syllable_stats.items():
211
+ if len(stats['intervals']) > 0:
212
+ # Calculate normalized intervals (deviation from mean)
213
+ mean_interval = stats['mean_interval']
214
+ normalized_intervals = [(interval - mean_interval) / mean_interval * 100
215
+ for interval in stats['intervals']]
216
+
217
+ # Plot normalized intervals
218
+ x = range(len(normalized_intervals))
219
+ ax1.plot(x, normalized_intervals, 'o-',
220
+ label=f'{syllable} (CV={stats["cv"]:.2f})',
221
+ color=colors[syllable], linewidth=2, markersize=8)
222
+
223
+ # Add individual point annotations
224
+ for i, val in enumerate(normalized_intervals):
225
+ ax1.annotate(f'{val:.1f}%',
226
+ (i, val),
227
+ xytext=(0, 10),
228
+ textcoords='offset points',
229
+ ha='center',
230
+ fontsize=8)
231
+
232
+ # Add reference zones for evenness
233
+ ax1.axhspan(-10, 10, color='#2ECC71', alpha=0.2, label='Highly Regular (±10%)')
234
+ ax1.axhspan(-30, -10, color='#F1C40F', alpha=0.2, label='Moderately Regular')
235
+ ax1.axhspan(10, 30, color='#F1C40F', alpha=0.2)
236
+ ax1.axhspan(-50, -30, color='#E74C3C', alpha=0.2, label='Irregular')
237
+ ax1.axhspan(30, 50, color='#E74C3C', alpha=0.2)
238
+
239
+ ax1.set_xlabel('Repetition Number', fontsize=12)
240
+ ax1.set_ylabel('Deviation from Mean Interval (%)', fontsize=12)
241
+ ax1.set_title('Timing Evenness Analysis\n(Deviation from Mean Interval)', fontsize=14, pad=20)
242
+ ax1.grid(True, linestyle='--', alpha=0.7)
243
+ ax1.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
244
+ ax1.set_ylim(-50, 50)
245
+
246
+ # Plot 2: Distinctness Analysis
247
+ for syllable, stats in syllable_stats.items():
248
+ if len(stats['confidence_scores']) > 0:
249
+ x = range(len(stats['confidence_scores']))
250
+
251
+ # Create gradient colors based on confidence scores
252
+ colors_array = []
253
+ for score in stats['confidence_scores']:
254
+ if score > 0.7:
255
+ colors_array.append('#2ECC71') # Green for high confidence
256
+ elif score > 0.4:
257
+ colors_array.append('#F1C40F') # Yellow for medium confidence
258
+ else:
259
+ colors_array.append('#E74C3C') # Red for low confidence
260
+
261
+ # Plot bars with gradient colors
262
+ bars = ax2.bar(x, stats['confidence_scores'],
263
+ label=f'{syllable} (mean={stats["mean_confidence"]:.2f})',
264
+ color=colors_array, alpha=0.7)
265
+
266
+ # Add value labels on top of bars
267
+ for bar in bars:
268
+ height = bar.get_height()
269
+ ax2.text(bar.get_x() + bar.get_width()/2., height,
270
+ f'{height:.2f}',
271
+ ha='center', va='bottom', fontsize=8)
272
+
273
+ # Add reference lines for distinctness
274
+ ax2.axhline(y=0.7, color='#2ECC71', linestyle='--', alpha=0.5, label='High Distinctness')
275
+ ax2.axhline(y=0.4, color='#F1C40F', linestyle='--', alpha=0.5, label='Moderate Distinctness')
276
+
277
+ ax2.set_xlabel('Syllable Occurrence', fontsize=12)
278
+ ax2.set_ylabel('Articulation Distinctness Score', fontsize=12)
279
+ ax2.set_title('Articulation Distinctness Analysis\n(Higher Score = Clearer Articulation)', fontsize=14, pad=20)
280
+ ax2.grid(True, linestyle='--', alpha=0.7)
281
+ ax2.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
282
+ ax2.set_ylim(0, 1)
283
+
284
+ # Overall layout adjustments
285
+ plt.tight_layout()
286
+ plt.subplots_adjust(right=0.85) # Make room for legends
287
+ plt.savefig('articulation_analysis.png', dpi=300, bbox_inches='tight')
288
+ plt.close()
289
+
290
+ # Update results text with new metrics
291
+ results_text = f"""Syllables per Second Analysis
292
+ Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
293
+
294
+ SPEED MEASUREMENTS
295
+ ----------------
296
+ - Overall syllables per second: {syllables_per_second:.2f}
297
+ - Total number of syllables: {syllable_count}
298
+ - Total duration: {audio_duration:.2f} seconds
299
+
300
+ Detailed Analysis by Syllable:"""
301
+
302
+ for syllable, stats in syllable_stats.items():
303
+ results_text += f"""
304
+
305
+ {syllable.upper()} Syllable Analysis:
306
+ Count: {stats['count']} occurrences
307
+
308
+ EVENNESS MEASUREMENTS (timing regularity)
309
+ --------------------------------
310
+ - Mean interval between repetitions: {stats['mean_interval']:.3f} seconds
311
+ - Variation in intervals (std dev): {stats['std_interval']:.3f} seconds
312
+ - Coefficient of variation: {stats['cv']:.3f}
313
+ (Lower CV = more even timing, Higher CV = more irregular timing)
314
+ * CV < 0.1: Highly regular
315
+ * CV 0.1-0.3: Moderately regular
316
+ * CV > 0.3: Irregular
317
+
318
+ DISTINCTNESS MEASUREMENTS (articulation clarity)
319
+ ------------------------------------
320
+ - Mean articulation confidence: {stats['mean_confidence']:.3f}
321
+ (Higher values indicate clearer articulation)
322
+ * Values closer to 1.0 indicate very distinct pronunciation
323
+ * Values closer to 0.0 indicate less distinct pronunciation
324
+ - Confidence scores for each occurrence: {stats['confidence_scores']}
325
+
326
+ RAW MEASUREMENTS
327
+ --------------
328
+ - All intervals between repetitions (seconds): {stats['intervals']}"""
329
+
330
+ # Print the results text to verify
331
+ print("\nFinal Results Text:")
332
+ print(results_text)
333
 
334
  # Create results directory if it doesn't exist
335
  os.makedirs('results', exist_ok=True)
 
340
  f.write(audio_bytes)
341
 
342
  # Save syllables per second to text file
 
 
 
 
 
 
343
  with open('results/analysis_results.txt', 'w') as f:
344
  f.write(results_text)
345
 
 
348
  with zipfile.ZipFile(zip_path, 'w') as zipf:
349
  zipf.write('syllables_per_second.png')
350
  zipf.write('mel_spectrogram.png')
351
+ zipf.write('articulation_analysis.png')
352
  zipf.write(audio_path)
353
  zipf.write('results/analysis_results.txt')
354
 
 
362
  help="Download a zip file containing the audio, visualizations, and analysis results"
363
  )
364
 
365
+ # Display all visualizations
366
+ col1, col2 = st.columns(2)
367
+ with col1:
368
+ st.image('syllables_per_second.png')
369
+ st.image('articulation_analysis.png')
370
+ with col2:
371
+ st.image('mel_spectrogram.png')
372
+
373
+ # Display detailed metrics
374
+ st.write("### Detailed Analysis")
375
+ for syllable, stats in syllable_stats.items():
376
+ st.write(f"\n**{syllable.upper()} Syllable:**")
377
+ st.write(f"- Count: {stats['count']}")
378
+ st.write(f"- Mean interval: {stats['mean_interval']:.3f} seconds")
379
+ st.write(f"- Coefficient of variation: {stats['cv']:.3f}")
380
+ st.write(f"- Mean articulation confidence: {stats['mean_confidence']:.3f}")
381
+
382
  return syllables_per_second
383
 
384