Spaces:

KarolinskaInstitutet
/

pataka

Runtime error

App Files Files Community

birgermoell commited on 3 days ago

Commit

c023426

1 Parent(s): 427cb0b

Updated visualisations

Browse files

Files changed (1) hide show

app.py +210 -12

app.py CHANGED Viewed

@@ -70,6 +70,7 @@ def get_syllables_per_second(audio_bytes):
     # Perform inference
     with torch.no_grad():
         logits = model(input_values).logits
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
         offsets = transcription["char_offsets"]
@@ -138,12 +139,197 @@ def get_syllables_per_second(audio_bytes):
     plt.savefig('mel_spectrogram.png')
     plt.close()
-    # Display both visualizations side by side
-    col1, col2 = st.columns(2)
-    with col1:
-        st.image('syllables_per_second.png')
-    with col2:
-        st.image('mel_spectrogram.png')
     # Create results directory if it doesn't exist
     os.makedirs('results', exist_ok=True)
@@ -154,12 +340,6 @@ def get_syllables_per_second(audio_bytes):
         f.write(audio_bytes)
     # Save syllables per second to text file
-    results_text = f"""Syllables per Second Analysis
-Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
-Syllables per second: {syllables_per_second:.2f}
-Number of syllables: {syllable_count}
-Total duration: {audio_duration:.2f} seconds
-"""
     with open('results/analysis_results.txt', 'w') as f:
         f.write(results_text)
@@ -168,6 +348,7 @@ Total duration: {audio_duration:.2f} seconds
     with zipfile.ZipFile(zip_path, 'w') as zipf:
         zipf.write('syllables_per_second.png')
         zipf.write('mel_spectrogram.png')
         zipf.write(audio_path)
         zipf.write('results/analysis_results.txt')
@@ -181,6 +362,23 @@ Total duration: {audio_duration:.2f} seconds
             help="Download a zip file containing the audio, visualizations, and analysis results"
         )
     return syllables_per_second

     # Perform inference
     with torch.no_grad():
         logits = model(input_values).logits
+        probabilities = torch.softmax(logits, dim=-1)
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
         offsets = transcription["char_offsets"]
     plt.savefig('mel_spectrogram.png')
     plt.close()
+    # Calculate evenness and distinctness metrics
+    syllable_stats = {}
+    for syllable in ['p', 't', 'k']:
+        syllable_times = [offset for offset in syllable_offsets if offset['char'] == syllable]
+        if len(syllable_times) > 1:
+            intervals = [(syllable_times[i+1]['start_offset'] - syllable_times[i]['start_offset']) * 0.02
+                        for i in range(len(syllable_times)-1)]
+            mean_interval = np.mean(intervals)
+            std_interval = np.std(intervals)
+            cv = (std_interval / mean_interval) if mean_interval > 0 else 0
+            # Debug prints for confidence calculation
+            syllable_idx = processor.tokenizer.convert_tokens_to_ids(syllable)
+            print(f"\nProcessing syllable: {syllable} (token_id: {syllable_idx})")
+            confidence_scores = []
+            # Only look at time windows where this syllable occurs
+            for offset in syllable_times:
+                # Convert time offset to model timestep index
+                time_idx = int(offset['start_offset'])
+                prob = probabilities[0][time_idx]
+                # Get top 5 predictions and their indices
+                top_k_values, top_k_indices = torch.topk(prob, k=5)
+                print(f"\nTimestep {time_idx} (time: {time_idx * 0.02:.3f}s):")
+                print(f"Top-5 indices: {top_k_indices.tolist()}")
+                print(f"Top-5 values: {top_k_values.tolist()}")
+                if syllable_idx in top_k_indices:
+                    syllable_prob = prob[syllable_idx]
+                    relative_confidence = syllable_prob / top_k_values.sum()
+                    print(f"Syllable probability: {syllable_prob:.4f}")
+                    print(f"Relative confidence: {relative_confidence:.4f}")
+                    confidence_scores.append(float(relative_confidence))
+                else:
+                    confidence_scores.append(0.0)
+                    print("Syllable not in top-5")
+            # Calculate mean confidence only from timesteps where syllable occurs
+            mean_confidence = np.mean(confidence_scores) if confidence_scores else 0.0
+            print(f"\nFinal confidence scores for {syllable}:")
+            print(f"Scores at syllable timestamps: {confidence_scores}")
+            print(f"Mean confidence: {mean_confidence:.4f}")
+            syllable_stats[syllable] = {
+                'count': len(syllable_times),
+                'mean_interval': mean_interval,
+                'std_interval': std_interval,
+                'cv': cv,
+                'mean_confidence': mean_confidence,
+                'intervals': intervals,
+                'confidence_scores': confidence_scores
+            }
+    # Create visualization for evenness and distinctness
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
+    # Color scheme
+    colors = {
+        'p': '#2E86C1',  # Blue
+        't': '#28B463',  # Green
+        'k': '#E74C3C'   # Red
+    }
+    # Plot 1: Evenness Analysis
+    for syllable, stats in syllable_stats.items():
+        if len(stats['intervals']) > 0:
+            # Calculate normalized intervals (deviation from mean)
+            mean_interval = stats['mean_interval']
+            normalized_intervals = [(interval - mean_interval) / mean_interval * 100
+                                 for interval in stats['intervals']]
+            # Plot normalized intervals
+            x = range(len(normalized_intervals))
+            ax1.plot(x, normalized_intervals, 'o-',
+                    label=f'{syllable} (CV={stats["cv"]:.2f})',
+                    color=colors[syllable], linewidth=2, markersize=8)
+            # Add individual point annotations
+            for i, val in enumerate(normalized_intervals):
+                ax1.annotate(f'{val:.1f}%',
+                           (i, val),
+                           xytext=(0, 10),
+                           textcoords='offset points',
+                           ha='center',
+                           fontsize=8)
+    # Add reference zones for evenness
+    ax1.axhspan(-10, 10, color='#2ECC71', alpha=0.2, label='Highly Regular (±10%)')
+    ax1.axhspan(-30, -10, color='#F1C40F', alpha=0.2, label='Moderately Regular')
+    ax1.axhspan(10, 30, color='#F1C40F', alpha=0.2)
+    ax1.axhspan(-50, -30, color='#E74C3C', alpha=0.2, label='Irregular')
+    ax1.axhspan(30, 50, color='#E74C3C', alpha=0.2)
+    ax1.set_xlabel('Repetition Number', fontsize=12)
+    ax1.set_ylabel('Deviation from Mean Interval (%)', fontsize=12)
+    ax1.set_title('Timing Evenness Analysis\n(Deviation from Mean Interval)', fontsize=14, pad=20)
+    ax1.grid(True, linestyle='--', alpha=0.7)
+    ax1.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
+    ax1.set_ylim(-50, 50)
+    # Plot 2: Distinctness Analysis
+    for syllable, stats in syllable_stats.items():
+        if len(stats['confidence_scores']) > 0:
+            x = range(len(stats['confidence_scores']))
+            # Create gradient colors based on confidence scores
+            colors_array = []
+            for score in stats['confidence_scores']:
+                if score > 0.7:
+                    colors_array.append('#2ECC71')  # Green for high confidence
+                elif score > 0.4:
+                    colors_array.append('#F1C40F')  # Yellow for medium confidence
+                else:
+                    colors_array.append('#E74C3C')  # Red for low confidence
+            # Plot bars with gradient colors
+            bars = ax2.bar(x, stats['confidence_scores'],
+                         label=f'{syllable} (mean={stats["mean_confidence"]:.2f})',
+                         color=colors_array, alpha=0.7)
+            # Add value labels on top of bars
+            for bar in bars:
+                height = bar.get_height()
+                ax2.text(bar.get_x() + bar.get_width()/2., height,
+                        f'{height:.2f}',
+                        ha='center', va='bottom', fontsize=8)
+    # Add reference lines for distinctness
+    ax2.axhline(y=0.7, color='#2ECC71', linestyle='--', alpha=0.5, label='High Distinctness')
+    ax2.axhline(y=0.4, color='#F1C40F', linestyle='--', alpha=0.5, label='Moderate Distinctness')
+    ax2.set_xlabel('Syllable Occurrence', fontsize=12)
+    ax2.set_ylabel('Articulation Distinctness Score', fontsize=12)
+    ax2.set_title('Articulation Distinctness Analysis\n(Higher Score = Clearer Articulation)', fontsize=14, pad=20)
+    ax2.grid(True, linestyle='--', alpha=0.7)
+    ax2.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
+    ax2.set_ylim(0, 1)
+    # Overall layout adjustments
+    plt.tight_layout()
+    plt.subplots_adjust(right=0.85)  # Make room for legends
+    plt.savefig('articulation_analysis.png', dpi=300, bbox_inches='tight')
+    plt.close()
+    # Update results text with new metrics
+    results_text = f"""Syllables per Second Analysis
+Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+SPEED MEASUREMENTS
+----------------
+- Overall syllables per second: {syllables_per_second:.2f}
+- Total number of syllables: {syllable_count}
+- Total duration: {audio_duration:.2f} seconds
+Detailed Analysis by Syllable:"""
+    for syllable, stats in syllable_stats.items():
+        results_text += f"""
+{syllable.upper()} Syllable Analysis:
+Count: {stats['count']} occurrences
+EVENNESS MEASUREMENTS (timing regularity)
+--------------------------------
+- Mean interval between repetitions: {stats['mean_interval']:.3f} seconds
+- Variation in intervals (std dev): {stats['std_interval']:.3f} seconds
+- Coefficient of variation: {stats['cv']:.3f}
+  (Lower CV = more even timing, Higher CV = more irregular timing)
+  * CV < 0.1: Highly regular
+  * CV 0.1-0.3: Moderately regular
+  * CV > 0.3: Irregular
+DISTINCTNESS MEASUREMENTS (articulation clarity)
+------------------------------------
+- Mean articulation confidence: {stats['mean_confidence']:.3f}
+  (Higher values indicate clearer articulation)
+  * Values closer to 1.0 indicate very distinct pronunciation
+  * Values closer to 0.0 indicate less distinct pronunciation
+- Confidence scores for each occurrence: {stats['confidence_scores']}
+RAW MEASUREMENTS
+--------------
+- All intervals between repetitions (seconds): {stats['intervals']}"""
+    # Print the results text to verify
+    print("\nFinal Results Text:")
+    print(results_text)
     # Create results directory if it doesn't exist
     os.makedirs('results', exist_ok=True)
         f.write(audio_bytes)
     # Save syllables per second to text file
     with open('results/analysis_results.txt', 'w') as f:
         f.write(results_text)
     with zipfile.ZipFile(zip_path, 'w') as zipf:
         zipf.write('syllables_per_second.png')
         zipf.write('mel_spectrogram.png')
+        zipf.write('articulation_analysis.png')
         zipf.write(audio_path)
         zipf.write('results/analysis_results.txt')
             help="Download a zip file containing the audio, visualizations, and analysis results"
         )
+    # Display all visualizations
+    col1, col2 = st.columns(2)
+    with col1:
+        st.image('syllables_per_second.png')
+        st.image('articulation_analysis.png')
+    with col2:
+        st.image('mel_spectrogram.png')
+    # Display detailed metrics
+    st.write("### Detailed Analysis")
+    for syllable, stats in syllable_stats.items():
+        st.write(f"\n**{syllable.upper()} Syllable:**")
+        st.write(f"- Count: {stats['count']}")
+        st.write(f"- Mean interval: {stats['mean_interval']:.3f} seconds")
+        st.write(f"- Coefficient of variation: {stats['cv']:.3f}")
+        st.write(f"- Mean articulation confidence: {stats['mean_confidence']:.3f}")
     return syllables_per_second