mj-new commited on
Commit
32c749f
1 Parent(s): 60f84a1

updated leaderboard - added poleval test sets

Browse files
Files changed (2) hide show
  1. app.py +296 -5
  2. constants.py +2 -0
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import streamlit as st
3
  import pandas as pd
4
- from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
5
  from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension,box_plot_per_dimension_subsets, box_plot_per_dimension_with_colors, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
6
  from app_utils import calculate_height_to_display, filter_dataframe
7
  import matplotlib.pyplot as plt
@@ -22,7 +22,9 @@ if hf_token is None:
22
  # select the dataset to display results
23
  datasets_secret = [
24
  "amu-cai/pl-asr-bigos-v2-secret",
25
- "pelcra/pl-asr-pelcra-for-bigos-secret"]
 
 
26
 
27
  datasets_public = []
28
  #["amu-cai/pl-asr-bigos-synth-med"]
@@ -30,7 +32,7 @@ datasets_public = []
30
 
31
  st.set_page_config(layout="wide")
32
 
33
- about, lead_bigos, lead_pelcra, analysis, interactive_comparison = st.tabs(["About", "ASR Leaderboard - BIGOS corpora", "ASR Leaderboard - PELCRA corpora", "ASR evaluation scenarios", "Interactive comparison of ASR accuracy"])
34
  # "Results inspection""Results inspection"
35
  # inspection
36
  # acknowledgements, changelog, faq, todos = st.columns(4)
@@ -366,7 +368,7 @@ with lead_bigos:
366
  if metric == 'Average':
367
  ax.set_title('Average normalization impact on all metrics')
368
  ax.set_xlabel('Normalization Type')
369
- ax.set_ylabel(f'Difference in {metric}')
370
  ax.grid(True)
371
  ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right')
372
  min_val = diff_in_metrics[metric].min()
@@ -500,7 +502,292 @@ with lead_pelcra:
500
  if metric == 'Average':
501
  ax.set_title('Average normalization impact on all metrics')
502
  ax.set_xlabel('Normalization Type')
503
- ax.set_ylabel(f'Difference in {metric}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  ax.grid(True)
505
  ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right')
506
  min_val = diff_in_metrics[metric].min()
@@ -532,6 +819,10 @@ with analysis:
532
  dataset_short_name = "BIGOS"
533
  elif dataset == "pelcra/pl-asr-pelcra-for-bigos-secret":
534
  dataset_short_name = "PELCRA"
 
 
 
 
535
  else:
536
  dataset_short_name = "UNKNOWN"
537
 
 
1
  import os
2
  import streamlit as st
3
  import pandas as pd
4
+ from constants import BIGOS_INFO, PELCRA_INFO, POLEVAL_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
5
  from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension,box_plot_per_dimension_subsets, box_plot_per_dimension_with_colors, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
6
  from app_utils import calculate_height_to_display, filter_dataframe
7
  import matplotlib.pyplot as plt
 
22
  # select the dataset to display results
23
  datasets_secret = [
24
  "amu-cai/pl-asr-bigos-v2-secret",
25
+ "pelcra/pl-asr-pelcra-for-bigos-secret",
26
+ "michaljunczyk/test_A_poleval_24",
27
+ "michaljunczyk/test_B_poleval_24"]
28
 
29
  datasets_public = []
30
  #["amu-cai/pl-asr-bigos-synth-med"]
 
32
 
33
  st.set_page_config(layout="wide")
34
 
35
+ about, lead_bigos, lead_pelcra, lead_poleval_a, lead_poleval_b, analysis, interactive_comparison = st.tabs(["About", "BIGOS", "PELCRA", "PolEval test-A", "PolEval test-B", "Evaluation scenarios", "Interactive dashboard"])
36
  # "Results inspection""Results inspection"
37
  # inspection
38
  # acknowledgements, changelog, faq, todos = st.columns(4)
 
368
  if metric == 'Average':
369
  ax.set_title('Average normalization impact on all metrics')
370
  ax.set_xlabel('Normalization Type')
371
+ ax.set_ylabel(f'Difference in {metric} [pp]')
372
  ax.grid(True)
373
  ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right')
374
  min_val = diff_in_metrics[metric].min()
 
502
  if metric == 'Average':
503
  ax.set_title('Average normalization impact on all metrics')
504
  ax.set_xlabel('Normalization Type')
505
+ ax.set_ylabel(f'Difference in {metric} [pp]')
506
+ ax.grid(True)
507
+ ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right')
508
+ min_val = diff_in_metrics[metric].min()
509
+ ax.set_ylim([min_val * 1.1, diff_in_metrics[metric].max() * 1.1])
510
+
511
+ for bar in bars:
512
+ height = bar.get_height()
513
+ ax.annotate(f'{height:.2f}',
514
+ xy=(bar.get_x() + bar.get_width() / 2, height),
515
+ xytext=(0, -12), # 3 points vertical offset
516
+ textcoords="offset points",
517
+ ha='center', va='bottom')
518
+
519
+ # Display the plot in Streamlit
520
+ st.pyplot(fig)
521
+
522
+ ##################### APPENDIX #########################
523
+ st.header("Appendix - Full evaluation results per subset for all evaluated systems")
524
+ # select only the columns we want to plot
525
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
526
+ st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False)
527
+
528
+ with lead_poleval_a:
529
+ st.title("PolEval test A Leaderboard")
530
+ st.markdown(POLEVAL_INFO, unsafe_allow_html=True)
531
+
532
+ # configuration for tab
533
+ dataset = "michaljunczyk/test_A_poleval_24"
534
+ dataset_short_name = "PolEval test A"
535
+ dataset_version = "V1"
536
+ eval_date = "November 2024"
537
+ split = "test"
538
+ norm_type = "all"
539
+ ref_type = "orig"
540
+
541
+ # common, reusable part for all tabs presenting leaderboards for specific datasets
542
+ #### DATA LOADING AND AUGMENTATION ####
543
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
544
+
545
+
546
+ # filter only the ref_type and norm_type we want to analyze
547
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
548
+ # filter only the ref_type and norm_type we want to analyze
549
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
550
+
551
+ ##### PARAMETERS CALCULATION ####
552
+ evaluated_systems_list = df_per_sample["system"].unique()
553
+ no_of_evaluated_systems = len(evaluated_systems_list)
554
+ no_of_eval_subsets = len(df_per_dataset["subset"].unique())
555
+ no_of_test_cases = len(df_per_sample)
556
+ no_of_unique_recordings = len(df_per_sample["id"].unique())
557
+ total_audio_duration_hours = get_total_audio_duration(df_per_sample)
558
+ no_of_unique_speakers = len(df_per_sample["speaker_id"].unique())
559
+
560
+ df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
561
+
562
+ # MOST IMPORTANT RESULTS
563
+ analysis_dim = "system"
564
+ metric = "WER"
565
+ st.subheader("Leaderboard - Median {} per ASR {} across all subsets of {} dataset".format(metric, analysis_dim, dataset_short_name))
566
+ fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
567
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
568
+
569
+ st.header("Benchmark details")
570
+ st.markdown("**Evaluation date:** {}".format(eval_date))
571
+ st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
572
+ st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
573
+ st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset)))
574
+ st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers))
575
+ st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings))
576
+ st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours))
577
+ st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases))
578
+ st.markdown("**Dataset:** {}".format(dataset))
579
+ st.markdown("**Dataset version:** {}".format(dataset_version))
580
+ st.markdown("**Split:** {}".format(split))
581
+ st.markdown("**Text reference type:** {}".format(ref_type))
582
+ st.markdown("**Normalization steps:** {}".format(norm_type))
583
+
584
+ ########### RESULTS ################
585
+ st.header("WER (Word Error Rate) analysis")
586
+ st.subheader("Average WER for the whole dataset")
587
+ df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset")
588
+ st.dataframe(df_wer_avg)
589
+
590
+ st.subheader("Comparison of average WER for free and commercial systems")
591
+ df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type")
592
+ st.dataframe(df_wer_avg_free_commercial)
593
+
594
+ ##################### PER SYSTEM ANALYSIS #########################
595
+ analysis_dim = "system"
596
+ metric = "WER"
597
+ metric2 = "CER"
598
+
599
+ st.subheader("Table showing {} and {}".format(metric, metric2))
600
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
601
+ df_cer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric2, analysis_dim)
602
+ # merge the two dataframes, keep only one column for each metric with average values
603
+ df_wer_cer_per_system_from_per_dataset = pd.merge(df_wer_per_system_from_per_dataset, df_cer_per_system_from_per_dataset, on='system')
604
+ # drop top level of the column index
605
+ df_wer_cer_per_system_from_per_dataset = df_wer_cer_per_system_from_per_dataset.reset_index()
606
+
607
+ # keep columns system, avg_WER and avg_CER only
608
+ df_wer_cer_per_system_from_per_dataset = df_wer_cer_per_system_from_per_dataset[['system', 'avg_WER', 'avg_CER']]
609
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_cer_per_system_from_per_dataset)
610
+
611
+ st.dataframe(df_wer_cer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
612
+
613
+ ##################### PER SUBSET ANALYSIS #########################
614
+ analysis_dim = "subset"
615
+ metric = "WER"
616
+
617
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
618
+ fig = box_plot_per_dimension_subsets(df_per_dataset, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim +' of dataset ' + dataset_short_name , metric + " (%)", "system")
619
+ st.pyplot(fig, clear_figure=True, use_container_width=False)
620
+
621
+ ### IMPACT OF NORMALIZATION ON ERROR RATES #####
622
+ # Calculate the average impact of various norm_types for all datasets and systems
623
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
624
+ diff_in_metrics = check_impact_of_normalization(df_per_dataset_selected_cols)
625
+ st.subheader("Impact of normalization on WER")
626
+ st.dataframe(diff_in_metrics, use_container_width=False)
627
+
628
+ # Visualizing the differences in metrics graphically with data labels
629
+ # Visualizing the differences in metrics graphically with data labels
630
+ fig, axs = plt.subplots(3, 2, figsize=(12, 12))
631
+ fig.subplots_adjust(hspace=0.6, wspace=0.6)
632
+
633
+ #remove the sixth subplot
634
+ fig.delaxes(axs[2,1])
635
+
636
+ metrics = ['SER', 'WER', 'MER', 'CER', "Average"]
637
+ colors = ['blue', 'orange', 'green', 'red', 'purple']
638
+
639
+ for ax, metric, color in zip(axs.flatten(), metrics, colors):
640
+ bars = ax.bar(diff_in_metrics.index, diff_in_metrics[metric], color=color)
641
+ ax.set_title(f'Normalization impact on {metric}')
642
+ if metric == 'Average':
643
+ ax.set_title('Average normalization impact on all metrics')
644
+ ax.set_xlabel('Normalization Type')
645
+ ax.set_ylabel(f'Difference in {metric} [pp]')
646
+ ax.grid(True)
647
+ ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right')
648
+ min_val = diff_in_metrics[metric].min()
649
+ ax.set_ylim([min_val * 1.1, diff_in_metrics[metric].max() * 1.1])
650
+
651
+ for bar in bars:
652
+ height = bar.get_height()
653
+ ax.annotate(f'{height:.2f}',
654
+ xy=(bar.get_x() + bar.get_width() / 2, height),
655
+ xytext=(0, -12), # 3 points vertical offset
656
+ textcoords="offset points",
657
+ ha='center', va='bottom')
658
+
659
+ # Display the plot in Streamlit
660
+ st.pyplot(fig)
661
+
662
+ ##################### APPENDIX #########################
663
+ st.header("Appendix - Full evaluation results per subset for all evaluated systems")
664
+ # select only the columns we want to plot
665
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
666
+ st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False)
667
+
668
+ with lead_poleval_b:
669
+ st.title("PolEval test B Leaderboard")
670
+ st.markdown(POLEVAL_INFO, unsafe_allow_html=True)
671
+
672
+ # configuration for tab
673
+ dataset = "michaljunczyk/test_B_poleval_24"
674
+ dataset_short_name = "PolEval test B"
675
+ dataset_version = "V1"
676
+ eval_date = "November 2024"
677
+ split = "test"
678
+ norm_type = "all"
679
+ ref_type = "orig"
680
+
681
+ # common, reusable part for all tabs presenting leaderboards for specific datasets
682
+ #### DATA LOADING AND AUGMENTATION ####
683
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
684
+
685
+
686
+ # filter only the ref_type and norm_type we want to analyze
687
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
688
+ # filter only the ref_type and norm_type we want to analyze
689
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
690
+
691
+ ##### PARAMETERS CALCULATION ####
692
+ evaluated_systems_list = df_per_sample["system"].unique()
693
+ no_of_evaluated_systems = len(evaluated_systems_list)
694
+ no_of_eval_subsets = len(df_per_dataset["subset"].unique())
695
+ no_of_test_cases = len(df_per_sample)
696
+ no_of_unique_recordings = len(df_per_sample["id"].unique())
697
+ total_audio_duration_hours = get_total_audio_duration(df_per_sample)
698
+ no_of_unique_speakers = len(df_per_sample["speaker_id"].unique())
699
+
700
+ df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
701
+
702
+ # MOST IMPORTANT RESULTS
703
+ analysis_dim = "system"
704
+ metric = "WER"
705
+ st.subheader("Leaderboard - Median {} per ASR {} across all subsets of {} dataset".format(metric, analysis_dim, dataset_short_name))
706
+ fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
707
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
708
+
709
+ st.header("Benchmark details")
710
+ st.markdown("**Evaluation date:** {}".format(eval_date))
711
+ st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
712
+ st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
713
+ st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset)))
714
+ st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers))
715
+ st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings))
716
+ st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours))
717
+ st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases))
718
+ st.markdown("**Dataset:** {}".format(dataset))
719
+ st.markdown("**Dataset version:** {}".format(dataset_version))
720
+ st.markdown("**Split:** {}".format(split))
721
+ st.markdown("**Text reference type:** {}".format(ref_type))
722
+ st.markdown("**Normalization steps:** {}".format(norm_type))
723
+
724
+ ########### RESULTS ################
725
+ st.header("WER (Word Error Rate) analysis")
726
+ st.subheader("Average WER for the whole dataset")
727
+ df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset")
728
+ st.dataframe(df_wer_avg)
729
+
730
+ st.subheader("Comparison of average WER for free and commercial systems")
731
+ df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type")
732
+ st.dataframe(df_wer_avg_free_commercial)
733
+
734
+ ##################### PER SYSTEM ANALYSIS #########################
735
+ analysis_dim = "system"
736
+ metric = "WER"
737
+ metric2 = "CER"
738
+
739
+ st.subheader("Table showing {} and {}".format(metric, metric2))
740
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
741
+ df_cer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric2, analysis_dim)
742
+ # merge the two dataframes, keep only one column for each metric with average values
743
+ df_wer_cer_per_system_from_per_dataset = pd.merge(df_wer_per_system_from_per_dataset, df_cer_per_system_from_per_dataset, on='system')
744
+ # drop top level of the column index
745
+ df_wer_cer_per_system_from_per_dataset = df_wer_cer_per_system_from_per_dataset.reset_index()
746
+
747
+ # keep columns system, avg_WER and avg_CER only
748
+ df_wer_cer_per_system_from_per_dataset = df_wer_cer_per_system_from_per_dataset[['system', 'avg_WER', 'avg_CER']]
749
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_cer_per_system_from_per_dataset)
750
+
751
+ st.dataframe(df_wer_cer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
752
+
753
+
754
+ ##################### PER SUBSET ANALYSIS #########################
755
+ analysis_dim = "subset"
756
+ metric = "WER"
757
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
758
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
759
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
760
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
761
+
762
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
763
+ fig = box_plot_per_dimension_subsets(df_per_dataset, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim +' of dataset ' + dataset_short_name , metric + " (%)", "system")
764
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
765
+
766
+ ### IMPACT OF NORMALIZATION ON ERROR RATES #####
767
+ # Calculate the average impact of various norm_types for all datasets and systems
768
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
769
+ diff_in_metrics = check_impact_of_normalization(df_per_dataset_selected_cols)
770
+ st.subheader("Impact of normalization on WER")
771
+ st.dataframe(diff_in_metrics, use_container_width=False)
772
+
773
+ # Visualizing the differences in metrics graphically with data labels
774
+ # Visualizing the differences in metrics graphically with data labels
775
+ fig, axs = plt.subplots(3, 2, figsize=(12, 12))
776
+ fig.subplots_adjust(hspace=0.6, wspace=0.6)
777
+
778
+ #remove the sixth subplot
779
+ fig.delaxes(axs[2,1])
780
+
781
+ metrics = ['SER', 'WER', 'MER', 'CER', "Average"]
782
+ colors = ['blue', 'orange', 'green', 'red', 'purple']
783
+
784
+ for ax, metric, color in zip(axs.flatten(), metrics, colors):
785
+ bars = ax.bar(diff_in_metrics.index, diff_in_metrics[metric], color=color)
786
+ ax.set_title(f'Normalization impact on {metric}')
787
+ if metric == 'Average':
788
+ ax.set_title('Average normalization impact on all metrics')
789
+ ax.set_xlabel('Normalization Type')
790
+ ax.set_ylabel(f'Difference in {metric} [pp]')
791
  ax.grid(True)
792
  ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right')
793
  min_val = diff_in_metrics[metric].min()
 
819
  dataset_short_name = "BIGOS"
820
  elif dataset == "pelcra/pl-asr-pelcra-for-bigos-secret":
821
  dataset_short_name = "PELCRA"
822
+ elif dataset == "michaljunczyk/test_A_poleval_24":
823
+ dataset_short_name = "PolEval test A"
824
+ elif dataset == "michaljunczyk/test_B_poleval_24":
825
+ dataset_short_name = "PolEval test B"
826
  else:
827
  dataset_short_name = "UNKNOWN"
828
 
constants.py CHANGED
@@ -17,6 +17,8 @@ Learn more [here](https://huggingface.co/datasets/amu-cai/pl-asr-bigos-v2)"
17
  PELCRA_INFO = "PELCRA for BIGOS is the subset of speech corpora created by the [PELCRA group](http://pelcra.pl/new/), curated for the BIGOS benchmark by the [AMU-CAI team](https://huggingface.co/amu-cai). \
18
  Learn more [here](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos)"
19
 
 
 
20
  ANALYSIS_INFO = "Here we examine ASR accuracy depending on the system type, model size, audio duration, speaking rate and speaker charactertics (age and gender)"
21
 
22
  INSPECTION_INFO = "Here you can inspect the performance of specific ASR systems on the specific audio samples"
 
17
  PELCRA_INFO = "PELCRA for BIGOS is the subset of speech corpora created by the [PELCRA group](http://pelcra.pl/new/), curated for the BIGOS benchmark by the [AMU-CAI team](https://huggingface.co/amu-cai). \
18
  Learn more [here](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos)"
19
 
20
+ POLEVAL_INFO = "PolEval is test used for Polish ASR challenge. It consists of recordings from BIGOS and PELCRA datasets. For details see: [PolEval 2024 - Task 3 - ASR](https://poleval.pl/tasks/task3)"
21
+
22
  ANALYSIS_INFO = "Here we examine ASR accuracy depending on the system type, model size, audio duration, speaking rate and speaker charactertics (age and gender)"
23
 
24
  INSPECTION_INFO = "Here you can inspect the performance of specific ASR systems on the specific audio samples"