Spaces:

amu-cai
/

pl-asr-survey

Running

App Files Files Community

mj-new commited on Mar 9, 2024

Commit

e283f70

1 Parent(s): d5cbb7a

Alpha version with tabs

Browse files

Files changed (8) hide show

.python-version +1 -0
__pycache__/app_utils.cpython-310.pyc +0 -0
__pycache__/contants.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
app.py +98 -65
app_utils.py +7 -2
contants.py +16 -4
utils.py +18 -2

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ streamlit

__pycache__/app_utils.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/app_utils.cpython-310.pyc and b/__pycache__/app_utils.cpython-310.pyc differ

__pycache__/contants.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ

__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -2,100 +2,133 @@ import pandas as pd
 import streamlit as st
 from app_utils import filter_dataframe, calculate_height_to_display
-from contants import WELCOME_TEXT, CITATION_TEXT
 from utils import BASE_SUMMARY_METRICS
-from utils import  load_catalog, load_taxonomy
 from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
 import matplotlib.pyplot as plt
 import seaborn as sns
-st.set_page_config(layout="wide")
-st.title("Polish Speech Datasets Catalog and Survey analysis")
-st.write(WELCOME_TEXT)
-st.write(CITATION_TEXT)
 # Cache the dataframe so it's only loaded once
-df_cat = load_catalog()
-df_tax = load_taxonomy()
 # Filter out non available datasets
-df_cat_available = df_cat[df_cat['Available online'] == 'yes']
 # Available and free
-df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
 # Available and paid
-df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
-# Display catalog contents
-st.dataframe(filter_dataframe(df_cat), hide_index=True, use_container_width=True)
-# Display taxonomy contents
-# Display summary statistics
-st.header("Polish ASR speech datasets summary statistics")
-df_summary_metrics = catalog_summary_statistics(df_cat)
-df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
-st.dataframe(df_basic_stats, use_container_width=False)
-st.header("Speech data available across Polish ASR speech datasets")
-df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
-st.dataframe(df_stats_audio_available, use_container_width=False)
-st.header("Transcribed data available across Polish ASR speech datasets")
-df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
-st.dataframe(df_stats_transcribed_available, use_container_width=False)
-# Display distribution of datasets created per year
-st.header("Polish ASR speech datasets created in 1997-2023")
-col_groupby = ['Creation year']
-df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
-st.dataframe(df_datasets_per_speech_type, use_container_width=False)
-st.header("Institutions contributing Polish ASR speech dataset")
-col_groupby = ['Publisher']
-df_datasets_per_publisher = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
-st.dataframe(df_datasets_per_publisher, use_container_width=False)
-st.header("Repositories hosting Polish ASR speech datasets")
-col_groupby = ['Repository']
-df_datasets_per_repo = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
-st.dataframe(df_datasets_per_repo, use_container_width=False)
-st.header("Public domain Polish ASR speech datasets")
-col_groupby = ['License', "Dataset ID"]
-df_datasets_public = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
-st.dataframe(df_datasets_public, use_container_width=False)
-st.header("Commercialy available Polish ASR speech datasets")
-col_groupby = ['License', "Dataset ID"]
-df_datasets_paid = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
-st.dataframe(df_datasets_paid, use_container_width=False)
-st.header("Coverage of metadata across Polish ASR speech datasets")
-df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid)
-st.dataframe(df_meta_all_pivot, use_container_width=False)
-# Display distribution of datasets for various speech types
-st.header("Datasets per speech type")
-col_groupby = ['Speech type']
-df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
-st.dataframe(df_datasets_per_speech_type, use_container_width=False)
-# Display distribution of datasets for various speech types
-st.header("Distribution of available speech data per audio device - Public domain datasets")
-col_groupby = ['Audio device']
-df_datasets_per_device = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
-st.dataframe(df_datasets_per_device, use_container_width=False)
-# Display distribution of datasets for various speech types
-st.header("Distribution of available speech data per audio device - Commercial datasets")
-col_groupby = ['Audio device']
-df_datasets_per_device = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
-st.dataframe(df_datasets_per_device, use_container_width=False)

 import streamlit as st
 from app_utils import filter_dataframe, calculate_height_to_display
+from contants import INFO_CATALOG, INFO_BENCHMARK, INFO_SURVEY, CITATION_CATALOG, CITATION_BENCHMARK, CITATION_SURVEY
 from utils import BASE_SUMMARY_METRICS
+from utils import  load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
 from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
 import matplotlib.pyplot as plt
 import seaborn as sns
+st.set_page_config(layout="wide")
+# Load PL ASR data survey data
 # Cache the dataframe so it's only loaded once
+df_data_cat = load_data_catalog()
+df_data_tax = load_data_taxonomy()
 # Filter out non available datasets
+df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
 # Available and free
+df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == 'free')]
 # Available and paid
+df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != 'free')]
+# Load PL ASR benchmarks survey data
+df_bench_cat = load_bench_catalog()
+df_bench_tax = load_bench_taxonomy()
+data_cat, data_survey, bench_cat, bench_survey = st.tabs(["PL ASR speech datasets catalog", "PL ASR speech data survey", "PL ASR benchmarks catalog", "PL ASR benchmarks survey"])
+with data_cat:
+    st.title("Polish ASR Speech Datasets Catalog")
+    st.markdown(INFO_CATALOG, unsafe_allow_html=True)
+    st.header("How to use?")
+    #    sent = st.text_area("Text", WELCOME_TEXT, height = 275)
+    st.header("How to cite?")
+    st.code(CITATION_CATALOG)
+    # Display catalog contents
+    st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)
+    # Display taxonomy contents
+with data_survey:
+    # Display summary statistics
+    st.title("Polish ASR Speech Datasets Survey")
+    st.header("Polish ASR speech datasets summary statistics")
+    df_summary_metrics = catalog_summary_statistics(df_data_cat)
+    df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
+    st.dataframe(df_basic_stats, use_container_width=False)
+    st.header("Speech data available across Polish ASR speech datasets")
+    df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
+    st.dataframe(df_stats_audio_available, use_container_width=False)
+    st.header("Transcribed data available across Polish ASR speech datasets")
+    df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
+    st.dataframe(df_stats_transcribed_available, use_container_width=False)
+    # Display distribution of datasets created per year
+    st.header("Polish ASR speech datasets created in 1997-2023")
+    col_groupby = ['Creation year']
+    df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+    st.dataframe(df_datasets_per_speech_type, use_container_width=False)
+    st.header("Institutions contributing Polish ASR speech dataset")
+    col_groupby = ['Publisher']
+    df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+    st.dataframe(df_datasets_per_publisher, use_container_width=False)
+    st.header("Repositories hosting Polish ASR speech datasets")
+    col_groupby = ['Repository']
+    df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+    st.dataframe(df_datasets_per_repo, use_container_width=False)
+    st.header("Public domain Polish ASR speech datasets")
+    col_groupby = ['License', "Dataset ID"]
+    df_datasets_public = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
+    st.dataframe(df_datasets_public, use_container_width=False)
+    st.header("Commercialy available Polish ASR speech datasets")
+    col_groupby = ['License', "Dataset ID"]
+    df_datasets_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
+    st.dataframe(df_datasets_paid, use_container_width=False)
+    st.header("Coverage of metadata across Polish ASR speech datasets")
+    df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_data_cat, df_data_cat_available_free, df_data_cat_available_paid)
+    st.dataframe(df_meta_all_pivot, use_container_width=False)
+    # Display distribution of datasets for various speech types
+    st.header("Datasets per speech type")
+    col_groupby = ['Speech type']
+    df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+    st.dataframe(df_datasets_per_speech_type, use_container_width=False)
+    # Display distribution of datasets for various speech types
+    st.header("Distribution of available speech data per audio device - Public domain datasets")
+    col_groupby = ['Audio device']
+    df_datasets_per_device = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+    st.dataframe(df_datasets_per_device, use_container_width=False)
+    # Display distribution of datasets for various speech types
+    st.header("Distribution of available speech data per audio device - Commercial datasets")
+    col_groupby = ['Audio device']
+    df_datasets_per_device = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
+    st.dataframe(df_datasets_per_device, use_container_width=False)
+with bench_cat:
+    st.write("Benchmarks catalog")
+    # TODO - load and display benchmarks catalog
+    st.title("Polish ASR Benchmarks Catalog")
+    st.write(WELCOME_TEXT)
+    st.write(CITATION_TEXT)
+    # Display catalog contents
+    st.dataframe(filter_dataframe(df_bench_cat, "benchmarks"), hide_index=True, use_container_width=True)
+    # Display taxonomy contents

app_utils.py CHANGED Viewed

@@ -18,7 +18,7 @@ def calculate_height_to_display(df):
     return calculated_height
-def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     Adds a UI on top of a dataframe to let viewers filter columns
@@ -28,7 +28,12 @@ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     Returns:
         pd.DataFrame: Filtered dataframe
     """
-    modify = st.checkbox("Use filters on speech data catalog")
     if not modify:
         return df

     return calculated_height
+def filter_dataframe(df: pd.DataFrame, target) -> pd.DataFrame:
     """
     Adds a UI on top of a dataframe to let viewers filter columns
     Returns:
         pd.DataFrame: Filtered dataframe
     """
+    if(target == "datasets"):
+        modify = st.checkbox("Use filters on speech data catalog")
+    elif(target == "benchmarks"):
+        modify = st.checkbox("Use filters on benchmarks catalog")
+    else:
+        print("Invalid target")
     if not modify:
         return df

contants.py CHANGED Viewed

@@ -1,5 +1,17 @@
-WELCOME_TEXT = "This dashboard complements [Polish Speech Datasets Catalog](https://github.com/goodmike31/pl-asr-speech-data-survey) with:\n \
-a. Dynamic filtering of catalog content\n \
-b. Summary statistics about Polish ASR speech datasets\n"
-CITATION_TEXT="Please cite this work as: TODO\n"

+INFO_CATALOG = "This dashboard complements *Polish ASR Speech Datasets Catalog* available on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey) and [Google Sheets](https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0) by providing:<br> \
+* More convenient browsing of the catalog content (see the *How to use?* section below) <br>\
+* Analysis of datasets utility for the purpose of ASR evaluation (see the *Dataset Utility Index* tab) <br>\
+* Analysis of the state of Polish ASR speech data (see the *Polish ASR Speech Data Survey* tab <br> \
+IMPORANT - Please share your feedback [HERE](https://forms.gle/EWJ6YfbJJTyEzQs66). <br>\
+Your feedback will help to assess the state of Polish ASR speech data from the community perspective.<br>\
+Each response is granted 50 PLN for the charity of choice."
+INFO_BENCHMARK = "TODO"
+INFO_SURVEY = "This dashboard complements [Polish Speech Datasets Survey]"
+CITATION_CATALOG="Please cite this work as: TODO"
+CITATION_BENCHMARK="Please cite this work as: TODO"
+CITATION_SURVEY="Please cite this work as: TODO"

utils.py CHANGED Viewed

@@ -43,19 +43,35 @@ def download_tsv_from_google_sheet(sheet_url):
         return None
 @st.cache_data
-def load_catalog():
     print("Reading speech data catalog")
     catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
     df_catalog = download_tsv_from_google_sheet(catalog_url)
     return(df_catalog)
 @st.cache_data
-def load_taxonomy():
     print("Reading speech data survey taxonomy")
     taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
     df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
     return(df_taxonomy)
 def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
     """
     Function to generate a summary view of datasets by speech type and other relevant metrics.

         return None
 @st.cache_data
+def load_data_catalog():
     print("Reading speech data catalog")
     catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
     df_catalog = download_tsv_from_google_sheet(catalog_url)
     return(df_catalog)
 @st.cache_data
+def load_data_taxonomy():
     print("Reading speech data survey taxonomy")
     taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
     df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
     return(df_taxonomy)
+@st.cache_data
+def load_bench_catalog():
+    print("Reading ASR benchmarks catalog")
+    catalog_url="https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit#gid=0"
+    df_catalog = download_tsv_from_google_sheet(catalog_url)
+    return(df_catalog)
+@st.cache_data
+def load_bench_taxonomy():
+    print("Reading ASR benchmarks survey taxonomy")
+    taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
+    df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
+    return(df_taxonomy)
 def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
     """
     Function to generate a summary view of datasets by speech type and other relevant metrics.