Spaces:
Running
Running
mj-new
commited on
Commit
·
e283f70
1
Parent(s):
d5cbb7a
Alpha version with tabs
Browse files- .python-version +1 -0
- __pycache__/app_utils.cpython-310.pyc +0 -0
- __pycache__/contants.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- app.py +98 -65
- app_utils.py +7 -2
- contants.py +16 -4
- utils.py +18 -2
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
streamlit
|
__pycache__/app_utils.cpython-310.pyc
CHANGED
Binary files a/__pycache__/app_utils.cpython-310.pyc and b/__pycache__/app_utils.cpython-310.pyc differ
|
|
__pycache__/contants.cpython-310.pyc
CHANGED
Binary files a/__pycache__/contants.cpython-310.pyc and b/__pycache__/contants.cpython-310.pyc differ
|
|
__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
|
|
app.py
CHANGED
@@ -2,100 +2,133 @@ import pandas as pd
|
|
2 |
import streamlit as st
|
3 |
|
4 |
from app_utils import filter_dataframe, calculate_height_to_display
|
5 |
-
from contants import
|
6 |
from utils import BASE_SUMMARY_METRICS
|
7 |
-
from utils import
|
8 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
9 |
|
10 |
import matplotlib.pyplot as plt
|
11 |
import seaborn as sns
|
12 |
|
13 |
-
st.set_page_config(layout="wide")
|
14 |
-
|
15 |
-
st.title("Polish Speech Datasets Catalog and Survey analysis")
|
16 |
|
17 |
-
st.
|
18 |
|
19 |
-
st.write(CITATION_TEXT)
|
20 |
|
|
|
21 |
# Cache the dataframe so it's only loaded once
|
22 |
-
|
23 |
-
|
24 |
|
25 |
# Filter out non available datasets
|
26 |
-
|
27 |
# Available and free
|
28 |
-
|
29 |
|
30 |
# Available and paid
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
|
34 |
-
|
|
|
35 |
|
36 |
-
# Display taxonomy contents
|
37 |
|
38 |
-
# Display
|
39 |
-
st.header("Polish ASR speech datasets
|
40 |
-
|
|
|
41 |
|
42 |
-
|
43 |
-
st.dataframe(df_basic_stats, use_container_width=False)
|
44 |
|
45 |
-
st.header("
|
46 |
-
|
47 |
-
|
|
|
48 |
|
49 |
-
st.header("
|
50 |
-
|
51 |
-
|
|
|
52 |
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
col_groupby = ['
|
57 |
-
|
58 |
|
59 |
-
st.
|
|
|
|
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
st.
|
80 |
|
81 |
-
st.
|
82 |
-
df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid)
|
83 |
-
st.dataframe(df_meta_all_pivot, use_container_width=False)
|
84 |
|
85 |
-
|
86 |
-
st.header("Datasets per speech type")
|
87 |
-
col_groupby = ['Speech type']
|
88 |
-
df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
89 |
-
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
90 |
|
91 |
-
# Display
|
92 |
-
st.
|
93 |
-
col_groupby = ['Audio device']
|
94 |
-
df_datasets_per_device = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
95 |
-
st.dataframe(df_datasets_per_device, use_container_width=False)
|
96 |
|
97 |
-
# Display
|
98 |
-
st.header("Distribution of available speech data per audio device - Commercial datasets")
|
99 |
-
col_groupby = ['Audio device']
|
100 |
-
df_datasets_per_device = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
101 |
-
st.dataframe(df_datasets_per_device, use_container_width=False)
|
|
|
2 |
import streamlit as st
|
3 |
|
4 |
from app_utils import filter_dataframe, calculate_height_to_display
|
5 |
+
from contants import INFO_CATALOG, INFO_BENCHMARK, INFO_SURVEY, CITATION_CATALOG, CITATION_BENCHMARK, CITATION_SURVEY
|
6 |
from utils import BASE_SUMMARY_METRICS
|
7 |
+
from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
|
8 |
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
9 |
|
10 |
import matplotlib.pyplot as plt
|
11 |
import seaborn as sns
|
12 |
|
|
|
|
|
|
|
13 |
|
14 |
+
st.set_page_config(layout="wide")
|
15 |
|
|
|
16 |
|
17 |
+
# Load PL ASR data survey data
|
18 |
# Cache the dataframe so it's only loaded once
|
19 |
+
df_data_cat = load_data_catalog()
|
20 |
+
df_data_tax = load_data_taxonomy()
|
21 |
|
22 |
# Filter out non available datasets
|
23 |
+
df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
|
24 |
# Available and free
|
25 |
+
df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == 'free')]
|
26 |
|
27 |
# Available and paid
|
28 |
+
df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != 'free')]
|
29 |
+
|
30 |
+
|
31 |
+
# Load PL ASR benchmarks survey data
|
32 |
+
df_bench_cat = load_bench_catalog()
|
33 |
+
df_bench_tax = load_bench_taxonomy()
|
34 |
+
|
35 |
+
data_cat, data_survey, bench_cat, bench_survey = st.tabs(["PL ASR speech datasets catalog", "PL ASR speech data survey", "PL ASR benchmarks catalog", "PL ASR benchmarks survey"])
|
36 |
+
|
37 |
+
|
38 |
+
with data_cat:
|
39 |
+
st.title("Polish ASR Speech Datasets Catalog")
|
40 |
+
|
41 |
+
st.markdown(INFO_CATALOG, unsafe_allow_html=True)
|
42 |
+
|
43 |
+
st.header("How to use?")
|
44 |
+
# sent = st.text_area("Text", WELCOME_TEXT, height = 275)
|
45 |
+
|
46 |
+
st.header("How to cite?")
|
47 |
+
st.code(CITATION_CATALOG)
|
48 |
+
|
49 |
+
# Display catalog contents
|
50 |
+
st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)
|
51 |
+
|
52 |
+
# Display taxonomy contents
|
53 |
+
|
54 |
+
with data_survey:
|
55 |
+
# Display summary statistics
|
56 |
+
st.title("Polish ASR Speech Datasets Survey")
|
57 |
+
|
58 |
+
st.header("Polish ASR speech datasets summary statistics")
|
59 |
+
df_summary_metrics = catalog_summary_statistics(df_data_cat)
|
60 |
+
|
61 |
+
df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
|
62 |
+
st.dataframe(df_basic_stats, use_container_width=False)
|
63 |
+
|
64 |
+
st.header("Speech data available across Polish ASR speech datasets")
|
65 |
+
df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
|
66 |
+
st.dataframe(df_stats_audio_available, use_container_width=False)
|
67 |
|
68 |
+
st.header("Transcribed data available across Polish ASR speech datasets")
|
69 |
+
df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
|
70 |
+
st.dataframe(df_stats_transcribed_available, use_container_width=False)
|
71 |
|
|
|
72 |
|
73 |
+
# Display distribution of datasets created per year
|
74 |
+
st.header("Polish ASR speech datasets created in 1997-2023")
|
75 |
+
col_groupby = ['Creation year']
|
76 |
+
df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
77 |
|
78 |
+
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
|
|
79 |
|
80 |
+
st.header("Institutions contributing Polish ASR speech dataset")
|
81 |
+
col_groupby = ['Publisher']
|
82 |
+
df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
83 |
+
st.dataframe(df_datasets_per_publisher, use_container_width=False)
|
84 |
|
85 |
+
st.header("Repositories hosting Polish ASR speech datasets")
|
86 |
+
col_groupby = ['Repository']
|
87 |
+
df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
88 |
+
st.dataframe(df_datasets_per_repo, use_container_width=False)
|
89 |
|
90 |
+
st.header("Public domain Polish ASR speech datasets")
|
91 |
+
col_groupby = ['License', "Dataset ID"]
|
92 |
+
df_datasets_public = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
|
93 |
+
st.dataframe(df_datasets_public, use_container_width=False)
|
94 |
|
95 |
+
st.header("Commercialy available Polish ASR speech datasets")
|
96 |
+
col_groupby = ['License', "Dataset ID"]
|
97 |
+
df_datasets_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
|
98 |
+
st.dataframe(df_datasets_paid, use_container_width=False)
|
99 |
|
100 |
+
st.header("Coverage of metadata across Polish ASR speech datasets")
|
101 |
+
df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_data_cat, df_data_cat_available_free, df_data_cat_available_paid)
|
102 |
+
st.dataframe(df_meta_all_pivot, use_container_width=False)
|
103 |
|
104 |
+
# Display distribution of datasets for various speech types
|
105 |
+
st.header("Datasets per speech type")
|
106 |
+
col_groupby = ['Speech type']
|
107 |
+
df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
108 |
+
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
109 |
|
110 |
+
# Display distribution of datasets for various speech types
|
111 |
+
st.header("Distribution of available speech data per audio device - Public domain datasets")
|
112 |
+
col_groupby = ['Audio device']
|
113 |
+
df_datasets_per_device = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
114 |
+
st.dataframe(df_datasets_per_device, use_container_width=False)
|
115 |
|
116 |
+
# Display distribution of datasets for various speech types
|
117 |
+
st.header("Distribution of available speech data per audio device - Commercial datasets")
|
118 |
+
col_groupby = ['Audio device']
|
119 |
+
df_datasets_per_device = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
120 |
+
st.dataframe(df_datasets_per_device, use_container_width=False)
|
121 |
|
122 |
+
with bench_cat:
|
123 |
+
st.write("Benchmarks catalog")
|
124 |
+
# TODO - load and display benchmarks catalog
|
125 |
+
st.title("Polish ASR Benchmarks Catalog")
|
126 |
|
127 |
+
st.write(WELCOME_TEXT)
|
|
|
|
|
128 |
|
129 |
+
st.write(CITATION_TEXT)
|
|
|
|
|
|
|
|
|
130 |
|
131 |
+
# Display catalog contents
|
132 |
+
st.dataframe(filter_dataframe(df_bench_cat, "benchmarks"), hide_index=True, use_container_width=True)
|
|
|
|
|
|
|
133 |
|
134 |
+
# Display taxonomy contents
|
|
|
|
|
|
|
|
app_utils.py
CHANGED
@@ -18,7 +18,7 @@ def calculate_height_to_display(df):
|
|
18 |
|
19 |
return calculated_height
|
20 |
|
21 |
-
def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
22 |
"""
|
23 |
Adds a UI on top of a dataframe to let viewers filter columns
|
24 |
|
@@ -28,7 +28,12 @@ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
28 |
Returns:
|
29 |
pd.DataFrame: Filtered dataframe
|
30 |
"""
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
if not modify:
|
34 |
return df
|
|
|
18 |
|
19 |
return calculated_height
|
20 |
|
21 |
+
def filter_dataframe(df: pd.DataFrame, target) -> pd.DataFrame:
|
22 |
"""
|
23 |
Adds a UI on top of a dataframe to let viewers filter columns
|
24 |
|
|
|
28 |
Returns:
|
29 |
pd.DataFrame: Filtered dataframe
|
30 |
"""
|
31 |
+
if(target == "datasets"):
|
32 |
+
modify = st.checkbox("Use filters on speech data catalog")
|
33 |
+
elif(target == "benchmarks"):
|
34 |
+
modify = st.checkbox("Use filters on benchmarks catalog")
|
35 |
+
else:
|
36 |
+
print("Invalid target")
|
37 |
|
38 |
if not modify:
|
39 |
return df
|
contants.py
CHANGED
@@ -1,5 +1,17 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
INFO_CATALOG = "This dashboard complements *Polish ASR Speech Datasets Catalog* available on [GitHub](https://github.com/goodmike31/pl-asr-speech-data-survey) and [Google Sheets](https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0) by providing:<br> \
|
2 |
+
* More convenient browsing of the catalog content (see the *How to use?* section below) <br>\
|
3 |
+
* Analysis of datasets utility for the purpose of ASR evaluation (see the *Dataset Utility Index* tab) <br>\
|
4 |
+
* Analysis of the state of Polish ASR speech data (see the *Polish ASR Speech Data Survey* tab <br> \
|
5 |
+
IMPORANT - Please share your feedback [HERE](https://forms.gle/EWJ6YfbJJTyEzQs66). <br>\
|
6 |
+
Your feedback will help to assess the state of Polish ASR speech data from the community perspective.<br>\
|
7 |
+
Each response is granted 50 PLN for the charity of choice."
|
8 |
|
9 |
+
INFO_BENCHMARK = "TODO"
|
10 |
+
|
11 |
+
INFO_SURVEY = "This dashboard complements [Polish Speech Datasets Survey]"
|
12 |
+
|
13 |
+
CITATION_CATALOG="Please cite this work as: TODO"
|
14 |
+
|
15 |
+
CITATION_BENCHMARK="Please cite this work as: TODO"
|
16 |
+
|
17 |
+
CITATION_SURVEY="Please cite this work as: TODO"
|
utils.py
CHANGED
@@ -43,19 +43,35 @@ def download_tsv_from_google_sheet(sheet_url):
|
|
43 |
return None
|
44 |
|
45 |
@st.cache_data
|
46 |
-
def
|
47 |
print("Reading speech data catalog")
|
48 |
catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
|
49 |
df_catalog = download_tsv_from_google_sheet(catalog_url)
|
50 |
return(df_catalog)
|
51 |
|
52 |
@st.cache_data
|
53 |
-
def
|
54 |
print("Reading speech data survey taxonomy")
|
55 |
taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
|
56 |
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
|
57 |
return(df_taxonomy)
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
|
60 |
"""
|
61 |
Function to generate a summary view of datasets by speech type and other relevant metrics.
|
|
|
43 |
return None
|
44 |
|
45 |
@st.cache_data
|
46 |
+
def load_data_catalog():
|
47 |
print("Reading speech data catalog")
|
48 |
catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
|
49 |
df_catalog = download_tsv_from_google_sheet(catalog_url)
|
50 |
return(df_catalog)
|
51 |
|
52 |
@st.cache_data
|
53 |
+
def load_data_taxonomy():
|
54 |
print("Reading speech data survey taxonomy")
|
55 |
taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
|
56 |
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
|
57 |
return(df_taxonomy)
|
58 |
|
59 |
+
|
60 |
+
@st.cache_data
|
61 |
+
def load_bench_catalog():
|
62 |
+
print("Reading ASR benchmarks catalog")
|
63 |
+
catalog_url="https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit#gid=0"
|
64 |
+
df_catalog = download_tsv_from_google_sheet(catalog_url)
|
65 |
+
return(df_catalog)
|
66 |
+
|
67 |
+
@st.cache_data
|
68 |
+
def load_bench_taxonomy():
|
69 |
+
print("Reading ASR benchmarks survey taxonomy")
|
70 |
+
taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
|
71 |
+
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
|
72 |
+
return(df_taxonomy)
|
73 |
+
|
74 |
+
|
75 |
def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
|
76 |
"""
|
77 |
Function to generate a summary view of datasets by speech type and other relevant metrics.
|