Spaces:
Running
Running
mj-new
commited on
Commit
·
d5cbb7a
1
Parent(s):
2901944
Alpha version of the dataset catalog
Browse files- README.md +4 -4
- __pycache__/app_utils.cpython-310.pyc +0 -0
- __pycache__/contants.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- analysis-playground.ipynb +0 -0
- app.py +101 -0
- app_utils.py +94 -0
- contants.py +5 -0
- requirements.txt +3 -0
- utils.py +276 -0
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
-
title: Pl Asr Survey
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: red
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: cc-by-sa-4.0
|
|
|
1 |
---
|
2 |
+
title: Pl Asr Speech Data Survey
|
3 |
+
emoji: 🏃
|
4 |
+
colorFrom: pink
|
5 |
colorTo: red
|
6 |
sdk: streamlit
|
7 |
+
sdk_version: 1.31.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: cc-by-sa-4.0
|
__pycache__/app_utils.cpython-310.pyc
ADDED
Binary file (2.28 kB). View file
|
|
__pycache__/contants.cpython-310.pyc
ADDED
Binary file (482 Bytes). View file
|
|
__pycache__/utils.cpython-310.pyc
ADDED
Binary file (7.26 kB). View file
|
|
analysis-playground.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
from app_utils import filter_dataframe, calculate_height_to_display
|
5 |
+
from contants import WELCOME_TEXT, CITATION_TEXT
|
6 |
+
from utils import BASE_SUMMARY_METRICS
|
7 |
+
from utils import load_catalog, load_taxonomy
|
8 |
+
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
|
9 |
+
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import seaborn as sns
|
12 |
+
|
13 |
+
st.set_page_config(layout="wide")
|
14 |
+
|
15 |
+
st.title("Polish Speech Datasets Catalog and Survey analysis")
|
16 |
+
|
17 |
+
st.write(WELCOME_TEXT)
|
18 |
+
|
19 |
+
st.write(CITATION_TEXT)
|
20 |
+
|
21 |
+
# Cache the dataframe so it's only loaded once
|
22 |
+
df_cat = load_catalog()
|
23 |
+
df_tax = load_taxonomy()
|
24 |
+
|
25 |
+
# Filter out non available datasets
|
26 |
+
df_cat_available = df_cat[df_cat['Available online'] == 'yes']
|
27 |
+
# Available and free
|
28 |
+
df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
|
29 |
+
|
30 |
+
# Available and paid
|
31 |
+
df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
|
32 |
+
|
33 |
+
# Display catalog contents
|
34 |
+
st.dataframe(filter_dataframe(df_cat), hide_index=True, use_container_width=True)
|
35 |
+
|
36 |
+
# Display taxonomy contents
|
37 |
+
|
38 |
+
# Display summary statistics
|
39 |
+
st.header("Polish ASR speech datasets summary statistics")
|
40 |
+
df_summary_metrics = catalog_summary_statistics(df_cat)
|
41 |
+
|
42 |
+
df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
|
43 |
+
st.dataframe(df_basic_stats, use_container_width=False)
|
44 |
+
|
45 |
+
st.header("Speech data available across Polish ASR speech datasets")
|
46 |
+
df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
|
47 |
+
st.dataframe(df_stats_audio_available, use_container_width=False)
|
48 |
+
|
49 |
+
st.header("Transcribed data available across Polish ASR speech datasets")
|
50 |
+
df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
|
51 |
+
st.dataframe(df_stats_transcribed_available, use_container_width=False)
|
52 |
+
|
53 |
+
|
54 |
+
# Display distribution of datasets created per year
|
55 |
+
st.header("Polish ASR speech datasets created in 1997-2023")
|
56 |
+
col_groupby = ['Creation year']
|
57 |
+
df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
58 |
+
|
59 |
+
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
60 |
+
|
61 |
+
st.header("Institutions contributing Polish ASR speech dataset")
|
62 |
+
col_groupby = ['Publisher']
|
63 |
+
df_datasets_per_publisher = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
64 |
+
st.dataframe(df_datasets_per_publisher, use_container_width=False)
|
65 |
+
|
66 |
+
st.header("Repositories hosting Polish ASR speech datasets")
|
67 |
+
col_groupby = ['Repository']
|
68 |
+
df_datasets_per_repo = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
69 |
+
st.dataframe(df_datasets_per_repo, use_container_width=False)
|
70 |
+
|
71 |
+
st.header("Public domain Polish ASR speech datasets")
|
72 |
+
col_groupby = ['License', "Dataset ID"]
|
73 |
+
df_datasets_public = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
|
74 |
+
st.dataframe(df_datasets_public, use_container_width=False)
|
75 |
+
|
76 |
+
st.header("Commercialy available Polish ASR speech datasets")
|
77 |
+
col_groupby = ['License', "Dataset ID"]
|
78 |
+
df_datasets_paid = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
|
79 |
+
st.dataframe(df_datasets_paid, use_container_width=False)
|
80 |
+
|
81 |
+
st.header("Coverage of metadata across Polish ASR speech datasets")
|
82 |
+
df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid)
|
83 |
+
st.dataframe(df_meta_all_pivot, use_container_width=False)
|
84 |
+
|
85 |
+
# Display distribution of datasets for various speech types
|
86 |
+
st.header("Datasets per speech type")
|
87 |
+
col_groupby = ['Speech type']
|
88 |
+
df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
89 |
+
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
|
90 |
+
|
91 |
+
# Display distribution of datasets for various speech types
|
92 |
+
st.header("Distribution of available speech data per audio device - Public domain datasets")
|
93 |
+
col_groupby = ['Audio device']
|
94 |
+
df_datasets_per_device = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
95 |
+
st.dataframe(df_datasets_per_device, use_container_width=False)
|
96 |
+
|
97 |
+
# Display distribution of datasets for various speech types
|
98 |
+
st.header("Distribution of available speech data per audio device - Commercial datasets")
|
99 |
+
col_groupby = ['Audio device']
|
100 |
+
df_datasets_per_device = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
|
101 |
+
st.dataframe(df_datasets_per_device, use_container_width=False)
|
app_utils.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
from pandas.api.types import (
|
5 |
+
is_categorical_dtype,
|
6 |
+
is_datetime64_any_dtype,
|
7 |
+
is_numeric_dtype,
|
8 |
+
is_object_dtype,
|
9 |
+
)
|
10 |
+
|
11 |
+
def calculate_height_to_display(df):
|
12 |
+
# Calculate the height of the DataFrame display area
|
13 |
+
num_rows = df.shape[0]
|
14 |
+
row_height = 25 # Estimate of row height in pixels, adjust based on your layout/theme
|
15 |
+
header_height = 50 # Estimate of header height in pixels
|
16 |
+
padding = 20 # Extra padding in pixels
|
17 |
+
calculated_height = num_rows * row_height + header_height + padding
|
18 |
+
|
19 |
+
return calculated_height
|
20 |
+
|
21 |
+
def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
22 |
+
"""
|
23 |
+
Adds a UI on top of a dataframe to let viewers filter columns
|
24 |
+
|
25 |
+
Args:
|
26 |
+
df (pd.DataFrame): Original dataframe
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
pd.DataFrame: Filtered dataframe
|
30 |
+
"""
|
31 |
+
modify = st.checkbox("Use filters on speech data catalog")
|
32 |
+
|
33 |
+
if not modify:
|
34 |
+
return df
|
35 |
+
|
36 |
+
df = df.copy()
|
37 |
+
|
38 |
+
# Try to convert datetimes into a standard format (datetime, no timezone)
|
39 |
+
for col in df.columns:
|
40 |
+
if is_object_dtype(df[col]):
|
41 |
+
try:
|
42 |
+
df[col] = pd.to_datetime(df[col])
|
43 |
+
except Exception:
|
44 |
+
pass
|
45 |
+
|
46 |
+
if is_datetime64_any_dtype(df[col]):
|
47 |
+
df[col] = df[col].dt.tz_localize(None)
|
48 |
+
|
49 |
+
modification_container = st.container()
|
50 |
+
|
51 |
+
with modification_container:
|
52 |
+
to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
|
53 |
+
for column in to_filter_columns:
|
54 |
+
left, right = st.columns((1, 20))
|
55 |
+
# Treat columns with < 10 unique values as categorical
|
56 |
+
if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
|
57 |
+
user_cat_input = right.multiselect(
|
58 |
+
f"Values for {column}",
|
59 |
+
df[column].unique(),
|
60 |
+
default=list(df[column].unique()),
|
61 |
+
)
|
62 |
+
df = df[df[column].isin(user_cat_input)]
|
63 |
+
elif is_numeric_dtype(df[column]):
|
64 |
+
_min = float(df[column].min())
|
65 |
+
_max = float(df[column].max())
|
66 |
+
step = (_max - _min) / 100
|
67 |
+
user_num_input = right.slider(
|
68 |
+
f"Values for {column}",
|
69 |
+
min_value=_min,
|
70 |
+
max_value=_max,
|
71 |
+
value=(_min, _max),
|
72 |
+
step=step,
|
73 |
+
)
|
74 |
+
df = df[df[column].between(*user_num_input)]
|
75 |
+
elif is_datetime64_any_dtype(df[column]):
|
76 |
+
user_date_input = right.date_input(
|
77 |
+
f"Values for {column}",
|
78 |
+
value=(
|
79 |
+
df[column].min(),
|
80 |
+
df[column].max(),
|
81 |
+
),
|
82 |
+
)
|
83 |
+
if len(user_date_input) == 2:
|
84 |
+
user_date_input = tuple(map(pd.to_datetime, user_date_input))
|
85 |
+
start_date, end_date = user_date_input
|
86 |
+
df = df.loc[df[column].between(start_date, end_date)]
|
87 |
+
else:
|
88 |
+
user_text_input = right.text_input(
|
89 |
+
f"Substring or regex in {column}",
|
90 |
+
)
|
91 |
+
if user_text_input:
|
92 |
+
df = df[df[column].astype(str).str.contains(user_text_input)]
|
93 |
+
|
94 |
+
return df
|
contants.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
WELCOME_TEXT = "This dashboard complements [Polish Speech Datasets Catalog](https://github.com/goodmike31/pl-asr-speech-data-survey) with:\n \
|
2 |
+
a. Dynamic filtering of catalog content\n \
|
3 |
+
b. Summary statistics about Polish ASR speech datasets\n"
|
4 |
+
|
5 |
+
CITATION_TEXT="Please cite this work as: TODO\n"
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
seaborn
|
2 |
+
matplotlib
|
3 |
+
pandas
|
utils.py
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import pandas as pd
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
|
6 |
+
# TODO - extract from the catalog name
|
7 |
+
|
8 |
+
BASE_SUMMARY_METRICS = [
|
9 |
+
"Catalog last update date",
|
10 |
+
"Unique Polish speech datasets producers",
|
11 |
+
"Identified datasets reported in the public domain",
|
12 |
+
"Datasets available to the public (free and paid)",
|
13 |
+
"Fraction of reported datasets available to the public [%]",
|
14 |
+
"Speech data reported in the public domain [hours]",
|
15 |
+
"Speech data available total [hours]",
|
16 |
+
"Speech data available free of charge [hours]",
|
17 |
+
"Speech data available commercially [hours]",
|
18 |
+
"Reported vs available speech data ratio [%]",
|
19 |
+
"Transcribed speech data reported in the public domain [hours]",
|
20 |
+
"Transcribed speech data available total [hours]",
|
21 |
+
"Transcribed speech data available free of charge [hours]",
|
22 |
+
"Transcribed speech data available commercially [hours]",
|
23 |
+
"Reported vs available transcribed speech data ratio [%]",
|
24 |
+
|
25 |
+
]
|
26 |
+
|
27 |
+
def download_tsv_from_google_sheet(sheet_url):
|
28 |
+
# Modify the Google Sheet URL to export it as TSV
|
29 |
+
tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
|
30 |
+
|
31 |
+
# Send a GET request to download the TSV file
|
32 |
+
response = requests.get(tsv_url)
|
33 |
+
|
34 |
+
# Check if the request was successful
|
35 |
+
if response.status_code == 200:
|
36 |
+
# Read the TSV content into a pandas DataFrame
|
37 |
+
from io import StringIO
|
38 |
+
tsv_content = StringIO(response.text)
|
39 |
+
df = pd.read_csv(tsv_content, sep='\t')
|
40 |
+
return df
|
41 |
+
else:
|
42 |
+
print("Failed to download the TSV file.")
|
43 |
+
return None
|
44 |
+
|
45 |
+
@st.cache_data
|
46 |
+
def load_catalog():
|
47 |
+
print("Reading speech data catalog")
|
48 |
+
catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
|
49 |
+
df_catalog = download_tsv_from_google_sheet(catalog_url)
|
50 |
+
return(df_catalog)
|
51 |
+
|
52 |
+
@st.cache_data
|
53 |
+
def load_taxonomy():
|
54 |
+
print("Reading speech data survey taxonomy")
|
55 |
+
taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
|
56 |
+
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
|
57 |
+
return(df_taxonomy)
|
58 |
+
|
59 |
+
def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
|
60 |
+
"""
|
61 |
+
Function to generate a summary view of datasets by speech type and other relevant metrics.
|
62 |
+
|
63 |
+
Args:
|
64 |
+
- df_cat (pd.DataFrame): The base dataframe containing dataset information.
|
65 |
+
- col_sum (str or list): The column(s) to sum.
|
66 |
+
- col_count (str or list): The column(s) to count.
|
67 |
+
- col_groupby (str or list): The column(s) to group the datasets by.
|
68 |
+
- col_percent (str): The column to calculate the percentage of total.
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
- pd.DataFrame: A dataframe summarizing datasets by speech type and other relevant metrics.
|
72 |
+
"""
|
73 |
+
# Convert col_sum, col_count, and col_groupby to lists if they are not already
|
74 |
+
if not isinstance(col_sum, list):
|
75 |
+
col_sum = [col_sum]
|
76 |
+
if not isinstance(col_count, list):
|
77 |
+
col_count = [col_count]
|
78 |
+
if not isinstance(col_groupby, list):
|
79 |
+
col_groupby = [col_groupby]
|
80 |
+
|
81 |
+
# First, ensure that the data types and potential missing values are handled correctly
|
82 |
+
for col in col_sum:
|
83 |
+
num_values = df_cat[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(0)
|
84 |
+
df_cat[col] = num_values
|
85 |
+
|
86 |
+
# Aggregating datasets by provided column type
|
87 |
+
summary = df_cat.groupby(col_groupby).agg({
|
88 |
+
**{col: 'sum' for col in col_sum},
|
89 |
+
**{col: 'count' for col in col_count}
|
90 |
+
}).reset_index()
|
91 |
+
|
92 |
+
col_name_percent = 'Percent of total'
|
93 |
+
if col_percent is not None:
|
94 |
+
# Calculating the percentage
|
95 |
+
total = summary[col_percent].sum(axis=1)
|
96 |
+
summary[col_name_percent] = round(total / total.sum() * 100, 2)
|
97 |
+
|
98 |
+
# Sorting the summary by the sum of the column
|
99 |
+
summary.sort_values(by=col_sum[0], ascending=False, inplace=True)
|
100 |
+
|
101 |
+
# Replacing index with the groupby column
|
102 |
+
summary.reset_index(drop=True, inplace=True)
|
103 |
+
summary.set_index(col_groupby, inplace=True)
|
104 |
+
|
105 |
+
# Rename the column to a more descriptive name
|
106 |
+
if len(col_count) == 0:
|
107 |
+
col_name_count = None
|
108 |
+
elif len(col_count) == 1:
|
109 |
+
col_name_count = 'Count ' + col_count[0]
|
110 |
+
summary.rename(columns={col_count[0]: col_name_count }, inplace=True)
|
111 |
+
summary[col_name_count] = summary[col_name_count].astype(int)
|
112 |
+
else:
|
113 |
+
#TODO - add support for renaming multiple count columns
|
114 |
+
pass
|
115 |
+
|
116 |
+
# Make the order of columns as follows 'Count Dataset ID', Total transcribed [hours], 'Percent of total'
|
117 |
+
if col_percent is None:
|
118 |
+
if col_name_count not in summary.columns:
|
119 |
+
summary = summary[col_sum]
|
120 |
+
else:
|
121 |
+
summary = summary[[col_name_count] + col_sum]
|
122 |
+
else:
|
123 |
+
if col_name_count not in summary.columns:
|
124 |
+
summary = summary[col_sum + [col_name_percent]]
|
125 |
+
else:
|
126 |
+
summary = summary[[col_name_count] + col_sum + [col_name_percent]]
|
127 |
+
|
128 |
+
# Sort by the provided column col_sort
|
129 |
+
col_sort = col_groupby if col_sort is None else col_sort
|
130 |
+
summary.sort_values(by=col_sort, ascending=False, inplace=True)
|
131 |
+
|
132 |
+
# Replace 0 with no-info in columns with sum
|
133 |
+
for col in col_sum:
|
134 |
+
summary[col] = summary[col].replace(0, 'no-info')
|
135 |
+
|
136 |
+
return summary
|
137 |
+
|
138 |
+
|
139 |
+
def datasets_count_and_size_standard(df_cat, col_groupby):
|
140 |
+
return datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=['Size audio transcribed [hours]'], col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count=['Dataset ID'])
|
141 |
+
|
142 |
+
def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid):
|
143 |
+
#TODO - add number of speakers and recordings
|
144 |
+
|
145 |
+
# 'Speaker id info', 'Part of speech annotation', 'Named entity annotation', 'Emotion annotation'
|
146 |
+
meta_data_cols = ['Gender info', 'Age info', 'Accent info', 'Nativity info', 'Time alignement annotation']
|
147 |
+
meta_coverage_all_sets = {}
|
148 |
+
meta_coverage_free_sets = {}
|
149 |
+
meta_coverage_paid_sets = {}
|
150 |
+
|
151 |
+
col_name_sum_size = 'Size audio transcribed [hours]'
|
152 |
+
col_name_count = 'Count Dataset ID'
|
153 |
+
col_name_percent = 'Percent of total'
|
154 |
+
|
155 |
+
#, 'Named entity annotation', 'Emotion annotation']
|
156 |
+
for meta_data_col in meta_data_cols:
|
157 |
+
df_datasets_per_meta_paid = datasets_count_and_size_standard(df_cat_available_paid, meta_data_col)
|
158 |
+
#print(df_datasets_per_meta_paid)
|
159 |
+
if 'yes' in df_datasets_per_meta_paid.index:
|
160 |
+
meta_coverage_paid_sets[meta_data_col] = df_datasets_per_meta_paid.loc['yes']
|
161 |
+
else:
|
162 |
+
meta_coverage_paid_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
|
163 |
+
|
164 |
+
df_datasets_per_meta_all = datasets_count_and_size_standard(df_cat, meta_data_col)
|
165 |
+
#print(df_datasets_per_meta_all)
|
166 |
+
# select row where index has value "yes" and column name is "Percent of total"
|
167 |
+
if 'yes' in df_datasets_per_meta_all.index:
|
168 |
+
meta_coverage_all_sets[meta_data_col] = df_datasets_per_meta_all.loc['yes']
|
169 |
+
else:
|
170 |
+
meta_coverage_all_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
|
171 |
+
|
172 |
+
df_datasets_per_meta_free = datasets_count_and_size_standard(df_cat_available_free, meta_data_col)
|
173 |
+
#print(df_datasets_per_meta_free)
|
174 |
+
# check if index has value "yes", if not assign 0
|
175 |
+
if 'yes' in df_datasets_per_meta_free.index:
|
176 |
+
meta_coverage_free_sets[meta_data_col] = df_datasets_per_meta_free.loc['yes']
|
177 |
+
else:
|
178 |
+
meta_coverage_free_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
|
179 |
+
|
180 |
+
#merge all free and paid dataframes
|
181 |
+
df_meta_free = pd.DataFrame.from_dict(meta_coverage_free_sets, orient='index')
|
182 |
+
df_meta_free[col_name_count] = df_meta_free[col_name_count].astype(int)
|
183 |
+
|
184 |
+
df_meta_paid = pd.DataFrame.from_dict(meta_coverage_paid_sets, orient='index')
|
185 |
+
df_meta_paid[col_name_count] = df_meta_paid[col_name_count].astype(int)
|
186 |
+
|
187 |
+
df_meta_free['Type'] = 'Free'
|
188 |
+
df_meta_paid['Type'] = 'Paid'
|
189 |
+
df_meta_all_flat = pd.concat([df_meta_free, df_meta_paid])
|
190 |
+
|
191 |
+
#transform to compare free and paid column by column
|
192 |
+
df_meta_all_pivot = df_meta_all_flat.reset_index()
|
193 |
+
df_meta_all_pivot = df_meta_all_pivot.rename(columns={'index':'Metadata'})
|
194 |
+
df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
|
195 |
+
df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
|
196 |
+
|
197 |
+
return(df_meta_all_flat, df_meta_all_pivot)
|
198 |
+
|
199 |
+
|
200 |
+
def catalog_summary_statistics(df_cat):
|
201 |
+
"""
|
202 |
+
Function to generate summary statistics for the speech data catalog.
|
203 |
+
|
204 |
+
Args:
|
205 |
+
- df_cat (pd.DataFrame): The base dataframe containing dataset information.
|
206 |
+
|
207 |
+
Returns:
|
208 |
+
- pd.DataFrame: A dataframe summarizing the speech data catalog.
|
209 |
+
"""
|
210 |
+
|
211 |
+
col_name_transcribed = 'Size audio transcribed [hours]'
|
212 |
+
col_name_audio= 'Size audio total [hours]'
|
213 |
+
|
214 |
+
# Convert numerical fields to numeric type
|
215 |
+
df_cat[col_name_audio] = pd.to_numeric(df_cat[col_name_audio], errors='coerce')
|
216 |
+
df_cat[col_name_transcribed] = pd.to_numeric(df_cat[col_name_transcribed], errors='coerce')
|
217 |
+
|
218 |
+
# Filter out non-available datasets
|
219 |
+
df_cat_available = df_cat[df_cat['Available online'] == 'yes']
|
220 |
+
df_cat_free = df_cat[df_cat['Price - non-commercial usage'] == 'free']
|
221 |
+
df_cat_commercial = df_cat[df_cat['Price - non-commercial usage'] != 'free']
|
222 |
+
|
223 |
+
# Available and free
|
224 |
+
df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
|
225 |
+
|
226 |
+
# Available and paid
|
227 |
+
df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
|
228 |
+
|
229 |
+
# Basic Calculations
|
230 |
+
identified_datasets_count = df_cat.shape[0]
|
231 |
+
accessible_datasets_count = df_cat_available.shape[0]
|
232 |
+
unique_producers_count = df_cat['Publisher'].nunique()
|
233 |
+
accessible_datasets_fraction = round((accessible_datasets_count / identified_datasets_count) * 100, 2)
|
234 |
+
|
235 |
+
# Total audio available and other dependent calculations
|
236 |
+
audio_reported = round(df_cat[col_name_audio].sum(), 2)
|
237 |
+
audio_accessible = round(df_cat_available[col_name_audio].sum(), 2)
|
238 |
+
audio_accessible_free = round(df_cat_available_free[col_name_audio].sum(), 2)
|
239 |
+
audio_accessible_paid = round(df_cat_available_paid[col_name_audio].sum(), 2)
|
240 |
+
|
241 |
+
transcribed_audio_reported = round(df_cat[col_name_transcribed].sum(), 2)
|
242 |
+
transcribed_audio_accessible = round(df_cat_available[col_name_transcribed].sum(), 2)
|
243 |
+
transcribed_audio_accessible_free = round(df_cat_available_free[col_name_transcribed].sum(), 2)
|
244 |
+
transcribed_audio_accessible_paid = round(df_cat_available_paid[col_name_transcribed].sum(), 2)
|
245 |
+
|
246 |
+
# available vs Reported Speech Material Ratio
|
247 |
+
accessible_vs_reported_audio_ratio = round((audio_accessible / audio_reported) * 100, 2)
|
248 |
+
accessible_vs_reported_transcribed_ratio = round((transcribed_audio_accessible / transcribed_audio_reported) * 100, 2)
|
249 |
+
|
250 |
+
# Finalizing the metrics dictionary
|
251 |
+
metrics_dict = {
|
252 |
+
"Metric": BASE_SUMMARY_METRICS,
|
253 |
+
"Value": [
|
254 |
+
catalog_last_update_date,
|
255 |
+
unique_producers_count,
|
256 |
+
identified_datasets_count,
|
257 |
+
accessible_datasets_count,
|
258 |
+
accessible_datasets_fraction,
|
259 |
+
audio_reported,
|
260 |
+
audio_accessible,
|
261 |
+
audio_accessible_free,
|
262 |
+
audio_accessible_paid,
|
263 |
+
accessible_vs_reported_audio_ratio,
|
264 |
+
transcribed_audio_reported,
|
265 |
+
transcribed_audio_accessible,
|
266 |
+
transcribed_audio_accessible_free,
|
267 |
+
transcribed_audio_accessible_paid,
|
268 |
+
accessible_vs_reported_transcribed_ratio,
|
269 |
+
]
|
270 |
+
}
|
271 |
+
|
272 |
+
# Convert the dictionary into a DataFrame
|
273 |
+
metrics_df = pd.DataFrame(metrics_dict)
|
274 |
+
metrics_df.reset_index(drop=True, inplace=True)
|
275 |
+
metrics_df.set_index("Metric", inplace=True)
|
276 |
+
return(metrics_df)
|