mj-new commited on
Commit
d5cbb7a
·
1 Parent(s): 2901944

Alpha version of the dataset catalog

Browse files
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Pl Asr Survey
3
- emoji: 💻
4
- colorFrom: yellow
5
  colorTo: red
6
  sdk: streamlit
7
- sdk_version: 1.32.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-sa-4.0
 
1
  ---
2
+ title: Pl Asr Speech Data Survey
3
+ emoji: 🏃
4
+ colorFrom: pink
5
  colorTo: red
6
  sdk: streamlit
7
+ sdk_version: 1.31.1
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-sa-4.0
__pycache__/app_utils.cpython-310.pyc ADDED
Binary file (2.28 kB). View file
 
__pycache__/contants.cpython-310.pyc ADDED
Binary file (482 Bytes). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (7.26 kB). View file
 
analysis-playground.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+
4
+ from app_utils import filter_dataframe, calculate_height_to_display
5
+ from contants import WELCOME_TEXT, CITATION_TEXT
6
+ from utils import BASE_SUMMARY_METRICS
7
+ from utils import load_catalog, load_taxonomy
8
+ from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
9
+
10
+ import matplotlib.pyplot as plt
11
+ import seaborn as sns
12
+
13
+ st.set_page_config(layout="wide")
14
+
15
+ st.title("Polish Speech Datasets Catalog and Survey analysis")
16
+
17
+ st.write(WELCOME_TEXT)
18
+
19
+ st.write(CITATION_TEXT)
20
+
21
+ # Cache the dataframe so it's only loaded once
22
+ df_cat = load_catalog()
23
+ df_tax = load_taxonomy()
24
+
25
+ # Filter out non available datasets
26
+ df_cat_available = df_cat[df_cat['Available online'] == 'yes']
27
+ # Available and free
28
+ df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
29
+
30
+ # Available and paid
31
+ df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
32
+
33
+ # Display catalog contents
34
+ st.dataframe(filter_dataframe(df_cat), hide_index=True, use_container_width=True)
35
+
36
+ # Display taxonomy contents
37
+
38
+ # Display summary statistics
39
+ st.header("Polish ASR speech datasets summary statistics")
40
+ df_summary_metrics = catalog_summary_statistics(df_cat)
41
+
42
+ df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
43
+ st.dataframe(df_basic_stats, use_container_width=False)
44
+
45
+ st.header("Speech data available across Polish ASR speech datasets")
46
+ df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
47
+ st.dataframe(df_stats_audio_available, use_container_width=False)
48
+
49
+ st.header("Transcribed data available across Polish ASR speech datasets")
50
+ df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
51
+ st.dataframe(df_stats_transcribed_available, use_container_width=False)
52
+
53
+
54
+ # Display distribution of datasets created per year
55
+ st.header("Polish ASR speech datasets created in 1997-2023")
56
+ col_groupby = ['Creation year']
57
+ df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
58
+
59
+ st.dataframe(df_datasets_per_speech_type, use_container_width=False)
60
+
61
+ st.header("Institutions contributing Polish ASR speech dataset")
62
+ col_groupby = ['Publisher']
63
+ df_datasets_per_publisher = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
64
+ st.dataframe(df_datasets_per_publisher, use_container_width=False)
65
+
66
+ st.header("Repositories hosting Polish ASR speech datasets")
67
+ col_groupby = ['Repository']
68
+ df_datasets_per_repo = datasets_count_and_size(df_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
69
+ st.dataframe(df_datasets_per_repo, use_container_width=False)
70
+
71
+ st.header("Public domain Polish ASR speech datasets")
72
+ col_groupby = ['License', "Dataset ID"]
73
+ df_datasets_public = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
74
+ st.dataframe(df_datasets_public, use_container_width=False)
75
+
76
+ st.header("Commercialy available Polish ASR speech datasets")
77
+ col_groupby = ['License', "Dataset ID"]
78
+ df_datasets_paid = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
79
+ st.dataframe(df_datasets_paid, use_container_width=False)
80
+
81
+ st.header("Coverage of metadata across Polish ASR speech datasets")
82
+ df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid)
83
+ st.dataframe(df_meta_all_pivot, use_container_width=False)
84
+
85
+ # Display distribution of datasets for various speech types
86
+ st.header("Datasets per speech type")
87
+ col_groupby = ['Speech type']
88
+ df_datasets_per_speech_type = datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
89
+ st.dataframe(df_datasets_per_speech_type, use_container_width=False)
90
+
91
+ # Display distribution of datasets for various speech types
92
+ st.header("Distribution of available speech data per audio device - Public domain datasets")
93
+ col_groupby = ['Audio device']
94
+ df_datasets_per_device = datasets_count_and_size(df_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
95
+ st.dataframe(df_datasets_per_device, use_container_width=False)
96
+
97
+ # Display distribution of datasets for various speech types
98
+ st.header("Distribution of available speech data per audio device - Commercial datasets")
99
+ col_groupby = ['Audio device']
100
+ df_datasets_per_device = datasets_count_and_size(df_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
101
+ st.dataframe(df_datasets_per_device, use_container_width=False)
app_utils.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+
4
+ from pandas.api.types import (
5
+ is_categorical_dtype,
6
+ is_datetime64_any_dtype,
7
+ is_numeric_dtype,
8
+ is_object_dtype,
9
+ )
10
+
11
+ def calculate_height_to_display(df):
12
+ # Calculate the height of the DataFrame display area
13
+ num_rows = df.shape[0]
14
+ row_height = 25 # Estimate of row height in pixels, adjust based on your layout/theme
15
+ header_height = 50 # Estimate of header height in pixels
16
+ padding = 20 # Extra padding in pixels
17
+ calculated_height = num_rows * row_height + header_height + padding
18
+
19
+ return calculated_height
20
+
21
+ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
22
+ """
23
+ Adds a UI on top of a dataframe to let viewers filter columns
24
+
25
+ Args:
26
+ df (pd.DataFrame): Original dataframe
27
+
28
+ Returns:
29
+ pd.DataFrame: Filtered dataframe
30
+ """
31
+ modify = st.checkbox("Use filters on speech data catalog")
32
+
33
+ if not modify:
34
+ return df
35
+
36
+ df = df.copy()
37
+
38
+ # Try to convert datetimes into a standard format (datetime, no timezone)
39
+ for col in df.columns:
40
+ if is_object_dtype(df[col]):
41
+ try:
42
+ df[col] = pd.to_datetime(df[col])
43
+ except Exception:
44
+ pass
45
+
46
+ if is_datetime64_any_dtype(df[col]):
47
+ df[col] = df[col].dt.tz_localize(None)
48
+
49
+ modification_container = st.container()
50
+
51
+ with modification_container:
52
+ to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
53
+ for column in to_filter_columns:
54
+ left, right = st.columns((1, 20))
55
+ # Treat columns with < 10 unique values as categorical
56
+ if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
57
+ user_cat_input = right.multiselect(
58
+ f"Values for {column}",
59
+ df[column].unique(),
60
+ default=list(df[column].unique()),
61
+ )
62
+ df = df[df[column].isin(user_cat_input)]
63
+ elif is_numeric_dtype(df[column]):
64
+ _min = float(df[column].min())
65
+ _max = float(df[column].max())
66
+ step = (_max - _min) / 100
67
+ user_num_input = right.slider(
68
+ f"Values for {column}",
69
+ min_value=_min,
70
+ max_value=_max,
71
+ value=(_min, _max),
72
+ step=step,
73
+ )
74
+ df = df[df[column].between(*user_num_input)]
75
+ elif is_datetime64_any_dtype(df[column]):
76
+ user_date_input = right.date_input(
77
+ f"Values for {column}",
78
+ value=(
79
+ df[column].min(),
80
+ df[column].max(),
81
+ ),
82
+ )
83
+ if len(user_date_input) == 2:
84
+ user_date_input = tuple(map(pd.to_datetime, user_date_input))
85
+ start_date, end_date = user_date_input
86
+ df = df.loc[df[column].between(start_date, end_date)]
87
+ else:
88
+ user_text_input = right.text_input(
89
+ f"Substring or regex in {column}",
90
+ )
91
+ if user_text_input:
92
+ df = df[df[column].astype(str).str.contains(user_text_input)]
93
+
94
+ return df
contants.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ WELCOME_TEXT = "This dashboard complements [Polish Speech Datasets Catalog](https://github.com/goodmike31/pl-asr-speech-data-survey) with:\n \
2
+ a. Dynamic filtering of catalog content\n \
3
+ b. Summary statistics about Polish ASR speech datasets\n"
4
+
5
+ CITATION_TEXT="Please cite this work as: TODO\n"
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ seaborn
2
+ matplotlib
3
+ pandas
utils.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ import streamlit as st
4
+
5
+ catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
6
+ # TODO - extract from the catalog name
7
+
8
+ BASE_SUMMARY_METRICS = [
9
+ "Catalog last update date",
10
+ "Unique Polish speech datasets producers",
11
+ "Identified datasets reported in the public domain",
12
+ "Datasets available to the public (free and paid)",
13
+ "Fraction of reported datasets available to the public [%]",
14
+ "Speech data reported in the public domain [hours]",
15
+ "Speech data available total [hours]",
16
+ "Speech data available free of charge [hours]",
17
+ "Speech data available commercially [hours]",
18
+ "Reported vs available speech data ratio [%]",
19
+ "Transcribed speech data reported in the public domain [hours]",
20
+ "Transcribed speech data available total [hours]",
21
+ "Transcribed speech data available free of charge [hours]",
22
+ "Transcribed speech data available commercially [hours]",
23
+ "Reported vs available transcribed speech data ratio [%]",
24
+
25
+ ]
26
+
27
+ def download_tsv_from_google_sheet(sheet_url):
28
+ # Modify the Google Sheet URL to export it as TSV
29
+ tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
30
+
31
+ # Send a GET request to download the TSV file
32
+ response = requests.get(tsv_url)
33
+
34
+ # Check if the request was successful
35
+ if response.status_code == 200:
36
+ # Read the TSV content into a pandas DataFrame
37
+ from io import StringIO
38
+ tsv_content = StringIO(response.text)
39
+ df = pd.read_csv(tsv_content, sep='\t')
40
+ return df
41
+ else:
42
+ print("Failed to download the TSV file.")
43
+ return None
44
+
45
+ @st.cache_data
46
+ def load_catalog():
47
+ print("Reading speech data catalog")
48
+ catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
49
+ df_catalog = download_tsv_from_google_sheet(catalog_url)
50
+ return(df_catalog)
51
+
52
+ @st.cache_data
53
+ def load_taxonomy():
54
+ print("Reading speech data survey taxonomy")
55
+ taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
56
+ df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
57
+ return(df_taxonomy)
58
+
59
+ def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
60
+ """
61
+ Function to generate a summary view of datasets by speech type and other relevant metrics.
62
+
63
+ Args:
64
+ - df_cat (pd.DataFrame): The base dataframe containing dataset information.
65
+ - col_sum (str or list): The column(s) to sum.
66
+ - col_count (str or list): The column(s) to count.
67
+ - col_groupby (str or list): The column(s) to group the datasets by.
68
+ - col_percent (str): The column to calculate the percentage of total.
69
+
70
+ Returns:
71
+ - pd.DataFrame: A dataframe summarizing datasets by speech type and other relevant metrics.
72
+ """
73
+ # Convert col_sum, col_count, and col_groupby to lists if they are not already
74
+ if not isinstance(col_sum, list):
75
+ col_sum = [col_sum]
76
+ if not isinstance(col_count, list):
77
+ col_count = [col_count]
78
+ if not isinstance(col_groupby, list):
79
+ col_groupby = [col_groupby]
80
+
81
+ # First, ensure that the data types and potential missing values are handled correctly
82
+ for col in col_sum:
83
+ num_values = df_cat[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(0)
84
+ df_cat[col] = num_values
85
+
86
+ # Aggregating datasets by provided column type
87
+ summary = df_cat.groupby(col_groupby).agg({
88
+ **{col: 'sum' for col in col_sum},
89
+ **{col: 'count' for col in col_count}
90
+ }).reset_index()
91
+
92
+ col_name_percent = 'Percent of total'
93
+ if col_percent is not None:
94
+ # Calculating the percentage
95
+ total = summary[col_percent].sum(axis=1)
96
+ summary[col_name_percent] = round(total / total.sum() * 100, 2)
97
+
98
+ # Sorting the summary by the sum of the column
99
+ summary.sort_values(by=col_sum[0], ascending=False, inplace=True)
100
+
101
+ # Replacing index with the groupby column
102
+ summary.reset_index(drop=True, inplace=True)
103
+ summary.set_index(col_groupby, inplace=True)
104
+
105
+ # Rename the column to a more descriptive name
106
+ if len(col_count) == 0:
107
+ col_name_count = None
108
+ elif len(col_count) == 1:
109
+ col_name_count = 'Count ' + col_count[0]
110
+ summary.rename(columns={col_count[0]: col_name_count }, inplace=True)
111
+ summary[col_name_count] = summary[col_name_count].astype(int)
112
+ else:
113
+ #TODO - add support for renaming multiple count columns
114
+ pass
115
+
116
+ # Make the order of columns as follows 'Count Dataset ID', Total transcribed [hours], 'Percent of total'
117
+ if col_percent is None:
118
+ if col_name_count not in summary.columns:
119
+ summary = summary[col_sum]
120
+ else:
121
+ summary = summary[[col_name_count] + col_sum]
122
+ else:
123
+ if col_name_count not in summary.columns:
124
+ summary = summary[col_sum + [col_name_percent]]
125
+ else:
126
+ summary = summary[[col_name_count] + col_sum + [col_name_percent]]
127
+
128
+ # Sort by the provided column col_sort
129
+ col_sort = col_groupby if col_sort is None else col_sort
130
+ summary.sort_values(by=col_sort, ascending=False, inplace=True)
131
+
132
+ # Replace 0 with no-info in columns with sum
133
+ for col in col_sum:
134
+ summary[col] = summary[col].replace(0, 'no-info')
135
+
136
+ return summary
137
+
138
+
139
+ def datasets_count_and_size_standard(df_cat, col_groupby):
140
+ return datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=['Size audio transcribed [hours]'], col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count=['Dataset ID'])
141
+
142
+ def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid):
143
+ #TODO - add number of speakers and recordings
144
+
145
+ # 'Speaker id info', 'Part of speech annotation', 'Named entity annotation', 'Emotion annotation'
146
+ meta_data_cols = ['Gender info', 'Age info', 'Accent info', 'Nativity info', 'Time alignement annotation']
147
+ meta_coverage_all_sets = {}
148
+ meta_coverage_free_sets = {}
149
+ meta_coverage_paid_sets = {}
150
+
151
+ col_name_sum_size = 'Size audio transcribed [hours]'
152
+ col_name_count = 'Count Dataset ID'
153
+ col_name_percent = 'Percent of total'
154
+
155
+ #, 'Named entity annotation', 'Emotion annotation']
156
+ for meta_data_col in meta_data_cols:
157
+ df_datasets_per_meta_paid = datasets_count_and_size_standard(df_cat_available_paid, meta_data_col)
158
+ #print(df_datasets_per_meta_paid)
159
+ if 'yes' in df_datasets_per_meta_paid.index:
160
+ meta_coverage_paid_sets[meta_data_col] = df_datasets_per_meta_paid.loc['yes']
161
+ else:
162
+ meta_coverage_paid_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
163
+
164
+ df_datasets_per_meta_all = datasets_count_and_size_standard(df_cat, meta_data_col)
165
+ #print(df_datasets_per_meta_all)
166
+ # select row where index has value "yes" and column name is "Percent of total"
167
+ if 'yes' in df_datasets_per_meta_all.index:
168
+ meta_coverage_all_sets[meta_data_col] = df_datasets_per_meta_all.loc['yes']
169
+ else:
170
+ meta_coverage_all_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
171
+
172
+ df_datasets_per_meta_free = datasets_count_and_size_standard(df_cat_available_free, meta_data_col)
173
+ #print(df_datasets_per_meta_free)
174
+ # check if index has value "yes", if not assign 0
175
+ if 'yes' in df_datasets_per_meta_free.index:
176
+ meta_coverage_free_sets[meta_data_col] = df_datasets_per_meta_free.loc['yes']
177
+ else:
178
+ meta_coverage_free_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0}
179
+
180
+ #merge all free and paid dataframes
181
+ df_meta_free = pd.DataFrame.from_dict(meta_coverage_free_sets, orient='index')
182
+ df_meta_free[col_name_count] = df_meta_free[col_name_count].astype(int)
183
+
184
+ df_meta_paid = pd.DataFrame.from_dict(meta_coverage_paid_sets, orient='index')
185
+ df_meta_paid[col_name_count] = df_meta_paid[col_name_count].astype(int)
186
+
187
+ df_meta_free['Type'] = 'Free'
188
+ df_meta_paid['Type'] = 'Paid'
189
+ df_meta_all_flat = pd.concat([df_meta_free, df_meta_paid])
190
+
191
+ #transform to compare free and paid column by column
192
+ df_meta_all_pivot = df_meta_all_flat.reset_index()
193
+ df_meta_all_pivot = df_meta_all_pivot.rename(columns={'index':'Metadata'})
194
+ df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
195
+ df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
196
+
197
+ return(df_meta_all_flat, df_meta_all_pivot)
198
+
199
+
200
+ def catalog_summary_statistics(df_cat):
201
+ """
202
+ Function to generate summary statistics for the speech data catalog.
203
+
204
+ Args:
205
+ - df_cat (pd.DataFrame): The base dataframe containing dataset information.
206
+
207
+ Returns:
208
+ - pd.DataFrame: A dataframe summarizing the speech data catalog.
209
+ """
210
+
211
+ col_name_transcribed = 'Size audio transcribed [hours]'
212
+ col_name_audio= 'Size audio total [hours]'
213
+
214
+ # Convert numerical fields to numeric type
215
+ df_cat[col_name_audio] = pd.to_numeric(df_cat[col_name_audio], errors='coerce')
216
+ df_cat[col_name_transcribed] = pd.to_numeric(df_cat[col_name_transcribed], errors='coerce')
217
+
218
+ # Filter out non-available datasets
219
+ df_cat_available = df_cat[df_cat['Available online'] == 'yes']
220
+ df_cat_free = df_cat[df_cat['Price - non-commercial usage'] == 'free']
221
+ df_cat_commercial = df_cat[df_cat['Price - non-commercial usage'] != 'free']
222
+
223
+ # Available and free
224
+ df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')]
225
+
226
+ # Available and paid
227
+ df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')]
228
+
229
+ # Basic Calculations
230
+ identified_datasets_count = df_cat.shape[0]
231
+ accessible_datasets_count = df_cat_available.shape[0]
232
+ unique_producers_count = df_cat['Publisher'].nunique()
233
+ accessible_datasets_fraction = round((accessible_datasets_count / identified_datasets_count) * 100, 2)
234
+
235
+ # Total audio available and other dependent calculations
236
+ audio_reported = round(df_cat[col_name_audio].sum(), 2)
237
+ audio_accessible = round(df_cat_available[col_name_audio].sum(), 2)
238
+ audio_accessible_free = round(df_cat_available_free[col_name_audio].sum(), 2)
239
+ audio_accessible_paid = round(df_cat_available_paid[col_name_audio].sum(), 2)
240
+
241
+ transcribed_audio_reported = round(df_cat[col_name_transcribed].sum(), 2)
242
+ transcribed_audio_accessible = round(df_cat_available[col_name_transcribed].sum(), 2)
243
+ transcribed_audio_accessible_free = round(df_cat_available_free[col_name_transcribed].sum(), 2)
244
+ transcribed_audio_accessible_paid = round(df_cat_available_paid[col_name_transcribed].sum(), 2)
245
+
246
+ # available vs Reported Speech Material Ratio
247
+ accessible_vs_reported_audio_ratio = round((audio_accessible / audio_reported) * 100, 2)
248
+ accessible_vs_reported_transcribed_ratio = round((transcribed_audio_accessible / transcribed_audio_reported) * 100, 2)
249
+
250
+ # Finalizing the metrics dictionary
251
+ metrics_dict = {
252
+ "Metric": BASE_SUMMARY_METRICS,
253
+ "Value": [
254
+ catalog_last_update_date,
255
+ unique_producers_count,
256
+ identified_datasets_count,
257
+ accessible_datasets_count,
258
+ accessible_datasets_fraction,
259
+ audio_reported,
260
+ audio_accessible,
261
+ audio_accessible_free,
262
+ audio_accessible_paid,
263
+ accessible_vs_reported_audio_ratio,
264
+ transcribed_audio_reported,
265
+ transcribed_audio_accessible,
266
+ transcribed_audio_accessible_free,
267
+ transcribed_audio_accessible_paid,
268
+ accessible_vs_reported_transcribed_ratio,
269
+ ]
270
+ }
271
+
272
+ # Convert the dictionary into a DataFrame
273
+ metrics_df = pd.DataFrame(metrics_dict)
274
+ metrics_df.reset_index(drop=True, inplace=True)
275
+ metrics_df.set_index("Metric", inplace=True)
276
+ return(metrics_df)