Spaces:
Running
Running
mj-new
commited on
Commit
·
3533dd6
1
Parent(s):
25f0e74
Added support for datasets without secret test split
Browse files- app.py +101 -33
- reports/amu-cai/pl-asr-bigos-v2-diagnostic/dataset_contents.json +0 -0
- reports/amu-cai/pl-asr-bigos-v2-diagnostic/dataset_statistics.json +1 -0
- reports/amu-cai/pl-asr-bigos-v2/dataset_contents.json +1 -1
- reports/amu-cai/pl-asr-bigos-v2/dataset_statistics.json +2 -2
- reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json +1 -1
- reports/pelcra/pl-asr-pelcra-for-bigos/dataset_statistics.json +2 -2
- run-analysis.py +14 -6
- utils.py +64 -18
app.py
CHANGED
@@ -18,8 +18,17 @@ from datasets import get_dataset_config_names
|
|
18 |
# TODO - compare the datasets
|
19 |
|
20 |
st.set_page_config(layout="wide")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
|
|
23 |
#analysis_bigos_diagnostic
|
24 |
#########################################BIGOS################################################
|
25 |
with about:
|
@@ -30,7 +39,6 @@ with about:
|
|
30 |
|
31 |
with analysis_bigos:
|
32 |
dataset_name = "amu-cai/pl-asr-bigos-v2"
|
33 |
-
#dataset_secret = "amu-cai/pl-asr-bigos-v2-secret"
|
34 |
dataset_short_name = "BIGOS"
|
35 |
dataset_version = "V2"
|
36 |
|
@@ -51,30 +59,93 @@ with analysis_bigos:
|
|
51 |
|
52 |
|
53 |
st.header("Dataset level metrics")
|
54 |
-
metrics_size = ["samples", "audio[h]", "speakers", "words", "chars"]
|
55 |
df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size)
|
56 |
|
57 |
# split dataframe into separate dataframes for easier analysis and visualization
|
58 |
-
st.subheader("
|
59 |
-
df_sum_stats_audio = df_sum_stats_agg[
|
60 |
st.dataframe(df_sum_stats_audio)
|
61 |
|
62 |
-
st.subheader("
|
63 |
-
df_sum_stats_text = df_sum_stats_agg[
|
64 |
st.dataframe(df_sum_stats_text)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
70 |
|
71 |
-
st.subheader("
|
72 |
-
df_sum_stats_feats_text = df_sum_stats_all_splits[
|
73 |
st.dataframe(df_sum_stats_feats_text)
|
74 |
|
75 |
-
st.subheader("
|
76 |
-
|
77 |
-
st.dataframe(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
st.header("BIGOS subsets (source datasets) cards")
|
80 |
for subset in dataset_configs:
|
@@ -84,14 +155,11 @@ with analysis_bigos:
|
|
84 |
df_metrics_subset_features = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_features, add_total=False)
|
85 |
st.dataframe(df_metrics_subset_features)
|
86 |
|
87 |
-
|
88 |
|
89 |
#########################################PELCRA################################################
|
90 |
with analysis_bigos_pelcra:
|
91 |
|
92 |
dataset_name = "pelcra/pl-asr-pelcra-for-bigos"
|
93 |
-
#dataset_secret = "pelcra/pl-asr-pelcra-for-bigos-secret"
|
94 |
-
|
95 |
dataset_short_name = "PELCRA"
|
96 |
|
97 |
# local version with granted gated access
|
@@ -113,35 +181,35 @@ with analysis_bigos_pelcra:
|
|
113 |
# extract metrics from dictionary and convert to various dataframes for easier analysis and visualization
|
114 |
#st.header("Summary statistics")
|
115 |
|
116 |
-
|
117 |
st.header("Dataset level metrics")
|
118 |
-
metrics_size = ["samples", "audio[h]", "speakers", "words", "chars"]
|
119 |
df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size)
|
120 |
|
121 |
-
#st.dataframe(df_sum_stats_agg)
|
122 |
-
#print(df_sum_stats.columns)
|
123 |
-
|
124 |
# split dataframe into separate dataframes for easier analysis and visualization
|
125 |
-
st.subheader("
|
126 |
-
df_sum_stats_audio = df_sum_stats_agg[
|
127 |
st.dataframe(df_sum_stats_audio)
|
128 |
|
129 |
-
st.subheader("
|
130 |
-
df_sum_stats_text = df_sum_stats_agg[
|
131 |
st.dataframe(df_sum_stats_text)
|
132 |
|
133 |
-
|
134 |
-
metrics_features = ["utts_unique", "words_unique", "chars_unique", "words_per_sec", "chars_per_sec", "average_audio_duration[s]"]
|
135 |
-
|
136 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
137 |
|
138 |
-
st.subheader("
|
139 |
-
df_sum_stats_feats_text = df_sum_stats_all_splits[
|
140 |
st.dataframe(df_sum_stats_feats_text)
|
141 |
|
142 |
-
st.subheader("
|
143 |
-
|
144 |
-
st.dataframe(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
st.header("BIGOS subsets (source datasets) cards")
|
147 |
for subset in dataset_configs:
|
|
|
18 |
# TODO - compare the datasets
|
19 |
|
20 |
st.set_page_config(layout="wide")
|
21 |
+
metrics_size_audio = ["samples", "audio[h]", "speakers"]
|
22 |
+
metrics_size_text = ["samples", "words", "chars"]
|
23 |
+
metrics_size = metrics_size_audio + metrics_size_text
|
24 |
+
metrics_features_text_uniq = ["utts_unique", "words_unique", "chars_unique"]
|
25 |
+
metrics_features_speech_rate = ["words_per_sec", "chars_per_sec"]
|
26 |
+
metrics_features_duration = ["average_audio_duration[s]", "average_utterance_length[words]", "average_utterance_length[chars]"]
|
27 |
+
metrics_features_meta = ["meta_cov_sex", "meta_cov_age"]
|
28 |
+
metrics_features = metrics_features_text_uniq + metrics_features_speech_rate + metrics_features_duration + metrics_features_meta
|
29 |
|
30 |
+
|
31 |
+
about, analysis_bigos, analysis_bigos_diagnostic, analysis_bigos_pelcra = st.tabs(["About BIGOS datasets", "BIGOS V2 analysis", "BIGOS V2 diagnostic", "PELCRA for BIGOS analysis"])
|
32 |
#analysis_bigos_diagnostic
|
33 |
#########################################BIGOS################################################
|
34 |
with about:
|
|
|
39 |
|
40 |
with analysis_bigos:
|
41 |
dataset_name = "amu-cai/pl-asr-bigos-v2"
|
|
|
42 |
dataset_short_name = "BIGOS"
|
43 |
dataset_version = "V2"
|
44 |
|
|
|
59 |
|
60 |
|
61 |
st.header("Dataset level metrics")
|
|
|
62 |
df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size)
|
63 |
|
64 |
# split dataframe into separate dataframes for easier analysis and visualization
|
65 |
+
st.subheader("Audio content size")
|
66 |
+
df_sum_stats_audio = df_sum_stats_agg[metrics_size_audio]
|
67 |
st.dataframe(df_sum_stats_audio)
|
68 |
|
69 |
+
st.subheader("Text content size")
|
70 |
+
df_sum_stats_text = df_sum_stats_agg[metrics_size_text]
|
71 |
st.dataframe(df_sum_stats_text)
|
72 |
|
73 |
+
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
74 |
+
|
75 |
+
st.subheader("Utterances, vocabulary and alphabet space")
|
76 |
+
df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features_text_uniq]
|
77 |
+
st.dataframe(df_sum_stats_feats_text)
|
78 |
+
|
79 |
+
st.subheader("Speech rates")
|
80 |
+
df_sum_stats_feats_speech_rate= df_sum_stats_all_splits[metrics_features_speech_rate]
|
81 |
+
st.dataframe(df_sum_stats_feats_speech_rate)
|
82 |
+
|
83 |
+
st.subheader("Average utterance lengths and audio duration")
|
84 |
+
df_sum_stats_feats_durations = df_sum_stats_all_splits[metrics_features_duration]
|
85 |
+
st.dataframe(df_sum_stats_feats_durations)
|
86 |
+
|
87 |
+
st.subheader("Metadata coverage")
|
88 |
+
df_sum_stats_feats_meta = df_sum_stats_all_splits[metrics_features_meta]
|
89 |
+
st.dataframe(df_sum_stats_feats_meta)
|
90 |
+
|
91 |
+
st.header("BIGOS subsets (source datasets) cards")
|
92 |
+
for subset in dataset_configs:
|
93 |
+
st.subheader("Dataset card for: {}".format(subset))
|
94 |
+
df_metrics_subset_size = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_size, add_total=True)
|
95 |
+
st.dataframe(df_metrics_subset_size)
|
96 |
+
df_metrics_subset_features = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_features, add_total=False)
|
97 |
+
st.dataframe(df_metrics_subset_features)
|
98 |
+
|
99 |
+
with analysis_bigos_diagnostic:
|
100 |
+
dataset_name = "amu-cai/pl-asr-bigos-v2-diagnostic"
|
101 |
+
dataset_short_name = "BIGOS diagnostic"
|
102 |
+
dataset_version = "V2"
|
103 |
+
|
104 |
+
dataset_configs = get_dataset_config_names(dataset_name,trust_remote_code=True)
|
105 |
+
# remove "all" subset, which is always the last config type
|
106 |
+
dataset_configs.pop()
|
107 |
+
print(dataset_configs)
|
108 |
+
# read the reports for public and secret datasets
|
109 |
+
[stats_dict_public, contents_dict_public] = read_reports(dataset_name)
|
110 |
+
|
111 |
+
# update the metrics for test split with the secret dataset statistics
|
112 |
+
#stats_dict_public = add_test_split_stats_from_secret_dataset(stats_dict_public, stats_dict_secret)
|
113 |
+
df_multindex_for_agg = dict_to_multindex_df(stats_dict_public, all_splits=False)
|
114 |
+
df_multindex_all_splits = dict_to_multindex_df(stats_dict_public, all_splits=True)
|
115 |
+
|
116 |
+
# extract metrics from dictionary and convert to various dataframes for easier analysis and visualization
|
117 |
+
#st.header("Summary statistics")
|
118 |
|
119 |
+
|
120 |
+
st.header("Dataset level metrics")
|
121 |
+
df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size)
|
122 |
+
|
123 |
+
# split dataframe into separate dataframes for easier analysis and visualization
|
124 |
+
st.subheader("Audio content size")
|
125 |
+
df_sum_stats_audio = df_sum_stats_agg[metrics_size_audio]
|
126 |
+
st.dataframe(df_sum_stats_audio)
|
127 |
+
|
128 |
+
st.subheader("Text content size")
|
129 |
+
df_sum_stats_text = df_sum_stats_agg[metrics_size_text]
|
130 |
+
st.dataframe(df_sum_stats_text)
|
131 |
|
132 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
133 |
|
134 |
+
st.subheader("Utterances, vocabulary and alphabet space")
|
135 |
+
df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features_text_uniq]
|
136 |
st.dataframe(df_sum_stats_feats_text)
|
137 |
|
138 |
+
st.subheader("Speech rates")
|
139 |
+
df_sum_stats_feats_speech_rate= df_sum_stats_all_splits[metrics_features_speech_rate]
|
140 |
+
st.dataframe(df_sum_stats_feats_speech_rate)
|
141 |
+
|
142 |
+
st.subheader("Average utterance lengths and audio duration")
|
143 |
+
df_sum_stats_feats_durations = df_sum_stats_all_splits[metrics_features_duration]
|
144 |
+
st.dataframe(df_sum_stats_feats_durations)
|
145 |
+
|
146 |
+
st.subheader("Metadata coverage")
|
147 |
+
df_sum_stats_feats_meta = df_sum_stats_all_splits[metrics_features_meta]
|
148 |
+
st.dataframe(df_sum_stats_feats_meta)
|
149 |
|
150 |
st.header("BIGOS subsets (source datasets) cards")
|
151 |
for subset in dataset_configs:
|
|
|
155 |
df_metrics_subset_features = extract_stats_for_dataset_card(df_multindex_for_agg, subset, metrics_features, add_total=False)
|
156 |
st.dataframe(df_metrics_subset_features)
|
157 |
|
|
|
158 |
|
159 |
#########################################PELCRA################################################
|
160 |
with analysis_bigos_pelcra:
|
161 |
|
162 |
dataset_name = "pelcra/pl-asr-pelcra-for-bigos"
|
|
|
|
|
163 |
dataset_short_name = "PELCRA"
|
164 |
|
165 |
# local version with granted gated access
|
|
|
181 |
# extract metrics from dictionary and convert to various dataframes for easier analysis and visualization
|
182 |
#st.header("Summary statistics")
|
183 |
|
|
|
184 |
st.header("Dataset level metrics")
|
|
|
185 |
df_sum_stats_agg = extract_stats_to_agg(df_multindex_for_agg, metrics_size)
|
186 |
|
|
|
|
|
|
|
187 |
# split dataframe into separate dataframes for easier analysis and visualization
|
188 |
+
st.subheader("Audio content size")
|
189 |
+
df_sum_stats_audio = df_sum_stats_agg[metrics_size_audio]
|
190 |
st.dataframe(df_sum_stats_audio)
|
191 |
|
192 |
+
st.subheader("Text content size")
|
193 |
+
df_sum_stats_text = df_sum_stats_agg[metrics_size_text]
|
194 |
st.dataframe(df_sum_stats_text)
|
195 |
|
|
|
|
|
|
|
196 |
df_sum_stats_all_splits = extract_stats_all_splits(df_multindex_all_splits, metrics_features)
|
197 |
|
198 |
+
st.subheader("Utterances, vocabulary and alphabet space")
|
199 |
+
df_sum_stats_feats_text = df_sum_stats_all_splits[metrics_features_text_uniq]
|
200 |
st.dataframe(df_sum_stats_feats_text)
|
201 |
|
202 |
+
st.subheader("Speech rates")
|
203 |
+
df_sum_stats_feats_speech_rate= df_sum_stats_all_splits[metrics_features_speech_rate]
|
204 |
+
st.dataframe(df_sum_stats_feats_speech_rate)
|
205 |
+
|
206 |
+
st.subheader("Average utterance lengths and audio duration")
|
207 |
+
df_sum_stats_feats_durations = df_sum_stats_all_splits[metrics_features_duration]
|
208 |
+
st.dataframe(df_sum_stats_feats_durations)
|
209 |
+
|
210 |
+
st.subheader("Metadata coverage")
|
211 |
+
df_sum_stats_feats_meta = df_sum_stats_all_splits[metrics_features_meta]
|
212 |
+
st.dataframe(df_sum_stats_feats_meta)
|
213 |
|
214 |
st.header("BIGOS subsets (source datasets) cards")
|
215 |
for subset in dataset_configs:
|
reports/amu-cai/pl-asr-bigos-v2-diagnostic/dataset_contents.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
reports/amu-cai/pl-asr-bigos-v2-diagnostic/dataset_statistics.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"pjatk-clarin_mobile-15": {"samples": {"test": 45, "train": 47, "validation": 45, "all_splits": 137}, "audio[h]": {"test": 0.16, "train": 0.16, "validation": 0.16, "all_splits": 0.48}, "speakers": {"test": 11, "train": 34, "validation": 10, "all_splits": 55}, "words": {"test": 1194, "train": 1268, "validation": 1203, "all_splits": 3665}, "chars": {"test": 8027, "train": 8258, "validation": 8109, "all_splits": 24394}, "utts_unique": {"test": 45, "train": 47, "validation": 45, "all_splits": 137}, "words_unique": {"test": 809, "train": 856, "validation": 809, "all_splits": 2161}, "chars_unique": {"test": 33, "train": 33, "validation": 33, "all_splits": 32}, "average_utterance_length[words]": {"test": 26.53, "train": 26.98, "validation": 26.73, "all_splits": 26.75}, "average_utterance_length[chars]": {"test": 178.38, "train": 175.7, "validation": 180.2, "all_splits": 178.06}, "samples_per_spk_stats": {"test": {"average": 4.09, "std": 2.07, "median": 4.0, "min": 1, "max": 7}, "train": {"average": 1.38, "std": 0.59, "median": 1.0, "min": 1, "max": 3}, "validation": {"average": 4.5, "std": 2.29, "median": 4.5, "min": 1, "max": 9}, "all_splits": {"average": 2.49, "std": 2.01, "median": 2.0, "min": 1, "max": 9}}, "words_per_sec": {"test": 2.06, "train": 2.17, "validation": 2.05, "all_splits": 2.09}, "chars_per_sec": {"test": 11.79, "train": 11.95, "validation": 11.78, "all_splits": 11.84}, "average_audio_duration[s]": {"test": 12.88, "train": 12.45, "validation": 13.03, "all_splits": 12.78}, "meta_cov_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pjatk-clarin_studio-15": {"samples": {"test": 42, "train": 44, "validation": 40, "all_splits": 126}, "audio[h]": {"test": 0.16, "train": 0.17, "validation": 0.17, "all_splits": 0.5}, "speakers": {"test": 32, "train": 43, "validation": 31, "all_splits": 106}, "words": {"test": 1660, "train": 1798, "validation": 1756, "all_splits": 5214}, "chars": {"test": 6549, "train": 6937, "validation": 6896, "all_splits": 20382}, "utts_unique": {"test": 42, "train": 44, "validation": 40, "all_splits": 126}, "words_unique": {"test": 593, "train": 636, "validation": 618, "all_splits": 1638}, "chars_unique": {"test": 34, "train": 34, "validation": 35, "all_splits": 35}, "average_utterance_length[words]": {"test": 39.52, "train": 40.86, "validation": 43.9, "all_splits": 41.38}, "average_utterance_length[chars]": {"test": 155.93, "train": 157.66, "validation": 172.4, "all_splits": 161.76}, "samples_per_spk_stats": {"test": {"average": 1.31, "std": 0.58, "median": 1.0, "min": 1, "max": 3}, "train": {"average": 1.02, "std": 0.15, "median": 1.0, "min": 1, "max": 2}, "validation": {"average": 1.29, "std": 0.52, "median": 1.0, "min": 1, "max": 3}, "all_splits": {"average": 1.19, "std": 0.46, "median": 1.0, "min": 1, "max": 3}}, "words_per_sec": {"test": 2.81, "train": 3.02, "validation": 2.96, "all_splits": 2.93}, "chars_per_sec": {"test": 8.29, "train": 8.63, "validation": 8.65, "all_splits": 8.52}, "average_audio_duration[s]": {"test": 14.04, "train": 13.54, "validation": 14.86, "all_splits": 14.12}, "meta_cov_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "mailabs-corpus_librivox-19": {"samples": {"test": 42, "train": 48, "validation": 47, "all_splits": 137}, "audio[h]": {"test": 0.09, "train": 0.1, "validation": 0.09, "all_splits": 0.28}, "speakers": {"test": 32, "train": 34, "validation": 35, "all_splits": 101}, "words": {"test": 687, "train": 798, "validation": 721, "all_splits": 2206}, "chars": {"test": 4416, "train": 5230, "validation": 4691, "all_splits": 14337}, "utts_unique": {"test": 42, "train": 48, "validation": 47, "all_splits": 137}, "words_unique": {"test": 524, "train": 560, "validation": 539, "all_splits": 1412}, "chars_unique": {"test": 66, "train": 61, "validation": 69, "all_splits": 71}, "average_utterance_length[words]": {"test": 16.36, "train": 16.62, "validation": 15.34, "all_splits": 16.1}, "average_utterance_length[chars]": {"test": 105.14, "train": 108.96, "validation": 99.81, "all_splits": 104.65}, "samples_per_spk_stats": {"test": {"average": 1.31, "std": 0.53, "median": 1.0, "min": 1, "max": 3}, "train": {"average": 1.41, "std": 0.65, "median": 1.0, "min": 1, "max": 3}, "validation": {"average": 1.34, "std": 0.67, "median": 1.0, "min": 1, "max": 4}, "all_splits": {"average": 1.36, "std": 0.62, "median": 1.0, "min": 1, "max": 4}}, "words_per_sec": {"test": 2.22, "train": 2.18, "validation": 2.16, "all_splits": 2.18}, "chars_per_sec": {"test": 12.04, "train": 12.12, "validation": 11.87, "all_splits": 12.01}, "average_audio_duration[s]": {"test": 7.37, "train": 7.62, "validation": 7.12, "all_splits": 7.37}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 0.02, "female": 0.0}, "train": {"male": 0.0, "female": 0.0}, "validation": {"male": 0.0, "female": 0.09}, "all_splits": {"male": 0.2, "female": 0.8}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pwr-azon_read-20": {"samples": {"test": 77, "train": 83, "validation": 95, "all_splits": 255}, "audio[h]": {"test": 0.17, "train": 0.17, "validation": 0.16, "all_splits": 0.5}, "speakers": {"test": 6, "train": 17, "validation": 4, "all_splits": 27}, "words": {"test": 780, "train": 805, "validation": 875, "all_splits": 2460}, "chars": {"test": 6744, "train": 6778, "validation": 7516, "all_splits": 21038}, "utts_unique": {"test": 73, "train": 82, "validation": 93, "all_splits": 239}, "words_unique": {"test": 589, "train": 618, "validation": 657, "all_splits": 1612}, "chars_unique": {"test": 33, "train": 33, "validation": 33, "all_splits": 32}, "average_utterance_length[words]": {"test": 10.13, "train": 9.7, "validation": 9.21, "all_splits": 9.65}, "average_utterance_length[chars]": {"test": 87.58, "train": 81.66, "validation": 79.12, "all_splits": 82.5}, "samples_per_spk_stats": {"test": {"average": 12.83, "std": 1.57, "median": 13.0, "min": 11, "max": 15}, "train": {"average": 4.88, "std": 2.0, "median": 5.0, "min": 1, "max": 8}, "validation": {"average": 23.75, "std": 6.61, "median": 23.5, "min": 15, "max": 33}, "all_splits": {"average": 9.44, "std": 7.45, "median": 7.0, "min": 1, "max": 33}}, "words_per_sec": {"test": 1.31, "train": 1.35, "validation": 1.48, "all_splits": 1.38}, "chars_per_sec": {"test": 10.04, "train": 10.0, "validation": 11.24, "all_splits": 10.42}, "average_audio_duration[s]": {"test": 7.72, "train": 7.2, "validation": 6.22, "all_splits": 6.99}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 0.14, "female": 0.86}, "train": {"male": 0.28, "female": 0.72}, "validation": {"male": 0.51, "female": 0.49}, "all_splits": {"male": 0.32, "female": 0.68}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pwr-azon_spont-20": {"samples": {"test": 39, "train": 30, "validation": 31, "all_splits": 100}, "audio[h]": {"test": 0.17, "train": 0.16, "validation": 0.16, "all_splits": 0.49}, "speakers": {"test": 2, "train": 12, "validation": 2, "all_splits": 16}, "words": {"test": 1305, "train": 1190, "validation": 1711, "all_splits": 4206}, "chars": {"test": 8491, "train": 7791, "validation": 10497, "all_splits": 26779}, "utts_unique": {"test": 39, "train": 30, "validation": 31, "all_splits": 100}, "words_unique": {"test": 598, "train": 651, "validation": 724, "all_splits": 1646}, "chars_unique": {"test": 33, "train": 33, "validation": 33, "all_splits": 32}, "average_utterance_length[words]": {"test": 33.46, "train": 39.67, "validation": 55.19, "all_splits": 42.06}, "average_utterance_length[chars]": {"test": 217.72, "train": 259.7, "validation": 338.61, "all_splits": 267.79}, "samples_per_spk_stats": {"test": {"average": 19.5, "std": 7.5, "median": 19.5, "min": 12, "max": 27}, "train": {"average": 2.5, "std": 1.5, "median": 2.0, "min": 1, "max": 5}, "validation": {"average": 15.5, "std": 6.5, "median": 15.5, "min": 9, "max": 22}, "all_splits": {"average": 6.25, "std": 7.56, "median": 4.0, "min": 1, "max": 27}}, "words_per_sec": {"test": 2.18, "train": 2.03, "validation": 2.88, "all_splits": 2.36}, "chars_per_sec": {"test": 11.98, "train": 11.24, "validation": 14.81, "all_splits": 12.68}, "average_audio_duration[s]": {"test": 15.38, "train": 19.57, "validation": 19.13, "all_splits": 17.8}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 0.31, "female": 0.69}, "train": {"male": 0.53, "female": 0.47}, "validation": {"male": 1.0, "female": 0.0}, "all_splits": {"male": 0.59, "female": 0.41}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pwr-maleset-unk": {"samples": {"test": 116, "train": 111, "validation": 118, "all_splits": 345}, "audio[h]": {"test": 0.16, "train": 0.16, "validation": 0.16, "all_splits": 0.48}, "speakers": {"test": 1, "train": 1, "validation": 1, "all_splits": 3}, "words": {"test": 1030, "train": 957, "validation": 972, "all_splits": 2959}, "chars": {"test": 7223, "train": 6779, "validation": 6467, "all_splits": 20469}, "utts_unique": {"test": 116, "train": 111, "validation": 117, "all_splits": 340}, "words_unique": {"test": 750, "train": 710, "validation": 692, "all_splits": 1866}, "chars_unique": {"test": 41, "train": 46, "validation": 49, "all_splits": 52}, "average_utterance_length[words]": {"test": 8.88, "train": 8.62, "validation": 8.24, "all_splits": 8.58}, "average_utterance_length[chars]": {"test": 62.27, "train": 61.07, "validation": 54.81, "all_splits": 59.33}, "samples_per_spk_stats": {"test": {"average": 116.0, "std": 0.0, "median": 116.0, "min": 116, "max": 116}, "train": {"average": 111.0, "std": 0.0, "median": 111.0, "min": 111, "max": 111}, "validation": {"average": 118.0, "std": 0.0, "median": 118.0, "min": 118, "max": 118}, "all_splits": {"average": 115.0, "std": 2.94, "median": 116.0, "min": 111, "max": 118}}, "words_per_sec": {"test": 1.73, "train": 1.68, "validation": 1.73, "all_splits": 1.71}, "chars_per_sec": {"test": 10.43, "train": 10.21, "validation": 9.79, "all_splits": 10.15}, "average_audio_duration[s]": {"test": 5.12, "train": 5.14, "validation": 4.76, "all_splits": 5.0}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 1.0, "female": 0.0}, "train": {"male": 1.0, "female": 0.0}, "validation": {"male": 1.0, "female": 0.0}, "all_splits": {"male": 1.0, "female": 0.0}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pwr-shortwords-unk": {"samples": {"test": 86, "train": 102, "validation": 82, "all_splits": 270}, "audio[h]": {"test": 0.13, "train": 0.16, "validation": 0.12, "all_splits": 0.41000000000000003}, "speakers": {"test": 1, "train": 1, "validation": 1, "all_splits": 3}, "words": {"test": 822, "train": 984, "validation": 755, "all_splits": 2561}, "chars": {"test": 5649, "train": 6727, "validation": 5224, "all_splits": 17600}, "utts_unique": {"test": 84, "train": 92, "validation": 78, "all_splits": 232}, "words_unique": {"test": 585, "train": 644, "validation": 543, "all_splits": 1458}, "chars_unique": {"test": 37, "train": 47, "validation": 41, "all_splits": 48}, "average_utterance_length[words]": {"test": 9.56, "train": 9.65, "validation": 9.21, "all_splits": 9.49}, "average_utterance_length[chars]": {"test": 65.69, "train": 65.95, "validation": 63.71, "all_splits": 65.19}, "samples_per_spk_stats": {"test": {"average": 86.0, "std": 0.0, "median": 86.0, "min": 86, "max": 86}, "train": {"average": 102.0, "std": 0.0, "median": 102.0, "min": 102, "max": 102}, "validation": {"average": 82.0, "std": 0.0, "median": 82.0, "min": 82, "max": 82}, "all_splits": {"average": 90.0, "std": 8.64, "median": 86.0, "min": 82, "max": 102}}, "words_per_sec": {"test": 1.76, "train": 1.74, "validation": 1.75, "all_splits": 1.75}, "chars_per_sec": {"test": 10.33, "train": 10.13, "validation": 10.34, "all_splits": 10.26}, "average_audio_duration[s]": {"test": 5.43, "train": 5.56, "validation": 5.27, "all_splits": 5.43}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 1.0, "female": 0.0}, "train": {"male": 1.0, "female": 0.0}, "validation": {"male": 1.0, "female": 0.0}, "all_splits": {"male": 1.0, "female": 0.0}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "pwr-viu-unk": {"samples": {"test": 247, "train": 382, "validation": 266, "all_splits": 895}, "audio[h]": {"test": 0.09, "train": 0.15, "validation": 0.1, "all_splits": 0.33999999999999997}, "speakers": {"test": 1, "train": 1, "validation": 1, "all_splits": 3}, "words": {"test": 438, "train": 691, "validation": 457, "all_splits": 1586}, "chars": {"test": 2785, "train": 4470, "validation": 3012, "all_splits": 10267}, "utts_unique": {"test": 13, "train": 13, "validation": 13, "all_splits": 13}, "words_unique": {"test": 18, "train": 18, "validation": 18, "all_splits": 18}, "chars_unique": {"test": 28, "train": 28, "validation": 28, "all_splits": 27}, "average_utterance_length[words]": {"test": 1.77, "train": 1.81, "validation": 1.72, "all_splits": 1.77}, "average_utterance_length[chars]": {"test": 11.28, "train": 11.7, "validation": 11.32, "all_splits": 11.47}, "samples_per_spk_stats": {"test": {"average": 247.0, "std": 0.0, "median": 247.0, "min": 247, "max": 247}, "train": {"average": 382.0, "std": 0.0, "median": 382.0, "min": 382, "max": 382}, "validation": {"average": 266.0, "std": 0.0, "median": 266.0, "min": 266, "max": 266}, "all_splits": {"average": 298.33, "std": 59.67, "median": 266.0, "min": 247, "max": 382}}, "words_per_sec": {"test": 1.29, "train": 1.27, "validation": 1.25, "all_splits": 1.27}, "chars_per_sec": {"test": 6.92, "train": 6.95, "validation": 7.01, "all_splits": 6.96}, "average_audio_duration[s]": {"test": 1.37, "train": 1.42, "validation": 1.37, "all_splits": 1.39}, "meta_cov_sex": {"test": 100.0, "train": 100.0, "validation": 100.0, "all_splits": 100.0}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 1.0, "female": 0.0}, "train": {"male": 1.0, "female": 0.0}, "validation": {"male": 1.0, "female": 0.0}, "all_splits": {"male": 1.0, "female": 0.0}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "google-fleurs-22": {"samples": {"test": 56, "train": 47, "validation": 62, "all_splits": 165}, "audio[h]": {"test": 0.17, "train": 0.16, "validation": 0.16, "all_splits": 0.49}, "speakers": {"test": 1, "train": 1, "validation": 1, "all_splits": 3}, "words": {"test": 1149, "train": 930, "validation": 1100, "all_splits": 3179}, "chars": {"test": 8120, "train": 6479, "validation": 7817, "all_splits": 22416}, "utts_unique": {"test": 53, "train": 47, "validation": 55, "all_splits": 155}, "words_unique": {"test": 769, "train": 685, "validation": 705, "all_splits": 1908}, "chars_unique": {"test": 49, "train": 47, "validation": 45, "all_splits": 51}, "average_utterance_length[words]": {"test": 20.52, "train": 19.79, "validation": 17.74, "all_splits": 19.27}, "average_utterance_length[chars]": {"test": 145.0, "train": 137.85, "validation": 126.08, "all_splits": 135.85}, "samples_per_spk_stats": {"test": {"average": 56.0, "std": 0.0, "median": 56.0, "min": 56, "max": 56}, "train": {"average": 47.0, "std": 0.0, "median": 47.0, "min": 47, "max": 47}, "validation": {"average": 62.0, "std": 0.0, "median": 62.0, "min": 62, "max": 62}, "all_splits": {"average": 55.0, "std": 6.16, "median": 56.0, "min": 47, "max": 62}}, "words_per_sec": {"test": 1.93, "train": 1.6, "validation": 1.9, "all_splits": 1.81}, "chars_per_sec": {"test": 11.74, "train": 9.54, "validation": 11.63, "all_splits": 10.97}, "average_audio_duration[s]": {"test": 10.61, "train": 12.38, "validation": 9.32, "all_splits": 10.63}, "meta_cov_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "polyai-minds14-21": {"samples": {"test": 24, "train": 26, "validation": 34, "all_splits": 84}, "audio[h]": {"test": 0.15, "train": 0.15, "validation": 0.16, "all_splits": 0.45999999999999996}, "speakers": {"test": 3, "train": 3, "validation": 3, "all_splits": 9}, "words": {"test": 456, "train": 524, "validation": 605, "all_splits": 1585}, "chars": {"test": 2874, "train": 3315, "validation": 3849, "all_splits": 10038}, "utts_unique": {"test": 24, "train": 26, "validation": 34, "all_splits": 84}, "words_unique": {"test": 253, "train": 274, "validation": 294, "all_splits": 551}, "chars_unique": {"test": 44, "train": 47, "validation": 49, "all_splits": 52}, "average_utterance_length[words]": {"test": 19.0, "train": 20.15, "validation": 17.79, "all_splits": 18.87}, "average_utterance_length[chars]": {"test": 119.75, "train": 127.5, "validation": 113.21, "all_splits": 119.5}, "samples_per_spk_stats": {"test": {"average": 8.0, "std": 9.2, "median": 2.0, "min": 1, "max": 21}, "train": {"average": 8.67, "std": 7.32, "median": 4.0, "min": 3, "max": 19}, "validation": {"average": 11.33, "std": 10.37, "median": 4.0, "min": 4, "max": 26}, "all_splits": {"average": 9.33, "std": 10.37, "median": 4.0, "min": 4, "max": 26}}, "words_per_sec": {"test": 0.82, "train": 0.97, "validation": 1.02, "all_splits": 0.94}, "chars_per_sec": {"test": 4.35, "train": 5.19, "validation": 5.47, "all_splits": 5.01}, "average_audio_duration[s]": {"test": 23.19, "train": 20.7, "validation": 17.44, "all_splits": 20.1}, "meta_cov_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}, "all": {"samples": {"test": 774, "train": 920, "validation": 820, "all_splits": 2514}, "audio[h]": {"test": 1.45, "train": 1.54, "validation": 1.45, "all_splits": 4.44}, "speakers": {"test": 83, "train": 130, "validation": 71, "all_splits": 284}, "words": {"test": 9521, "train": 9945, "validation": 10155, "all_splits": 29621}, "chars": {"test": 60887, "train": 62773, "validation": 64087, "all_splits": 187747}, "utts_unique": {"test": 531, "train": 540, "validation": 552, "all_splits": 1558}, "words_unique": {"test": 4319, "train": 4502, "validation": 4384, "all_splits": 10565}, "chars_unique": {"test": 80, "train": 75, "validation": 81, "all_splits": 88}, "average_utterance_length[words]": {"test": 12.3, "train": 10.81, "validation": 12.38, "all_splits": 11.78}, "average_utterance_length[chars]": {"test": 78.67, "train": 68.23, "validation": 78.15, "all_splits": 74.68}, "samples_per_spk_stats": {"test": {"average": 8.6, "std": 30.0, "median": 1.0, "min": 1, "max": 247}, "train": {"average": 6.26, "std": 33.63, "median": 1.0, "min": 1, "max": 382}, "validation": {"average": 9.21, "std": 32.11, "median": 1.0, "min": 1, "max": 266}, "all_splits": {"average": 7.71, "std": 32.56, "median": 1.0, "min": 1, "max": 382}}, "words_per_sec": {"test": 1.82, "train": 1.8, "validation": 1.94, "all_splits": 1.85}, "chars_per_sec": {"test": 9.83, "train": 9.55, "validation": 10.32, "all_splits": 9.89}, "average_audio_duration[s]": {"test": 6.75, "train": 6.01, "validation": 6.38, "all_splits": 6.36}, "meta_cov_sex": {"test": 78.42, "train": 82.17, "validation": 77.93, "all_splits": 79.63}, "meta_cov_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}, "meta_dist_sex": {"test": {"male": 0.78, "female": 0.15}, "train": {"male": 0.84, "female": 0.1}, "validation": {"male": 0.85, "female": 0.08}, "all_splits": {"male": 0.88, "female": 0.12}}, "meta_dist_age": {"test": "N/A", "train": "N/A", "validation": "N/A", "all_splits": "N/A"}}}
|
reports/amu-cai/pl-asr-bigos-v2/dataset_contents.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 46668863
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51bc49c43a89556c627ad2b57143bceedc0a5510de81f2ceffc471200cdc7ff2
|
3 |
size 46668863
|
reports/amu-cai/pl-asr-bigos-v2/dataset_statistics.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3914471cf4e90fa7cef52beae12fe5a1162ae90a73b1e2e97cc38f9222ea8831
|
3 |
+
size 26953
|
reports/pelcra/pl-asr-pelcra-for-bigos/dataset_contents.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 95274266
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ccb4a2a854270fcd53a55ce09443c05392c3f15413013724a7975f7db9019ca
|
3 |
size 95274266
|
reports/pelcra/pl-asr-pelcra-for-bigos/dataset_statistics.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:18ea2da5c59735f8dcd2d78bffcbbda26e414abcdbf607ab4a66e15aff65acac
|
3 |
+
size 33533
|
run-analysis.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import json
|
3 |
from datasets import load_dataset, get_dataset_config_names, Features, Value
|
4 |
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
|
5 |
-
from utils import total_audio_duration_per_split, average_audio_duration_per_split, speakers_per_split, meta_cov_per_split
|
6 |
#, uniq_utts_per_speaker
|
7 |
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
|
8 |
import argparse
|
@@ -16,14 +16,17 @@ os.makedirs(output_dir_plots, exist_ok=True)
|
|
16 |
# read from command line argument
|
17 |
parser = argparse.ArgumentParser()
|
18 |
parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for")
|
19 |
-
parser.add_argument(
|
20 |
|
21 |
args = parser.parse_args()
|
22 |
|
|
|
23 |
dataset_name = args.dataset
|
24 |
print("Generating reports for dataset: {}".format(dataset_name))
|
25 |
-
if (args.
|
|
|
26 |
dataset_name_secret = str.join("-", [dataset_name, "secret"])
|
|
|
27 |
# check if secret repo exists
|
28 |
print(dataset_name_secret)
|
29 |
try:
|
@@ -31,7 +34,6 @@ if (args.secret_test_split):
|
|
31 |
except:
|
32 |
print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret))
|
33 |
|
34 |
-
#dataset_name = "amu-cai/pl-asr-bigos-v2"
|
35 |
output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name)
|
36 |
os.makedirs(output_dir_reports_dataset, exist_ok=True)
|
37 |
|
@@ -55,8 +57,11 @@ for config_name in dataset_configs:
|
|
55 |
dataset_contents[config_name] = {}
|
56 |
|
57 |
dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True)
|
58 |
-
|
|
|
59 |
dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
|
|
|
|
|
60 |
|
61 |
#audio content size
|
62 |
dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
|
@@ -73,6 +78,10 @@ for config_name in dataset_configs:
|
|
73 |
dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
74 |
dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
75 |
|
|
|
|
|
|
|
|
|
76 |
# audio content derived features
|
77 |
dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
78 |
dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
@@ -86,7 +95,6 @@ for config_name in dataset_configs:
|
|
86 |
dataset_statistics[config_name]["meta_dist_sex"] = meta_distribution_text(dataset_hf_subset, 'speaker_sex')
|
87 |
dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')
|
88 |
|
89 |
-
dataset_statistics[config_name]["samples_per_spk"], dataset_contents[config_name]["samples_per_spk"] = recordings_per_speaker(dataset_hf_subset)
|
90 |
# dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
|
91 |
# number of words per speaker (min, max, med, avg, std)
|
92 |
|
|
|
2 |
import json
|
3 |
from datasets import load_dataset, get_dataset_config_names, Features, Value
|
4 |
from utils import num_of_samples_per_split, uniq_utts_per_split, words_per_split, uniq_words_per_split, chars_per_split, uniq_chars_per_split
|
5 |
+
from utils import total_audio_duration_per_split, average_audio_duration_per_split, average_utterance_length_chars_per_split, average_utterance_length_words_per_split, speakers_per_split, meta_cov_per_split
|
6 |
#, uniq_utts_per_speaker
|
7 |
from utils import meta_distribution_text, meta_distribution_violin_plot, recordings_per_speaker, speech_rate_words_per_split, speech_rate_chars_per_split
|
8 |
import argparse
|
|
|
16 |
# read from command line argument
|
17 |
parser = argparse.ArgumentParser()
|
18 |
parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to generate reports for")
|
19 |
+
parser.add_argument('--no_secret_test_split', action='store_false', help="Should references for test split be retrieved from the secret distribution?")
|
20 |
|
21 |
args = parser.parse_args()
|
22 |
|
23 |
+
|
24 |
dataset_name = args.dataset
|
25 |
print("Generating reports for dataset: {}".format(dataset_name))
|
26 |
+
if not (args.no_secret_test_split):
|
27 |
+
|
28 |
dataset_name_secret = str.join("-", [dataset_name, "secret"])
|
29 |
+
|
30 |
# check if secret repo exists
|
31 |
print(dataset_name_secret)
|
32 |
try:
|
|
|
34 |
except:
|
35 |
print("Config for secret dataset {} cannot be retrieved!".format(dataset_name_secret))
|
36 |
|
|
|
37 |
output_dir_reports_dataset = os.path.join(output_dir_reports, dataset_name)
|
38 |
os.makedirs(output_dir_reports_dataset, exist_ok=True)
|
39 |
|
|
|
57 |
dataset_contents[config_name] = {}
|
58 |
|
59 |
dataset_hf_subset = load_dataset(dataset_name, config_name, features=features_to_load, trust_remote_code=True)
|
60 |
+
|
61 |
+
if not (args.no_secret_test_split):
|
62 |
dataset_hf_subset_secret = load_dataset(dataset_name_secret, config_name, features=features_to_load, trust_remote_code=True)
|
63 |
+
else:
|
64 |
+
dataset_hf_subset_secret = None
|
65 |
|
66 |
#audio content size
|
67 |
dataset_statistics[config_name]["samples"] = num_of_samples_per_split(dataset_hf_subset)
|
|
|
78 |
dataset_statistics[config_name]["words_unique"], dataset_contents[config_name]["unique_words"] = uniq_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
79 |
dataset_statistics[config_name]["chars_unique"], dataset_contents[config_name]["unique_chars"] = uniq_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
80 |
|
81 |
+
dataset_statistics[config_name]["average_utterance_length[words]"] = average_utterance_length_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
82 |
+
dataset_statistics[config_name]["average_utterance_length[chars]"] = average_utterance_length_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
83 |
+
dataset_statistics[config_name]["samples_per_spk_stats"], dataset_contents[config_name]["samples_per_spk"] = recordings_per_speaker(dataset_hf_subset)
|
84 |
+
|
85 |
# audio content derived features
|
86 |
dataset_statistics[config_name]["words_per_sec"] = speech_rate_words_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
87 |
dataset_statistics[config_name]["chars_per_sec"] = speech_rate_chars_per_split(dataset_hf_subset, dataset_hf_subset_secret)
|
|
|
95 |
dataset_statistics[config_name]["meta_dist_sex"] = meta_distribution_text(dataset_hf_subset, 'speaker_sex')
|
96 |
dataset_statistics[config_name]["meta_dist_age"] = meta_distribution_text(dataset_hf_subset, 'speaker_age')
|
97 |
|
|
|
98 |
# dataset_statistics[config_name] = uniq_utts_per_speaker(dataset_hf_subset)
|
99 |
# number of words per speaker (min, max, med, avg, std)
|
100 |
|
utils.py
CHANGED
@@ -77,6 +77,58 @@ def average_audio_duration_per_split(dataset_hf):
|
|
77 |
out_dict["all_splits"] = round(audio_length_total_seconds / samples_all,2)
|
78 |
return out_dict
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
def speakers_per_split(dataset_hf):
|
81 |
# input - huggingface dataset object
|
82 |
# output - dictionary with statistics about audio duration per split
|
@@ -106,7 +158,7 @@ def uniq_utts_per_split(dataset_hf, dataset_hf_secret):
|
|
106 |
utts_all = []
|
107 |
for split in dataset_hf.keys():
|
108 |
# extract speakers from file_id
|
109 |
-
if (split == "test"):
|
110 |
utts_split = dataset_hf_secret[split]["ref_orig"]
|
111 |
else:
|
112 |
utts_split = dataset_hf[split]["ref_orig"]
|
@@ -129,7 +181,7 @@ def words_per_split(dataset_hf, dataset_hf_secret):
|
|
129 |
|
130 |
for split in dataset_hf.keys():
|
131 |
# extract speakers from file_id
|
132 |
-
if (split == "test"):
|
133 |
utts_all = dataset_hf_secret[split]["ref_orig"]
|
134 |
else:
|
135 |
utts_all = dataset_hf[split]["ref_orig"]
|
@@ -153,7 +205,7 @@ def uniq_words_per_split(dataset_hf, dataset_hf_secret):
|
|
153 |
|
154 |
for split in dataset_hf.keys():
|
155 |
# extract speakers from file_id
|
156 |
-
if (split == "test"):
|
157 |
utts_all = dataset_hf_secret[split]["ref_orig"]
|
158 |
else:
|
159 |
utts_all = dataset_hf[split]["ref_orig"]
|
@@ -185,7 +237,7 @@ def chars_per_split(dataset_hf, dataset_hf_secret):
|
|
185 |
|
186 |
for split in dataset_hf.keys():
|
187 |
# extract speakers from file_id
|
188 |
-
if (split=="test"):
|
189 |
utts_all = dataset_hf_secret[split]["ref_orig"]
|
190 |
else:
|
191 |
utts_all = dataset_hf[split]["ref_orig"]
|
@@ -210,7 +262,7 @@ def uniq_chars_per_split(dataset_hf, dataset_hf_secret):
|
|
210 |
|
211 |
for split in dataset_hf.keys():
|
212 |
# extract speakers from file_id
|
213 |
-
if(split == "test"):
|
214 |
utts_all = dataset_hf_secret[split]["ref_orig"]
|
215 |
else:
|
216 |
utts_all = dataset_hf[split]["ref_orig"]
|
@@ -256,17 +308,13 @@ def meta_cov_per_split(dataset_hf, meta_field):
|
|
256 |
out_dict[split] = "N/A"
|
257 |
continue
|
258 |
meta_info_not_null_all += meta_info_not_null_count
|
259 |
-
meta_info_coverage = round(meta_info_not_null_count / meta_info_count, 2)
|
260 |
-
#print(split, meta_info_coverage)
|
261 |
-
|
262 |
-
# add number of samples for all splits
|
263 |
out_dict[split] = meta_info_coverage
|
264 |
|
265 |
-
# add number of samples for all splits
|
266 |
if (meta_info_not_null_all == 0):
|
267 |
out_dict["all_splits"] = "N/A"
|
268 |
else:
|
269 |
-
out_dict["all_splits"] = round(meta_info_not_null_all/meta_info_all,2 )
|
270 |
return out_dict
|
271 |
|
272 |
|
@@ -282,7 +330,7 @@ def speech_rate_words_per_split(dataset_hf, dataset_hf_secret):
|
|
282 |
|
283 |
for split in dataset_hf.keys():
|
284 |
# extract speakers from file_id
|
285 |
-
if (split == "test"):
|
286 |
utts_split = dataset_hf_secret[split]["ref_orig"]
|
287 |
else:
|
288 |
utts_split = dataset_hf[split]["ref_orig"]
|
@@ -292,9 +340,8 @@ def speech_rate_words_per_split(dataset_hf, dataset_hf_secret):
|
|
292 |
audio_split_length_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
|
293 |
audio_total_length_seconds += audio_split_length_seconds
|
294 |
speech_rate = round(words_split_count / audio_split_length_seconds, 2)
|
295 |
-
#print(split, speech_rate)
|
296 |
out_dict[split] = speech_rate
|
297 |
-
|
298 |
out_dict["all_splits"] = round(words_all_count / audio_total_length_seconds, 2)
|
299 |
return out_dict
|
300 |
|
@@ -310,7 +357,7 @@ def speech_rate_chars_per_split(dataset_hf, dataset_hf_secret):
|
|
310 |
|
311 |
for split in dataset_hf.keys():
|
312 |
# extract speakers from file_id
|
313 |
-
if (split == "test"):
|
314 |
utts_split = dataset_hf_secret[split]["ref_orig"]
|
315 |
else:
|
316 |
utts_split = dataset_hf[split]["ref_orig"]
|
@@ -320,9 +367,8 @@ def speech_rate_chars_per_split(dataset_hf, dataset_hf_secret):
|
|
320 |
audio_split_length_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
|
321 |
audio_total_length_seconds += audio_split_length_seconds
|
322 |
speech_rate = round(chars_split_count / audio_split_length_seconds, 2)
|
323 |
-
#print(split, speech_rate)
|
324 |
out_dict[split] = speech_rate
|
325 |
-
|
326 |
out_dict["all_splits"] = round(chars_all_count / audio_total_length_seconds, 2)
|
327 |
return out_dict
|
328 |
|
@@ -362,7 +408,6 @@ def meta_distribution_text(dataset_hf, meta_field):
|
|
362 |
out_dict[split][bucket] = round(values_count/len(meta_info_not_null),2)
|
363 |
#print(split, out_dict[split])
|
364 |
|
365 |
-
# add number of samples for all splits
|
366 |
if (no_meta):
|
367 |
out_dict["all_splits"] = "N/A"
|
368 |
return out_dict
|
@@ -428,6 +473,7 @@ def recordings_per_speaker(dataset_hf):
|
|
428 |
recordings_total += recordings_split
|
429 |
|
430 |
average_recordings_per_speaker = round( recordings_split / speakers_split,2)
|
|
|
431 |
out_dict_stats[split]["average"] = average_recordings_per_speaker
|
432 |
out_dict_stats[split]["std"] = round(np.std(list(recordings_per_speaker_stats_dict_split.values())),2)
|
433 |
out_dict_stats[split]["median"] = np.median(list(recordings_per_speaker_stats_dict_split.values()))
|
|
|
77 |
out_dict["all_splits"] = round(audio_length_total_seconds / samples_all,2)
|
78 |
return out_dict
|
79 |
|
80 |
+
def average_utterance_length_chars_per_split(dataset_hf, dataset_hf_secret):
|
81 |
+
# input - huggingface dataset object
|
82 |
+
# output - dictionary with statistics about average utterance length per split
|
83 |
+
out_dict = {}
|
84 |
+
metric = "average_utterance_length[chars]"
|
85 |
+
print("Calculating {}".format(metric))
|
86 |
+
chars_all=0
|
87 |
+
samples_all=0
|
88 |
+
for split in dataset_hf.keys():
|
89 |
+
# extract speakers from file_id
|
90 |
+
if (split=="test" and dataset_hf_secret is not None):
|
91 |
+
utts_split = dataset_hf_secret[split]["ref_orig"]
|
92 |
+
else:
|
93 |
+
utts_split = dataset_hf[split]["ref_orig"]
|
94 |
+
words_split = " ".join(utts_split).split(" ")
|
95 |
+
chars_split = " ".join(words_split)
|
96 |
+
chars_split_count = len(chars_split)
|
97 |
+
chars_all += chars_split_count
|
98 |
+
samples_split = len(utts_split)
|
99 |
+
samples_all += samples_split
|
100 |
+
#print(split, chars_all_count)
|
101 |
+
out_dict[split] = round(chars_split_count/samples_split, 2)
|
102 |
+
|
103 |
+
# add number of samples for all splits
|
104 |
+
out_dict["all_splits"] = round(chars_all/samples_all, 2)
|
105 |
+
return out_dict
|
106 |
+
|
107 |
+
def average_utterance_length_words_per_split(dataset_hf, dataset_hf_secret):
|
108 |
+
# input - huggingface dataset object
|
109 |
+
# output - dictionary with statistics about average utterance length per split
|
110 |
+
out_dict = {}
|
111 |
+
metric = "average_utterance_length[words]"
|
112 |
+
print("Calculating {}".format(metric))
|
113 |
+
words_all=0
|
114 |
+
samples_all=0
|
115 |
+
for split in dataset_hf.keys():
|
116 |
+
# extract speakers from file_id
|
117 |
+
if (split=="test" and dataset_hf_secret is not None):
|
118 |
+
utts_split = dataset_hf_secret[split]["ref_orig"]
|
119 |
+
else:
|
120 |
+
utts_split = dataset_hf[split]["ref_orig"]
|
121 |
+
words_split_count = len(" ".join(utts_split).split(" "))
|
122 |
+
words_all += words_split_count
|
123 |
+
samples_split = len(utts_split)
|
124 |
+
samples_all += samples_split
|
125 |
+
#print(split, chars_all_count)
|
126 |
+
out_dict[split] = round(words_split_count/samples_split , 2)
|
127 |
+
|
128 |
+
# add number of samples for all splits
|
129 |
+
out_dict["all_splits"] = round(words_all/samples_all, 2)
|
130 |
+
return out_dict
|
131 |
+
|
132 |
def speakers_per_split(dataset_hf):
|
133 |
# input - huggingface dataset object
|
134 |
# output - dictionary with statistics about audio duration per split
|
|
|
158 |
utts_all = []
|
159 |
for split in dataset_hf.keys():
|
160 |
# extract speakers from file_id
|
161 |
+
if (split == "test" and dataset_hf_secret is not None):
|
162 |
utts_split = dataset_hf_secret[split]["ref_orig"]
|
163 |
else:
|
164 |
utts_split = dataset_hf[split]["ref_orig"]
|
|
|
181 |
|
182 |
for split in dataset_hf.keys():
|
183 |
# extract speakers from file_id
|
184 |
+
if (split == "test" and dataset_hf_secret is not None):
|
185 |
utts_all = dataset_hf_secret[split]["ref_orig"]
|
186 |
else:
|
187 |
utts_all = dataset_hf[split]["ref_orig"]
|
|
|
205 |
|
206 |
for split in dataset_hf.keys():
|
207 |
# extract speakers from file_id
|
208 |
+
if (split == "test" and dataset_hf_secret is not None):
|
209 |
utts_all = dataset_hf_secret[split]["ref_orig"]
|
210 |
else:
|
211 |
utts_all = dataset_hf[split]["ref_orig"]
|
|
|
237 |
|
238 |
for split in dataset_hf.keys():
|
239 |
# extract speakers from file_id
|
240 |
+
if (split=="test" and dataset_hf_secret is not None):
|
241 |
utts_all = dataset_hf_secret[split]["ref_orig"]
|
242 |
else:
|
243 |
utts_all = dataset_hf[split]["ref_orig"]
|
|
|
262 |
|
263 |
for split in dataset_hf.keys():
|
264 |
# extract speakers from file_id
|
265 |
+
if(split == "test" and dataset_hf_secret is not None):
|
266 |
utts_all = dataset_hf_secret[split]["ref_orig"]
|
267 |
else:
|
268 |
utts_all = dataset_hf[split]["ref_orig"]
|
|
|
308 |
out_dict[split] = "N/A"
|
309 |
continue
|
310 |
meta_info_not_null_all += meta_info_not_null_count
|
311 |
+
meta_info_coverage = round(meta_info_not_null_count / meta_info_count * 100, 2)
|
|
|
|
|
|
|
312 |
out_dict[split] = meta_info_coverage
|
313 |
|
|
|
314 |
if (meta_info_not_null_all == 0):
|
315 |
out_dict["all_splits"] = "N/A"
|
316 |
else:
|
317 |
+
out_dict["all_splits"] = round(meta_info_not_null_all/meta_info_all * 100,2 )
|
318 |
return out_dict
|
319 |
|
320 |
|
|
|
330 |
|
331 |
for split in dataset_hf.keys():
|
332 |
# extract speakers from file_id
|
333 |
+
if (split == "test" and dataset_hf_secret is not None):
|
334 |
utts_split = dataset_hf_secret[split]["ref_orig"]
|
335 |
else:
|
336 |
utts_split = dataset_hf[split]["ref_orig"]
|
|
|
340 |
audio_split_length_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
|
341 |
audio_total_length_seconds += audio_split_length_seconds
|
342 |
speech_rate = round(words_split_count / audio_split_length_seconds, 2)
|
|
|
343 |
out_dict[split] = speech_rate
|
344 |
+
|
345 |
out_dict["all_splits"] = round(words_all_count / audio_total_length_seconds, 2)
|
346 |
return out_dict
|
347 |
|
|
|
357 |
|
358 |
for split in dataset_hf.keys():
|
359 |
# extract speakers from file_id
|
360 |
+
if (split == "test" and dataset_hf_secret is not None):
|
361 |
utts_split = dataset_hf_secret[split]["ref_orig"]
|
362 |
else:
|
363 |
utts_split = dataset_hf[split]["ref_orig"]
|
|
|
367 |
audio_split_length_seconds = sum(dataset_hf[split]["audio_duration_seconds"])
|
368 |
audio_total_length_seconds += audio_split_length_seconds
|
369 |
speech_rate = round(chars_split_count / audio_split_length_seconds, 2)
|
|
|
370 |
out_dict[split] = speech_rate
|
371 |
+
|
372 |
out_dict["all_splits"] = round(chars_all_count / audio_total_length_seconds, 2)
|
373 |
return out_dict
|
374 |
|
|
|
408 |
out_dict[split][bucket] = round(values_count/len(meta_info_not_null),2)
|
409 |
#print(split, out_dict[split])
|
410 |
|
|
|
411 |
if (no_meta):
|
412 |
out_dict["all_splits"] = "N/A"
|
413 |
return out_dict
|
|
|
473 |
recordings_total += recordings_split
|
474 |
|
475 |
average_recordings_per_speaker = round( recordings_split / speakers_split,2)
|
476 |
+
|
477 |
out_dict_stats[split]["average"] = average_recordings_per_speaker
|
478 |
out_dict_stats[split]["std"] = round(np.std(list(recordings_per_speaker_stats_dict_split.values())),2)
|
479 |
out_dict_stats[split]["median"] = np.median(list(recordings_per_speaker_stats_dict_split.values()))
|