Spaces:
Sleeping
Sleeping
add minimal structure and parsing cv17 results
Browse files- .gitignore +2 -0
- app.py +106 -69
- config.py +88 -0
- parsing.py +56 -0
- requirements.txt +2 -1
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
fair-asr-results
|
2 |
+
__pycache__
|
app.py
CHANGED
@@ -2,93 +2,130 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
import random
|
4 |
import plotly.express as px
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
-
def
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
data = {
|
13 |
-
"Model": ["Model A", "Model B", "Model C"],
|
14 |
-
"Avg": [0.85, 0.90, 0.88],
|
15 |
-
"Gap Read": [0.05, 0.03, 0.04],
|
16 |
-
"Gap Spontaneous": [0.07, 0.06, 0.05],
|
17 |
-
}
|
18 |
-
|
19 |
-
df = pd.DataFrame(data)
|
20 |
-
return df
|
21 |
-
|
22 |
-
|
23 |
-
def get_language_performance():
|
24 |
-
languages = [
|
25 |
-
"en",
|
26 |
-
"es",
|
27 |
-
"de",
|
28 |
-
"fr",
|
29 |
-
"it",
|
30 |
-
"pt",
|
31 |
-
"nl",
|
32 |
-
"ru",
|
33 |
-
"zh",
|
34 |
-
"ja",
|
35 |
-
"ko",
|
36 |
-
"ar",
|
37 |
-
"hi",
|
38 |
-
"bn",
|
39 |
-
"ur",
|
40 |
-
"tr",
|
41 |
-
"sv",
|
42 |
-
]
|
43 |
-
data = {
|
44 |
-
"Model": ["Model A", "Model B", "Model C"],
|
45 |
-
}
|
46 |
-
|
47 |
-
for lang in languages:
|
48 |
-
data[lang] = [random.uniform(-100, 100) for _ in range(3)]
|
49 |
-
|
50 |
-
df = pd.DataFrame(data)
|
51 |
return df
|
52 |
|
53 |
|
54 |
-
results = get_results_df()
|
55 |
-
|
56 |
with gr.Blocks() as fm_interface:
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
)
|
66 |
-
|
67 |
-
y="Performance",
|
68 |
-
color="Model",
|
69 |
-
title="Language Performance Plot 1",
|
70 |
-
barmode="group",
|
71 |
)
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
x="Language",
|
77 |
-
y="
|
78 |
color="Model",
|
79 |
-
title="Language
|
|
|
|
|
|
|
|
|
|
|
80 |
barmode="group",
|
81 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
-
|
84 |
-
|
|
|
|
|
85 |
|
86 |
tabs = [fm_interface]
|
87 |
titles = ["F-M Setup"]
|
88 |
|
89 |
with gr.Blocks() as demo:
|
90 |
-
gr.Markdown("#
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
gr.TabbedInterface(tabs, titles)
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
if __name__ == "__main__":
|
94 |
demo.launch()
|
|
|
2 |
import pandas as pd
|
3 |
import random
|
4 |
import plotly.express as px
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
+
import os
|
7 |
+
import logging
|
8 |
+
|
9 |
+
from config import (
|
10 |
+
SETUPS,
|
11 |
+
LOCAL_RESULTS_DIR,
|
12 |
+
CITATION_BUTTON_TEXT,
|
13 |
+
CITATION_BUTTON_LABEL,
|
14 |
+
)
|
15 |
+
from parsing import read_all_configs
|
16 |
+
|
17 |
+
# Set up logging
|
18 |
+
logging.basicConfig(
|
19 |
+
level=logging.INFO,
|
20 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
21 |
+
handlers=[
|
22 |
+
# logging.FileHandler("app.log"),
|
23 |
+
logging.StreamHandler()
|
24 |
+
],
|
25 |
+
)
|
26 |
+
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
|
29 |
+
|
30 |
+
try:
|
31 |
+
print("Saving results locally at:", LOCAL_RESULTS_DIR)
|
32 |
+
snapshot_download(
|
33 |
+
repo_id="g8a9/fair-asr-results",
|
34 |
+
local_dir=LOCAL_RESULTS_DIR,
|
35 |
+
repo_type="dataset",
|
36 |
+
tqdm_class=None,
|
37 |
+
etag_timeout=30,
|
38 |
+
ignore_patterns=["*samples*", "*transcripts*"],
|
39 |
+
token=os.environ.get("TOKEN"),
|
40 |
+
)
|
41 |
+
except Exception as e:
|
42 |
+
raise e
|
43 |
|
44 |
|
45 |
+
def format_dataframe(df, times_100=False):
|
46 |
+
if times_100:
|
47 |
+
df = df.map(lambda x: (f"{x * 100:.3f}%" if isinstance(x, (int, float)) else x))
|
48 |
+
else:
|
49 |
+
df = df.map(lambda x: (f"{x:.4f}" if isinstance(x, (int, float)) else x))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
return df
|
51 |
|
52 |
|
|
|
|
|
53 |
with gr.Blocks() as fm_interface:
|
54 |
+
fm = SETUPS[0]
|
55 |
+
setup = fm["majority_group"] + "_" + fm["minority_group"]
|
56 |
+
results = read_all_configs(setup)
|
57 |
+
|
58 |
+
model_results = (
|
59 |
+
results.pivot_table(
|
60 |
+
index="Model", values="Gap", aggfunc=lambda x: 100 * x.abs().sum()
|
61 |
+
)
|
62 |
+
.reset_index()
|
63 |
+
.sort_values("Gap")
|
|
|
|
|
|
|
|
|
64 |
)
|
65 |
+
best_model = model_results.iloc[0]["Model"]
|
66 |
+
print("Best model:", best_model)
|
67 |
+
# model_results = format_dataframe(model_results)
|
68 |
+
# print(results.head())
|
69 |
+
|
70 |
+
gr.Markdown("### Sum of Absolute Gaps ⬇️")
|
71 |
+
gr.DataFrame(format_dataframe(model_results))
|
72 |
+
|
73 |
+
gr.Markdown("#### F-M gaps by language")
|
74 |
+
|
75 |
+
lang_results = results.pivot_table(
|
76 |
+
index="Model",
|
77 |
+
values="Gap",
|
78 |
+
columns="Language",
|
79 |
+
).reset_index()
|
80 |
+
gr.DataFrame(format_dataframe(lang_results, times_100=True))
|
81 |
+
|
82 |
+
# gr.Plot(fig1)
|
83 |
+
results["Gap"] = results["Gap"] * 100
|
84 |
+
fig = px.bar(
|
85 |
+
results,
|
86 |
x="Language",
|
87 |
+
y="Gap",
|
88 |
color="Model",
|
89 |
+
title="Gaps by Language and Model",
|
90 |
+
labels={
|
91 |
+
"Gap": "Sum of Absolute Gaps (%)",
|
92 |
+
"Language": "Language",
|
93 |
+
"Model": "Model",
|
94 |
+
},
|
95 |
barmode="group",
|
96 |
)
|
97 |
+
lang_order = (
|
98 |
+
lang_results.set_index("Model")
|
99 |
+
.loc[best_model]
|
100 |
+
.sort_values(ascending=False)
|
101 |
+
.index
|
102 |
+
)
|
103 |
+
print(lang_order)
|
104 |
|
105 |
+
# [best_model].sort_values().index
|
106 |
+
fig.update_layout(xaxis={"categoryorder": "array", "categoryarray": lang_order})
|
107 |
+
gr.Plot(fig)
|
108 |
+
# gr.Plot(fig2)
|
109 |
|
110 |
tabs = [fm_interface]
|
111 |
titles = ["F-M Setup"]
|
112 |
|
113 |
with gr.Blocks() as demo:
|
114 |
+
gr.Markdown("# Twists, Humps, and Pebbles: ASR Leadeboard")
|
115 |
+
gr.Markdown(
|
116 |
+
"""
|
117 |
+
Datasets currently included:
|
118 |
+
- **Mozilla Common Voice v17**
|
119 |
+
"""
|
120 |
+
)
|
121 |
gr.TabbedInterface(tabs, titles)
|
122 |
|
123 |
+
gr.Textbox(
|
124 |
+
value=CITATION_BUTTON_TEXT,
|
125 |
+
label=CITATION_BUTTON_LABEL,
|
126 |
+
max_lines=6,
|
127 |
+
show_copy_button=True,
|
128 |
+
)
|
129 |
+
|
130 |
if __name__ == "__main__":
|
131 |
demo.launch()
|
config.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Python file to store configuration and info, e.g., which language
|
3 |
+
to use for a particular datasetm or which language a model should be
|
4 |
+
evaluated on.
|
5 |
+
"""
|
6 |
+
|
7 |
+
LOCAL_RESULTS_DIR = "fair-asr-results"
|
8 |
+
SETUPS = [{"majority_group": "male_masculine", "minority_group": "female_feminine"}]
|
9 |
+
|
10 |
+
|
11 |
+
class CVInfo:
|
12 |
+
dataset_id: str = "cv_17"
|
13 |
+
full_name: str = "Mozilla Common Voice v17"
|
14 |
+
|
15 |
+
# fmt: off
|
16 |
+
langs = [
|
17 |
+
"de", "en", "nl", # Germanic
|
18 |
+
"ru", "sr", "cs", "sk", # Slavic
|
19 |
+
"it", "fr", "es", "ca", "pt", "ro", # Romance
|
20 |
+
"sw", # Bantu
|
21 |
+
"yo", # Niger-Congo
|
22 |
+
"ja", # Japonic
|
23 |
+
"hu", "fi", # Uralic
|
24 |
+
"ar" # Semitic
|
25 |
+
]
|
26 |
+
# fmt: on
|
27 |
+
|
28 |
+
|
29 |
+
dataset2info = {"cv_17": CVInfo}
|
30 |
+
|
31 |
+
|
32 |
+
class WhisperInfo:
|
33 |
+
# fmt: off
|
34 |
+
langs = [
|
35 |
+
"de", "en", "nl", # Germanic
|
36 |
+
"ru", "sr", "cs", "sk", # Slavic
|
37 |
+
"it", "fr", "es", "ca", "pt", "ro", # Romance
|
38 |
+
"sw", # Bantu
|
39 |
+
"yo", # Niger-Congo
|
40 |
+
"ja", # Japonic
|
41 |
+
"hu", "fi", # Uralic
|
42 |
+
"ar" # Semitic
|
43 |
+
]
|
44 |
+
# fmt: on
|
45 |
+
|
46 |
+
|
47 |
+
class SeamlessInfo:
|
48 |
+
# fmt: off
|
49 |
+
langs = [
|
50 |
+
"de", "en", "nl", # Germanic
|
51 |
+
"ru", "sr", "cs", "sk", # Slavic
|
52 |
+
"it", "fr", "es", "ca", "pt", "ro", # Romance
|
53 |
+
"sw", # Bantu
|
54 |
+
"yo", # Niger-Congo
|
55 |
+
"ja", # Japonic
|
56 |
+
"hu", "fi", # Uralic
|
57 |
+
"ar" # Semitic
|
58 |
+
]
|
59 |
+
# fmt: on
|
60 |
+
|
61 |
+
|
62 |
+
model2info = {
|
63 |
+
"openai--whisper-large-v3": WhisperInfo,
|
64 |
+
"openai--whisper-large-v3-turbo": WhisperInfo,
|
65 |
+
# "facebook--seamless-m4t-v2-large": SeamlessInfo,
|
66 |
+
}
|
67 |
+
|
68 |
+
|
69 |
+
CITATION_BUTTON_LABEL = "Please use this bibtex to cite these results"
|
70 |
+
CITATION_BUTTON_TEXT = r"""@inproceedings{attanasio-etal-2024-twists,
|
71 |
+
title = "Twists, Humps, and Pebbles: Multilingual Speech Recognition Models Exhibit Gender Performance Gaps",
|
72 |
+
author = "Attanasio, Giuseppe and
|
73 |
+
Savoldi, Beatrice and
|
74 |
+
Fucci, Dennis and
|
75 |
+
Hovy, Dirk",
|
76 |
+
editor = "Al-Onaizan, Yaser and
|
77 |
+
Bansal, Mohit and
|
78 |
+
Chen, Yun-Nung",
|
79 |
+
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
|
80 |
+
month = nov,
|
81 |
+
year = "2024",
|
82 |
+
address = "Miami, Florida, USA",
|
83 |
+
publisher = "Association for Computational Linguistics",
|
84 |
+
url = "https://aclanthology.org/2024.emnlp-main.1188",
|
85 |
+
doi = "10.18653/v1/2024.emnlp-main.1188",
|
86 |
+
pages = "21318--21340",
|
87 |
+
abstract = "Current automatic speech recognition (ASR) models are designed to be used across many languages and tasks without substantial changes. However, this broad language coverage hides performance gaps within languages, for example, across genders. Our study systematically evaluates the performance of two widely used multilingual ASR models on three datasets, encompassing 19 languages from eight language families and two speaking conditions. Our findings reveal clear gender disparities, with the advantaged group varying across languages and models. Surprisingly, those gaps are not explained by acoustic or lexical properties. However, probing internal model states reveals a correlation with gendered performance gap. That is, the easier it is to distinguish speaker gender in a language using probes, the more the gap reduces, favoring female speakers. Our results show that gender disparities persist even in state-of-the-art models. Our findings have implications for the improvement of multilingual ASR systems, underscoring the importance of accessibility to training data and nuanced evaluation to predict and mitigate gender gaps. We release all code and artifacts at https://github.com/g8a9/multilingual-asr-gender-gap.",
|
88 |
+
}"""
|
parsing.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from typing import List
|
3 |
+
from os.path import join as opj
|
4 |
+
import json
|
5 |
+
from config import dataset2info, model2info, LOCAL_RESULTS_DIR
|
6 |
+
|
7 |
+
|
8 |
+
def load_language_results(
|
9 |
+
model_id: str, dataset_id: str, lang_ids: List[str], setup: str
|
10 |
+
):
|
11 |
+
lang_gaps = dict()
|
12 |
+
for lang in lang_ids:
|
13 |
+
with open(
|
14 |
+
opj(
|
15 |
+
LOCAL_RESULTS_DIR,
|
16 |
+
"evaluation",
|
17 |
+
dataset_id,
|
18 |
+
f"results_{model_id}_{dataset_id}_devtest_{lang}_gender_{setup}.json",
|
19 |
+
)
|
20 |
+
) as fp:
|
21 |
+
data = json.load(fp)
|
22 |
+
lang_gaps[lang] = data[f"{data['eval_metric']}_diff_mean"]
|
23 |
+
return lang_gaps
|
24 |
+
|
25 |
+
|
26 |
+
def read_all_configs(setup: str):
|
27 |
+
|
28 |
+
all_datasets = dataset2info.keys()
|
29 |
+
print("Parsing results datasets:", all_datasets)
|
30 |
+
all_models = model2info.keys()
|
31 |
+
print("Parsing results models:", all_models)
|
32 |
+
|
33 |
+
rows = list()
|
34 |
+
for dataset_id in all_datasets:
|
35 |
+
for model_id in all_models:
|
36 |
+
lang_gaps = load_language_results(
|
37 |
+
model_id, dataset_id, dataset2info[dataset_id].langs, setup
|
38 |
+
)
|
39 |
+
|
40 |
+
rows.extend(
|
41 |
+
[
|
42 |
+
{
|
43 |
+
"Model": model_id,
|
44 |
+
"Dataset": dataset_id,
|
45 |
+
"Language": lang,
|
46 |
+
"Gap": lang_gaps[lang],
|
47 |
+
}
|
48 |
+
for lang in lang_gaps
|
49 |
+
]
|
50 |
+
)
|
51 |
+
|
52 |
+
results_df = pd.DataFrame(rows)
|
53 |
+
results_df = results_df.drop(columns=["Dataset"])
|
54 |
+
# results_df = results_df.sort_values(by="Mean Gap", ascending=True)
|
55 |
+
|
56 |
+
return results_df
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
gradio
|
2 |
-
plotly
|
|
|
|
1 |
gradio
|
2 |
+
plotly
|
3 |
+
pandas
|