File size: 5,126 Bytes
c25e6bb
 
 
 
b06ea15
c25e6bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b06ea15
 
c25e6bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import os
import json
import glob
from collections import defaultdict
import pandas as pd
import gradio as gr
from content import *
from css import *
import glob

AFRIMMLU_DIRECT = "afrimmlu_direct"
AFRIMMLU_TRANSLATE = "afrimmlu_translate"
AFRIXNLI_DIRECT = "afrixnli_direct"
AFRIXNLI_TRANSLATE = "afrixnli_translate"


BENCHMARKS = [AFRIMMLU_DIRECT, AFRIMMLU_TRANSLATE, AFRIXNLI_DIRECT, AFRIXNLI_TRANSLATE]

METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]

LANGS = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul']

LANG_NAME = {
    'amh': 'Amharic',
    'eng': 'English',
    'ewe': 'Ewe',
    'fra': 'French',
    'hau': 'Hausa',
    'ibo': 'Igbo',
    'kin': 'Kinyarwanda',
    'lin': 'Lingala',
    'lug': 'Luganda',
    'orm': 'Oromo',
    'sna': 'Shona',
    'sot': 'Sotho',
    'swa': 'Swahili',
    'twi': 'Twi',
    'wol': 'Wolof',
    'xho': 'Xhosa',
    'yor': 'Yoruba',
    'zul': 'Zulu'
}


def collect_results():
    performance_dict = defaultdict(dict)
    pretrained_models = set()
    for file in glob.glob('evals/*/*.json'):
        with open(file, 'r') as f:
            data = json.load(f)
        if 'results' not in data:
            continue
        if 'config' not in data:
            continue
        results = data['results']
        config = data['config']
        if 'model_args' not in config:
            continue

        model_args = config['model_args'].split(',')
        pretrained = [x for x in model_args if x.startswith('pretrained=')]
        if len(pretrained) != 1:
            continue
        pretrained = pretrained[0].split('=')[1]
        pretrained = pretrained.split('/')[-1]
        pretrained_models.add(pretrained)

        for lang_task, perfs in results.items():
            task, lang = lang_task.split('_')
            assert task in BENCHMARKS

            if lang and task:
                metric = METRICS[BENCHMARKS.index(task)]
                p = round(perfs[metric] * 100, 1)
                performance_dict[(pretrained, lang)][task] = p
    return performance_dict, pretrained_models


def get_leaderboard_df(performance_dict, pretrained_models):
    df = list()
    for (pretrained, lang), perfs in performance_dict.items():
        lang_name = LANG_NAME[lang]
        afrimmlu_direct_perf = perfs.get(AFRIMMLU_DIRECT, 0.0)
        afrimmlu_translate_perf = perfs.get(AFRIMMLU_TRANSLATE, 0.0)
        afrixnli_direct_perf = perfs.get(AFRIXNLI_DIRECT, 0.0)
        afrixnli_translate_perf = perfs.get(AFRIXNLI_TRANSLATE, 0.0)

        if afrimmlu_direct_perf * afrimmlu_translate_perf * afrixnli_direct_perf * afrixnli_translate_perf == 0:
            continue

        avg = round((afrimmlu_direct_perf + afrimmlu_translate_perf + afrixnli_direct_perf + afrixnli_translate_perf) / 4, 1)
        notes = ' '.join([pretrained, lang_name])
        row = [pretrained, lang_name, lang, avg, afrimmlu_direct_perf, afrimmlu_translate_perf, afrixnli_direct_perf, afrixnli_translate_perf, notes]
        df.append(row)

    df = pd.DataFrame.from_records(df, columns=COLS)
    df = df.sort_values(by=[LANG_COL, AVERAGE_COL], ascending=False)
    df = df[COLS]

    return df


def search_table(df, query):
    filtered_df = df[df[NOTES_COL].str.contains(query, case=False)]
    return filtered_df



MODEL_COL = "Model"
LANG_COL = "Language"
CODE_COL = "Code"
AVERAGE_COL = "Average"
AFRIMMLU_DIRECT_COL = "AfriMMLU Direct (0-Shot)"
AFRIMMLU_TRANSLATE_COL = "AfriMMLU Translate (0-Shot)"
AFRIXNLI_DIRECT_COL = "AfriXNLI Direct (0-Shot)"
AFRIXNLI_TRANSLATE_COL = "AfriXNLI Translate (0-Shot)"
NOTES_COL = "Notes"  # For search only

COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, AFRIMMLU_DIRECT_COL, AFRIMMLU_TRANSLATE_COL, AFRIXNLI_DIRECT_COL, AFRIXNLI_TRANSLATE_COL, NOTES_COL]
TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"]

args = collect_results()
original_df = get_leaderboard_df(*args)

demo = gr.Blocks(css=CUSTOM_CSS)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
    gr.Markdown(HOW_TO, elem_classes="markdown-text")

    with gr.Group():
        search_bar = gr.Textbox(
            placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
        )

        leaderboard_table = gr.components.Dataframe(
            value=original_df,
            headers=COLS,
            datatype=TYPES,
            # max_rows=5,
            elem_id="leaderboard-table",
        )

        # # Dummy leaderboard for handling the case when the user uses backspace key
        hidden_leaderboard_table_for_search = gr.components.Dataframe(
            value=original_df, 
            headers=COLS, 
            datatype=TYPES, 
            # max_rows=5, 
            visible=False
        )

        search_bar.change(
            search_table,
            [hidden_leaderboard_table_for_search, search_bar],
            leaderboard_table,
        )

    gr.Markdown(CREDIT, elem_classes="markdown-text")
    gr.Markdown(CITATION, elem_classes="markdown-text")

demo.launch()