File size: 8,493 Bytes
b73474b
 
 
 
b757aa2
b73474b
011d38f
b73474b
 
 
affbcab
b73474b
7947a00
b73474b
 
b757aa2
87e2fb2
affbcab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b73474b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
affbcab
 
 
 
 
 
 
 
 
 
 
 
 
 
b73474b
 
 
 
 
affbcab
 
b73474b
 
 
 
 
affbcab
 
b73474b
 
 
 
 
affbcab
 
 
 
 
 
 
 
 
 
 
 
b73474b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import gradio as gr
import pandas as pd

title = """
# hmLeaderboard: Space for tracking and ranking models on Historical NER Datasets

![hmLeaderboard](https://huggingface.co/spaces/hmbench/hmLeaderboard/resolve/main/logo.png)
"""

description = """
## Models

At the moment the following backbone LMs are supported:

* hmBERT: [Historical Multilingual Language Models for Named Entity Recognition](https://huggingface.co/hmbert).
* hmTEAMS: [Historical Multilingual TEAMS Models](https://huggingface.co/hmteams).
* hmByT5: [Historical Multilingual and Monolingual ByT5 Models](https://huggingface.co/hmbyt5)

## Datasets

We test our pretrained language models on various datasets from HIPE-2020, HIPE-2022 and Europeana. The following table
shows an overview of used datasets.


| Language | Datasets                                                         |
|----------|------------------------------------------------------------------|
| English  | [AjMC] - [TopRes19th]                                            |
| German   | [AjMC] - [NewsEye] - [HIPE-2020]                                 |
| French   | [AjMC] - [ICDAR-Europeana] - [LeTemps] - [NewsEye] - [HIPE-2020] |
| Finnish  | [NewsEye]                                                        |
| Swedish  | [NewsEye]                                                        |
| Dutch    | [ICDAR-Europeana]                                                |

[AjMC]: https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-ajmc.md
[NewsEye]: https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-newseye.md
[TopRes19th]: https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-topres19th.md
[ICDAR-Europeana]: https://github.com/stefan-it/historic-domain-adaptation-icdar
[LeTemps]: https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-letemps.md
[HIPE-2020]: https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-hipe2020.md

## Results
"""
footer = "Made from Bavarian Oberland with ❤️ and 🥨."

model_selection_file_names = {
    "Best Configuration": "best_model_configurations.csv",
    "Best Model": "best_models.csv"
}

df_init = pd.read_csv(model_selection_file_names["Best Configuration"])
dataset_names = df_init.columns.values[1:].tolist()
languages = list(set([dataset_name.split(" ")[0] for dataset_name in dataset_names]))


def perform_evaluation_for_datasets(model_selection, selected_datasets):
    df = pd.read_csv(model_selection_file_names.get(model_selection))

    selected_indices = []

    for selected_dataset in selected_datasets:
        selected_indices.append(dataset_names.index(selected_dataset) + 1)

    mean_column = df.iloc[:, selected_indices].mean(axis=1).round(2)

    # Include column with column name
    result_df = df.iloc[:, [0] + selected_indices]
    result_df["Average"] = mean_column

    return result_df

def perform_evaluation_for_languages(model_selection, selected_languages):
    df = pd.read_csv(model_selection_file_names.get(model_selection))

    selected_indices = []

    for selected_language in selected_languages:
        selected_language = selected_language.lower()
        found_indices = [i for i, column_name in enumerate(df.columns) if selected_language in column_name.lower()]

        for found_index in found_indices:
            selected_indices.append(found_index)

    mean_column = df.iloc[:, selected_indices].mean(axis=1).round(2)

    # Include column with column name
    result_df = df.iloc[:, [0] + selected_indices]
    result_df["Average"] = mean_column

    return result_df

dataset_to_description_mapping = {
    "AjMC": "#### AjMC\nThe AjMC dataset consists of NE-annotated historical commentaries in the field of Classics, and was created in the context of the [Ajax MultiCommentary](https://mromanello.github.io/ajax-multi-commentary/) project.\n\nThe following NEs were annotated: `pers`, `work`, `loc`, `object`, `date` and `scope`.",
    "NewsEye": "#### NewsEye\nThe NewsEye dataset is comprised of diachronic historical newspaper material published between 1850 and 1950 in French, German, Finnish, and Swedish. More information can be found [here](https://dl.acm.org/doi/abs/10.1145/3404835.3463255).\n\nThe following NEs were annotated: `PER`, `LOC`, `ORG` and `HumanProd`.",
    "ICDAR": "#### ICDAR\nThe ICDAR-Europeana NER Dataset is a preprocessed variant of the [Europeana NER Corpora](https://github.com/EuropeanaNewspapers/ner-corpora) for Dutch and French.\n\nThe following NEs were annotated: `PER`, `LOC` and `ORG`.",
    "LeTemps": "#### LeTemps\nThe LeTemps dataset consists of NE-annotated historical French newspaper articles from mid-19C to mid 20C.\n\nThe following NEs were annotated: `loc`, `org` and `pers`.",
    "TopRes19th": "#### TopRes19th\nThe TopRes19th dataset consists of NE-annotated historical English newspaper articles from 19C.\n\nThe following NEs were annotated: `BUILDING`, `LOC` and `STREET`.",
    "HIPE-2020": "#### HIPE-2020\nThe HIPE-2020 dataset is comprised of newspapers from mid 19C to mid 20C. For information can be found [here](https://dl.acm.org/doi/abs/10.1007/978-3-030-58219-7_21).\n\nThe following NEs were annotated: `loc`, `org`, `pers`, `prod`, `time` and `comp`.",
}

configuration_to_description_mapping = {
    "Best Configuration": "The best hyper-parameter configuration for each model is used and average F1-score over runs with different seeds is reported here:",
    "Best Model": "The best hyper-parameter configuration for each model is used, the model with highest F1-score is chosen and its performance is reported here:"
}

with gr.Blocks() as demo:
    gr.Markdown(title)
    gr.Markdown(description)

    with gr.Tab("Overview"):
        gr.Markdown("### Best Configuration")
        gr.Markdown(configuration_to_description_mapping["Best Configuration"])

        df_result = perform_evaluation_for_datasets("Best Configuration", dataset_names)

        gr.Dataframe(value=df_result)

        gr.Markdown("### Best Model")
        gr.Markdown(configuration_to_description_mapping["Best Model"])

        df_result = perform_evaluation_for_datasets("Best Model", dataset_names)

        gr.Dataframe(value=df_result)

    for dataset_name, dataset_description in dataset_to_description_mapping.items():
        with gr.Tab(dataset_name):
            selected_datasets = [ds for ds in dataset_names if dataset_name.lower() in ds.lower()]

            gr.Markdown(dataset_description)

            for config in ["Best Configuration", "Best Model"]:
                gr.Markdown(f"##### Results for {config}")
                gr.Markdown(configuration_to_description_mapping[config])
                df_result = perform_evaluation_for_datasets(config, selected_datasets)
                gr.Dataframe(value=df_result)

    with gr.Tab("Filtering"):

        gr.Markdown("### Filtering\nSwiss-knife filtering for single datasets and languages is possible.")

        model_selection = gr.Radio(choices=["Best Configuration", "Best Model"],
                                   label="Model Selection",
                                   info="Defines if best configuration or best model should be used for evaluation. When 'Best Configuration' is used, the best hyper-parameter configuration is used and then averaged F1-score over all runs is calculated. When 'Best Model' is chosen, the best hyper-parameter configuration and model with highest F1-score on development dataset is used (best model).",
                                   value="Best Configuration")

        with gr.Tab("Dataset Selection"):
            datasets_selection = gr.CheckboxGroup(
                dataset_names, label="Datasets", info="Select datasets for evaluation"
            )
            output_df = gr.Dataframe()

            evaluation_button = gr.Button("Evaluate")
            evaluation_button.click(fn=perform_evaluation_for_datasets, inputs=[model_selection, datasets_selection], outputs=output_df)


        with gr.Tab("Language Selection"):
            language_selection = gr.CheckboxGroup(
                languages, label="Languages", info="Select languages for evaluation"
            )
            output_df = gr.Dataframe()

            evaluation_button = gr.Button("Evaluate")
            evaluation_button.click(fn=perform_evaluation_for_languages, inputs=[model_selection, language_selection], outputs=output_df)



    gr.Markdown(footer)

demo.launch()