|
from intel_evaluate_extension.evaluation_suite.model_card_suite import ModelCardSuiteResults |
|
|
|
from evaluate.evaluation_suite import SubTask |
|
from evaluate.visualization import radar_plot |
|
|
|
|
|
_HEADER = "GLUE/AdvGlue Evaluation Results" |
|
|
|
_DESCRIPTION = """ |
|
The suite compares the GLUE results with Adversarial GLUE (AdvGLUE), a |
|
multi-task benchmark that tests the vulnerability of modern large-scale |
|
language models againstvarious adversarial attacks.""" |
|
|
|
|
|
class Suite(ModelCardSuiteResults): |
|
def __init__(self, name): |
|
super().__init__(name) |
|
self.result_keys = ["accuracy", "f1"] |
|
self.preprocessor = lambda x: {"text": x["text"].lower()} |
|
self.suite = [ |
|
SubTask( |
|
task_type="text-classification", |
|
data="glue", |
|
subset="sst2", |
|
split="validation[:5]", |
|
args_for_task={ |
|
"metric": "glue", |
|
"input_column": "sentence", |
|
"label_column": "label", |
|
"config_name": "sst2", |
|
"label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0}, |
|
}, |
|
), |
|
SubTask( |
|
task_type="text-classification", |
|
data="adv_glue", |
|
subset="adv_sst2", |
|
split="validation[:5]", |
|
args_for_task={ |
|
"metric": "glue", |
|
"input_column": "sentence", |
|
"label_column": "label", |
|
"config_name": "sst2", |
|
"label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0}, |
|
}, |
|
), |
|
SubTask( |
|
task_type="text-classification", |
|
data="glue", |
|
subset="qqp", |
|
split="validation[:5]", |
|
args_for_task={ |
|
"metric": "glue", |
|
"input_column": "question1", |
|
"second_input_column": "question2", |
|
"label_column": "label", |
|
"config_name": "qqp", |
|
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, |
|
}, |
|
), |
|
SubTask( |
|
task_type="text-classification", |
|
data="adv_glue", |
|
subset="adv_qqp", |
|
split="validation[:5]", |
|
args_for_task={ |
|
"metric": "glue", |
|
"input_column": "question1", |
|
"second_input_column": "question2", |
|
"label_column": "label", |
|
"config_name": "qqp", |
|
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, |
|
}, |
|
), |
|
SubTask( |
|
task_type="text-classification", |
|
data="glue", |
|
subset="qnli", |
|
split="validation[:5]", |
|
args_for_task={ |
|
"metric": "glue", |
|
"input_column": "question", |
|
"second_input_column": "sentence", |
|
"label_column": "label", |
|
"config_name": "qnli", |
|
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, |
|
}, |
|
), |
|
SubTask( |
|
task_type="text-classification", |
|
data="adv_glue", |
|
subset="adv_qnli", |
|
split="validation[:5]", |
|
args_for_task={ |
|
"metric": "glue", |
|
"input_column": "question", |
|
"second_input_column": "sentence", |
|
"label_column": "label", |
|
"config_name": "qnli", |
|
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, |
|
}, |
|
), |
|
SubTask( |
|
task_type="text-classification", |
|
data="glue", |
|
subset="rte", |
|
split="validation[:5]", |
|
args_for_task={ |
|
"metric": "glue", |
|
"input_column": "sentence1", |
|
"second_input_column": "sentence2", |
|
"label_column": "label", |
|
"config_name": "rte", |
|
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, |
|
}, |
|
), |
|
SubTask( |
|
task_type="text-classification", |
|
data="adv_glue", |
|
subset="adv_rte", |
|
split="validation[:5]", |
|
args_for_task={ |
|
"metric": "glue", |
|
"input_column": "sentence1", |
|
"second_input_column": "sentence2", |
|
"label_column": "label", |
|
"config_name": "rte", |
|
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, |
|
}, |
|
), |
|
SubTask( |
|
task_type="text-classification", |
|
data="glue", |
|
subset="mnli", |
|
split="validation_mismatched[:5]", |
|
args_for_task={ |
|
"metric": "glue", |
|
"input_column": "premise", |
|
"second_input_column": "hypothesis", |
|
"config_name": "mnli", |
|
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}, |
|
}, |
|
), |
|
SubTask( |
|
task_type="text-classification", |
|
data="adv_glue", |
|
subset="adv_mnli", |
|
split="validation[:5]", |
|
args_for_task={ |
|
"metric": "glue", |
|
"input_column": "premise", |
|
"second_input_column": "hypothesis", |
|
"config_name": "mnli", |
|
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}, |
|
}, |
|
), |
|
] |
|
|
|
def process_results(self, results): |
|
radar_data = [ |
|
{"accuracy " + result["task_name"].split("/")[-1]: result["accuracy"] for result in results[::2]}, |
|
{ |
|
"accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]: result["accuracy"] |
|
for result in results[1::2] |
|
}, |
|
] |
|
return radar_plot(radar_data, ["GLUE", "AdvGLUE"]) |
|
|
|
def plot_results(self, results, model_or_pipeline): |
|
radar_data = self.process_results(results) |
|
graphic = radar_plot(radar_data, ["GLUE " + model_or_pipeline, "AdvGLUE " + model_or_pipeline]) |
|
return graphic |
|
|