|
from typing import Sequence |
|
|
|
import numpy |
|
from sklearn.metrics import roc_curve, auc |
|
|
|
import datasets |
|
import evaluate |
|
|
|
_DESCRIPTION = """ |
|
MC-AUROC (Multi-class Area Under the Receiver Operating Characteristic Curve) is a performance metric used in multiclass classification tasks. |
|
It evaluates the ability of a model to distinguish between positive and negative classes across different threshold values. |
|
The curve is generated by plotting the true positive rate (sensitivity) against the false positive rate (1-specificity) at various threshold settings. |
|
AUROC provides a single scalar value indicating the overall discriminatory power of the model, with higher values suggesting better performance. |
|
""" |
|
|
|
_KWARGS_DESCRIPTION = """ |
|
AUROC metric for binary classification predictions. Here we use one-vs-all strategy to calculate the AUROC for multi-class classification problems. |
|
The multi-class AUROC is calculated by treating each class as the positive class and the rest as the negative class. |
|
The final score is the average of the AUROC scores for each class. |
|
|
|
Args: |
|
probabilities: list-like. Predicted probabilities or decision scores for the each class. |
|
true_labels: list-like. True labels indicating the actual class memberships (must be ordinal, starting from 0). |
|
Returns: |
|
auroc_score: float. Multi-class Area Under the Receiver Operating Characteristic Curve (MC-AUROC) score. |
|
""" |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class AVG_MULTICLASS_AUROC(evaluate.Metric): |
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
description=_DESCRIPTION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
citation="", |
|
features=[ |
|
datasets.Features( |
|
{ |
|
"predictions":datasets.Sequence(datasets.Value("float")), |
|
"references": datasets.Value("int8") |
|
} |
|
), |
|
], |
|
reference_urls=[ |
|
"https://en.wikipedia.org/wiki/Receiver_operating_characteristic" |
|
], |
|
) |
|
|
|
def _compute(self, predictions: Sequence[Sequence[float]], references: Sequence[int]): |
|
""" |
|
Computes the average AUROC score for multi-class classification problems. |
|
""" |
|
probabilities = predictions |
|
|
|
n_classes = list(range(len(probabilities[0]))) |
|
fpr = dict() |
|
tpr = dict() |
|
roc_auc = dict() |
|
for i in range(len(n_classes)): |
|
fpr[i], tpr[i], _ = roc_curve(y_true=[1 if x == n_classes[i] else 0 for x in references], |
|
y_score=[prob[i] for prob in probabilities]) |
|
roc_auc[i] = auc(fpr[i], tpr[i]) |
|
|
|
|
|
average_auc = numpy.mean(list(roc_auc.values())) |
|
|
|
return { |
|
"mc_auroc_score": average_auc, |
|
"mc_auroc_ci": "Not implemented yet." |
|
} |
|
|