import evaluate from datasets import Features, Value from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix _CITATION = """ @article{scikit-learn, title={Scikit-learn: Machine Learning in {P}ython}, author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, journal={Journal of Machine Learning Research}, volume={12}, pages={2825--2830}, year={2011} } """ _DESCRIPTION = """ This evaluator computes multiple classification metrics to assess the performance of a model. Metrics calculated include: - Accuracy: The proportion of correct predictions among the total number of cases processed. Computed as (TP + TN) / (TP + TN + FP + FN), where TP, TN, FP, and FN denote true positives, true negatives, false positives, and false negatives respectively. - Precision, Recall, and F1-Score: Evaluated for each class individually as well as macro (average across classes) and micro (aggregate contributions of all classes) averages. - Confusion Matrix: A matrix representing the classification accuracy for each class combination. """ _KWARGS_DESCRIPTION = """ Args: predictions (`list` of `str`): Predicted labels. references (`list` of `str`): Ground truth labels. Returns: Returns: Dict containing: accuracy (float): Proportion of correct predictions. Value ranges between 0 (worst) and 1 (best). precision_macro (float), recall_macro (float), f1_macro (float): Macro averages of precision, recall, and F1-score respectively. precision_micro (float), recall_micro (float), f1_micro (float): Micro averages of precision, recall, and F1-score respectively. confusion_matrix (list of lists): 2D list representing the confusion matrix of the classification results. """ class ClassificationEvaluator(evaluate.Metric): def _info(self): return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=Features( {"predictions": Value("string"), "references": Value("string")} ), ) def _compute(self, predictions, references): accuracy = accuracy_score(references, predictions, normalize=True, sample_weight=None) # Calculate macro and micro averages for precision, recall, and F1-score precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support( references, predictions, average='macro' ) precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support( references, predictions, average='micro' ) # Calculate the confusion matrix conf_matrix = confusion_matrix(references, predictions) return { "accuracy": accuracy, "precision_macro": float(precision_macro), "recall_macro": float(recall_macro), "f1_macro": float(f1_macro), "precision_micro": float(precision_micro), "recall_micro": float(recall_micro), "f1_micro": float(f1_micro), "confusion_matrix": conf_matrix.tolist() }