Spaces:

symanto
/

classification_evaluator

Sleeping

App Files Files Community

HalteroXHunter commited on Jun 25

Commit

1a07572

•

1 Parent(s): e368a57

include new metrics

Browse files

Files changed (1) hide show

classification_evaluator.py +34 -15

classification_evaluator.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import evaluate
 from datasets import Features, Value
-from sklearn.metrics import accuracy_score
 _CITATION = """
 @article{scikit-learn,
@@ -17,13 +18,11 @@ _CITATION = """
 """
 _DESCRIPTION = """
-Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
-Accuracy = (TP + TN) / (TP + TN + FP + FN)
- Where:
-TP: True positive
-TN: True negative
-FP: False positive
-FN: False negative
 """
 _KWARGS_DESCRIPTION = """
@@ -32,8 +31,12 @@ Args:
     references (`list` of `str`): Ground truth labels.
 Returns:
-    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
 """
@@ -50,10 +53,26 @@ class ClassificationEvaluator(evaluate.Metric):
     def _compute(self, predictions, references):
         return {
-            "accuracy": float(
-                accuracy_score(
-                    references, predictions, normalize=True, sample_weight=None
-                )
-            )
         }

 import evaluate
 from datasets import Features, Value
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
 _CITATION = """
 @article{scikit-learn,
 """
 _DESCRIPTION = """
+This evaluator computes multiple classification metrics to assess the performance of a model. Metrics calculated include:
+- Accuracy: The proportion of correct predictions among the total number of cases processed. Computed as (TP + TN) / (TP + TN + FP + FN), where TP, TN, FP, and FN denote true positives, true negatives, false positives, and false negatives respectively.
+- Precision, Recall, and F1-Score: Evaluated for each class individually as well as macro (average across classes) and micro (aggregate contributions of all classes) averages.
+- Confusion Matrix: A matrix representing the classification accuracy for each class combination.
 """
 _KWARGS_DESCRIPTION = """
     references (`list` of `str`): Ground truth labels.
 Returns:
+    Returns:
+    Dict containing:
+        accuracy (float): Proportion of correct predictions. Value ranges between 0 (worst) and 1 (best).
+        precision_macro (float), recall_macro (float), f1_macro (float): Macro averages of precision, recall, and F1-score respectively.
+        precision_micro (float), recall_micro (float), f1_micro (float): Micro averages of precision, recall, and F1-score respectively.
+        confusion_matrix (list of lists): 2D list representing the confusion matrix of the classification results.
 """
     def _compute(self, predictions, references):
+        accuracy = accuracy_score(references, predictions, normalize=True, sample_weight=None)
+        # Calculate macro and micro averages for precision, recall, and F1-score
+        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
+            references, predictions, average='macro'
+        )
+        precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
+            references, predictions, average='micro'
+        )
+        # Calculate the confusion matrix
+        conf_matrix = confusion_matrix(references, predictions)
         return {
+            "accuracy": accuracy,
+            "precision_macro": float(precision_macro),
+            "recall_macro": float(recall_macro),
+            "f1_macro": float(f1_macro),
+            "precision_micro": float(precision_micro),
+            "recall_micro": float(recall_micro),
+            "f1_micro": float(f1_micro),
+            "confusion_matrix": conf_matrix.tolist()
         }