import evaluate
from datasets import Features, Value
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix


_CITATION = """
@article{scikit-learn,
  title={Scikit-learn: Machine Learning in {P}ython},
  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
  journal={Journal of Machine Learning Research},
  volume={12},
  pages={2825--2830},
  year={2011}
}
"""

_DESCRIPTION = """
This evaluator computes multiple classification metrics to assess the performance of a model. Metrics calculated include:
- Accuracy: The proportion of correct predictions among the total number of cases processed. Computed as (TP + TN) / (TP + TN + FP + FN), where TP, TN, FP, and FN denote true positives, true negatives, false positives, and false negatives respectively.
- Precision, Recall, and F1-Score: Evaluated for each class individually as well as macro (average across classes) and micro (aggregate contributions of all classes) averages.
- Confusion Matrix: A matrix representing the classification accuracy for each class combination.

"""

_KWARGS_DESCRIPTION = """
Args:
    predictions (`list` of `str`): Predicted labels.
    references (`list` of `str`): Ground truth labels.

Returns:
    Returns:
    Dict containing:
        accuracy (float): Proportion of correct predictions. Value ranges between 0 (worst) and 1 (best).
        precision_macro (float), recall_macro (float), f1_macro (float): Macro averages of precision, recall, and F1-score respectively.
        precision_micro (float), recall_micro (float), f1_micro (float): Micro averages of precision, recall, and F1-score respectively.
        confusion_matrix (list of lists): 2D list representing the confusion matrix of the classification results.
"""


class ClassificationEvaluator(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=Features(
                {"predictions": Value("string"), "references": Value("string")}
            ),
        )

    def _compute(self, predictions, references):

        accuracy = accuracy_score(references, predictions, normalize=True, sample_weight=None)        

        # Calculate macro and micro averages for precision, recall, and F1-score
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
            references, predictions, average='macro'
        )
        precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
            references, predictions, average='micro'
        )

        # Calculate the confusion matrix
        conf_matrix = confusion_matrix(references, predictions)

        return {
            "accuracy": accuracy,
            "precision_macro": float(precision_macro),
            "recall_macro": float(recall_macro),
            "f1_macro": float(f1_macro),
            "precision_micro": float(precision_micro),
            "recall_micro": float(recall_micro),
            "f1_micro": float(f1_micro),
            "confusion_matrix": conf_matrix.tolist()
        }