Spaces:
Sleeping
Sleeping
from typing import Dict, List | |
import evaluate | |
from datasets import Features, Sequence, Value | |
from sklearn.metrics import accuracy_score | |
from preprocessing import absa_term_preprocess | |
_CITATION = """ | |
""" | |
_DESCRIPTION = """ | |
Evaluation metrics for Aspect-Based Sentiment Analysis (ABSA) including precision, recall, and F1 score for aspect terms and polarities. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Computes precision, recall, and F1 score for aspect terms and polarities in Aspect-Based Sentiment Analysis (ABSA). | |
Args: | |
predictions: List of ABSA predictions with the following structure: | |
- 'aspects': Sequence of aspect annotations, each with the following keys: | |
- 'term': Aspect term | |
- 'polarity': Polarity of the aspect term | |
references: List of ABSA references with the same structure as predictions. | |
Returns: | |
aspect_precision: Precision score for aspect terms | |
aspect_recall: Recall score for aspect terms | |
aspect_f1: F1 score for aspect terms | |
polarity_precision: Precision score for aspect polarities | |
polarity_recall: Recall score for aspect polarities | |
polarity_f1: F1 score for aspect polarities | |
""" | |
class AbsaEvaluatorTest(evaluate.Metric): | |
def _info(self): | |
return evaluate.MetricInfo( | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
features=Features( | |
{ | |
"predictions": Features( | |
{ | |
"aspects": Features( | |
{ | |
"term": Sequence(Value("string")), | |
"polarity": Sequence(Value("string")), | |
} | |
), | |
"category": Features( | |
{ | |
"category": Sequence(Value("string")), | |
"polarity": Sequence(Value("string")), | |
} | |
), | |
} | |
), | |
"references": Features( | |
{ | |
"aspects": Features( | |
{ | |
"term": Sequence(Value("string")), | |
"polarity": Sequence(Value("string")), | |
} | |
), | |
"category": Features( | |
{ | |
"category": Sequence(Value("string")), | |
"polarity": Sequence(Value("string")), | |
} | |
), | |
} | |
), | |
} | |
), | |
) | |
def _compute(self, predictions, references): | |
# preprocess aspect term | |
( | |
truth_aspect_terms, | |
pred_aspect_terms, | |
truth_term_polarities, | |
pred_term_polarities, | |
) = absa_term_preprocess( | |
references=references, | |
predictions=predictions, | |
subtask_key="aspects", | |
subtask_value="term", | |
) | |
# evaluate | |
term_results = self.semeval_metric( | |
truth_aspect_terms, pred_aspect_terms | |
) | |
term_polarity_acc = accuracy_score( | |
truth_term_polarities, pred_term_polarities | |
) | |
# preprocess category detection | |
( | |
truth_categories, | |
pred_categories, | |
truth_cat_polarities, | |
pred_cat_polarities, | |
) = absa_term_preprocess( | |
references=references, | |
predictions=predictions, | |
subtask_key="category", | |
subtask_value="category", | |
) | |
# evaluate | |
category_results = self.semeval_metric( | |
truth_categories, pred_categories | |
) | |
cat_polarity_acc = accuracy_score( | |
truth_cat_polarities, pred_cat_polarities | |
) | |
return { | |
"term_extraction_results": term_results, | |
"term_polarity_results_accuracy": term_polarity_acc, | |
"category_detection_results": category_results, | |
"category_polarity_results_accuracy": cat_polarity_acc, | |
} | |
def semeval_metric( | |
self, truths: List[List[str]], preds: List[List[str]] | |
) -> Dict[str, float]: | |
""" | |
Implements evaluation for extraction tasks using precision, recall, and F1 score. | |
Parameters: | |
- truths: List of lists, where each list contains the ground truth labels for a sample. | |
- preds: List of lists, where each list contains the predicted labels for a sample. | |
Returns: | |
- A dictionary containing the precision, recall, F1 score, and counts of common, retrieved, and relevant. | |
link for code: link for this code: https://github.com/davidsbatista/Aspect-Based-Sentiment-Analysis/blob/1d9c8ec1131993d924e96676fa212db6b53cb870/libraries/baselines.py#L387 | |
""" | |
b = 1 | |
common, relevant, retrieved = 0.0, 0.0, 0.0 | |
for truth, pred in zip(truths, preds): | |
common += len([a for a in pred if a in truth]) | |
retrieved += len(pred) | |
relevant += len(truth) | |
precision = common / retrieved if retrieved > 0 else 0.0 | |
recall = common / relevant if relevant > 0 else 0.0 | |
f1 = ( | |
(1 + (b**2)) | |
* precision | |
* recall | |
/ ((precision * b**2) + recall) | |
if precision > 0 and recall > 0 | |
else 0.0 | |
) | |
return { | |
"precision": precision, | |
"recall": recall, | |
"f1_score": f1, | |
"common": common, | |
"retrieved": retrieved, | |
"relevant": relevant, | |
} | |