# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import logging import string from collections import Counter from typing import Callable import regex from rouge import Rouge rouge = Rouge() logger = logging.getLogger(__name__) # Normalization and score functions from SQuAD evaluation script https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ def normalize_answer(s: str) -> str: def remove_articles(text): return regex.sub(r"\b(a|an|the)\b", " ", text) def white_space_fix(text): return " ".join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return "".join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s)))) def em(prediction, ground_truth, normalize_fn): return float(normalize_fn(prediction) == normalize_fn(ground_truth)) def f1(prediction, ground_truth, normalize_fn): prediction_tokens = normalize_fn(prediction).split() ground_truth_tokens = normalize_fn(ground_truth).split() common = Counter(prediction_tokens) & Counter(ground_truth_tokens) num_same = sum(common.values()) if num_same == 0: return 0 precision = 1.0 * num_same / len(prediction_tokens) recall = 1.0 * num_same / len(ground_truth_tokens) f1 = (2 * precision * recall) / (precision + recall) return f1 def rouge_wrapper(prediction, ground_truth): try: result = rouge.get_scores(prediction, ground_truth, avg=True) return result["rouge-1"]["f"], result["rouge-2"]["f"], result["rouge-l"]["f"] except: return 0.0, 0.0, 0.0 # pred = [p1, p2 ..., pn] gt=[[g1,g2,...,gn]] def f1_score(prediction, ground_truths, normalize_fn: Callable[[str], str] = lambda x: x): return max([f1(prediction, gt, normalize_fn) for gt in ground_truths]) def exact_match_score(prediction, ground_truths, normalize_fn: Callable[[str], str] = lambda x: x): return max([em(prediction, gt, normalize_fn) for gt in ground_truths]) # pred = [p1, p2 ..., pn] gt=[[g1,g2,...,gn]] def rouge_score(prediction, ground_truths): ground_truths = [x for x in ground_truths if len(x) > 0] if ( len(prediction) == 0 or len(ground_truths) == 0 ): # check if empty prediction or if there is no hypothesis with len > 0 return 0.0, 0.0, 0.0 scores = [rouge_wrapper(prediction, gt) for gt in ground_truths] rouge1 = max(s[0] for s in scores) rouge2 = max(s[1] for s in scores) rougel = max(s[2] for s in scores) return rouge1, rouge2, rougel # pred = [p1, p2 ..., pn] gt=[[g1,g2,...,gn]] def bleu_score(prediction, ground_truths): from sacrebleu import BLEU bleu = BLEU() score = bleu.corpus_score(prediction, ground_truths) return score