Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

App Files Files Community

pminervini commited on Dec 13, 2023

Commit

46bcca0

1 Parent(s): 05346b7

update

Browse files

Files changed (5) hide show

src/backend/tasks/halueval/halueval_dialogue.yaml +8 -2
src/backend/tasks/halueval/halueval_qa.yaml +8 -2
src/backend/tasks/halueval/halueval_summarization.yaml +8 -2
src/backend/tasks/xsum/utils.py +89 -0
src/backend/tasks/xsum/xsum.yaml +49 -0

src/backend/tasks/halueval/halueval_dialogue.yaml CHANGED Viewed

@@ -4,13 +4,19 @@ task: halueval_dialogue
 dataset_path: pminervini/HaluEval
 dataset_name: dialogue_samples
 output_type: generate_until
-training_split: data
-validation_split: data
 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_dialogue
 doc_to_target: !function utils.doc_to_target
 process_results: !function utils.process_results
 metric_list:
   - metric: em
     aggregation: mean

 dataset_path: pminervini/HaluEval
 dataset_name: dialogue_samples
 output_type: generate_until
+training_split: null
+validation_split: null
 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_dialogue
 doc_to_target: !function utils.doc_to_target
 process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "\n"
+    - "."
+  do_sample: false
+  temperature: 0.0
 metric_list:
   - metric: em
     aggregation: mean

src/backend/tasks/halueval/halueval_qa.yaml CHANGED Viewed

@@ -4,13 +4,19 @@ task: halueval_qa
 dataset_path: pminervini/HaluEval
 dataset_name: qa_samples
 output_type: generate_until
-training_split: data
-validation_split: data
 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_qa
 doc_to_target: !function utils.doc_to_target
 process_results: !function utils.process_results
 metric_list:
   - metric: em
     aggregation: mean

 dataset_path: pminervini/HaluEval
 dataset_name: qa_samples
 output_type: generate_until
+training_split: null
+validation_split: null
 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_qa
 doc_to_target: !function utils.doc_to_target
 process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "\n"
+    - "."
+  do_sample: false
+  temperature: 0.0
 metric_list:
   - metric: em
     aggregation: mean

src/backend/tasks/halueval/halueval_summarization.yaml CHANGED Viewed

@@ -4,13 +4,19 @@ task: halueval_summarization
 dataset_path: pminervini/HaluEval
 dataset_name: summarization_samples
 output_type: generate_until
-training_split: data
-validation_split: data
 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_summarization
 doc_to_target: !function utils.doc_to_target
 process_results: !function utils.process_results
 metric_list:
   - metric: em
     aggregation: mean

 dataset_path: pminervini/HaluEval
 dataset_name: summarization_samples
 output_type: generate_until
+training_split: null
+validation_split: null
 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_summarization
 doc_to_target: !function utils.doc_to_target
 process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "\n"
+    - "."
+  do_sample: false
+  temperature: 0.0
 metric_list:
   - metric: em
     aggregation: mean

src/backend/tasks/xsum/utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import sacrebleu
+import numpy as np
+from rouge_score import rouge_scorer, scoring
+def process_results(doc, results):
+    # (Pdb)doc.keys()
+    # dict_keys(['document', 'summary', 'id'])
+    # (Pdb++) results
+    # [' The Welsh Government has announced
+    # breakpoint()
+    completion = results[0]
+    # true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
+    # all_refs = true_refs + false_refs
+    document = doc["document"]
+    true_refs = [doc["summary"]]
+    all_refs = true_refs
+    # ROUGE-N
+    rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
+    # ROUGE-1
+    rouge1_scores = [score["rouge1"] for score in rouge_scores]
+    # ROUGE-2
+    rouge2_scores = [score["rouge2"] for score in rouge_scores]
+    # ROUGE-L
+    rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
+    res = {
+        "rouge1": rouge1_scores[0],
+        "rouge2": rouge2_scores[0],
+        "rougeL": rougeL_scores[0],
+    }
+    return res
+def bleu(refs, preds):
+    """
+    Returns `t5` style BLEU scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
+    :param refs:
+        A `list` of `list` of reference `str`s.
+    :param preds:
+        A `list` of predicted `str`s.
+    """
+    score = sacrebleu.corpus_bleu(
+        preds,
+        refs,
+        smooth_method="exp",
+        smooth_value=0.0,
+        force=False,
+        lowercase=False,
+        tokenize="intl",
+        use_effective_order=False,
+    ).score
+    return score
+def rouge(refs, preds):
+    """
+    Returns `t5` style ROUGE scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
+    :param refs:
+        A `list` of reference `strs`.
+    :param preds:
+        A `list` of predicted `strs`.
+    """
+    rouge_types = ["rouge1", "rouge2", "rougeLsum"]
+    scorer = rouge_scorer.RougeScorer(rouge_types)
+    # Add newlines between sentences to correctly compute `rougeLsum`.
+    def _prepare_summary(summary):
+        summary = summary.replace(" . ", ".\n")
+        return summary
+    # Accumulate confidence intervals.
+    aggregator = scoring.BootstrapAggregator()
+    for ref, pred in zip(refs, preds):
+        ref = _prepare_summary(ref)
+        pred = _prepare_summary(pred)
+        aggregator.add_scores(scorer.score(ref, pred))
+    result = aggregator.aggregate()
+    return {type: result[type].mid.fmeasure * 100 for type in rouge_types}

src/backend/tasks/xsum/xsum.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+task: xsum
+dataset_path: EdinburghNLP/xsum
+dataset_name: xsum
+output_type: generate_until
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Document: {{document}}\nSummary:"
+doc_to_target: "{{summary}}"
+# process_docs: !function utils.process_docs
+process_results: !function utils.process_results
+should_decontaminate: True
+doc_to_decontamination_query: document
+generation_kwargs:
+  until:
+    - "\n"
+    - "."
+  do_sample: false
+  temperature: 0.0
+metric_list:
+  - metric: rouge1_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_diff
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  - version: 0.0