Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

pminervini commited on Dec 12, 2023

Commit

b25a00b

•

1 Parent(s): d265631

update

Files changed (4) hide show

src/backend/tasks/halueval/halueval_dialogue.yaml CHANGED Viewed

@@ -7,8 +7,8 @@ validation_split: data
 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_dialogue
-doc_to_target: !function utils.doc_to_target_qa
-process_results: !function utils.process_results_qa
 metric_list:
   - metric: em
     aggregation: mean

 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_dialogue
+doc_to_target: !function utils.doc_to_target
+process_results: !function utils.process_results
 metric_list:
   - metric: em
     aggregation: mean

src/backend/tasks/halueval/halueval_qa.yaml CHANGED Viewed

@@ -7,8 +7,8 @@ validation_split: data
 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_qa
-doc_to_target: !function utils.doc_to_target_qa
-process_results: !function utils.process_results_qa
 metric_list:
   - metric: em
     aggregation: mean

 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_qa
+doc_to_target: !function utils.doc_to_target
+process_results: !function utils.process_results
 metric_list:
   - metric: em
     aggregation: mean

src/backend/tasks/halueval/halueval_summarization.yaml CHANGED Viewed

@@ -7,8 +7,8 @@ validation_split: data
 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_summarization
-doc_to_target: !function utils.doc_to_target_qa
-process_results: !function utils.process_results_qa
 metric_list:
   - metric: em
     aggregation: mean

 test_split: data
 num_fewshot: 0
 doc_to_text: !function utils.doc_to_text_summarization
+doc_to_target: !function utils.doc_to_target
+process_results: !function utils.process_results
 metric_list:
   - metric: em
     aggregation: mean

src/backend/tasks/halueval/utils.py CHANGED Viewed

@@ -102,11 +102,11 @@ def doc_to_text_summarization(doc: dict[str, str]) -> str:
     return doc_text
-def doc_to_target_qa(doc: dict[str, str]) -> str:
     return doc['hallucination']
-def compute_metrics_qa(gold_answer: str, prediction: str) -> dict[str, float]:
     is_correct = True
     if ("Yes" in prediction and "No" in prediction) or ("Yes" not in prediction and "No" not in prediction):
@@ -122,13 +122,15 @@ def compute_metrics_qa(gold_answer: str, prediction: str) -> dict[str, float]:
     if is_correct:
         res["em"] = 1.0 if is_exact else 0.0
     return res
-def process_results_qa(doc: dict[str, str], results: list[str]):
     # results is e.g., ['Yes']
-    gold_list = doc_to_target_qa(doc)
     # gold_list is e.g., 'yes'
     prediction = results[0].strip().split("\n")[0]
-    scores = compute_metrics_qa(gold_list, prediction)
     return scores

     return doc_text
+def doc_to_target(doc: dict[str, str]) -> str:
     return doc['hallucination']
+def compute_metrics(gold_answer: str, prediction: str) -> dict[str, float]:
     is_correct = True
     if ("Yes" in prediction and "No" in prediction) or ("Yes" not in prediction and "No" not in prediction):
     if is_correct:
         res["em"] = 1.0 if is_exact else 0.0
+    res["acc"] = 1.0 if (is_correct and is_exact) else 0.0
     return res
+def process_results(doc: dict[str, str], results: list[str]):
     # results is e.g., ['Yes']
+    gold_list = doc_to_target(doc)
     # gold_list is e.g., 'yes'
     prediction = results[0].strip().split("\n")[0]
+    scores = compute_metrics(gold_list, prediction)
     return scores