data_only_hallucination_leaderboard

Runtime error

App Files Files Community

pminervini commited on Dec 7, 2023

Commit

b3fd791

•

1 Parent(s): 7f12787

cleanup

Browse files

Files changed (4) hide show

halueval-cli.py +5 -5
src/backend/tasks/halueval/halueval_dialogue.yaml +1 -1
src/backend/tasks/halueval/halueval_summarization.yaml +1 -1
src/backend/tasks/halueval/utils.py +1 -13

halueval-cli.py CHANGED Viewed

@@ -37,12 +37,12 @@ def main():
     task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
-    print(f"Selected Tasks: {task_names}")
-    results = evaluator.simple_evaluate(model="hf-auto", model_args=eval_request.get_model_args(), tasks=task_names, num_fewshot=0,
-                                        batch_size=4, device=DEVICE, use_cache=None, limit=8, write_out=True)
-    print('AAA', results)
 if __name__ == "__main__":
     main()

     task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
+    for task_name in task_names:
+        print(f"Selected Tasks: [{task_name}]")
+        results = evaluator.simple_evaluate(model="hf-auto", model_args=eval_request.get_model_args(), tasks=[task_name], num_fewshot=0,
+                                            batch_size=1, device=DEVICE, use_cache=None, limit=8, write_out=True)
+        print('AAA', results)
 if __name__ == "__main__":
     main()

src/backend/tasks/halueval/halueval_dialogue.yaml CHANGED Viewed

@@ -5,7 +5,7 @@ output_type: generate_until
 training_split: data
 validation_split: data
 doc_to_text: !function utils.doc_to_text_dialogue
-doc_to_target: !function utils.doc_to_target_dialogue
 process_results: !function utils.process_results_qa
 fewshot_delimiter: "\n"
 generation_kwargs:

 training_split: data
 validation_split: data
 doc_to_text: !function utils.doc_to_text_dialogue
+doc_to_target: !function utils.doc_to_target_qa
 process_results: !function utils.process_results_qa
 fewshot_delimiter: "\n"
 generation_kwargs:

src/backend/tasks/halueval/halueval_summarization.yaml CHANGED Viewed

@@ -5,7 +5,7 @@ output_type: generate_until
 training_split: data
 validation_split: data
 doc_to_text: !function utils.doc_to_text_summarization
-doc_to_target: !function utils.doc_to_target_summarization
 process_results: !function utils.process_results_qa
 fewshot_delimiter: "\n"
 generation_kwargs:

 training_split: data
 validation_split: data
 doc_to_text: !function utils.doc_to_text_summarization
+doc_to_target: !function utils.doc_to_target_qa
 process_results: !function utils.process_results_qa
 fewshot_delimiter: "\n"
 generation_kwargs:

src/backend/tasks/halueval/utils.py CHANGED Viewed

@@ -1,5 +1,4 @@
-from itertools import zip_longest
-import transformers.data.metrics.squad_metrics as squad_metrics
 QA_INSTURCTIONS = """I want you act as an answer judge. Given a question and an answer, your objective is to determine if the provided answer contains non-factual or hallucinated information. You SHOULD give your judgement based on the following hallucination types and the world knowledge.
@@ -83,21 +82,18 @@ You should try your best to determine if the summary contains non-factual or hal
 def doc_to_text_qa(doc: dict[str, str]) -> str:
-    # print('XXX doc_to_text_qa')
     # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
     doc_text = QA_INSTURCTIONS + "\n\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
     return doc_text
 def doc_to_text_dialogue(doc: dict[str, str]) -> str:
-    # print('XXX doc_to_text_dialogue')
     # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
     doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
     return doc_text
 def doc_to_text_summarization(doc: dict[str, str]) -> str:
-    # print('XXX doc_to_text_dialogue')
     # prompt1 = instruction + "\n\n#Document#: " + document
     # prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:"
     doc_text_1 = SUMMARIZATION_INSTRUCTIONS + "\n\n#Document#: " + doc["document"]
@@ -106,15 +102,7 @@ def doc_to_text_summarization(doc: dict[str, str]) -> str:
     return doc_text
-def doc_to_text_summarization(doc: dict[str, str]) -> str:
-    # print('XXX doc_to_text_dialogue')
-    # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
-    doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Dialogue History#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
-    return doc_text
 def doc_to_target_qa(doc: dict[str, str]) -> str:
-    # print('XXX doc_to_target_qa')
     return doc['hallucination']

+# Main reference: https://github.com/RUCAIBox/HaluEval/blob/main/evaluation/evaluate.py
 QA_INSTURCTIONS = """I want you act as an answer judge. Given a question and an answer, your objective is to determine if the provided answer contains non-factual or hallucinated information. You SHOULD give your judgement based on the following hallucination types and the world knowledge.
 def doc_to_text_qa(doc: dict[str, str]) -> str:
     # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
     doc_text = QA_INSTURCTIONS + "\n\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
     return doc_text
 def doc_to_text_dialogue(doc: dict[str, str]) -> str:
     # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
     doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
     return doc_text
 def doc_to_text_summarization(doc: dict[str, str]) -> str:
     # prompt1 = instruction + "\n\n#Document#: " + document
     # prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:"
     doc_text_1 = SUMMARIZATION_INSTRUCTIONS + "\n\n#Document#: " + doc["document"]
     return doc_text
 def doc_to_target_qa(doc: dict[str, str]) -> str:
     return doc['hallucination']