Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Commit
•
3d44a49
1
Parent(s):
21eac98
update
Browse files
cli/eval-cli.py
CHANGED
@@ -35,7 +35,8 @@ def main():
|
|
35 |
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
36 |
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
37 |
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
38 |
-
my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
|
|
|
39 |
|
40 |
eval_logger = utils.eval_logger
|
41 |
import logging
|
|
|
35 |
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
36 |
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
37 |
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
38 |
+
# my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
|
39 |
+
my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
|
40 |
|
41 |
eval_logger = utils.eval_logger
|
42 |
import logging
|
src/backend/tasks/faithdial/faithdial.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: faithdial
|
2 |
+
task: faithdial_hallu
|
3 |
+
dataset_path: McGill-NLP/FaithDial
|
4 |
+
training_split: train
|
5 |
+
validation_split: validation
|
6 |
+
test_split: test
|
7 |
+
output_type: multiple_choice
|
8 |
+
doc_to_text: !function utils.doc_to_text
|
9 |
+
doc_to_target: !function utils.doc_to_target
|
10 |
+
# process_results: !function utils.process_results
|
11 |
+
doc_to_choice: ["false", "true"]
|
12 |
+
metric_list:
|
13 |
+
- metric: acc
|
14 |
+
higher_is_better: True
|
15 |
+
metadata:
|
16 |
+
version: 0.0
|
src/backend/tasks/faithdial/utils.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Union
|
2 |
+
ValueType = Union[str, List[str]]
|
3 |
+
|
4 |
+
|
5 |
+
def doc_to_text(doc: dict[str, ValueType]) -> str:
|
6 |
+
history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc['history'])])
|
7 |
+
doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["response"]}\n#Hallucinated#:'
|
8 |
+
# breakpoint()
|
9 |
+
return doc_text
|
10 |
+
|
11 |
+
|
12 |
+
def doc_to_target(doc: dict[str, ValueType]) -> str:
|
13 |
+
res = "true" if "Hallucination" in doc["BEGIN"] else "false"
|
14 |
+
# breakpoint()
|
15 |
+
return res
|
16 |
+
|
17 |
+
|
18 |
+
def process_results(doc: dict[str, ValueType], results: List[str]) -> dict[str, float]:
|
19 |
+
# breakpoint()
|
20 |
+
return {"acc": 0.0}
|
src/backend/tasks/halueval/utils.py
CHANGED
@@ -83,13 +83,13 @@ You should try your best to determine if the summary contains non-factual or hal
|
|
83 |
|
84 |
def doc_to_text_qa(doc: dict[str, str]) -> str:
|
85 |
# prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
|
86 |
-
doc_text = QA_INSTURCTIONS + "\n\n#Knowledge
|
87 |
return doc_text
|
88 |
|
89 |
|
90 |
def doc_to_text_dialogue(doc: dict[str, str]) -> str:
|
91 |
# prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
|
92 |
-
doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Knowledge
|
93 |
return doc_text
|
94 |
|
95 |
|
@@ -127,7 +127,7 @@ def compute_metrics(gold_answer: str, prediction: str) -> dict[str, float]:
|
|
127 |
return res
|
128 |
|
129 |
|
130 |
-
def process_results(doc: dict[str, str], results: list[str]):
|
131 |
# results is e.g., ['Yes']
|
132 |
gold_list = doc_to_target(doc)
|
133 |
# gold_list is e.g., 'yes'
|
|
|
83 |
|
84 |
def doc_to_text_qa(doc: dict[str, str]) -> str:
|
85 |
# prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
|
86 |
+
doc_text = QA_INSTURCTIONS + "\n\n#Knowledge#: " + doc["knowledge"] + "\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
|
87 |
return doc_text
|
88 |
|
89 |
|
90 |
def doc_to_text_dialogue(doc: dict[str, str]) -> str:
|
91 |
# prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
|
92 |
+
doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Knowledge#: " + doc["knowledge"] + "\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
|
93 |
return doc_text
|
94 |
|
95 |
|
|
|
127 |
return res
|
128 |
|
129 |
|
130 |
+
def process_results(doc: dict[str, str], results: list[str]) -> dict[str, float]:
|
131 |
# results is e.g., ['Yes']
|
132 |
gold_list = doc_to_target(doc)
|
133 |
# gold_list is e.g., 'yes'
|