Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Commit
•
a6af742
1
Parent(s):
efa0391
update
Browse files- src/backend/envs.py +1 -4
- src/backend/tasks/selfcheckgpt/task.py +19 -20
src/backend/envs.py
CHANGED
@@ -17,13 +17,10 @@ class Task:
|
|
17 |
|
18 |
|
19 |
class Tasks(Enum):
|
20 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
21 |
-
# task0 = Task("anli_r1", "acc", "ANLI")
|
22 |
-
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
23 |
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
|
24 |
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
|
25 |
-
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
|
26 |
|
|
|
27 |
task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
|
28 |
task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
|
29 |
task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
|
|
|
17 |
|
18 |
|
19 |
class Tasks(Enum):
|
|
|
|
|
|
|
20 |
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
|
21 |
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
|
|
|
22 |
|
23 |
+
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
|
24 |
task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
|
25 |
task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
|
26 |
task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
|
src/backend/tasks/selfcheckgpt/task.py
CHANGED
@@ -18,6 +18,7 @@ class SelfCheckGpt(Task):
|
|
18 |
DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
|
19 |
DATASET_NAME = None
|
20 |
OUTPUT_TYPE = 'generate_until'
|
|
|
21 |
def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
|
22 |
super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
|
23 |
self.generation_kwargs = {"temperature": 0.0, "do_sample": False}
|
@@ -54,7 +55,8 @@ class SelfCheckGpt(Task):
|
|
54 |
doc_text = doc["wiki_bio_text"]
|
55 |
doc_text = doc_text.split()
|
56 |
doc_text = " ".join(doc_text[:5])
|
57 |
-
|
|
|
58 |
return doc_text
|
59 |
|
60 |
def doc_to_target(self, doc):
|
@@ -82,35 +84,32 @@ class SelfCheckGpt(Task):
|
|
82 |
sentences = self.selfcheckgpt_nlp(response_temperature_0)
|
83 |
sentences = [sent.text.strip() for sent in sentences.sents]
|
84 |
if self.selfcheckgpt_type == 'SelfCheckNgram':
|
85 |
-
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
return {'avg-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_neg_logprob'],
|
91 |
-
'max-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_max_neg_logprob']}
|
92 |
|
93 |
elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
|
94 |
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
95 |
elif self.selfcheckgpt_type == 'SelfCheckMQAG':
|
96 |
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
97 |
-
sentences
|
98 |
-
passage
|
99 |
-
sampled_passages
|
100 |
-
num_questions_per_sent
|
101 |
-
scoring_method
|
102 |
-
beta1
|
103 |
-
)
|
104 |
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
105 |
-
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
106 |
-
sentences = sentences,
|
107 |
-
sampled_passages = other_responses,
|
108 |
-
)
|
109 |
|
110 |
if len(selfcheckgpt_scores) == 0:
|
111 |
self.SelfCheckNLI_error_cnt += 1
|
112 |
print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 1.0.")
|
113 |
-
result = {
|
|
|
|
|
|
|
114 |
|
115 |
else:
|
116 |
threshold = 0.5
|
|
|
18 |
DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
|
19 |
DATASET_NAME = None
|
20 |
OUTPUT_TYPE = 'generate_until'
|
21 |
+
|
22 |
def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
|
23 |
super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
|
24 |
self.generation_kwargs = {"temperature": 0.0, "do_sample": False}
|
|
|
55 |
doc_text = doc["wiki_bio_text"]
|
56 |
doc_text = doc_text.split()
|
57 |
doc_text = " ".join(doc_text[:5])
|
58 |
+
# prompt = f"This is a passage from Wikipedia about {context}:\n\n"
|
59 |
+
doc_text = f"Please generate a Wikipedia passage starting with: {doc_text}\n"
|
60 |
return doc_text
|
61 |
|
62 |
def doc_to_target(self, doc):
|
|
|
84 |
sentences = self.selfcheckgpt_nlp(response_temperature_0)
|
85 |
sentences = [sent.text.strip() for sent in sentences.sents]
|
86 |
if self.selfcheckgpt_type == 'SelfCheckNgram':
|
87 |
+
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, passage=response_temperature_0, sampled_passages=other_responses)
|
88 |
+
return {
|
89 |
+
'avg-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_neg_logprob'],
|
90 |
+
'max-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_max_neg_logprob']
|
91 |
+
}
|
|
|
|
|
92 |
|
93 |
elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
|
94 |
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
95 |
elif self.selfcheckgpt_type == 'SelfCheckMQAG':
|
96 |
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
97 |
+
sentences=sentences,
|
98 |
+
passage=response_temperature_0,
|
99 |
+
sampled_passages=other_responses,
|
100 |
+
num_questions_per_sent=5, # number of questions to be drawn
|
101 |
+
scoring_method='bayes_with_alpha', # options = 'counting', 'bayes', 'bayes_with_alpha'
|
102 |
+
beta1=0.8, beta2=0.8) # additional params depending on scoring_method
|
|
|
103 |
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
104 |
+
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
|
|
|
|
|
|
105 |
|
106 |
if len(selfcheckgpt_scores) == 0:
|
107 |
self.SelfCheckNLI_error_cnt += 1
|
108 |
print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 1.0.")
|
109 |
+
result = {
|
110 |
+
'avg-selfcheckgpt': 1.0,
|
111 |
+
'max-selfcheckgpt': 1.0
|
112 |
+
}
|
113 |
|
114 |
else:
|
115 |
threshold = 0.5
|