Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Merge branch 'main' of https://huggingface.co/spaces/pminervini/hallucinations-leaderboard into main
Browse files
src/backend/tasks/selfcheckgpt/task.py
CHANGED
@@ -21,9 +21,9 @@ class SelfCheckGpt(Task):
|
|
21 |
|
22 |
def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
|
23 |
super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
|
24 |
-
self.generation_kwargs = {"
|
25 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
26 |
-
self.generation_kwargs_sampling = {"temperature":
|
27 |
|
28 |
self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')
|
29 |
self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', DEVICE)
|
@@ -38,7 +38,7 @@ class SelfCheckGpt(Task):
|
|
38 |
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
39 |
self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
|
40 |
self.SelfCheckNLI_error_cnt = 0
|
41 |
-
|
42 |
def has_training_docs(self):
|
43 |
return False
|
44 |
|
@@ -102,21 +102,21 @@ class SelfCheckGpt(Task):
|
|
102 |
beta1=0.8, beta2=0.8) # additional params depending on scoring_method
|
103 |
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
104 |
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
105 |
-
|
106 |
if len(selfcheckgpt_scores) == 0:
|
107 |
self.SelfCheckNLI_error_cnt += 1
|
108 |
-
print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with
|
109 |
result = {
|
110 |
-
'avg-selfcheckgpt':
|
111 |
-
'max-selfcheckgpt':
|
112 |
}
|
113 |
|
114 |
else:
|
115 |
-
threshold = 0.
|
116 |
# passage is hallucianted if one sentence is hallucinated. It's very strict.
|
117 |
-
selfcheckgpt_scores_max =
|
118 |
# passage is hallucianted if average score of all sentences is hallucinated.
|
119 |
-
selfcheckgpt_scores_avg =
|
120 |
result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
|
121 |
return result
|
122 |
|
@@ -139,4 +139,4 @@ class SelfCheckGpt(Task):
|
|
139 |
A dictionary where keys are the names of submetrics and values are
|
140 |
whether a higher value of the submetric is better
|
141 |
"""
|
142 |
-
return {k:
|
|
|
21 |
|
22 |
def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
|
23 |
super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
|
24 |
+
self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512} # these end tokens are hard coded because of the current limitaion of the llm-eval.
|
25 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
26 |
+
self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
|
27 |
|
28 |
self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')
|
29 |
self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', DEVICE)
|
|
|
38 |
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
39 |
self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
|
40 |
self.SelfCheckNLI_error_cnt = 0
|
41 |
+
|
42 |
def has_training_docs(self):
|
43 |
return False
|
44 |
|
|
|
102 |
beta1=0.8, beta2=0.8) # additional params depending on scoring_method
|
103 |
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
104 |
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
105 |
+
|
106 |
if len(selfcheckgpt_scores) == 0:
|
107 |
self.SelfCheckNLI_error_cnt += 1
|
108 |
+
print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 0.0.")
|
109 |
result = {
|
110 |
+
'avg-selfcheckgpt': 0.0,
|
111 |
+
'max-selfcheckgpt': 0.0
|
112 |
}
|
113 |
|
114 |
else:
|
115 |
+
threshold = 0.6 # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
|
116 |
# passage is hallucianted if one sentence is hallucinated. It's very strict.
|
117 |
+
selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
|
118 |
# passage is hallucianted if average score of all sentences is hallucinated.
|
119 |
+
selfcheckgpt_scores_avg = 0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
|
120 |
result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
|
121 |
return result
|
122 |
|
|
|
139 |
A dictionary where keys are the names of submetrics and values are
|
140 |
whether a higher value of the submetric is better
|
141 |
"""
|
142 |
+
return {k: True for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}
|