Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Commit
•
c323865
1
Parent(s):
0b755b6
update
Browse files
src/backend/tasks/selfcheckgpt/README.md
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models
|
2 |
+
|
3 |
+
In order to run selfcheckgpt evaluation, these dependencies should be installed.
|
4 |
+
```
|
5 |
+
pip install spacy
|
6 |
+
pip install selfcheckgpt
|
7 |
+
python -m spacy download en
|
8 |
+
```
|
9 |
+
|
10 |
+
selfcheckgpt support different evaluation methods including: `SelfCheckNgram`, `SelfCheckBERTScore`, `SelfCheckMQAG` and `SelfCheckNLI`.
|
11 |
+
The default evaluation method in llm-eval-harness is `SelfCheckNgram`. You can change the evaluation method by changing the environment variable
|
12 |
+
```
|
13 |
+
export SELFCHECKGPTTYPE=SelfCheckNgram
|
14 |
+
```
|
15 |
+
For `SelfCheckBERTScore`, `SelfCheckMQAG` and `SelfCheckNLI` evaluation method which will also run some huggingface models, You can change the running device of the selfcheckgpt to GPU by setting enviroment device:
|
16 |
+
```
|
17 |
+
export SELFCHECKGPTDEVICE=cuda
|
18 |
+
```
|
19 |
+
|
20 |
+
## Citation
|
21 |
+
|
22 |
+
```
|
23 |
+
@misc{manakul2023selfcheckgpt,
|
24 |
+
title={SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models},
|
25 |
+
author={Potsawee Manakul and Adian Liusie and Mark J. F. Gales},
|
26 |
+
year={2023},
|
27 |
+
eprint={2303.08896},
|
28 |
+
archivePrefix={arXiv},
|
29 |
+
primaryClass={cs.CL}
|
30 |
+
}
|
31 |
+
```
|
src/backend/tasks/selfcheckgpt/task.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Union, List
|
3 |
+
|
4 |
+
|
5 |
+
from lm_eval.api.task import Task
|
6 |
+
from lm_eval.api.instance import Instance
|
7 |
+
from lm_eval.api.registry import register_task
|
8 |
+
from lm_eval.api.metrics import mean
|
9 |
+
|
10 |
+
import spacy
|
11 |
+
from selfcheckgpt.modeling_selfcheck import SelfCheckMQAG, SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram
|
12 |
+
|
13 |
+
|
14 |
+
@register_task("selfcheckgpt")
|
15 |
+
class SelfCheckGpt(Task):
|
16 |
+
VERSION = 0.0
|
17 |
+
DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
|
18 |
+
DATASET_NAME = None
|
19 |
+
OUTPUT_TYPE = 'generate_until'
|
20 |
+
def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
|
21 |
+
super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
|
22 |
+
self.generation_kwargs = {"temperature": 0.0, "do_sample": False}
|
23 |
+
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
24 |
+
self.generation_kwargs_sampling = {"temperature": 1.0, "do_sample": False}
|
25 |
+
|
26 |
+
self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNgram')
|
27 |
+
self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', 'cpu')
|
28 |
+
self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
|
29 |
+
|
30 |
+
if self.selfcheckgpt_type == 'SelfCheckNgram':
|
31 |
+
self.selfcheckgpt = SelfCheckNgram(n=1)
|
32 |
+
elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
|
33 |
+
self.selfcheckgpt = SelfCheckBERTScore(rescale_with_baseline=True)
|
34 |
+
elif self.selfcheckgpt_type == 'SelfCheckMQAG':
|
35 |
+
self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
|
36 |
+
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
37 |
+
self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
|
38 |
+
|
39 |
+
def has_training_docs(self):
|
40 |
+
return False
|
41 |
+
|
42 |
+
def has_validation_docs(self):
|
43 |
+
return True
|
44 |
+
|
45 |
+
def has_test_docs(self):
|
46 |
+
return False
|
47 |
+
|
48 |
+
def validation_docs(self):
|
49 |
+
return self.dataset["evaluation"]
|
50 |
+
|
51 |
+
def doc_to_text(self, doc):
|
52 |
+
doc_text = doc["wiki_bio_text"]
|
53 |
+
doc_text = doc_text.split()
|
54 |
+
doc_text = " ".join(doc_text[:5])
|
55 |
+
doc_text = f"Please generating a Wikipedia passage starting with: {doc_text}\n"
|
56 |
+
return doc_text
|
57 |
+
|
58 |
+
def doc_to_target(self, doc):
|
59 |
+
answer = doc['wiki_bio_text']
|
60 |
+
return answer
|
61 |
+
|
62 |
+
def construct_requests(
|
63 |
+
self, doc: dict, ctx: str, **kwargs
|
64 |
+
) -> Union[List[Instance], Instance]:
|
65 |
+
arguments = (ctx, self.generation_kwargs)
|
66 |
+
request_list = [
|
67 |
+
Instance(
|
68 |
+
request_type=self.OUTPUT_TYPE,
|
69 |
+
doc=doc,
|
70 |
+
arguments=arguments,
|
71 |
+
idx=0,
|
72 |
+
**kwargs
|
73 |
+
),
|
74 |
+
]
|
75 |
+
sampling_arguments = (ctx, self.generation_kwargs_sampling)
|
76 |
+
request_list.extend([
|
77 |
+
Instance(
|
78 |
+
request_type=self.OUTPUT_TYPE,
|
79 |
+
doc=doc,
|
80 |
+
arguments=sampling_arguments,
|
81 |
+
idx=idx,
|
82 |
+
**kwargs
|
83 |
+
)
|
84 |
+
for idx in range(1, self.generation_kwargs_sampling_number+1)
|
85 |
+
]
|
86 |
+
)
|
87 |
+
return request_list
|
88 |
+
|
89 |
+
|
90 |
+
def process_results(self, doc, results):
|
91 |
+
response_temperature_0 = results[0]
|
92 |
+
other_responses = results[1:]
|
93 |
+
passage = self.doc_to_target(doc)
|
94 |
+
|
95 |
+
sentences = self.selfcheckgpt_nlp(response_temperature_0)
|
96 |
+
sentences = [sent.text.strip() for sent in sentences.sents]
|
97 |
+
if self.selfcheckgpt_type == 'SelfCheckNgram':
|
98 |
+
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
99 |
+
sentences = sentences,
|
100 |
+
passage = response_temperature_0,
|
101 |
+
sampled_passages = other_responses,
|
102 |
+
)
|
103 |
+
return {'avg-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_neg_logprob'],
|
104 |
+
'max-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_max_neg_logprob']}
|
105 |
+
|
106 |
+
elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
|
107 |
+
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
108 |
+
sentences = sentences,
|
109 |
+
sampled_passages = other_responses,
|
110 |
+
)
|
111 |
+
elif self.selfcheckgpt_type == 'SelfCheckMQAG':
|
112 |
+
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
113 |
+
sentences = sentences,
|
114 |
+
sampled_passages = other_responses,
|
115 |
+
)
|
116 |
+
elif self.selfcheckgpt_type == 'SelfCheckNLI':
|
117 |
+
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
118 |
+
sentences = sentences,
|
119 |
+
passage = response_temperature_0,
|
120 |
+
sampled_passages = other_responses,
|
121 |
+
num_questions_per_sent = 5, # number of questions to be drawn
|
122 |
+
scoring_method = 'bayes_with_alpha', # options = 'counting', 'bayes', 'bayes_with_alpha'
|
123 |
+
beta1 = 0.8, beta2 = 0.8, # additional params depending on scoring_method
|
124 |
+
)
|
125 |
+
|
126 |
+
selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
|
127 |
+
selfcheckgpt_scores_max = max(selfcheckgpt_scores)
|
128 |
+
|
129 |
+
return {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
|
130 |
+
|
131 |
+
def aggregation(self):
|
132 |
+
"""
|
133 |
+
:returns: {str: [float] -> float}
|
134 |
+
A dictionary where keys are the names of submetrics and values are
|
135 |
+
functions that aggregate a list of metrics
|
136 |
+
"""
|
137 |
+
return {k: mean for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}
|
138 |
+
|
139 |
+
def higher_is_better(self):
|
140 |
+
"""
|
141 |
+
:returns: {str: bool}
|
142 |
+
A dictionary where keys are the names of submetrics and values are
|
143 |
+
whether a higher value of the submetric is better
|
144 |
+
"""
|
145 |
+
return {k: False for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}
|