Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

App Files Files Community

pminervini commited on Jan 26, 2024

Commit

3567246

2 Parent(s): c639c51 3557858

Merge branch 'main' of https://huggingface.co/spaces/pminervini/hallucinations-leaderboard into main

Browse files

Files changed (1) hide show

src/backend/tasks/selfcheckgpt/task.py +11 -11

src/backend/tasks/selfcheckgpt/task.py CHANGED Viewed

@@ -21,9 +21,9 @@ class SelfCheckGpt(Task):
     def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
         super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
-        self.generation_kwargs = {"temperature": 0.0, "do_sample": False}
         self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
-        self.generation_kwargs_sampling = {"temperature": 1.0, "do_sample": False}
         self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')
         self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', DEVICE)
@@ -38,7 +38,7 @@ class SelfCheckGpt(Task):
         elif self.selfcheckgpt_type == 'SelfCheckNLI':
             self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
         self.SelfCheckNLI_error_cnt = 0
     def has_training_docs(self):
         return False
@@ -102,21 +102,21 @@ class SelfCheckGpt(Task):
                 beta1=0.8, beta2=0.8)            # additional params depending on scoring_method
         elif self.selfcheckgpt_type == 'SelfCheckNLI':
             selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
             if len(selfcheckgpt_scores) == 0:
                 self.SelfCheckNLI_error_cnt += 1
-                print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 1.0.")
                 result = {
-                    'avg-selfcheckgpt': 1.0,
-                    'max-selfcheckgpt': 1.0
                 }
             else:
-                threshold = 0.5
                 # passage is hallucianted if one sentence is hallucinated. It's very strict.
-                selfcheckgpt_scores_max = 1.0 if max(selfcheckgpt_scores) > threshold else 0.0
                 # passage is hallucianted if average score of all sentences is hallucinated.
-                selfcheckgpt_scores_avg = 1.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 0.0
                 result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
             return result
@@ -139,4 +139,4 @@ class SelfCheckGpt(Task):
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
-        return {k: False for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}

     def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
         super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
+        self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512} # these end tokens are hard coded because of the current limitaion of the llm-eval.
         self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
+        self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
         self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')
         self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', DEVICE)
         elif self.selfcheckgpt_type == 'SelfCheckNLI':
             self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
         self.SelfCheckNLI_error_cnt = 0
     def has_training_docs(self):
         return False
                 beta1=0.8, beta2=0.8)            # additional params depending on scoring_method
         elif self.selfcheckgpt_type == 'SelfCheckNLI':
             selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
             if len(selfcheckgpt_scores) == 0:
                 self.SelfCheckNLI_error_cnt += 1
+                print(f"SelfCheckNLI Warning.SelfCheckNLI_error_cnt:{self.SelfCheckNLI_error_cnt}. This instance is marked as hallucinated with 0.0.")
                 result = {
+                    'avg-selfcheckgpt': 0.0,
+                    'max-selfcheckgpt': 0.0
                 }
             else:
+                threshold = 0.6 # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
                 # passage is hallucianted if one sentence is hallucinated. It's very strict.
+                selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
                 # passage is hallucianted if average score of all sentences is hallucinated.
+                selfcheckgpt_scores_avg = 0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
                 result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
             return result
             A dictionary where keys are the names of submetrics and values are
             whether a higher value of the submetric is better
         """
+        return {k: True for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}