pingnie commited on
Commit
69021cc
1 Parent(s): e5e2b84

fix generation bugs

Browse files
backend-cli.py CHANGED
@@ -94,7 +94,7 @@ def request_to_result_name(request: EvalRequest) -> str:
94
 
95
  def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
96
  batch_size = "auto"
97
-
98
  try:
99
  results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
100
  batch_size=batch_size, device=DEVICE, use_cache=None, limit=LIMIT)
@@ -266,6 +266,20 @@ def process_pending_requests() -> bool:
266
  if __name__ == "__main__":
267
  wait = True
268
  hard_task_lst = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
  if socket.gethostname() in {'hamburg', 'neuromancer'} or os.path.isdir("/home/pminervi"):
271
  wait = False
 
94
 
95
  def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
96
  batch_size = "auto"
97
+
98
  try:
99
  results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
100
  batch_size=batch_size, device=DEVICE, use_cache=None, limit=LIMIT)
 
266
  if __name__ == "__main__":
267
  wait = True
268
  hard_task_lst = None
269
+ local_debug = True
270
+ #debug specific task by ping
271
+ if local_debug:
272
+ debug_model_names = ['TinyLlama/TinyLlama-1.1B-Chat-v0.6']
273
+ # debug_task_name = 'ifeval'
274
+ debug_task_name = 'selfcheckgpt'
275
+ task_lst = TASKS_HARNESS.copy()
276
+ for task in task_lst:
277
+ for debug_model_name in debug_model_names:
278
+ task_name = task.benchmark
279
+ if task_name != debug_task_name:
280
+ continue
281
+ eval_request = EvalRequest(model=debug_model_name, private=False, status='', json_filepath='', precision='float16')
282
+ results = process_evaluation(task, eval_request)
283
 
284
  if socket.gethostname() in {'hamburg', 'neuromancer'} or os.path.isdir("/home/pminervi"):
285
  wait = False
src/backend/manage_requests.py CHANGED
@@ -24,7 +24,6 @@ class EvalRequest:
24
  likes: Optional[int] = 0
25
  params: Optional[int] = None
26
  license: Optional[str] = ""
27
-
28
  def get_model_args(self) -> str:
29
  model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
30
 
@@ -41,7 +40,6 @@ class EvalRequest:
41
  pass
42
  else:
43
  raise Exception(f"Unknown precision {self.precision}.")
44
-
45
  return model_args
46
 
47
 
 
24
  likes: Optional[int] = 0
25
  params: Optional[int] = None
26
  license: Optional[str] = ""
 
27
  def get_model_args(self) -> str:
28
  model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
29
 
 
40
  pass
41
  else:
42
  raise Exception(f"Unknown precision {self.precision}.")
 
43
  return model_args
44
 
45
 
src/backend/run_eval_suite.py CHANGED
@@ -11,6 +11,7 @@ from src.backend.tasks.cnndm.task_v2 import CNNDMv2
11
 
12
  from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
13
 
 
14
 
15
  def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
16
  if limit:
@@ -32,7 +33,7 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
32
  print(f"Selected Tasks: {task_names}")
33
  print(f"Eval Request: {eval_request.get_model_args()}")
34
 
35
- results = evaluator.simple_evaluate(model="hf-auto", # "hf-causal-experimental", # "hf-causal"
36
  model_args=eval_request.get_model_args(),
37
  tasks=task_names,
38
  num_fewshot=num_fewshot,
 
11
 
12
  from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
13
 
14
+ from src.backend.huggingface_generate_until import HFLMwithChatTemplate
15
 
16
  def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
17
  if limit:
 
33
  print(f"Selected Tasks: {task_names}")
34
  print(f"Eval Request: {eval_request.get_model_args()}")
35
 
36
+ results = evaluator.simple_evaluate(model="hf-chat", # "hf-causal-experimental", # "hf-causal"
37
  model_args=eval_request.get_model_args(),
38
  tasks=task_names,
39
  num_fewshot=num_fewshot,
src/backend/tasks/selfcheckgpt/task.py CHANGED
@@ -22,7 +22,7 @@ class SelfCheckGPT(ConfigurableTask):
22
  def __init__(self):
23
  super().__init__(config={'metadata': {'version': self.VERSION}})
24
  # these end tokens are hard coded because of the current limitaion of the llm-eval.
25
- self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
26
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
27
  self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
28
 
 
22
  def __init__(self):
23
  super().__init__(config={'metadata': {'version': self.VERSION}})
24
  # these end tokens are hard coded because of the current limitaion of the llm-eval.
25
+ self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
26
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
27
  self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
28
 
src/display/utils.py CHANGED
@@ -45,7 +45,7 @@ class Tasks(Enum):
45
  halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
46
 
47
  # XXX include me back at some point
48
- # selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
49
 
50
 
51
  # These classes are for user facing column names,
@@ -157,7 +157,7 @@ class Precision(Enum):
157
  if precision in ["GPTQ", "None"]:
158
  return Precision.qt_GPTQ
159
  return Precision.Unknown
160
-
161
 
162
  # Column selection
163
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
45
  halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
46
 
47
  # XXX include me back at some point
48
+ selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
49
 
50
 
51
  # These classes are for user facing column names,
 
157
  if precision in ["GPTQ", "None"]:
158
  return Precision.qt_GPTQ
159
  return Precision.Unknown
160
+
161
 
162
  # Column selection
163
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
src/leaderboard/read_evals.py CHANGED
@@ -24,7 +24,7 @@ class EvalResult:
24
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
25
  eval_name: str # org_model_precision (uid)
26
  full_model: str # org/model (path on hub)
27
- org: str
28
  model: str
29
  revision: str # commit hash, "" if main
30
  results: dict
 
24
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
25
  eval_name: str # org_model_precision (uid)
26
  full_model: str # org/model (path on hub)
27
+ org: str
28
  model: str
29
  revision: str # commit hash, "" if main
30
  results: dict