Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
fix generation bugs
Browse files- backend-cli.py +15 -1
- src/backend/manage_requests.py +0 -2
- src/backend/run_eval_suite.py +2 -1
- src/backend/tasks/selfcheckgpt/task.py +1 -1
- src/display/utils.py +2 -2
- src/leaderboard/read_evals.py +1 -1
backend-cli.py
CHANGED
@@ -94,7 +94,7 @@ def request_to_result_name(request: EvalRequest) -> str:
|
|
94 |
|
95 |
def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
|
96 |
batch_size = "auto"
|
97 |
-
|
98 |
try:
|
99 |
results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
|
100 |
batch_size=batch_size, device=DEVICE, use_cache=None, limit=LIMIT)
|
@@ -266,6 +266,20 @@ def process_pending_requests() -> bool:
|
|
266 |
if __name__ == "__main__":
|
267 |
wait = True
|
268 |
hard_task_lst = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
if socket.gethostname() in {'hamburg', 'neuromancer'} or os.path.isdir("/home/pminervi"):
|
271 |
wait = False
|
|
|
94 |
|
95 |
def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
|
96 |
batch_size = "auto"
|
97 |
+
|
98 |
try:
|
99 |
results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
|
100 |
batch_size=batch_size, device=DEVICE, use_cache=None, limit=LIMIT)
|
|
|
266 |
if __name__ == "__main__":
|
267 |
wait = True
|
268 |
hard_task_lst = None
|
269 |
+
local_debug = True
|
270 |
+
#debug specific task by ping
|
271 |
+
if local_debug:
|
272 |
+
debug_model_names = ['TinyLlama/TinyLlama-1.1B-Chat-v0.6']
|
273 |
+
# debug_task_name = 'ifeval'
|
274 |
+
debug_task_name = 'selfcheckgpt'
|
275 |
+
task_lst = TASKS_HARNESS.copy()
|
276 |
+
for task in task_lst:
|
277 |
+
for debug_model_name in debug_model_names:
|
278 |
+
task_name = task.benchmark
|
279 |
+
if task_name != debug_task_name:
|
280 |
+
continue
|
281 |
+
eval_request = EvalRequest(model=debug_model_name, private=False, status='', json_filepath='', precision='float16')
|
282 |
+
results = process_evaluation(task, eval_request)
|
283 |
|
284 |
if socket.gethostname() in {'hamburg', 'neuromancer'} or os.path.isdir("/home/pminervi"):
|
285 |
wait = False
|
src/backend/manage_requests.py
CHANGED
@@ -24,7 +24,6 @@ class EvalRequest:
|
|
24 |
likes: Optional[int] = 0
|
25 |
params: Optional[int] = None
|
26 |
license: Optional[str] = ""
|
27 |
-
|
28 |
def get_model_args(self) -> str:
|
29 |
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
|
30 |
|
@@ -41,7 +40,6 @@ class EvalRequest:
|
|
41 |
pass
|
42 |
else:
|
43 |
raise Exception(f"Unknown precision {self.precision}.")
|
44 |
-
|
45 |
return model_args
|
46 |
|
47 |
|
|
|
24 |
likes: Optional[int] = 0
|
25 |
params: Optional[int] = None
|
26 |
license: Optional[str] = ""
|
|
|
27 |
def get_model_args(self) -> str:
|
28 |
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
|
29 |
|
|
|
40 |
pass
|
41 |
else:
|
42 |
raise Exception(f"Unknown precision {self.precision}.")
|
|
|
43 |
return model_args
|
44 |
|
45 |
|
src/backend/run_eval_suite.py
CHANGED
@@ -11,6 +11,7 @@ from src.backend.tasks.cnndm.task_v2 import CNNDMv2
|
|
11 |
|
12 |
from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
|
13 |
|
|
|
14 |
|
15 |
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
|
16 |
if limit:
|
@@ -32,7 +33,7 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
|
|
32 |
print(f"Selected Tasks: {task_names}")
|
33 |
print(f"Eval Request: {eval_request.get_model_args()}")
|
34 |
|
35 |
-
results = evaluator.simple_evaluate(model="hf-
|
36 |
model_args=eval_request.get_model_args(),
|
37 |
tasks=task_names,
|
38 |
num_fewshot=num_fewshot,
|
|
|
11 |
|
12 |
from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
|
13 |
|
14 |
+
from src.backend.huggingface_generate_until import HFLMwithChatTemplate
|
15 |
|
16 |
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
|
17 |
if limit:
|
|
|
33 |
print(f"Selected Tasks: {task_names}")
|
34 |
print(f"Eval Request: {eval_request.get_model_args()}")
|
35 |
|
36 |
+
results = evaluator.simple_evaluate(model="hf-chat", # "hf-causal-experimental", # "hf-causal"
|
37 |
model_args=eval_request.get_model_args(),
|
38 |
tasks=task_names,
|
39 |
num_fewshot=num_fewshot,
|
src/backend/tasks/selfcheckgpt/task.py
CHANGED
@@ -22,7 +22,7 @@ class SelfCheckGPT(ConfigurableTask):
|
|
22 |
def __init__(self):
|
23 |
super().__init__(config={'metadata': {'version': self.VERSION}})
|
24 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
25 |
-
self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
|
26 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
27 |
self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
|
28 |
|
|
|
22 |
def __init__(self):
|
23 |
super().__init__(config={'metadata': {'version': self.VERSION}})
|
24 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
25 |
+
self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
26 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
27 |
self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
|
28 |
|
src/display/utils.py
CHANGED
@@ -45,7 +45,7 @@ class Tasks(Enum):
|
|
45 |
halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
46 |
|
47 |
# XXX include me back at some point
|
48 |
-
|
49 |
|
50 |
|
51 |
# These classes are for user facing column names,
|
@@ -157,7 +157,7 @@ class Precision(Enum):
|
|
157 |
if precision in ["GPTQ", "None"]:
|
158 |
return Precision.qt_GPTQ
|
159 |
return Precision.Unknown
|
160 |
-
|
161 |
|
162 |
# Column selection
|
163 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
45 |
halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
46 |
|
47 |
# XXX include me back at some point
|
48 |
+
selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
49 |
|
50 |
|
51 |
# These classes are for user facing column names,
|
|
|
157 |
if precision in ["GPTQ", "None"]:
|
158 |
return Precision.qt_GPTQ
|
159 |
return Precision.Unknown
|
160 |
+
|
161 |
|
162 |
# Column selection
|
163 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
src/leaderboard/read_evals.py
CHANGED
@@ -24,7 +24,7 @@ class EvalResult:
|
|
24 |
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
25 |
eval_name: str # org_model_precision (uid)
|
26 |
full_model: str # org/model (path on hub)
|
27 |
-
org: str
|
28 |
model: str
|
29 |
revision: str # commit hash, "" if main
|
30 |
results: dict
|
|
|
24 |
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
25 |
eval_name: str # org_model_precision (uid)
|
26 |
full_model: str # org/model (path on hub)
|
27 |
+
org: str
|
28 |
model: str
|
29 |
revision: str # commit hash, "" if main
|
30 |
results: dict
|