Spaces:
Sleeping
Sleeping
XufengDuan
commited on
Commit
·
2a968dc
1
Parent(s):
7d83c67
update scripts
Browse files- main_backend.py +106 -35
- src/backend/model_operations.py +48 -14
- src/leaderboard/read_evals.py +1 -0
main_backend.py
CHANGED
@@ -27,6 +27,85 @@ snapshot_download(repo_id=envs.QUEUE_REPO, revision="main",
|
|
27 |
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
28 |
# exit()
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def run_auto_eval(args):
|
31 |
if not args.reproduce:
|
32 |
current_pending_status = [PENDING_STATUS]
|
@@ -42,50 +121,43 @@ def run_auto_eval(args):
|
|
42 |
local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
|
43 |
)
|
44 |
logging.info("Checked completed evals")
|
45 |
-
eval_requests = manage_requests.get_eval_requests(
|
46 |
-
|
47 |
-
|
|
|
|
|
48 |
logging.info("Got eval requests")
|
49 |
eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
|
50 |
logging.info("Sorted eval requests")
|
51 |
|
52 |
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
53 |
-
print(eval_requests)
|
54 |
if len(eval_requests) == 0:
|
55 |
print("No eval requests found. Exiting.")
|
56 |
return
|
57 |
|
58 |
-
|
59 |
-
eval_request = manage_requests.EvalRequest(
|
60 |
-
model=args.model,
|
61 |
-
status=PENDING_STATUS,
|
62 |
-
precision=args.precision
|
63 |
-
)
|
64 |
-
pp.pprint(eval_request)
|
65 |
-
else:
|
66 |
-
eval_request = eval_requests[0]
|
67 |
pp.pprint(eval_request)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
-
# manage_requests.set_eval_request(
|
70 |
-
# api=envs.API,
|
71 |
-
# eval_request=eval_request,
|
72 |
-
# new_status=RUNNING_STATUS,
|
73 |
-
# hf_repo=envs.QUEUE_REPO,
|
74 |
-
# local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
|
75 |
-
# )
|
76 |
-
# logging.info("Set eval request to running, now running eval")
|
77 |
-
|
78 |
-
run_eval_suite.run_evaluation(
|
79 |
-
eval_request=eval_request,
|
80 |
-
local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
|
81 |
-
results_repo=envs.RESULTS_REPO,
|
82 |
-
batch_size=1,
|
83 |
-
device=envs.DEVICE,
|
84 |
-
no_cache=True,
|
85 |
-
need_check=not args.publish,
|
86 |
-
write_results=args.update
|
87 |
-
)
|
88 |
-
logging.info("Eval finished, now setting status to finished")
|
89 |
else:
|
90 |
eval_request = manage_requests.EvalRequest(
|
91 |
model=args.model,
|
@@ -106,7 +178,6 @@ def run_auto_eval(args):
|
|
106 |
)
|
107 |
logging.info("Reproducibility eval finished")
|
108 |
|
109 |
-
|
110 |
def main():
|
111 |
parser = argparse.ArgumentParser(description="Run auto evaluation with optional reproducibility feature")
|
112 |
|
@@ -114,7 +185,7 @@ def main():
|
|
114 |
parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
|
115 |
parser.add_argument("--model", type=str, default=None, help="Your Model ID")
|
116 |
parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
|
117 |
-
parser.add_argument("--publish", type=bool, default=
|
118 |
parser.add_argument("--update", type=bool, default=False, help="whether to update google drive files")
|
119 |
|
120 |
args = parser.parse_args()
|
|
|
27 |
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
28 |
# exit()
|
29 |
|
30 |
+
# def run_auto_eval(args):
|
31 |
+
# if not args.reproduce:
|
32 |
+
# current_pending_status = [PENDING_STATUS]
|
33 |
+
# print('_________________')
|
34 |
+
# manage_requests.check_completed_evals(
|
35 |
+
# api=envs.API,
|
36 |
+
# checked_status=RUNNING_STATUS,
|
37 |
+
# completed_status=FINISHED_STATUS,
|
38 |
+
# failed_status=FAILED_STATUS,
|
39 |
+
# hf_repo=envs.QUEUE_REPO,
|
40 |
+
# local_dir=envs.EVAL_REQUESTS_PATH_BACKEND,
|
41 |
+
# hf_repo_results=envs.RESULTS_REPO,
|
42 |
+
# local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
|
43 |
+
# )
|
44 |
+
# logging.info("Checked completed evals")
|
45 |
+
# eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
|
46 |
+
# hf_repo=envs.QUEUE_REPO,
|
47 |
+
# local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
|
48 |
+
# logging.info("Got eval requests")
|
49 |
+
# eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
|
50 |
+
# logging.info("Sorted eval requests")
|
51 |
+
#
|
52 |
+
# print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
53 |
+
# print(eval_requests)
|
54 |
+
# if len(eval_requests) == 0:
|
55 |
+
# print("No eval requests found. Exiting.")
|
56 |
+
# return
|
57 |
+
#
|
58 |
+
# if args.model is not None:
|
59 |
+
# eval_request = manage_requests.EvalRequest(
|
60 |
+
# model=args.model,
|
61 |
+
# status=PENDING_STATUS,
|
62 |
+
# precision=args.precision
|
63 |
+
# )
|
64 |
+
# pp.pprint(eval_request)
|
65 |
+
# else:
|
66 |
+
# eval_request = eval_requests[0]
|
67 |
+
# pp.pprint(eval_request)
|
68 |
+
#
|
69 |
+
# # manage_requests.set_eval_request(
|
70 |
+
# # api=envs.API,
|
71 |
+
# # eval_request=eval_request,
|
72 |
+
# # new_status=RUNNING_STATUS,
|
73 |
+
# # hf_repo=envs.QUEUE_REPO,
|
74 |
+
# # local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
|
75 |
+
# # )
|
76 |
+
# # logging.info("Set eval request to running, now running eval")
|
77 |
+
#
|
78 |
+
# run_eval_suite.run_evaluation(
|
79 |
+
# eval_request=eval_request,
|
80 |
+
# local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
|
81 |
+
# results_repo=envs.RESULTS_REPO,
|
82 |
+
# batch_size=1,
|
83 |
+
# device=envs.DEVICE,
|
84 |
+
# no_cache=True,
|
85 |
+
# need_check=not args.publish,
|
86 |
+
# write_results=args.update
|
87 |
+
# )
|
88 |
+
# logging.info("Eval finished, now setting status to finished")
|
89 |
+
# else:
|
90 |
+
# eval_request = manage_requests.EvalRequest(
|
91 |
+
# model=args.model,
|
92 |
+
# status=PENDING_STATUS,
|
93 |
+
# precision=args.precision
|
94 |
+
# )
|
95 |
+
# pp.pprint(eval_request)
|
96 |
+
# logging.info("Running reproducibility eval")
|
97 |
+
#
|
98 |
+
# run_eval_suite.run_evaluation(
|
99 |
+
# eval_request=eval_request,
|
100 |
+
# local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
|
101 |
+
# results_repo=envs.RESULTS_REPO,
|
102 |
+
# batch_size=1,
|
103 |
+
# device=envs.DEVICE,
|
104 |
+
# need_check=not args.publish,
|
105 |
+
# write_results=args.update
|
106 |
+
# )
|
107 |
+
# logging.info("Reproducibility eval finished")
|
108 |
+
|
109 |
def run_auto_eval(args):
|
110 |
if not args.reproduce:
|
111 |
current_pending_status = [PENDING_STATUS]
|
|
|
121 |
local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
|
122 |
)
|
123 |
logging.info("Checked completed evals")
|
124 |
+
eval_requests = manage_requests.get_eval_requests(
|
125 |
+
job_status=current_pending_status,
|
126 |
+
hf_repo=envs.QUEUE_REPO,
|
127 |
+
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
|
128 |
+
)
|
129 |
logging.info("Got eval requests")
|
130 |
eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
|
131 |
logging.info("Sorted eval requests")
|
132 |
|
133 |
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
|
|
134 |
if len(eval_requests) == 0:
|
135 |
print("No eval requests found. Exiting.")
|
136 |
return
|
137 |
|
138 |
+
for eval_request in eval_requests:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
pp.pprint(eval_request)
|
140 |
+
run_eval_suite.run_evaluation(
|
141 |
+
eval_request=eval_request,
|
142 |
+
local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
|
143 |
+
results_repo=envs.RESULTS_REPO,
|
144 |
+
batch_size=1,
|
145 |
+
device=envs.DEVICE,
|
146 |
+
no_cache=True,
|
147 |
+
need_check=not args.publish,
|
148 |
+
write_results=args.update
|
149 |
+
)
|
150 |
+
logging.info(f"Eval finished for model {eval_request.model}, now setting status to finished")
|
151 |
+
|
152 |
+
# Update the status to FINISHED
|
153 |
+
manage_requests.set_eval_request(
|
154 |
+
api=envs.API,
|
155 |
+
eval_request=eval_request,
|
156 |
+
new_status=FINISHED_STATUS,
|
157 |
+
hf_repo=envs.QUEUE_REPO,
|
158 |
+
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
|
159 |
+
)
|
160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
else:
|
162 |
eval_request = manage_requests.EvalRequest(
|
163 |
model=args.model,
|
|
|
178 |
)
|
179 |
logging.info("Reproducibility eval finished")
|
180 |
|
|
|
181 |
def main():
|
182 |
parser = argparse.ArgumentParser(description="Run auto evaluation with optional reproducibility feature")
|
183 |
|
|
|
185 |
parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
|
186 |
parser.add_argument("--model", type=str, default=None, help="Your Model ID")
|
187 |
parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
|
188 |
+
parser.add_argument("--publish", type=bool, default=True, help="whether directly publish the evaluation results on HF")
|
189 |
parser.add_argument("--update", type=bool, default=False, help="whether to update google drive files")
|
190 |
|
191 |
args = parser.parse_args()
|
src/backend/model_operations.py
CHANGED
@@ -173,12 +173,12 @@ class SummaryGenerator:
|
|
173 |
# print(ID, q_ID, prompt_value)
|
174 |
system_prompt = envs.SYSTEM_PROMPT
|
175 |
_user_prompt = prompt_value
|
176 |
-
for ii in range(
|
177 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
178 |
while True:
|
179 |
try:
|
180 |
'''调用'''
|
181 |
-
print('
|
182 |
|
183 |
_response = self.generate_summary(system_prompt, _user_prompt)
|
184 |
# print(f"Finish index {index}")
|
@@ -204,18 +204,46 @@ class SummaryGenerator:
|
|
204 |
break
|
205 |
if i == 5:
|
206 |
print(_response)
|
207 |
-
|
208 |
-
|
209 |
-
|
|
|
|
|
|
|
210 |
try:
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
217 |
else:
|
218 |
-
_response1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
Experiment_ID.append(ID)
|
221 |
Questions_ID.append(q_column[j])
|
@@ -421,10 +449,16 @@ class SummaryGenerator:
|
|
421 |
# print(result)
|
422 |
from huggingface_hub import InferenceClient
|
423 |
|
424 |
-
client = InferenceClient(self.model_id,api_key=envs.TOKEN)
|
425 |
messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
|
426 |
outputs = client.chat_completion(messages, max_tokens=50)
|
427 |
-
result =
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
|
429 |
return result
|
430 |
# exit()
|
|
|
173 |
# print(ID, q_ID, prompt_value)
|
174 |
system_prompt = envs.SYSTEM_PROMPT
|
175 |
_user_prompt = prompt_value
|
176 |
+
for ii in range(2):
|
177 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
178 |
while True:
|
179 |
try:
|
180 |
'''调用'''
|
181 |
+
print(ID,'-',ii)
|
182 |
|
183 |
_response = self.generate_summary(system_prompt, _user_prompt)
|
184 |
# print(f"Finish index {index}")
|
|
|
204 |
break
|
205 |
if i == 5:
|
206 |
print(_response)
|
207 |
+
|
208 |
+
def extract_responses(text, trigger_words=None):
|
209 |
+
if trigger_words is None:
|
210 |
+
# 如果没有提供特定的触发词列表,则使用默认值
|
211 |
+
trigger_words = ["sure", "okay", "yes"]
|
212 |
+
|
213 |
try:
|
214 |
+
sentences = text.split('\n')
|
215 |
+
|
216 |
+
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
217 |
+
|
218 |
+
sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence for
|
219 |
+
sentence in sentences]
|
220 |
+
if any(sentences[0].lower().startswith(word) for word in trigger_words):
|
221 |
+
_response1 = sentences[1].strip() if len(sentences) > 1 else None
|
222 |
+
_response2 = sentences[2].strip() if len(sentences) > 2 else None
|
223 |
else:
|
224 |
+
_response1 = sentences[0].strip() if len(sentences) > 0 else None
|
225 |
+
_response2 = sentences[1].strip() if len(sentences) > 1 else None
|
226 |
+
|
227 |
+
except Exception as e:
|
228 |
+
print(f"Error occurred: {e}")
|
229 |
+
_response1, _response2 = None, None
|
230 |
+
|
231 |
+
|
232 |
+
return _response1, _response2
|
233 |
+
|
234 |
+
_response1, _response2 = extract_responses(_response)
|
235 |
+
# if _response == None:
|
236 |
+
# _response1, _response2 = "", ""
|
237 |
+
# else:
|
238 |
+
# try:
|
239 |
+
# import re
|
240 |
+
# _response1,_response2 = re.split(r'\n\s*\n', _response.strip())
|
241 |
+
# except:
|
242 |
+
# _response1 = _response.split('\n\n')
|
243 |
+
# if len(_response) == 2:
|
244 |
+
# _response1, _response2 = _response[0], _response[1]
|
245 |
+
# else:
|
246 |
+
# _response1, _response2 = _response[0], ""
|
247 |
|
248 |
Experiment_ID.append(ID)
|
249 |
Questions_ID.append(q_column[j])
|
|
|
449 |
# print(result)
|
450 |
from huggingface_hub import InferenceClient
|
451 |
|
452 |
+
client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
|
453 |
messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
|
454 |
outputs = client.chat_completion(messages, max_tokens=50)
|
455 |
+
result = None
|
456 |
+
while result is None:
|
457 |
+
outputs = client.chat_completion(messages, max_tokens=50)
|
458 |
+
result = outputs['choices'][0]['message']['content']
|
459 |
+
|
460 |
+
if result is None:
|
461 |
+
time.sleep(1) # Optional: Add a small delay before retrying
|
462 |
|
463 |
return result
|
464 |
# exit()
|
src/leaderboard/read_evals.py
CHANGED
@@ -173,6 +173,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
173 |
for model_result_filepath in model_result_filepaths:
|
174 |
# Creation of result
|
175 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
|
|
176 |
eval_result.update_with_request_file(requests_path)
|
177 |
|
178 |
# Store results of same eval together
|
|
|
173 |
for model_result_filepath in model_result_filepaths:
|
174 |
# Creation of result
|
175 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
176 |
+
print("request_path:",requests_path)
|
177 |
eval_result.update_with_request_file(requests_path)
|
178 |
|
179 |
# Store results of same eval together
|