XufengDuan commited on
Commit
2a968dc
·
1 Parent(s): 7d83c67

update scripts

Browse files
main_backend.py CHANGED
@@ -27,6 +27,85 @@ snapshot_download(repo_id=envs.QUEUE_REPO, revision="main",
27
  local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
28
  # exit()
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def run_auto_eval(args):
31
  if not args.reproduce:
32
  current_pending_status = [PENDING_STATUS]
@@ -42,50 +121,43 @@ def run_auto_eval(args):
42
  local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
43
  )
44
  logging.info("Checked completed evals")
45
- eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
46
- hf_repo=envs.QUEUE_REPO,
47
- local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
 
 
48
  logging.info("Got eval requests")
49
  eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
50
  logging.info("Sorted eval requests")
51
 
52
  print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
53
- print(eval_requests)
54
  if len(eval_requests) == 0:
55
  print("No eval requests found. Exiting.")
56
  return
57
 
58
- if args.model is not None:
59
- eval_request = manage_requests.EvalRequest(
60
- model=args.model,
61
- status=PENDING_STATUS,
62
- precision=args.precision
63
- )
64
- pp.pprint(eval_request)
65
- else:
66
- eval_request = eval_requests[0]
67
  pp.pprint(eval_request)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- # manage_requests.set_eval_request(
70
- # api=envs.API,
71
- # eval_request=eval_request,
72
- # new_status=RUNNING_STATUS,
73
- # hf_repo=envs.QUEUE_REPO,
74
- # local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
75
- # )
76
- # logging.info("Set eval request to running, now running eval")
77
-
78
- run_eval_suite.run_evaluation(
79
- eval_request=eval_request,
80
- local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
81
- results_repo=envs.RESULTS_REPO,
82
- batch_size=1,
83
- device=envs.DEVICE,
84
- no_cache=True,
85
- need_check=not args.publish,
86
- write_results=args.update
87
- )
88
- logging.info("Eval finished, now setting status to finished")
89
  else:
90
  eval_request = manage_requests.EvalRequest(
91
  model=args.model,
@@ -106,7 +178,6 @@ def run_auto_eval(args):
106
  )
107
  logging.info("Reproducibility eval finished")
108
 
109
-
110
  def main():
111
  parser = argparse.ArgumentParser(description="Run auto evaluation with optional reproducibility feature")
112
 
@@ -114,7 +185,7 @@ def main():
114
  parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
115
  parser.add_argument("--model", type=str, default=None, help="Your Model ID")
116
  parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
117
- parser.add_argument("--publish", type=bool, default=False, help="whether directly publish the evaluation results on HF")
118
  parser.add_argument("--update", type=bool, default=False, help="whether to update google drive files")
119
 
120
  args = parser.parse_args()
 
27
  local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
28
  # exit()
29
 
30
+ # def run_auto_eval(args):
31
+ # if not args.reproduce:
32
+ # current_pending_status = [PENDING_STATUS]
33
+ # print('_________________')
34
+ # manage_requests.check_completed_evals(
35
+ # api=envs.API,
36
+ # checked_status=RUNNING_STATUS,
37
+ # completed_status=FINISHED_STATUS,
38
+ # failed_status=FAILED_STATUS,
39
+ # hf_repo=envs.QUEUE_REPO,
40
+ # local_dir=envs.EVAL_REQUESTS_PATH_BACKEND,
41
+ # hf_repo_results=envs.RESULTS_REPO,
42
+ # local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
43
+ # )
44
+ # logging.info("Checked completed evals")
45
+ # eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
46
+ # hf_repo=envs.QUEUE_REPO,
47
+ # local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
48
+ # logging.info("Got eval requests")
49
+ # eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
50
+ # logging.info("Sorted eval requests")
51
+ #
52
+ # print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
53
+ # print(eval_requests)
54
+ # if len(eval_requests) == 0:
55
+ # print("No eval requests found. Exiting.")
56
+ # return
57
+ #
58
+ # if args.model is not None:
59
+ # eval_request = manage_requests.EvalRequest(
60
+ # model=args.model,
61
+ # status=PENDING_STATUS,
62
+ # precision=args.precision
63
+ # )
64
+ # pp.pprint(eval_request)
65
+ # else:
66
+ # eval_request = eval_requests[0]
67
+ # pp.pprint(eval_request)
68
+ #
69
+ # # manage_requests.set_eval_request(
70
+ # # api=envs.API,
71
+ # # eval_request=eval_request,
72
+ # # new_status=RUNNING_STATUS,
73
+ # # hf_repo=envs.QUEUE_REPO,
74
+ # # local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
75
+ # # )
76
+ # # logging.info("Set eval request to running, now running eval")
77
+ #
78
+ # run_eval_suite.run_evaluation(
79
+ # eval_request=eval_request,
80
+ # local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
81
+ # results_repo=envs.RESULTS_REPO,
82
+ # batch_size=1,
83
+ # device=envs.DEVICE,
84
+ # no_cache=True,
85
+ # need_check=not args.publish,
86
+ # write_results=args.update
87
+ # )
88
+ # logging.info("Eval finished, now setting status to finished")
89
+ # else:
90
+ # eval_request = manage_requests.EvalRequest(
91
+ # model=args.model,
92
+ # status=PENDING_STATUS,
93
+ # precision=args.precision
94
+ # )
95
+ # pp.pprint(eval_request)
96
+ # logging.info("Running reproducibility eval")
97
+ #
98
+ # run_eval_suite.run_evaluation(
99
+ # eval_request=eval_request,
100
+ # local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
101
+ # results_repo=envs.RESULTS_REPO,
102
+ # batch_size=1,
103
+ # device=envs.DEVICE,
104
+ # need_check=not args.publish,
105
+ # write_results=args.update
106
+ # )
107
+ # logging.info("Reproducibility eval finished")
108
+
109
  def run_auto_eval(args):
110
  if not args.reproduce:
111
  current_pending_status = [PENDING_STATUS]
 
121
  local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
122
  )
123
  logging.info("Checked completed evals")
124
+ eval_requests = manage_requests.get_eval_requests(
125
+ job_status=current_pending_status,
126
+ hf_repo=envs.QUEUE_REPO,
127
+ local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
128
+ )
129
  logging.info("Got eval requests")
130
  eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
131
  logging.info("Sorted eval requests")
132
 
133
  print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
 
134
  if len(eval_requests) == 0:
135
  print("No eval requests found. Exiting.")
136
  return
137
 
138
+ for eval_request in eval_requests:
 
 
 
 
 
 
 
 
139
  pp.pprint(eval_request)
140
+ run_eval_suite.run_evaluation(
141
+ eval_request=eval_request,
142
+ local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
143
+ results_repo=envs.RESULTS_REPO,
144
+ batch_size=1,
145
+ device=envs.DEVICE,
146
+ no_cache=True,
147
+ need_check=not args.publish,
148
+ write_results=args.update
149
+ )
150
+ logging.info(f"Eval finished for model {eval_request.model}, now setting status to finished")
151
+
152
+ # Update the status to FINISHED
153
+ manage_requests.set_eval_request(
154
+ api=envs.API,
155
+ eval_request=eval_request,
156
+ new_status=FINISHED_STATUS,
157
+ hf_repo=envs.QUEUE_REPO,
158
+ local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
159
+ )
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  else:
162
  eval_request = manage_requests.EvalRequest(
163
  model=args.model,
 
178
  )
179
  logging.info("Reproducibility eval finished")
180
 
 
181
  def main():
182
  parser = argparse.ArgumentParser(description="Run auto evaluation with optional reproducibility feature")
183
 
 
185
  parser.add_argument("--reproduce", type=bool, default=False, help="Reproduce the evaluation results")
186
  parser.add_argument("--model", type=str, default=None, help="Your Model ID")
187
  parser.add_argument("--precision", type=str, default="float16", help="Precision of your model")
188
+ parser.add_argument("--publish", type=bool, default=True, help="whether directly publish the evaluation results on HF")
189
  parser.add_argument("--update", type=bool, default=False, help="whether to update google drive files")
190
 
191
  args = parser.parse_args()
src/backend/model_operations.py CHANGED
@@ -173,12 +173,12 @@ class SummaryGenerator:
173
  # print(ID, q_ID, prompt_value)
174
  system_prompt = envs.SYSTEM_PROMPT
175
  _user_prompt = prompt_value
176
- for ii in range(1):
177
  # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
178
  while True:
179
  try:
180
  '''调用'''
181
- print('开始调用LLM-API')
182
 
183
  _response = self.generate_summary(system_prompt, _user_prompt)
184
  # print(f"Finish index {index}")
@@ -204,18 +204,46 @@ class SummaryGenerator:
204
  break
205
  if i == 5:
206
  print(_response)
207
- if _response == None:
208
- _response1, _response2 = "", ""
209
- else:
 
 
 
210
  try:
211
- import re
212
- _response1,_response2 = re.split(r'\n\s*\n', _response.strip())
213
- except:
214
- _response1 = _response.split('\n\n')
215
- if len(_response) == 2:
216
- _response1, _response2 = _response[0], _response[1]
 
 
 
217
  else:
218
- _response1, _response2 = _response[0], ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  Experiment_ID.append(ID)
221
  Questions_ID.append(q_column[j])
@@ -421,10 +449,16 @@ class SummaryGenerator:
421
  # print(result)
422
  from huggingface_hub import InferenceClient
423
 
424
- client = InferenceClient(self.model_id,api_key=envs.TOKEN)
425
  messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
426
  outputs = client.chat_completion(messages, max_tokens=50)
427
- result = outputs['choices'][0]['message']['content']
 
 
 
 
 
 
428
 
429
  return result
430
  # exit()
 
173
  # print(ID, q_ID, prompt_value)
174
  system_prompt = envs.SYSTEM_PROMPT
175
  _user_prompt = prompt_value
176
+ for ii in range(2):
177
  # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
178
  while True:
179
  try:
180
  '''调用'''
181
+ print(ID,'-',ii)
182
 
183
  _response = self.generate_summary(system_prompt, _user_prompt)
184
  # print(f"Finish index {index}")
 
204
  break
205
  if i == 5:
206
  print(_response)
207
+
208
+ def extract_responses(text, trigger_words=None):
209
+ if trigger_words is None:
210
+ # 如果没有提供特定的触发词列表,则使用默认值
211
+ trigger_words = ["sure", "okay", "yes"]
212
+
213
  try:
214
+ sentences = text.split('\n')
215
+
216
+ sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
217
+
218
+ sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence for
219
+ sentence in sentences]
220
+ if any(sentences[0].lower().startswith(word) for word in trigger_words):
221
+ _response1 = sentences[1].strip() if len(sentences) > 1 else None
222
+ _response2 = sentences[2].strip() if len(sentences) > 2 else None
223
  else:
224
+ _response1 = sentences[0].strip() if len(sentences) > 0 else None
225
+ _response2 = sentences[1].strip() if len(sentences) > 1 else None
226
+
227
+ except Exception as e:
228
+ print(f"Error occurred: {e}")
229
+ _response1, _response2 = None, None
230
+
231
+
232
+ return _response1, _response2
233
+
234
+ _response1, _response2 = extract_responses(_response)
235
+ # if _response == None:
236
+ # _response1, _response2 = "", ""
237
+ # else:
238
+ # try:
239
+ # import re
240
+ # _response1,_response2 = re.split(r'\n\s*\n', _response.strip())
241
+ # except:
242
+ # _response1 = _response.split('\n\n')
243
+ # if len(_response) == 2:
244
+ # _response1, _response2 = _response[0], _response[1]
245
+ # else:
246
+ # _response1, _response2 = _response[0], ""
247
 
248
  Experiment_ID.append(ID)
249
  Questions_ID.append(q_column[j])
 
449
  # print(result)
450
  from huggingface_hub import InferenceClient
451
 
452
+ client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
453
  messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
454
  outputs = client.chat_completion(messages, max_tokens=50)
455
+ result = None
456
+ while result is None:
457
+ outputs = client.chat_completion(messages, max_tokens=50)
458
+ result = outputs['choices'][0]['message']['content']
459
+
460
+ if result is None:
461
+ time.sleep(1) # Optional: Add a small delay before retrying
462
 
463
  return result
464
  # exit()
src/leaderboard/read_evals.py CHANGED
@@ -173,6 +173,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
173
  for model_result_filepath in model_result_filepaths:
174
  # Creation of result
175
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
176
  eval_result.update_with_request_file(requests_path)
177
 
178
  # Store results of same eval together
 
173
  for model_result_filepath in model_result_filepaths:
174
  # Creation of result
175
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
176
+ print("request_path:",requests_path)
177
  eval_result.update_with_request_file(requests_path)
178
 
179
  # Store results of same eval together