app.py CHANGED
@@ -2,6 +2,7 @@
2
  import os
3
  import datetime
4
  import socket
 
5
  from threading import Thread
6
 
7
  import gradio as gr
@@ -20,6 +21,7 @@ from src.display.about import (
20
  LLM_BENCHMARKS_DETAILS,
21
  FAQ_TEXT,
22
  TITLE,
 
23
  )
24
 
25
  from src.display.css_html_js import custom_css
@@ -89,6 +91,17 @@ def init_space():
89
  EVAL_REQUESTS_PATH, EVAL_COLS
90
  )
91
  return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  # Searching and filtering
94
  def update_table(
@@ -96,7 +109,8 @@ def update_table(
96
  ):
97
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
98
  filtered_df = filter_queries(query, filtered_df)
99
- df = select_columns(filtered_df, columns)
 
100
  return df
101
 
102
 
@@ -204,10 +218,21 @@ def load_query(request: gr.Request):
204
  return query
205
 
206
 
 
 
 
 
 
 
 
 
 
 
207
  demo = gr.Blocks(css=custom_css)
208
  with demo:
209
  gr.HTML(TITLE)
210
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
211
 
212
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
213
  with gr.TabItem("open-moe-llm-leaderboard", elem_id="llm-benchmark-tab-table", id=0):
@@ -270,18 +295,19 @@ with demo:
270
  # )
271
 
272
  # breakpoint()
273
-
274
  leaderboard_table = gr.components.Dataframe(
275
  value=(
276
  leaderboard_df[
277
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
278
  + shown_columns.value
 
279
  + [AutoEvalColumn.dummy.name]
280
  ]
281
  if leaderboard_df.empty is False
282
  else leaderboard_df
283
  ),
284
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
285
  datatype=TYPES,
286
  elem_id="leaderboard-table",
287
  interactive=False,
@@ -313,7 +339,7 @@ with demo:
313
  demo.load(load_query, inputs=[], outputs=[search_bar])
314
 
315
  for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
316
- selector.select(
317
  update_table,
318
  [
319
  hidden_leaderboard_table_for_search,
 
2
  import os
3
  import datetime
4
  import socket
5
+ import base64
6
  from threading import Thread
7
 
8
  import gradio as gr
 
21
  LLM_BENCHMARKS_DETAILS,
22
  FAQ_TEXT,
23
  TITLE,
24
+ ACKNOWLEDGEMENT_TEXT,
25
  )
26
 
27
  from src.display.css_html_js import custom_css
 
91
  EVAL_REQUESTS_PATH, EVAL_COLS
92
  )
93
  return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
94
+
95
+
96
+ def add_benchmark_columns(shown_columns):
97
+ benchmark_columns = []
98
+ for benchmark in BENCHMARK_COLS:
99
+ if benchmark in shown_columns:
100
+ for c in COLS:
101
+ if benchmark in c and benchmark != c:
102
+ benchmark_columns.append(c)
103
+ return benchmark_columns
104
+
105
 
106
  # Searching and filtering
107
  def update_table(
 
109
  ):
110
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
111
  filtered_df = filter_queries(query, filtered_df)
112
+ benchmark_columns = add_benchmark_columns(columns)
113
+ df = select_columns(filtered_df, columns + benchmark_columns)
114
  return df
115
 
116
 
 
218
  return query
219
 
220
 
221
+ def get_image_html(url, image_path):
222
+ with open(image_path, "rb") as image_file:
223
+ encoded_string = base64.b64encode(image_file.read()).decode()
224
+ return f'<a href="{url}" target="_blank"><img src="data:image/jpg;base64,{encoded_string}" alt="NetMind.AI Logo" style="width:100pt;"></a>'
225
+
226
+
227
+ # Prepare the HTML content with the image
228
+ image_html = get_image_html("https://netmind.ai/home", "./src/display/imgs/Netmind.AI_LOGO.jpg")
229
+
230
+
231
  demo = gr.Blocks(css=custom_css)
232
  with demo:
233
  gr.HTML(TITLE)
234
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
235
+ gr.HTML(ACKNOWLEDGEMENT_TEXT.format(image_html=image_html))
236
 
237
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
238
  with gr.TabItem("open-moe-llm-leaderboard", elem_id="llm-benchmark-tab-table", id=0):
 
295
  # )
296
 
297
  # breakpoint()
298
+ benchmark_columns = add_benchmark_columns(shown_columns.value)
299
  leaderboard_table = gr.components.Dataframe(
300
  value=(
301
  leaderboard_df[
302
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
303
  + shown_columns.value
304
+ + benchmark_columns
305
  + [AutoEvalColumn.dummy.name]
306
  ]
307
  if leaderboard_df.empty is False
308
  else leaderboard_df
309
  ),
310
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + benchmark_columns,
311
  datatype=TYPES,
312
  elem_id="leaderboard-table",
313
  interactive=False,
 
339
  demo.load(load_query, inputs=[], outputs=[search_bar])
340
 
341
  for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
342
+ selector.change(
343
  update_table,
344
  [
345
  hidden_leaderboard_table_for_search,
backend-cli.py CHANGED
@@ -17,7 +17,7 @@ from src.backend.manage_requests import EvalRequest
17
  from src.leaderboard.read_evals import EvalResult
18
 
19
  from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
20
- from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
21
 
22
  from src.leaderboard.read_evals import get_raw_eval_results
23
 
@@ -28,6 +28,8 @@ import time
28
  import pprint
29
  import logging
30
 
 
 
31
 
32
  # Configure the root logger
33
  logging.basicConfig(
@@ -42,6 +44,20 @@ eval_logger = logging.getLogger("lm-eval")
42
  # Explicitly set the level for 'lm-eval' logger to WARNING
43
  eval_logger.setLevel(logging.WARNING)
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
47
  for i in range(10):
@@ -126,9 +142,6 @@ def request_to_result_name(request: EvalRequest) -> str:
126
  def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
127
  batch_size = 1
128
  batch_size = eval_request.batch_size
129
-
130
- if args.debug:
131
- RESULTS_REPO = DEBUG_RESULTS_REPO
132
 
133
  init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
134
  # if init_gpu_info['Mem(M)'] > 500:
@@ -137,6 +150,12 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
137
  stop_event = threading.Event()
138
  monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
139
  monitor_thread.start()
 
 
 
 
 
 
140
 
141
  try:
142
  results = run_evaluation(
@@ -198,6 +217,8 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
198
  repo_id=RESULTS_REPO,
199
  repo_type="dataset",
200
  )
 
 
201
  return results
202
 
203
 
@@ -366,21 +387,7 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
366
 
367
  return False
368
 
369
-
370
- def get_gpu_details():
371
- gpus = GPUtil.getGPUs()
372
- gpu = gpus[0]
373
- name = gpu.name.replace(" ", "-")
374
- # Convert memory from MB to GB and round to nearest whole number
375
- memory_gb = round(gpu.memoryTotal / 1024)
376
- memory = f"{memory_gb}GB"
377
- formatted_name = f"{name}-{memory}"
378
- return formatted_name
379
-
380
  def process_pending_requests() -> bool:
381
- if args.debug:
382
- QUEUE_REPO = DEBUG_QUEUE_REPO
383
-
384
  sanity_checks()
385
  print("Processing pending requests")
386
  current_pending_status = [PENDING_STATUS]
@@ -443,13 +450,14 @@ def get_args():
443
  parser = argparse.ArgumentParser(description="Run the backend")
444
  parser.add_argument("--debug", action="store_true", help="Run in debug mode")
445
  # debug parameters
446
- parser.add_argument("--task", type=str, default="selfcheckgpt,mmlu", help="Task to debug")
447
  parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1,mistralai/Mixtral-8x7B-v0.1", help="Model to debug")
448
  parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
449
  parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
450
  parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
451
  parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
452
  help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
 
453
  return parser.parse_args()
454
 
455
 
@@ -457,7 +465,7 @@ if __name__ == "__main__":
457
  args = get_args()
458
  local_debug = args.debug
459
  # debug specific task by ping
460
- if local_debug:
461
  # debug_model_names = [args.model] # Use model from arguments
462
  # debug_task_name = [args.task] # Use task from arguments
463
  debug_model_names = args.model.split(",")
@@ -465,48 +473,67 @@ if __name__ == "__main__":
465
  precisions = args.precision.split(",")
466
  print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
467
  task_lst = TASKS_HARNESS.copy()
 
468
  for precision in precisions:
469
  for debug_model_name in debug_model_names:
470
  for task in task_lst:
471
  task_name = task.benchmark
472
  if task_name not in debug_task_name:
473
  continue
474
- try:
475
- eval_request = EvalRequest(
476
- model=debug_model_name,
477
- private=False,
478
- status="",
479
- json_filepath="",
480
- precision=precision, # Use precision from arguments
481
- inference_framework=args.inference_framework, # Use inference framework from arguments
482
- gpu_type=args.gpu_type
483
- )
484
- curr_gpu_type = get_gpu_details()
485
- if eval_request.gpu_type != curr_gpu_type:
486
- print(f"GPU type mismatch: {eval_request.gpu_type} vs {curr_gpu_type}")
487
- raise Exception("GPU type mismatch")
488
- results = process_evaluation(task, eval_request, limit=args.limit)
489
- except Exception as e:
490
- print(f"debug running error: {e}")
491
- else:
 
 
492
  while True:
493
  res = False
494
-
495
  # if random.randint(0, 10) == 0:
496
  res = process_pending_requests()
497
  print(f"waiting for 60 seconds")
498
  time.sleep(60)
499
-
500
  # if res is False:
501
  # if random.randint(0, 5) == 0:
502
  # res = maybe_refresh_results(100)
503
  # else:
504
  # res = process_finished_requests(100)
505
-
506
  # time.sleep(60)
507
-
508
  # if res is False:
509
  # if random.randint(0, 5) == 0:
510
  # res = maybe_refresh_results(0)
511
  # else:
512
  # res = process_finished_requests(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  from src.leaderboard.read_evals import EvalResult
18
 
19
  from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
20
+ from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus, get_gpu_details
21
 
22
  from src.leaderboard.read_evals import get_raw_eval_results
23
 
 
28
  import pprint
29
  import logging
30
 
31
+ from lm_eval.filters.extraction import RegexFilter
32
+
33
 
34
  # Configure the root logger
35
  logging.basicConfig(
 
44
  # Explicitly set the level for 'lm-eval' logger to WARNING
45
  eval_logger.setLevel(logging.WARNING)
46
 
47
+ def tuple_input_decorator(func):
48
+ def wrapper(self, resps, docs):
49
+ stripped_resps = [[resp_data[0] for resp_data in group] for group in resps]
50
+
51
+ filtered_resps = func(self, stripped_resps, docs)
52
+
53
+ combined_resps = []
54
+ for original_group, new_group in zip(resps, filtered_resps):
55
+ combined_group = [(new_resp,) + rest_of_data[1:] for new_resp, rest_of_data in zip(new_group, original_group)]
56
+ combined_resps.append(combined_group)
57
+
58
+ return combined_resps
59
+ return wrapper
60
+
61
 
62
  def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
63
  for i in range(10):
 
142
  def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
143
  batch_size = 1
144
  batch_size = eval_request.batch_size
 
 
 
145
 
146
  init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
147
  # if init_gpu_info['Mem(M)'] > 500:
 
150
  stop_event = threading.Event()
151
  monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
152
  monitor_thread.start()
153
+
154
+ original_apply = RegexFilter.apply
155
+ if task.benchmark in ["gsm8k", "gsm8k_cot", "gsm8k_cot_self_consistency", "gsm8k_custom"]:
156
+ RegexFilter.apply = tuple_input_decorator(RegexFilter.apply)
157
+ else:
158
+ RegexFilter.apply = original_apply
159
 
160
  try:
161
  results = run_evaluation(
 
217
  repo_id=RESULTS_REPO,
218
  repo_type="dataset",
219
  )
220
+
221
+ RegexFilter.apply = original_apply
222
  return results
223
 
224
 
 
387
 
388
  return False
389
 
 
 
 
 
 
 
 
 
 
 
 
390
  def process_pending_requests() -> bool:
 
 
 
391
  sanity_checks()
392
  print("Processing pending requests")
393
  current_pending_status = [PENDING_STATUS]
 
450
  parser = argparse.ArgumentParser(description="Run the backend")
451
  parser.add_argument("--debug", action="store_true", help="Run in debug mode")
452
  # debug parameters
453
+ parser.add_argument("--task", type=str, default="selfcheckgpt,mmlu, gsm8k", help="Task to debug")
454
  parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1,mistralai/Mixtral-8x7B-v0.1", help="Model to debug")
455
  parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
456
  parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
457
  parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
458
  parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
459
  help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
460
+ parser.add_argument("--debug_repo", action="store_true", help="Use debug repo")
461
  return parser.parse_args()
462
 
463
 
 
465
  args = get_args()
466
  local_debug = args.debug
467
  # debug specific task by ping
468
+ if local_debug and not args.debug_repo:
469
  # debug_model_names = [args.model] # Use model from arguments
470
  # debug_task_name = [args.task] # Use task from arguments
471
  debug_model_names = args.model.split(",")
 
473
  precisions = args.precision.split(",")
474
  print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
475
  task_lst = TASKS_HARNESS.copy()
476
+ RESULTS_REPO = DEBUG_RESULTS_REPO
477
  for precision in precisions:
478
  for debug_model_name in debug_model_names:
479
  for task in task_lst:
480
  task_name = task.benchmark
481
  if task_name not in debug_task_name:
482
  continue
483
+ # try:
484
+ eval_request = EvalRequest(
485
+ model=debug_model_name,
486
+ private=False,
487
+ status="",
488
+ json_filepath="",
489
+ precision=precision, # Use precision from arguments
490
+ inference_framework=args.inference_framework, # Use inference framework from arguments
491
+ gpu_type=args.gpu_type
492
+ )
493
+ curr_gpu_type = get_gpu_details()
494
+ if eval_request.gpu_type != curr_gpu_type:
495
+ print(f"GPU type mismatch: {eval_request.gpu_type} vs {curr_gpu_type}")
496
+ raise Exception("GPU type mismatch")
497
+ results = process_evaluation(task, eval_request, limit=args.limit)
498
+ # except Exception as e:
499
+ # print(f"debug running error: {e}")
500
+ elif local_debug and args.debug_repo:
501
+ QUEUE_REPO = DEBUG_QUEUE_REPO
502
+ RESULTS_REPO = DEBUG_RESULTS_REPO
503
  while True:
504
  res = False
 
505
  # if random.randint(0, 10) == 0:
506
  res = process_pending_requests()
507
  print(f"waiting for 60 seconds")
508
  time.sleep(60)
 
509
  # if res is False:
510
  # if random.randint(0, 5) == 0:
511
  # res = maybe_refresh_results(100)
512
  # else:
513
  # res = process_finished_requests(100)
 
514
  # time.sleep(60)
 
515
  # if res is False:
516
  # if random.randint(0, 5) == 0:
517
  # res = maybe_refresh_results(0)
518
  # else:
519
  # res = process_finished_requests(0)
520
+ elif not local_debug and not args.debug_repo:
521
+ while True:
522
+ res = False
523
+ # if random.randint(0, 10) == 0:
524
+ res = process_pending_requests()
525
+ print(f"waiting for 60 seconds")
526
+ time.sleep(60)
527
+ # if res is False:
528
+ # if random.randint(0, 5) == 0:
529
+ # res = maybe_refresh_results(100)
530
+ # else:
531
+ # res = process_finished_requests(100)
532
+ # time.sleep(60)
533
+ # if res is False:
534
+ # if random.randint(0, 5) == 0:
535
+ # res = maybe_refresh_results(0)
536
+ # else:
537
+ # res = process_finished_requests(0)
538
+ else:
539
+ raise Exception("Cannot use debug_repo without local debug flag")
requirements.txt CHANGED
@@ -30,4 +30,5 @@ evaluate
30
  spacy==3.7.4
31
  selfcheckgpt
32
  immutabledict
33
- gputil
 
 
30
  spacy==3.7.4
31
  selfcheckgpt
32
  immutabledict
33
+ gputil
34
+ bitsandbytes
src/backend/envs.py CHANGED
@@ -57,6 +57,7 @@ class Tasks(Enum):
57
 
58
  # task20 = Task("race", "acc", "RACE", 0)
59
  task21 = Task("mmlu", "acc", "MMLU", 5)
 
60
 
61
 
62
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 
57
 
58
  # task20 = Task("race", "acc", "RACE", 0)
59
  task21 = Task("mmlu", "acc", "MMLU", 5)
60
+ task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
61
 
62
 
63
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
src/backend/hflm_with_measurement.py CHANGED
@@ -37,6 +37,9 @@ from lm_eval.models.utils import (
37
  stop_sequences_criteria,
38
  )
39
  from lm_eval.models.huggingface import HFLM
 
 
 
40
 
41
 
42
  class StopWatch(TextStreamer):
@@ -67,6 +70,9 @@ class StopWatch(TextStreamer):
67
  class HFLMWithMeasurement(HFLM):
68
  def __init__(self, **kwargs):
69
  super().__init__(**kwargs)
 
 
 
70
 
71
  def _loglikelihood_tokens(
72
  self,
@@ -288,13 +294,15 @@ class HFLMWithMeasurement(HFLM):
288
 
289
  return re_ord.get_original(res)
290
 
291
- def _model_generate(self, context, max_length, stop, **generation_kwargs):
292
  # temperature = 0.0 if not set
293
  # if do_sample is false and temp==0.0:
294
  # remove temperature, as do_sample=False takes care of this
295
  # and we don't want a warning from HF
296
  generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
297
  do_sample = generation_kwargs.get("do_sample", None)
 
 
298
 
299
  # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
300
  if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -302,7 +310,21 @@ class HFLMWithMeasurement(HFLM):
302
 
303
  if do_sample is False and generation_kwargs.get("temperature") == 0.0:
304
  generation_kwargs.pop("temperature")
305
- # build stopping criteria
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  stopping_criteria = stop_sequences_criteria(
307
  self.tokenizer, stop, context.shape[1], context.shape[0]
308
  )
@@ -310,7 +332,7 @@ class HFLMWithMeasurement(HFLM):
310
  start = time()
311
  res = self.model.generate(
312
  input_ids=context,
313
- max_length=max_length,
314
  stopping_criteria=stopping_criteria,
315
  pad_token_id=self.tokenizer.pad_token_id,
316
  use_cache=True,
@@ -321,12 +343,68 @@ class HFLMWithMeasurement(HFLM):
321
 
322
  batch_size = context.shape[0]
323
  output_length = stop_watch.decoding_iterations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  end_to_end_time = (end - start) / batch_size
326
  prefilling_time = stop_watch.prefilling_time / batch_size
327
  decoding_time = stop_watch.decoding_time / batch_size
328
  token_per_sec = output_length / decoding_time
329
- return res, end_to_end_time, prefilling_time, token_per_sec
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  def generate_until(
332
  self, requests: List[Instance], disable_tqdm: bool = False
@@ -403,11 +481,19 @@ class HFLMWithMeasurement(HFLM):
403
  f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
404
  )
405
  # add EOS token to stop sequences
406
- eos = self.tok_decode(self.eot_token_id)
407
  if not until:
408
  until = [eos]
409
  else:
410
  until.append(eos)
 
 
 
 
 
 
 
 
411
  if "max_gen_toks" in kwargs.keys():
412
  max_gen_toks = kwargs.pop("max_gen_toks")
413
  else:
@@ -427,14 +513,16 @@ class HFLMWithMeasurement(HFLM):
427
  left_truncate_len=max_ctx_len,
428
  truncation=self.truncation,
429
  )
 
 
430
  context_enc = context_enc.to(self.device)
431
  attn_masks = attn_masks.to(self.device)
432
 
433
- if "max_length" not in kwargs:
434
- kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
435
 
436
  # perform batched generation
437
- cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate(
438
  context=context_enc,
439
  attention_mask=attn_masks,
440
  stop=until,
@@ -445,18 +533,21 @@ class HFLMWithMeasurement(HFLM):
445
  for cont_toks, context in zip(cont_toks_list, contexts):
446
  # discard context + left-padding toks if using causal decoder-only LM
447
  if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
 
448
  cont_toks = cont_toks[context_enc.shape[1] :]
449
-
450
  s = self.tok_decode(cont_toks)
451
 
452
- # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
 
453
  for term in until:
454
  if len(term) > 0:
455
  # ignore '' separator,
456
  # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
457
  s = s.split(term)[0]
458
-
459
- res.append((s, end_to_end_time, prefilling_time, token_per_sec))
 
460
 
461
  self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
462
  pbar.update(1)
 
37
  stop_sequences_criteria,
38
  )
39
  from lm_eval.models.huggingface import HFLM
40
+ from src.utils import get_gpu_number, get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
41
+ from src.submission.check_validity import get_model_size
42
+ from src.envs import API
43
 
44
 
45
  class StopWatch(TextStreamer):
 
70
  class HFLMWithMeasurement(HFLM):
71
  def __init__(self, **kwargs):
72
  super().__init__(**kwargs)
73
+ self.pretrained = kwargs.get("pretrained", None)
74
+ self.revision = kwargs.get("revision", None)
75
+ self.precision = kwargs.get("dtype", None)
76
 
77
  def _loglikelihood_tokens(
78
  self,
 
294
 
295
  return re_ord.get_original(res)
296
 
297
+ def _model_generate(self, context, max_tokens, stop, **generation_kwargs):
298
  # temperature = 0.0 if not set
299
  # if do_sample is false and temp==0.0:
300
  # remove temperature, as do_sample=False takes care of this
301
  # and we don't want a warning from HF
302
  generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
303
  do_sample = generation_kwargs.get("do_sample", None)
304
+
305
+ # is_gsm8k = generation_kwargs.get("is_gsm8k", False)
306
 
307
  # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
308
  if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
 
310
 
311
  if do_sample is False and generation_kwargs.get("temperature") == 0.0:
312
  generation_kwargs.pop("temperature")
313
+
314
+ # if is_gsm8k:
315
+ # generation_kwargs.pop("is_gsm8k")
316
+
317
+ context_length = context.shape[1]
318
+ model_config = self.model.config
319
+
320
+ if not self.precision:
321
+ if model_config.quantization_config._load_in_4bit:
322
+ self.precision = "4bit"
323
+ elif model_config.quantization_config._load_in_8bit:
324
+ self.precision = "8bit"
325
+ else:
326
+ raise ValueError("Unknown precision")
327
+
328
  stopping_criteria = stop_sequences_criteria(
329
  self.tokenizer, stop, context.shape[1], context.shape[0]
330
  )
 
332
  start = time()
333
  res = self.model.generate(
334
  input_ids=context,
335
+ max_new_tokens=max_tokens,
336
  stopping_criteria=stopping_criteria,
337
  pad_token_id=self.tokenizer.pad_token_id,
338
  use_cache=True,
 
343
 
344
  batch_size = context.shape[0]
345
  output_length = stop_watch.decoding_iterations
346
+
347
+ precision_bytes = transfer_precision2bytes(self.precision)
348
+
349
+ model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
350
+ model_size_param = get_model_size(model_info=model_info, precision=self.precision)
351
+
352
+ n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
353
+ d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
354
+
355
+ if hasattr(model_config, "num_experts_per_tok"):
356
+ n_experts_per_tok = model_config.num_experts_per_tok
357
+ elif hasattr(model_config, "num_selected_experts"):
358
+ n_experts_per_tok = model_config.num_selected_experts
359
+ else:
360
+ n_experts_per_tok = 1
361
+
362
+ if hasattr(model_config, "ffn_dim"):
363
+ d_ff = model_config.ffn_dim
364
+ elif hasattr(model_config, "intermediate_size"):
365
+ d_ff = model_config.intermediate_size
366
+ elif hasattr(model_config, "d_ff"):
367
+ d_ff = model_config.d_ff
368
+ else:
369
+ raise ValueError("Unknown ffn dim model configuration")
370
+
371
+ if hasattr(model_config, "num_local_experts"):
372
+ num_experts = model_config.num_local_experts
373
+ elif hasattr(model_config, "num_experts"):
374
+ num_experts = model_config.num_experts
375
+ else:
376
+ num_experts = 1
377
+
378
+ ffn_params = n_layers * d_ff * 2 * d_model
379
+
380
+ shared_params = model_size_param * 1e9 - num_experts * ffn_params
381
+
382
+ model_size = shared_params + n_experts_per_tok * ffn_params
383
+
384
+ per_token_kv_size = 2 * n_layers * d_model * precision_bytes
385
+
386
+ peak_bw_single = get_peak_bw(get_gpu_details())
387
+ peak_bw = peak_bw_single * get_gpu_number()
388
+
389
+ kv_size = (output_length - 1) * per_token_kv_size / 1e9
390
 
391
  end_to_end_time = (end - start) / batch_size
392
  prefilling_time = stop_watch.prefilling_time / batch_size
393
  decoding_time = stop_watch.decoding_time / batch_size
394
  token_per_sec = output_length / decoding_time
395
+ ach_mem_bw = (model_size * precision_bytes / 1e9 + kv_size) * token_per_sec
396
+
397
+ flops_per_token = 2 * model_size + 2 * n_layers * context_length * d_model
398
+ peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
399
+ peak_flops = peak_flops_single * get_gpu_number()
400
+
401
+ ## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
402
+ mfu = token_per_sec * flops_per_token / peak_flops
403
+ mbu = ach_mem_bw / peak_bw
404
+
405
+ # print(f"mfu: {mfu}, mbu: {mbu}")
406
+
407
+ return res, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu
408
 
409
  def generate_until(
410
  self, requests: List[Instance], disable_tqdm: bool = False
 
481
  f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
482
  )
483
  # add EOS token to stop sequences
484
+ eos = "<|eot_id|>"
485
  if not until:
486
  until = [eos]
487
  else:
488
  until.append(eos)
489
+
490
+ # is_gsm8k = kwargs.get("is_gsm8k", False)
491
+ # if is_gsm8k:
492
+ # until = ["Question:", "Question", "</s>"]
493
+ # eos_ids = [self.tokenizer.eos_token_id,
494
+ # self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
495
+
496
+
497
  if "max_gen_toks" in kwargs.keys():
498
  max_gen_toks = kwargs.pop("max_gen_toks")
499
  else:
 
513
  left_truncate_len=max_ctx_len,
514
  truncation=self.truncation,
515
  )
516
+
517
+ # print("context: ", self.tok_decode(context_enc[0]))
518
  context_enc = context_enc.to(self.device)
519
  attn_masks = attn_masks.to(self.device)
520
 
521
+ if "max_tokens" not in kwargs:
522
+ kwargs["max_tokens"] = max_gen_toks
523
 
524
  # perform batched generation
525
+ cont, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu = self._model_generate(
526
  context=context_enc,
527
  attention_mask=attn_masks,
528
  stop=until,
 
533
  for cont_toks, context in zip(cont_toks_list, contexts):
534
  # discard context + left-padding toks if using causal decoder-only LM
535
  if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
536
+ # print("After Generation: ", self.tok_decode(cont_toks))
537
  cont_toks = cont_toks[context_enc.shape[1] :]
538
+
539
  s = self.tok_decode(cont_toks)
540
 
541
+ # # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
542
+ # if not is_gsm8k:
543
  for term in until:
544
  if len(term) > 0:
545
  # ignore '' separator,
546
  # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
547
  s = s.split(term)[0]
548
+
549
+ # print(s)
550
+ res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu))
551
 
552
  self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
553
  pbar.update(1)
src/backend/run_eval_suite.py CHANGED
@@ -17,12 +17,16 @@ def process_results_decorator(func):
17
  end_to_end_time = sum([r[1] for r in results]) / len(results)
18
  prefilling_time = sum([r[2] for r in results]) / len(results)
19
  decoding_throughput = sum([r[3] for r in results]) / len(results)
 
 
20
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
21
 
22
  result_dict = func(self, doc, processed_results, *args, **kwargs)
23
  result_dict["end_to_end_time"] = end_to_end_time
24
  result_dict["prefilling_time"] = prefilling_time
25
  result_dict["decoding_throughput"] = decoding_throughput
 
 
26
  return result_dict
27
  return wrapper
28
  ConfigurableTask.process_results = process_results_decorator(orig_process_results)
@@ -33,6 +37,8 @@ def aggregation_decorator(func):
33
  aggregation_list["end_to_end_time"] = mean
34
  aggregation_list["prefilling_time"] = mean
35
  aggregation_list["decoding_throughput"] = mean
 
 
36
  return aggregation_list
37
  return wrapper
38
  ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
@@ -43,6 +49,8 @@ def higher_is_better_decorator(func):
43
  higher_is_better_dict["end_to_end_time"] = False
44
  higher_is_better_dict["prefilling_time"] = False
45
  higher_is_better_dict["decoding_throughput"] = True
 
 
46
  return higher_is_better_dict
47
  return wrapper
48
  ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
 
17
  end_to_end_time = sum([r[1] for r in results]) / len(results)
18
  prefilling_time = sum([r[2] for r in results]) / len(results)
19
  decoding_throughput = sum([r[3] for r in results]) / len(results)
20
+ mfu = sum([r[4] for r in results]) / len(results)
21
+ mbu = sum([r[5] for r in results]) / len(results)
22
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
23
 
24
  result_dict = func(self, doc, processed_results, *args, **kwargs)
25
  result_dict["end_to_end_time"] = end_to_end_time
26
  result_dict["prefilling_time"] = prefilling_time
27
  result_dict["decoding_throughput"] = decoding_throughput
28
+ result_dict["mfu"] = mfu * 100
29
+ result_dict["mbu"] = mbu * 100
30
  return result_dict
31
  return wrapper
32
  ConfigurableTask.process_results = process_results_decorator(orig_process_results)
 
37
  aggregation_list["end_to_end_time"] = mean
38
  aggregation_list["prefilling_time"] = mean
39
  aggregation_list["decoding_throughput"] = mean
40
+ aggregation_list["mfu"] = mean
41
+ aggregation_list["mbu"] = mean
42
  return aggregation_list
43
  return wrapper
44
  ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
 
49
  higher_is_better_dict["end_to_end_time"] = False
50
  higher_is_better_dict["prefilling_time"] = False
51
  higher_is_better_dict["decoding_throughput"] = True
52
+ higher_is_better_dict["mfu"] = True
53
+ higher_is_better_dict["mbu"] = True
54
  return higher_is_better_dict
55
  return wrapper
56
  ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
src/backend/tasks/gsm8k/gsm8k-custom.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group:
2
+ - math_word_problems
3
+ task: gsm8k_custom
4
+ dataset_path: gsm8k
5
+ dataset_name: main
6
+ output_type: generate_until
7
+ training_split: train
8
+ fewshot_split: train
9
+ test_split: test
10
+ doc_to_text: "Question: {{question}}\nAnswer:"
11
+ doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
12
+ metric_list:
13
+ - metric: exact_match
14
+ aggregation: mean
15
+ higher_is_better: true
16
+ ignore_case: true
17
+ ignore_punctuation: false
18
+ regexes_to_ignore:
19
+ - ","
20
+ - "\\$"
21
+ - "(?s).*#### "
22
+ - "\\.$"
23
+ generation_kwargs:
24
+ until:
25
+ - "Question:"
26
+ - "Question"
27
+ - "</s>"
28
+ - "<|im_end|>"
29
+ do_sample: false
30
+ temperature: 0.0
31
+ # is_gsm8k: true
32
+ repeats: 1
33
+ num_fewshot: 5
34
+ filter_list:
35
+ - name: "strict-match"
36
+ filter:
37
+ - function: "regex"
38
+ regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
39
+ - function: "take_first"
40
+ - name: "flexible-extract"
41
+ filter:
42
+ - function: "regex"
43
+ group_select: -1
44
+ regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
45
+ - function: "take_first"
46
+ metadata:
47
+ version: 3.0
src/backend/tasks/measurement_task_utils.py CHANGED
@@ -12,6 +12,9 @@ def process_results_decorator(func):
12
  end_to_end_time = sum([r[1] for r in results]) / len(results)
13
  prefilling_time = sum([r[2] for r in results]) / len(results)
14
  decoding_throughput = sum([r[3] for r in results]) / len(results)
 
 
 
15
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
16
 
17
  # Now call the original process_results with the processed results
@@ -19,6 +22,8 @@ def process_results_decorator(func):
19
  result_dict["end_to_end_time"] = end_to_end_time
20
  result_dict["prefilling_time"] = prefilling_time
21
  result_dict["decoding_throughput"] = decoding_throughput
 
 
22
  return result_dict
23
  return wrapper
24
 
@@ -30,6 +35,8 @@ def aggregation_decorator(func):
30
  aggregation_list["end_to_end_time"] = mean
31
  aggregation_list["prefilling_time"] = mean
32
  aggregation_list["decoding_throughput"] = mean
 
 
33
  return aggregation_list
34
  return wrapper
35
 
@@ -41,6 +48,8 @@ def higher_is_better_decorator(func):
41
  higher_is_better_dict["end_to_end_time"] = False
42
  higher_is_better_dict["prefilling_time"] = False
43
  higher_is_better_dict["decoding_throughput"] = True
 
 
44
  return higher_is_better_dict
45
  return wrapper
46
 
 
12
  end_to_end_time = sum([r[1] for r in results]) / len(results)
13
  prefilling_time = sum([r[2] for r in results]) / len(results)
14
  decoding_throughput = sum([r[3] for r in results]) / len(results)
15
+ mfu = sum([r[4] for r in results]) / len(results)
16
+ mbu = sum([r[5] for r in results]) / len(results)
17
+
18
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
19
 
20
  # Now call the original process_results with the processed results
 
22
  result_dict["end_to_end_time"] = end_to_end_time
23
  result_dict["prefilling_time"] = prefilling_time
24
  result_dict["decoding_throughput"] = decoding_throughput
25
+ result_dict["mfu"] = mfu
26
+ result_dict["mbu"] = mbu
27
  return result_dict
28
  return wrapper
29
 
 
35
  aggregation_list["end_to_end_time"] = mean
36
  aggregation_list["prefilling_time"] = mean
37
  aggregation_list["decoding_throughput"] = mean
38
+ aggregation_list["mfu"] = mean
39
+ aggregation_list["mbu"] = mean
40
  return aggregation_list
41
  return wrapper
42
 
 
48
  higher_is_better_dict["end_to_end_time"] = False
49
  higher_is_better_dict["prefilling_time"] = False
50
  higher_is_better_dict["decoding_throughput"] = True
51
+ higher_is_better_dict["mfu"] = True
52
+ higher_is_better_dict["mbu"] = True
53
  return higher_is_better_dict
54
  return wrapper
55
 
src/display/about.py CHANGED
@@ -3,7 +3,8 @@ from src.display.utils import ModelType
3
  TITLE = """<h1 align="center" id="space-title">OPEN-MOE-LLM-LEADERBOARD</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
- The OPEN-MOE-LLM-LEADERBOARD is specifically designed to assess the performance and efficiency of various Mixture of Experts (MoE) Large Language Models (LLMs). This initiative, driven by the open-source community, aims to comprehensively evaluate these advanced MoE LLMs. We extend our gratitude to the Huggingface for the GPU community grant that supported the initial debugging process, and to [NetMind.AI](https://netmind.ai/home) for their generous GPU donation, which ensures the continuous operation of the Leaderboard.
 
7
 
8
  The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to measure the performance and efficiency of MOE LLMs.
9
 
@@ -20,6 +21,15 @@ Columns and Metrics:
20
  - Precision: The precison of used model.
21
 
22
  """
 
 
 
 
 
 
 
 
 
23
  LLM_BENCHMARKS_TEXT = f"""
24
 
25
  """
 
3
  TITLE = """<h1 align="center" id="space-title">OPEN-MOE-LLM-LEADERBOARD</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
+ The OPEN-MOE-LLM-LEADERBOARD is specifically designed to assess the performance and efficiency of various Mixture of Experts (MoE) Large Language Models (LLMs).
7
+ This initiative, driven by the open-source community, aims to comprehensively evaluate these advanced MoE LLMs.
8
 
9
  The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to measure the performance and efficiency of MOE LLMs.
10
 
 
21
  - Precision: The precison of used model.
22
 
23
  """
24
+
25
+ ACKNOWLEDGEMENT_TEXT = """
26
+ <div>
27
+ <h4>Acknowledgements</h4>
28
+ {image_html}
29
+ <p>We express our sincere gratitude to <a href="https://netmind.ai/home">NetMind.AI</a> for their generous donation of GPUs, which plays a crucial role in ensuring the continuous operation of our Leaderboard.</p>
30
+ </div>
31
+ """
32
+
33
  LLM_BENCHMARKS_TEXT = f"""
34
 
35
  """
src/display/imgs/Netmind.AI_LOGO.jpg ADDED
src/display/utils.py CHANGED
@@ -18,12 +18,16 @@ GPU_Power = 'Power(W)'
18
  GPU_Mem = 'Mem(G)'
19
  GPU_Name = "GPU"
20
  GPU_Util = 'Util(%)'
 
 
21
  BATCH_SIZE = 'bs'
22
  PRECISION = "Precision"
23
  system_metrics_to_name_map = {
24
  "end_to_end_time": f"{E2Es}",
25
  "prefilling_time": f"{PREs}",
26
  "decoding_throughput": f"{TS}",
 
 
27
  }
28
 
29
  gpu_metrics_to_name_map = {
@@ -75,6 +79,7 @@ class Tasks(Enum):
75
  # # XXX include me back at some point
76
  selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
77
  mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
 
78
 
79
 
80
  # These classes are for user facing column names,
@@ -104,16 +109,16 @@ auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnConten
104
  for task in Tasks:
105
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
106
  # System performance metrics
107
- auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True)])
108
- auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True)])
109
- # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True)])
110
- auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True)])
111
- auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True)])
112
- auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True)])
113
  if task.value.benchmark in MULTIPLE_CHOICEs:
114
  continue
115
- # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False)])
116
- auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True)])
117
 
118
 
119
  # Model information
@@ -242,8 +247,8 @@ class Precision(Enum):
242
 
243
 
244
  # Column selection
245
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
246
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
247
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
248
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
249
 
 
18
  GPU_Mem = 'Mem(G)'
19
  GPU_Name = "GPU"
20
  GPU_Util = 'Util(%)'
21
+ MFU = 'MFU(%)'
22
+ MBU = 'MBU(%)'
23
  BATCH_SIZE = 'bs'
24
  PRECISION = "Precision"
25
  system_metrics_to_name_map = {
26
  "end_to_end_time": f"{E2Es}",
27
  "prefilling_time": f"{PREs}",
28
  "decoding_throughput": f"{TS}",
29
+ "mfu": f"{MFU}",
30
+ "mbu": f"{MBU}"
31
  }
32
 
33
  gpu_metrics_to_name_map = {
 
79
  # # XXX include me back at some point
80
  selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
81
  mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
82
+ gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (8-shot)
83
 
84
 
85
  # These classes are for user facing column names,
 
109
  for task in Tasks:
110
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
111
  # System performance metrics
112
+ auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
113
+ auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
114
+ # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
115
+ auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
116
+ auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
117
+ auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
118
  if task.value.benchmark in MULTIPLE_CHOICEs:
119
  continue
120
+ # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
121
+ auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)])
122
 
123
 
124
  # Model information
 
247
 
248
 
249
  # Column selection
250
+ COLS = [c.name for c in fields(AutoEvalColumn)]
251
+ TYPES = [c.type for c in fields(AutoEvalColumn)]
252
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
253
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
254
 
src/submission/check_validity.py CHANGED
@@ -74,7 +74,7 @@ def is_model_on_hub(
74
 
75
 
76
  def get_model_size(model_info: ModelInfo, precision: str):
77
- size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
78
  try:
79
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
80
  except (AttributeError, TypeError):
@@ -130,7 +130,8 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
130
  continue
131
  with open(os.path.join(root, file), "r") as f:
132
  info = json.load(f)
133
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
 
134
 
135
  # Select organisation
136
  if info["model"].count("/") == 0 or "submitted_time" not in info:
 
74
 
75
 
76
  def get_model_size(model_info: ModelInfo, precision: str):
77
+ size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
78
  try:
79
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
80
  except (AttributeError, TypeError):
 
130
  continue
131
  with open(os.path.join(root, file), "r") as f:
132
  info = json.load(f)
133
+ if not info["status"] == "FINISHED" and not info["status"] == "RUNNING":
134
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
135
 
136
  # Select organisation
137
  if info["model"].count("/") == 0 or "submitted_time" not in info:
src/utils.py CHANGED
@@ -3,12 +3,54 @@ from huggingface_hub import snapshot_download
3
  import subprocess
4
  import re
5
  import os
 
6
 
7
  try:
8
  from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
9
  except:
10
  print("local debug: from display.utils")
11
  from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
14
  for i in range(10):
@@ -52,11 +94,12 @@ def parse_nvidia_smi():
52
  print("Failed to query GPU indices.")
53
  return []
54
  gpu_indices = result.stdout.strip().split('\n')
55
- print(f"gpu_indices: {gpu_indices}")
56
  gpu_stats = []
57
 
58
  gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
59
- gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+?\d+GB)')
 
60
 
61
  gpu_name = ""
62
  for index in gpu_indices:
@@ -68,7 +111,7 @@ def parse_nvidia_smi():
68
  name_match = gpu_name_pattern.search(line)
69
  gpu_info = {}
70
  if name_match:
71
- gpu_name = name_match.group(1).strip()
72
  if match:
73
  temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
74
  gpu_info.update({
@@ -80,7 +123,7 @@ def parse_nvidia_smi():
80
 
81
  if len(gpu_info) >= 4:
82
  gpu_stats.append(gpu_info)
83
- print(f"gpu_stats: {gpu_stats}")
84
  gpu_name = f"{len(gpu_stats)}x{gpu_name}"
85
  gpu_stats_total = {
86
  GPU_TEMP: 0,
@@ -131,5 +174,75 @@ def analyze_gpu_stats(stats_list):
131
 
132
  return avg_stats
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  if __name__ == "__main__":
135
  print(analyze_gpu_stats(parse_nvidia_smi()))
 
3
  import subprocess
4
  import re
5
  import os
6
+ import GPUtil
7
 
8
  try:
9
  from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
10
  except:
11
  print("local debug: from display.utils")
12
  from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
13
+
14
+ MEM_BW_DICT ={
15
+ "NVIDIA-A100-PCIe-80GB": 1935,
16
+ "NVIDIA-A100-SXM-80GB": 2039,
17
+ "NVIDIA-H100-PCIe-80GB": 2039,
18
+ "NVIDIA-RTX-A5000-24GB": 768
19
+ }
20
+
21
+ PEAK_FLOPS_DICT = {
22
+ "float32":{
23
+ "NVIDIA-A100-PCIe-80GB": 312e12,
24
+ "NVIDIA-A100-SXM-80GB": 312e12,
25
+ "NVIDIA-H100-PCIe-80GB": 756e12,
26
+ "NVIDIA-RTX-A5000-24GB": 222.2e12
27
+ },
28
+ "float16":{
29
+ "NVIDIA-A100-PCIe-80GB": 624e12,
30
+ "NVIDIA-A100-SXM-80GB": 624e12,
31
+ "NVIDIA-H100-PCIe-80GB": 1513e12,
32
+ "NVIDIA-RTX-A5000-24GB": 444.4e12
33
+ },
34
+ "bfloat16":{
35
+ "NVIDIA-A100-PCIe-80GB": 624e12,
36
+ "NVIDIA-A100-SXM-80GB": 624e12,
37
+ "NVIDIA-H100-PCIe-80GB": 1513e12,
38
+ "NVIDIA-RTX-A5000-24GB": 444.4e12
39
+ },
40
+ "8bit":{
41
+ "NVIDIA-A100-PCIe-80GB": 1248e12,
42
+ "NVIDIA-A100-SXM-80GB": 1248e12,
43
+ "NVIDIA-H100-PCIe-80GB": 3026e12,
44
+ "NVIDIA-RTX-A5000-24GB": 889e12
45
+ },
46
+ "4bit": {
47
+ "NVIDIA-A100-PCIe-80GB": 2496e12,
48
+ "NVIDIA-A100-SXM-80GB": 2496e12,
49
+ "NVIDIA-H100-PCIe-80GB": 6052e12,
50
+ "NVIDIA-RTX-A5000-24GB": 1778e12
51
+ }
52
+
53
+ }
54
 
55
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
56
  for i in range(10):
 
94
  print("Failed to query GPU indices.")
95
  return []
96
  gpu_indices = result.stdout.strip().split('\n')
97
+ # print(f"gpu_indices: {gpu_indices}")
98
  gpu_stats = []
99
 
100
  gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
101
+ # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
102
+ gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
103
 
104
  gpu_name = ""
105
  for index in gpu_indices:
 
111
  name_match = gpu_name_pattern.search(line)
112
  gpu_info = {}
113
  if name_match:
114
+ gpu_name = ''.join(filter(None, name_match.groups())).strip()
115
  if match:
116
  temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
117
  gpu_info.update({
 
123
 
124
  if len(gpu_info) >= 4:
125
  gpu_stats.append(gpu_info)
126
+ # print(f"gpu_stats: {gpu_stats}")
127
  gpu_name = f"{len(gpu_stats)}x{gpu_name}"
128
  gpu_stats_total = {
129
  GPU_TEMP: 0,
 
174
 
175
  return avg_stats
176
 
177
+ def get_gpu_number():
178
+ visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
179
+ if visible_devices is not None:
180
+ gpu_indices = visible_devices.split(',')
181
+ else:
182
+ # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
183
+ result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
184
+ if result.returncode != 0:
185
+ print("Failed to query GPU indices.")
186
+ return []
187
+ gpu_indices = result.stdout.strip().split('\n')
188
+ # print(f"gpu_indices: {gpu_indices}")
189
+ gpu_stats = []
190
+
191
+ gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
192
+
193
+ for index in gpu_indices:
194
+ result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
195
+ output = result.stdout.strip()
196
+ lines = output.split("\n")
197
+ for line in lines:
198
+ match = gpu_info_pattern.search(line)
199
+ gpu_info = {}
200
+ if match:
201
+ temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
202
+ gpu_info.update({
203
+ GPU_TEMP: temp,
204
+ GPU_Power: power_usage,
205
+ GPU_Mem: round(mem_usage / 1024, 2),
206
+ GPU_Util: gpu_util
207
+ })
208
+
209
+ if len(gpu_info) >= 4:
210
+ gpu_stats.append(gpu_info)
211
+
212
+ return len(gpu_stats)
213
+
214
+ def get_gpu_details():
215
+ gpus = GPUtil.getGPUs()
216
+ gpu = gpus[0]
217
+ name = gpu.name.replace(" ", "-")
218
+ memory_gb = round(gpu.memoryTotal / 1024)
219
+ memory = f"{memory_gb}GB"
220
+
221
+ for part in name.split('-'):
222
+ if part.endswith("GB") and part[:-2].isdigit():
223
+ name = name.replace(f"-{part}", "").replace(part, "")
224
+
225
+ formatted_name = f"{name}-{memory}"
226
+
227
+ return formatted_name
228
+
229
+ def get_peak_bw(gpu_name):
230
+ return MEM_BW_DICT[gpu_name]
231
+
232
+ def get_peak_flops(gpu_name, precision):
233
+ return PEAK_FLOPS_DICT[precision][gpu_name]
234
+
235
+ def transfer_precision2bytes(precision):
236
+ if precision == "float32":
237
+ return 4
238
+ elif precision in ["float16", "bfloat16"]:
239
+ return 2
240
+ elif precision == "8bit":
241
+ return 1
242
+ elif precision == "4bit":
243
+ return 0.5
244
+ else:
245
+ raise ValueError(f"Unsupported precision: {precision}")
246
+
247
  if __name__ == "__main__":
248
  print(analyze_gpu_stats(parse_nvidia_smi()))