Quentin Gallouédec commited on
Commit
0811d37
1 Parent(s): 6b9db30

works with cartpole!

Browse files
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import logging
2
  from src.logging import configure_root_logger
 
3
  logging.getLogger("httpx").setLevel(logging.WARNING)
4
  logging.getLogger("numexpr").setLevel(logging.WARNING)
5
  logging.getLogger("absl").setLevel(logging.WARNING)
@@ -8,7 +9,7 @@ configure_root_logger()
8
  from functools import partial
9
 
10
  import gradio as gr
11
- from main_backend_lighteval import run_auto_eval
12
  from src.display.log_visualizer import log_file_to_html_string
13
  from src.display.css_html_js import dark_mode_gradio_js
14
  from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
@@ -32,6 +33,7 @@ links_md = f"""
32
  | Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
33
  """
34
 
 
35
  def button_auto_eval():
36
  logger.info("Manually triggering Auto Eval")
37
  run_auto_eval()
@@ -45,7 +47,7 @@ with gr.Blocks(js=dark_mode_gradio_js) as demo:
45
  output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
46
  with gr.Row():
47
  download_button = gr.DownloadButton("Download Log File", value=log_file)
48
- with gr.Accordion('Log View Configuration', open=False):
49
  reverse_order_checkbox.render()
50
  # Add a button that when pressed, triggers run_auto_eval
51
  button = gr.Button("Manually Run Evaluation")
@@ -56,5 +58,5 @@ with gr.Blocks(js=dark_mode_gradio_js) as demo:
56
  button.click(fn=button_auto_eval, inputs=[], outputs=[])
57
 
58
 
59
- if __name__ == '__main__':
60
- demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)
 
1
  import logging
2
  from src.logging import configure_root_logger
3
+
4
  logging.getLogger("httpx").setLevel(logging.WARNING)
5
  logging.getLogger("numexpr").setLevel(logging.WARNING)
6
  logging.getLogger("absl").setLevel(logging.WARNING)
 
9
  from functools import partial
10
 
11
  import gradio as gr
12
+ from main_backend_harness import run_auto_eval
13
  from src.display.log_visualizer import log_file_to_html_string
14
  from src.display.css_html_js import dark_mode_gradio_js
15
  from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
 
33
  | Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
34
  """
35
 
36
+
37
  def button_auto_eval():
38
  logger.info("Manually triggering Auto Eval")
39
  run_auto_eval()
 
47
  output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
48
  with gr.Row():
49
  download_button = gr.DownloadButton("Download Log File", value=log_file)
50
+ with gr.Accordion("Log View Configuration", open=False):
51
  reverse_order_checkbox.render()
52
  # Add a button that when pressed, triggers run_auto_eval
53
  button = gr.Button("Manually Run Evaluation")
 
58
  button.click(fn=button_auto_eval, inputs=[], outputs=[])
59
 
60
 
61
+ if __name__ == "__main__":
62
+ demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)
custom_tasks.py DELETED
@@ -1,90 +0,0 @@
1
- # ruff: noqa: F405, F403, F401
2
- """
3
- Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
4
-
5
- This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
6
-
7
- Author:
8
- """
9
- from lighteval.tasks.lighteval_task import LightevalTaskConfig
10
- from lighteval.tasks.requests import Doc
11
- from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
12
-
13
-
14
- ## EVAL WITH NO SUBSET ##
15
- # This is how you create a simple tasks (like hellaswag) which has one single subset
16
- # attached to it, and one evaluation possible.
17
- task = LightevalTaskConfig(
18
- name="myothertask",
19
- prompt_function="prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
20
- suite=["community"],
21
- hf_repo="",
22
- hf_subset="default",
23
- hf_avail_splits=[],
24
- evaluation_splits=[],
25
- few_shots_split="",
26
- few_shots_select="",
27
- metric=[""],
28
- )
29
-
30
- ## EVALS WITH SUBSET
31
- # This is how you create a subset task (like MMLU), which has several subset
32
- # each being its own evaluation task.
33
-
34
- # fmt: off
35
- SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
36
- # fmt: on
37
-
38
-
39
- class CustomSubsetTask(LightevalTaskConfig):
40
- def __init__(
41
- self,
42
- name,
43
- hf_subset,
44
- ):
45
- super().__init__(
46
- name=name,
47
- hf_subset=hf_subset,
48
- prompt_function="prompt_fn", # must be defined in the file
49
- hf_repo="",
50
- metric=[""],
51
- hf_avail_splits=[],
52
- evaluation_splits=[],
53
- few_shots_split="",
54
- few_shots_select="",
55
- suite=["community"],
56
- generation_size=-1,
57
- stop_sequence=None,
58
- output_regex=None,
59
- frozen=False,
60
- )
61
-
62
-
63
- ## DEFINE YOUR PROMPT FUNCTIONS
64
- # Define as many as you need for your different tasks
65
- def prompt_fn(line, task_name: str = None):
66
- """Defines how to go from a dataset line to a doc object.
67
- Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
68
- about what this function should do in the README.
69
- """
70
- return Doc(
71
- task_name=task_name,
72
- query="",
73
- choices="",
74
- gold_index=0,
75
- instruction="",
76
- )
77
-
78
-
79
- ## STORE YOUR EVALS
80
- SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
81
- _TASKS = SUBSET_TASKS + [task]
82
-
83
- ## MODULE LOGIC
84
- # You should not need to touch this
85
- # Convert to dict for lighteval
86
- TASKS_TABLE = [task.as_dict() for task in _TASKS]
87
-
88
- if __name__ == "__main__":
89
- print(t["name"] for t in TASKS_TABLE)
90
- print(len(TASKS_TABLE))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main_backend_harness.py CHANGED
@@ -5,13 +5,23 @@ from huggingface_hub import snapshot_download
5
 
6
  logging.getLogger("openai").setLevel(logging.WARNING)
7
 
8
- from backend.run_eval_suite_harness import run_evaluation
9
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
  from src.backend.sort_queue import sort_models_by_priority
11
 
12
- from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
 
 
 
 
 
 
 
 
 
13
  from src.about import Tasks, NUM_FEWSHOT
14
  from src.logging import setup_logger
 
15
  TASKS_HARNESS = [task.value.benchmark for task in Tasks]
16
 
17
  # logging.basicConfig(level=logging.ERROR)
@@ -23,8 +33,23 @@ RUNNING_STATUS = "RUNNING"
23
  FINISHED_STATUS = "FINISHED"
24
  FAILED_STATUS = "FAILED"
25
 
26
- snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
27
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def run_auto_eval():
30
  current_pending_status = [PENDING_STATUS]
@@ -39,11 +64,13 @@ def run_auto_eval():
39
  hf_repo=QUEUE_REPO,
40
  local_dir=EVAL_REQUESTS_PATH_BACKEND,
41
  hf_repo_results=RESULTS_REPO,
42
- local_dir_results=EVAL_RESULTS_PATH_BACKEND
43
  )
44
 
45
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
46
- eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
47
  # Sort the evals by priority (first submitted first run)
48
  eval_requests = sort_models_by_priority(api=API, models=eval_requests)
49
 
@@ -64,17 +91,12 @@ def run_auto_eval():
64
  )
65
 
66
  run_evaluation(
67
- eval_request=eval_request,
68
- task_names=TASKS_HARNESS,
69
- num_fewshot=NUM_FEWSHOT,
70
  local_dir=EVAL_RESULTS_PATH_BACKEND,
71
  results_repo=RESULTS_REPO,
72
- batch_size=1,
73
- device=DEVICE,
74
- no_cache=True,
75
- limit=LIMIT
76
- )
77
 
78
 
79
  if __name__ == "__main__":
80
- run_auto_eval()
 
5
 
6
  logging.getLogger("openai").setLevel(logging.WARNING)
7
 
8
+ from src.backend.run_eval_suite_harness import run_evaluation
9
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
  from src.backend.sort_queue import sort_models_by_priority
11
 
12
+ from src.envs import (
13
+ QUEUE_REPO,
14
+ EVAL_REQUESTS_PATH_BACKEND,
15
+ RESULTS_REPO,
16
+ EVAL_RESULTS_PATH_BACKEND,
17
+ DEVICE,
18
+ API,
19
+ LIMIT,
20
+ TOKEN,
21
+ )
22
  from src.about import Tasks, NUM_FEWSHOT
23
  from src.logging import setup_logger
24
+
25
  TASKS_HARNESS = [task.value.benchmark for task in Tasks]
26
 
27
  # logging.basicConfig(level=logging.ERROR)
 
33
  FINISHED_STATUS = "FINISHED"
34
  FAILED_STATUS = "FAILED"
35
 
36
+ snapshot_download(
37
+ repo_id=RESULTS_REPO,
38
+ revision="main",
39
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
40
+ repo_type="dataset",
41
+ max_workers=60,
42
+ token=TOKEN,
43
+ )
44
+ snapshot_download(
45
+ repo_id=QUEUE_REPO,
46
+ revision="main",
47
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
48
+ repo_type="dataset",
49
+ max_workers=60,
50
+ token=TOKEN,
51
+ )
52
+
53
 
54
  def run_auto_eval():
55
  current_pending_status = [PENDING_STATUS]
 
64
  hf_repo=QUEUE_REPO,
65
  local_dir=EVAL_REQUESTS_PATH_BACKEND,
66
  hf_repo_results=RESULTS_REPO,
67
+ local_dir_results=EVAL_RESULTS_PATH_BACKEND,
68
  )
69
 
70
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
71
+ eval_requests = get_eval_requests(
72
+ job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
73
+ )
74
  # Sort the evals by priority (first submitted first run)
75
  eval_requests = sort_models_by_priority(api=API, models=eval_requests)
76
 
 
91
  )
92
 
93
  run_evaluation(
94
+ eval_request=eval_request,
95
+ task_names=TASKS_HARNESS,
 
96
  local_dir=EVAL_RESULTS_PATH_BACKEND,
97
  results_repo=RESULTS_REPO,
98
+ )
 
 
 
 
99
 
100
 
101
  if __name__ == "__main__":
102
+ run_auto_eval()
main_backend_lighteval.py DELETED
@@ -1,92 +0,0 @@
1
- import logging
2
- import pprint
3
-
4
- from huggingface_hub import snapshot_download
5
-
6
- logging.getLogger("openai").setLevel(logging.WARNING)
7
-
8
- from src.backend.run_eval_suite_lighteval import run_evaluation
9
- from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
- from src.backend.sort_queue import sort_models_by_priority
11
-
12
- from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION
13
- from src.about import TASKS_LIGHTEVAL
14
- from src.logging import setup_logger
15
-
16
- logger = setup_logger(__name__)
17
-
18
- # logging.basicConfig(level=logging.ERROR)
19
- pp = pprint.PrettyPrinter(width=80)
20
-
21
- PENDING_STATUS = "PENDING"
22
- RUNNING_STATUS = "RUNNING"
23
- FINISHED_STATUS = "FINISHED"
24
- FAILED_STATUS = "FAILED"
25
-
26
- snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
27
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
28
-
29
- def run_auto_eval():
30
- current_pending_status = [PENDING_STATUS]
31
-
32
- # pull the eval dataset from the hub and parse any eval requests
33
- # check completed evals and set them to finished
34
- check_completed_evals(
35
- api=API,
36
- checked_status=RUNNING_STATUS,
37
- completed_status=FINISHED_STATUS,
38
- failed_status=FAILED_STATUS,
39
- hf_repo=QUEUE_REPO,
40
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
41
- hf_repo_results=RESULTS_REPO,
42
- local_dir_results=EVAL_RESULTS_PATH_BACKEND
43
- )
44
-
45
- # Get all eval request that are PENDING, if you want to run other evals, change this parameter
46
- eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
47
- # Sort the evals by priority (first submitted first run)
48
- eval_requests = sort_models_by_priority(api=API, models=eval_requests)
49
-
50
- logger.info(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
51
-
52
- if len(eval_requests) == 0:
53
- return
54
-
55
- eval_request = eval_requests[0]
56
- logger.info(pp.pformat(eval_request))
57
-
58
-
59
- set_eval_request(
60
- api=API,
61
- eval_request=eval_request,
62
- set_to_status=RUNNING_STATUS,
63
- hf_repo=QUEUE_REPO,
64
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
65
- )
66
-
67
- # This needs to be done
68
- #instance_size, instance_type = get_instance_for_model(eval_request)
69
- # For GPU
70
- # instance_size, instance_type = "small", "g4dn.xlarge"
71
- # For CPU
72
- instance_size, instance_type = "medium", "c6i"
73
- logger.info(f'Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
74
-
75
- run_evaluation(
76
- eval_request=eval_request,
77
- task_names=TASKS_LIGHTEVAL,
78
- local_dir=EVAL_RESULTS_PATH_BACKEND,
79
- batch_size=1,
80
- accelerator=ACCELERATOR,
81
- region=REGION,
82
- vendor=VENDOR,
83
- instance_size=instance_size,
84
- instance_type=instance_type,
85
- limit=LIMIT
86
- )
87
-
88
- logger.info(f'Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
89
-
90
-
91
- if __name__ == "__main__":
92
- run_auto_eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -2,8 +2,9 @@ APScheduler==3.10.1
2
  black==23.11.0
3
  click==8.1.3
4
  datasets==2.14.5
5
- gradio==4.4.0 # will have to move to 4.19.2
6
  gradio_client
 
7
  huggingface-hub>=0.18.0
8
  matplotlib==3.7.1
9
  numpy==1.24.2
@@ -11,16 +12,6 @@ pandas==2.0.0
11
  python-dateutil==2.8.2
12
  requests==2.28.2
13
  tqdm==4.65.0
14
- transformers
15
- tokenizers>=0.15.0
16
- git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
- git+https://github.com/huggingface/lighteval.git#egg=lighteval
18
- accelerate==0.24.1
19
- sentencepiece
20
-
21
- # Evaluation suites
22
- lighteval
23
- lm_eval
24
 
25
  # Log Visualizer
26
  BeautifulSoup4==4.12.2
 
2
  black==23.11.0
3
  click==8.1.3
4
  datasets==2.14.5
5
+ gradio==4.25.0
6
  gradio_client
7
+ gymnasium==0.29.1
8
  huggingface-hub>=0.18.0
9
  matplotlib==3.7.1
10
  numpy==1.24.2
 
12
  python-dateutil==2.8.2
13
  requests==2.28.2
14
  tqdm==4.65.0
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Log Visualizer
17
  BeautifulSoup4==4.12.2
scripts/create_request_file.py CHANGED
@@ -1,7 +1,6 @@
1
  import json
2
  import os
3
  import pprint
4
- import re
5
  from datetime import datetime, timezone
6
 
7
  import click
@@ -9,39 +8,16 @@ from colorama import Fore
9
  from huggingface_hub import HfApi, snapshot_download
10
  from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
11
 
12
- precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
13
- model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
14
- weight_types = ("Original", "Delta", "Adapter")
15
-
16
-
17
- def get_model_size(model_info, precision: str):
18
- size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
19
- try:
20
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
21
- except (AttributeError, TypeError):
22
- try:
23
- size_match = re.search(size_pattern, model_info.modelId.lower())
24
- model_size = size_match.group(0)
25
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
26
- except AttributeError:
27
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
28
-
29
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
30
- model_size = size_factor * model_size
31
- return model_size
32
-
33
 
34
  def main():
35
  api = HfApi()
36
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
37
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
 
 
38
 
39
  model_name = click.prompt("Enter model name")
40
  revision = click.prompt("Enter revision", default="main")
41
- precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
42
- model_type = click.prompt("Enter model type", type=click.Choice(model_types))
43
- weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
44
- base_model = click.prompt("Enter base model", default="")
45
  status = click.prompt("Enter status", default="FINISHED")
46
 
47
  try:
@@ -50,8 +26,6 @@ def main():
50
  print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
51
  return 1
52
 
53
- model_size = get_model_size(model_info=model_info, precision=precision)
54
-
55
  try:
56
  license = model_info.cardData["license"]
57
  except Exception:
@@ -59,16 +33,10 @@ def main():
59
 
60
  eval_entry = {
61
  "model": model_name,
62
- "base_model": base_model,
63
  "revision": revision,
64
- "private": False,
65
- "precision": precision,
66
- "weight_type": weight_type,
67
  "status": status,
68
  "submitted_time": current_time,
69
- "model_type": model_type,
70
  "likes": model_info.likes,
71
- "params": model_size,
72
  "license": license,
73
  }
74
 
@@ -85,7 +53,7 @@ def main():
85
 
86
  out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
87
  os.makedirs(out_dir, exist_ok=True)
88
- out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
89
 
90
  with open(out_path, "w") as f:
91
  f.write(json.dumps(eval_entry))
 
1
  import json
2
  import os
3
  import pprint
 
4
  from datetime import datetime, timezone
5
 
6
  import click
 
8
  from huggingface_hub import HfApi, snapshot_download
9
  from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def main():
13
  api = HfApi()
14
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
15
+ snapshot_download(
16
+ repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
17
+ )
18
 
19
  model_name = click.prompt("Enter model name")
20
  revision = click.prompt("Enter revision", default="main")
 
 
 
 
21
  status = click.prompt("Enter status", default="FINISHED")
22
 
23
  try:
 
26
  print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
27
  return 1
28
 
 
 
29
  try:
30
  license = model_info.cardData["license"]
31
  except Exception:
 
33
 
34
  eval_entry = {
35
  "model": model_name,
 
36
  "revision": revision,
 
 
 
37
  "status": status,
38
  "submitted_time": current_time,
 
39
  "likes": model_info.likes,
 
40
  "license": license,
41
  }
42
 
 
53
 
54
  out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
55
  os.makedirs(out_dir, exist_ok=True)
56
+ out_path = f"{out_dir}/{model_path}_eval_request.json"
57
 
58
  with open(out_path, "w") as f:
59
  f.write(json.dumps(eval_entry))
scripts/fix_harness_import.py CHANGED
@@ -8,4 +8,4 @@ import lm_eval
8
 
9
  if __name__ == "__main__":
10
  lm_eval_path = lm_eval.__path__[0]
11
- os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
 
8
 
9
  if __name__ == "__main__":
10
  lm_eval_path = lm_eval.__path__[0]
11
+ os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -11,14 +12,16 @@ class Task:
11
  # Change for your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
17
 
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
 
20
  TASKS_HARNESS = [task.value.benchmark for task in Tasks]
21
  # ---------------------------------------------------
22
 
23
- TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
- #custom|myothertask|0|0
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
12
  # Change for your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ # task0 = Task("PongNoFrameskip-v4", "episodic_return", "PongNoFrameskip-v4")
17
+ task1 = Task("BreakoutNoFrameskip-v4", "episodic_return", "BreakoutNoFrameskip-v4")
18
+ task2 = Task("CartPole-v1", "episodic_return", "CartPole-v1")
19
+
20
 
21
+ NUM_FEWSHOT = 0 # Change with your few shot
22
 
23
  TASKS_HARNESS = [task.value.benchmark for task in Tasks]
24
  # ---------------------------------------------------
25
 
26
+ TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
27
+ # custom|myothertask|0|0
src/backend/manage_requests.py CHANGED
@@ -9,41 +9,18 @@ from src.logging import setup_logger
9
 
10
  logger = setup_logger(__name__)
11
 
 
12
  @dataclass
13
  class EvalRequest:
14
  model: str
15
- private: bool
16
  status: str
17
  json_filepath: str
18
- weight_type: str = "Original"
19
- model_type: str = "" # pretrained, finetuned, with RL
20
- precision: str = "" # float16, bfloat16
21
- base_model: Optional[str] = None # for adapter models
22
- revision: str = "main" # commit
23
- submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
24
- model_type: Optional[str] = None
25
  likes: Optional[int] = 0
26
- params: Optional[int] = None
27
  license: Optional[str] = ""
28
-
29
- def get_model_args(self):
30
- model_args = f"pretrained={self.model},revision={self.revision}"
31
-
32
- if self.precision in ["float16", "bfloat16", "float32"]:
33
- model_args += f",dtype={self.precision}"
34
- # Quantized models need some added config, the install of bits and bytes, etc
35
- #elif self.precision == "8bit":
36
- # model_args += ",load_in_8bit=True"
37
- #elif self.precision == "4bit":
38
- # model_args += ",load_in_4bit=True"
39
- #elif self.precision == "GPTQ":
40
- # A GPTQ model does not need dtype to be specified,
41
- # it will be inferred from the config
42
- pass
43
- else:
44
- raise Exception(f"Unknown precision {self.precision}.")
45
-
46
- return model_args
47
 
48
 
49
  def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
@@ -74,7 +51,9 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
74
  Returns:
75
  `list[EvalRequest]`: a list of model info dicts.
76
  """
77
- snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN)
 
 
78
  json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
79
 
80
  eval_requests = []
@@ -100,7 +79,14 @@ def check_completed_evals(
100
  local_dir_results: str,
101
  ):
102
  """Checks if the currently running evals are completed, if yes, update their status on the hub."""
103
- snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
 
 
 
 
 
 
 
104
 
105
  running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
106
 
@@ -114,12 +100,8 @@ def check_completed_evals(
114
  output_file_exists = len(glob.glob(output_file)) > 0
115
 
116
  if output_file_exists:
117
- logger.info(
118
- f"EXISTS output file exists for {model} setting it to {completed_status}"
119
- )
120
  set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
121
  else:
122
- logger.info(
123
- f"No result file found for {model} setting it to {failed_status}"
124
- )
125
  set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
 
9
 
10
  logger = setup_logger(__name__)
11
 
12
+
13
  @dataclass
14
  class EvalRequest:
15
  model: str
 
16
  status: str
17
  json_filepath: str
18
+ revision: str = "main" # commit
19
+ submitted_time: Optional[
20
+ str
21
+ ] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
 
 
 
22
  likes: Optional[int] = 0
 
23
  license: Optional[str] = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
 
51
  Returns:
52
  `list[EvalRequest]`: a list of model info dicts.
53
  """
54
+ snapshot_download(
55
+ repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
56
+ )
57
  json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
58
 
59
  eval_requests = []
 
79
  local_dir_results: str,
80
  ):
81
  """Checks if the currently running evals are completed, if yes, update their status on the hub."""
82
+ snapshot_download(
83
+ repo_id=hf_repo_results,
84
+ revision="main",
85
+ local_dir=local_dir_results,
86
+ repo_type="dataset",
87
+ max_workers=60,
88
+ token=TOKEN,
89
+ )
90
 
91
  running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
92
 
 
100
  output_file_exists = len(glob.glob(output_file)) > 0
101
 
102
  if output_file_exists:
103
+ logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
 
 
104
  set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
105
  else:
106
+ logger.info(f"No result file found for {model} setting it to {failed_status}")
 
 
107
  set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
src/backend/run_eval_suite_harness.py CHANGED
@@ -3,41 +3,76 @@ import os
3
  import logging
4
  from datetime import datetime
5
 
6
- from lm_eval import tasks, evaluator, utils
7
-
8
  from src.envs import RESULTS_REPO, API
9
  from src.backend.manage_requests import EvalRequest
10
  from src.logging import setup_logger
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  logging.getLogger("openai").setLevel(logging.WARNING)
13
  logger = setup_logger(__name__)
14
 
15
- def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
16
- if limit:
17
- logger.info(
18
- "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
19
- )
20
 
21
- task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  logger.info(f"Selected Tasks: {task_names}")
24
 
25
- results = evaluator.simple_evaluate(
26
- model="hf-causal-experimental", # "hf-causal"
27
- model_args=eval_request.get_model_args(),
28
- tasks=task_names,
29
- num_fewshot=num_fewshot,
30
- batch_size=batch_size,
31
- device=device,
32
- no_cache=no_cache,
33
- limit=limit,
34
- write_out=True,
35
- output_base_path="logs"
36
- )
 
37
 
38
- results["config"]["model_dtype"] = eval_request.precision
39
- results["config"]["model_name"] = eval_request.model
40
- results["config"]["model_sha"] = eval_request.revision
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  dumped = json.dumps(results, indent=2)
43
  logger.info(dumped)
@@ -47,8 +82,6 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
47
  with open(output_path, "w") as f:
48
  f.write(dumped)
49
 
50
- logger.info(evaluator.make_table(results))
51
-
52
  API.upload_file(
53
  path_or_fileobj=output_path,
54
  path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
 
3
  import logging
4
  from datetime import datetime
5
 
 
 
6
  from src.envs import RESULTS_REPO, API
7
  from src.backend.manage_requests import EvalRequest
8
  from src.logging import setup_logger
9
+ from src.backend.evaluate import run_evaluation
10
+ import fnmatch
11
+ import torch
12
+ from torch import nn
13
+ from huggingface_hub.utils._errors import EntryNotFoundError
14
+
15
+ import gymnasium as gym
16
+
17
+
18
+ import numpy as np
19
+ from typing import List
20
+ from huggingface_hub import hf_hub_download
21
+ from src.backend.manage_requests import EvalRequest
22
 
23
  logging.getLogger("openai").setLevel(logging.WARNING)
24
  logger = setup_logger(__name__)
25
 
 
 
 
 
 
26
 
27
+ def pattern_match(patterns, source_list):
28
+ if isinstance(patterns, str):
29
+ patterns = [patterns]
30
+
31
+ task_names = set()
32
+ for pattern in patterns:
33
+ for matching in fnmatch.filter(source_list, pattern):
34
+ task_names.add(matching)
35
+ return sorted(list(task_names))
36
+
37
+
38
+ def run_evaluation(eval_request: EvalRequest, task_names, local_dir: str, results_repo: str):
39
+ tags = API.model_info(eval_request.model).tags
40
+ task_names = pattern_match(tags, task_names)
41
 
42
  logger.info(f"Selected Tasks: {task_names}")
43
 
44
+ results = {
45
+ "config": {
46
+ "model_name": eval_request.model,
47
+ "model_sha": eval_request.revision,
48
+ },
49
+ "results": {},
50
+ }
51
+ try:
52
+ agent_path = hf_hub_download(repo_id=eval_request.model, filename="agent.pt")
53
+ except EntryNotFoundError:
54
+ logger.error("Agent not found")
55
+ return
56
+ agent = torch.jit.load(agent_path)
57
 
58
+ episodic_rewards = []
59
+ for task_name in task_names:
60
+ env = gym.make(task_name)
61
+ for _ in range(10):
62
+ episodic_reward = 0.0
63
+ observation, info = env.reset()
64
+ done = False
65
+ while not done:
66
+ torch_observation = torch.from_numpy(np.array([observation]))
67
+ action = agent(torch_observation).numpy()[0]
68
+ observation, reward, terminated, truncated, info = env.step(action)
69
+ done = terminated or truncated
70
+ episodic_reward += reward
71
+
72
+ episodic_rewards.append(episodic_reward)
73
+
74
+ mean_reward = np.mean(episodic_rewards)
75
+ results[task_name] = {"episodic_return": mean_reward}
76
 
77
  dumped = json.dumps(results, indent=2)
78
  logger.info(dumped)
 
82
  with open(output_path, "w") as f:
83
  f.write(dumped)
84
 
 
 
85
  API.upload_file(
86
  path_or_fileobj=output_path,
87
  path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
src/backend/run_eval_suite_lighteval.py DELETED
@@ -1,72 +0,0 @@
1
- import json
2
- import argparse
3
- import logging
4
- from datetime import datetime
5
-
6
- from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
7
-
8
- from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
9
- from src.backend.manage_requests import EvalRequest
10
- from src.logging import setup_logger
11
-
12
- logging.getLogger("openai").setLevel(logging.WARNING)
13
- logger = setup_logger(__name__)
14
-
15
- def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
16
- if limit:
17
- logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
18
-
19
- args_dict = {
20
- # Endpoint parameters
21
- "endpoint_model_name":eval_request.model,
22
- "accelerator": accelerator,
23
- "vendor": vendor,
24
- "region": region,
25
- "instance_size": instance_size,
26
- "instance_type": instance_type,
27
- "reuse_existing": False,
28
- "model_dtype": eval_request.precision,
29
- "revision": eval_request.revision,
30
- # Save parameters
31
- "push_results_to_hub": True,
32
- "save_details": True,
33
- "push_details_to_hub": True,
34
- "public_run": False,
35
- "cache_dir": CACHE_PATH,
36
- "results_org": RESULTS_REPO,
37
- "output_dir": local_dir,
38
- "job_id": str(datetime.now()),
39
- # Experiment parameters
40
- "override_batch_size": batch_size,
41
- "custom_tasks": "custom_tasks.py",
42
- "tasks": task_names,
43
- "max_samples": limit,
44
- "use_chat_template": False,
45
- "system_prompt": None,
46
- # Parameters which would be set to things by the kwargs if actually using argparse
47
- "inference_server_address": None,
48
- "model_args": None,
49
- "num_fewshot_seeds": None,
50
- "delta_weights": False,
51
- "adapter_weights": False
52
- }
53
- args = argparse.Namespace(**args_dict)
54
-
55
- try:
56
- results = main(args)
57
-
58
- results["config"]["model_dtype"] = eval_request.precision
59
- results["config"]["model_name"] = eval_request.model
60
- results["config"]["model_sha"] = eval_request.revision
61
-
62
- dumped = json.dumps(results, indent=2)
63
- logger.info(dumped)
64
- except Exception as e: # if eval failed, we force a cleanup
65
- env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
66
-
67
- model_config = create_model_config(args=args, accelerator=accelerator)
68
- model, _ = load_model(config=model_config, env_config=env_config)
69
- model.cleanup()
70
-
71
-
72
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/sort_queue.py CHANGED
@@ -9,20 +9,15 @@ from src.backend.manage_requests import EvalRequest
9
  @dataclass
10
  class ModelMetadata:
11
  likes: int = 0
12
- size: int = 15
13
 
14
 
15
  def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
16
- private_models = [model for model in models if model.private]
17
- public_models = [model for model in models if not model.private]
18
 
19
- return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
20
 
21
  def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
22
  return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
23
 
24
- def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
25
- return sorted(eval_requests, key=lambda x: x.params, reverse=False)
26
 
27
  def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
28
- return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
 
9
  @dataclass
10
  class ModelMetadata:
11
  likes: int = 0
 
12
 
13
 
14
  def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
15
+ return sort_by_submit_date(models)
 
16
 
 
17
 
18
  def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
19
  return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
20
 
 
 
21
 
22
  def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
23
+ return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
src/display/log_visualizer.py CHANGED
@@ -12,8 +12,8 @@ from src.logging import log_file
12
 
13
  def log_file_to_html_string(reverse=True):
14
  with open(log_file, "rt") as f:
15
- lines = f.readlines()
16
- lines = lines[-NUM_LINES_VISUALIZE:]
17
 
18
  if reverse:
19
  lines = reversed(lines)
@@ -26,12 +26,12 @@ def log_file_to_html_string(reverse=True):
26
  html_content = console.export_html(inline_styles=True)
27
 
28
  # Parse the HTML content using BeautifulSoup
29
- soup = BeautifulSoup(html_content, 'lxml')
30
 
31
  # Modify the <pre> tag and add custom styles
32
  pre_tag = soup.pre
33
- pre_tag['class'] = 'scrollable'
34
- del pre_tag['style']
35
 
36
  # Add your custom styles and the .scrollable CSS to the <style> tag
37
  style_tag = soup.style
 
12
 
13
  def log_file_to_html_string(reverse=True):
14
  with open(log_file, "rt") as f:
15
+ lines = f.readlines()
16
+ lines = lines[-NUM_LINES_VISUALIZE:]
17
 
18
  if reverse:
19
  lines = reversed(lines)
 
26
  html_content = console.export_html(inline_styles=True)
27
 
28
  # Parse the HTML content using BeautifulSoup
29
+ soup = BeautifulSoup(html_content, "lxml")
30
 
31
  # Modify the <pre> tag and add custom styles
32
  pre_tag = soup.pre
33
+ pre_tag["class"] = "scrollable"
34
+ del pre_tag["style"]
35
 
36
  # Add your custom styles and the .scrollable CSS to the <style> tag
37
  style_tag = soup.style
src/envs.py CHANGED
@@ -4,13 +4,13 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "open-rl-leaderboard" # Change to your org - don't forget to create a results and request file
10
 
11
  # For harness evaluations
12
- DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
13
- LIMIT = 20 # !!!! Should be None for actual evaluations!!!
14
 
15
  # For lighteval evaluations
16
  ACCELERATOR = "cpu"
@@ -23,7 +23,7 @@ QUEUE_REPO = f"{OWNER}/requests"
23
  RESULTS_REPO = f"{OWNER}/results"
24
 
25
  # If you setup a cache later, just change HF_HOME
26
- CACHE_PATH=os.getenv("HF_HOME", ".")
27
 
28
  # Local caches
29
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
@@ -35,4 +35,3 @@ REFRESH_RATE = 1 * 60 # 1 min
35
  NUM_LINES_VISUALIZE = 300
36
 
37
  API = HfApi(token=TOKEN)
38
-
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "open-rl-leaderboard" # Change to your org - don't forget to create a results and request file
10
 
11
  # For harness evaluations
12
+ DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
13
+ LIMIT = 20 # !!!! Should be None for actual evaluations!!!
14
 
15
  # For lighteval evaluations
16
  ACCELERATOR = "cpu"
 
23
  RESULTS_REPO = f"{OWNER}/results"
24
 
25
  # If you setup a cache later, just change HF_HOME
26
+ CACHE_PATH = os.getenv("HF_HOME", ".")
27
 
28
  # Local caches
29
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
35
  NUM_LINES_VISUALIZE = 300
36
 
37
  API = HfApi(token=TOKEN)
 
src/logging.py CHANGED
@@ -3,7 +3,7 @@ from pathlib import Path
3
 
4
  proj_dir = Path(__file__).parents[1]
5
 
6
- log_file = proj_dir/"output.log"
7
 
8
 
9
  import logging
@@ -13,7 +13,7 @@ def setup_logger(name: str):
13
  logger = logging.getLogger(name)
14
  logger.setLevel(logging.INFO)
15
 
16
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17
 
18
  # Create a file handler to write logs to a file
19
  file_handler = logging.FileHandler(log_file)
@@ -29,10 +29,10 @@ def configure_root_logger():
29
  logging.basicConfig(level=logging.INFO)
30
  root_logger = logging.getLogger()
31
 
32
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
33
 
34
  file_handler = logging.FileHandler(log_file)
35
  file_handler.setLevel(logging.INFO)
36
  file_handler.setFormatter(formatter)
37
 
38
- root_logger.addHandler(file_handler)
 
3
 
4
  proj_dir = Path(__file__).parents[1]
5
 
6
+ log_file = proj_dir / "output.log"
7
 
8
 
9
  import logging
 
13
  logger = logging.getLogger(name)
14
  logger.setLevel(logging.INFO)
15
 
16
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
17
 
18
  # Create a file handler to write logs to a file
19
  file_handler = logging.FileHandler(log_file)
 
29
  logging.basicConfig(level=logging.INFO)
30
  root_logger = logging.getLogger()
31
 
32
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
33
 
34
  file_handler = logging.FileHandler(log_file)
35
  file_handler.setLevel(logging.INFO)
36
  file_handler.setFormatter(formatter)
37
 
38
+ root_logger.addHandler(file_handler)