Clémentine commited on
Commit
1ffc326
·
1 Parent(s): 943f952

now with a functionning backend

Browse files
.gitignore CHANGED
@@ -6,10 +6,8 @@ __pycache__/
6
  *ipynb
7
  .vscode/
8
 
9
- gpt_4_evals/
10
- human_evals/
11
  eval-queue/
12
  eval-results/
13
- auto_evals/
14
-
15
- src/assets/model_counts.html
 
6
  *ipynb
7
  .vscode/
8
 
 
 
9
  eval-queue/
10
  eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
README.md CHANGED
@@ -12,7 +12,7 @@ license: apache-2.0
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
 
15
- Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
16
 
17
  Results files should have the following format:
18
  ```
@@ -33,4 +33,8 @@ Results files should have the following format:
33
  }
34
  ```
35
 
36
- Request files are created automatically by this tool.
 
 
 
 
 
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
 
15
+ Most of the variables to change for a default leaderboard are in src/env (replace the path for your leaderboard) and src/about.
16
 
17
  Results files should have the following format:
18
  ```
 
33
  }
34
  ```
35
 
36
+ Request files are created automatically by this tool.
37
+
38
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
+
40
+ If you want to run your own backend, you only need to change the logic in src/backend/run_eval_suite, which at the moment launches the Eleuther AI Harness.
app.py CHANGED
@@ -1,9 +1,10 @@
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
 
6
- from src.display.about import (
7
  CITATION_BUTTON_LABEL,
8
  CITATION_BUTTON_TEXT,
9
  EVALUATION_QUEUE_TEXT,
@@ -30,9 +31,14 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
31
 
32
 
 
 
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID, token=TOKEN)
35
 
 
 
 
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
@@ -342,5 +348,8 @@ with demo:
342
 
343
  scheduler = BackgroundScheduler()
344
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
345
  scheduler.start()
346
  demo.queue(default_concurrency_limit=40).launch()
 
 
 
1
+ import subprocess
2
  import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
 
7
+ from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
 
31
  from src.submission.submit import add_new_eval
32
 
33
 
34
+ subprocess.run(["python", "scripts/fix_harness_import.py"])
35
+
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID, token=TOKEN)
38
 
39
+ def launch_backend():
40
+ _ = subprocess.run(["python", "main_backend.py"])
41
+
42
  try:
43
  print(EVAL_REQUESTS_PATH)
44
  snapshot_download(
 
348
 
349
  scheduler = BackgroundScheduler()
350
  scheduler.add_job(restart_space, "interval", seconds=1800)
351
+ scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
352
  scheduler.start()
353
  demo.queue(default_concurrency_limit=40).launch()
354
+
355
+ restart_space()
main_backend.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pprint
3
+
4
+ from huggingface_hub import snapshot_download
5
+
6
+ logging.getLogger("openai").setLevel(logging.WARNING)
7
+
8
+ from src.backend.run_eval_suite import run_evaluation
9
+ from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
+ from src.backend.sort_queue import sort_models_by_priority
11
+
12
+ from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT
13
+ from src.about import Tasks, NUM_FEWSHOT
14
+ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
15
+
16
+ logging.basicConfig(level=logging.ERROR)
17
+ pp = pprint.PrettyPrinter(width=80)
18
+
19
+ PENDING_STATUS = "PENDING"
20
+ RUNNING_STATUS = "RUNNING"
21
+ FINISHED_STATUS = "FINISHED"
22
+ FAILED_STATUS = "FAILED"
23
+
24
+ snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
25
+ snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
26
+
27
+ def run_auto_eval():
28
+ current_pending_status = [PENDING_STATUS]
29
+
30
+ # pull the eval dataset from the hub and parse any eval requests
31
+ # check completed evals and set them to finished
32
+ check_completed_evals(
33
+ api=API,
34
+ checked_status=RUNNING_STATUS,
35
+ completed_status=FINISHED_STATUS,
36
+ failed_status=FAILED_STATUS,
37
+ hf_repo=QUEUE_REPO,
38
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
39
+ hf_repo_results=RESULTS_REPO,
40
+ local_dir_results=EVAL_RESULTS_PATH_BACKEND
41
+ )
42
+
43
+ # Get all eval request that are PENDING, if you want to run other evals, change this parameter
44
+ eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
45
+ # Sort the evals by priority (first submitted first run)
46
+ eval_requests = sort_models_by_priority(api=API, models=eval_requests)
47
+
48
+ print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
49
+
50
+ if len(eval_requests) == 0:
51
+ return
52
+
53
+ eval_request = eval_requests[0]
54
+ pp.pprint(eval_request)
55
+
56
+ set_eval_request(
57
+ api=API,
58
+ eval_request=eval_request,
59
+ set_to_status=RUNNING_STATUS,
60
+ hf_repo=QUEUE_REPO,
61
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
62
+ )
63
+
64
+ run_evaluation(
65
+ eval_request=eval_request,
66
+ task_names=TASKS_HARNESS,
67
+ num_fewshot=NUM_FEWSHOT,
68
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
69
+ results_repo=RESULTS_REPO,
70
+ batch_size=1,
71
+ device=DEVICE,
72
+ no_cache=True,
73
+ limit=LIMIT
74
+ )
75
+
76
+
77
+ if __name__ == "__main__":
78
+ run_auto_eval()
requirements.txt CHANGED
@@ -12,4 +12,6 @@ python-dateutil==2.8.2
12
  requests==2.28.2
13
  tqdm==4.65.0
14
  transformers==4.35.2
15
- tokenizers>=0.15.0
 
 
 
12
  requests==2.28.2
13
  tqdm==4.65.0
14
  transformers==4.35.2
15
+ tokenizers>=0.15.0
16
+ git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
+ accelerate==0.24.1
scripts/fix_harness_import.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This file should be used after pip install -r requirements.
2
+ It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
3
+ It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
4
+ """
5
+ import os
6
+
7
+ import lm_eval
8
+
9
+ if __name__ == "__main__":
10
+ lm_eval_path = lm_eval.__path__[0]
11
+ os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
src/{display/about.py → about.py} RENAMED
@@ -11,8 +11,12 @@ class Task:
11
  # Init: to update with your specific keys
12
  class Tasks(Enum):
13
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
- task0 = Task("task_name1", "metric_name", "First task")
15
- task1 = Task("task_name2", "metric_name", "Second task")
 
 
 
 
16
 
17
 
18
  # Your leaderboard name
 
11
  # Init: to update with your specific keys
12
  class Tasks(Enum):
13
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
+ task0 = Task("anli_r1", "acc", "ANLI")
15
+ task1 = Task("logiqa", "acc_norm", "LogiQA")
16
+
17
+ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
18
+
19
+ NUM_FEWSHOT = 0 # Change with your few shot
20
 
21
 
22
  # Your leaderboard name
src/backend/manage_requests.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ from huggingface_hub import HfApi, snapshot_download
7
+ from src.envs import TOKEN
8
+
9
+ @dataclass
10
+ class EvalRequest:
11
+ model: str
12
+ private: bool
13
+ status: str
14
+ json_filepath: str
15
+ weight_type: str = "Original"
16
+ model_type: str = "" # pretrained, finetuned, with RL
17
+ precision: str = "" # float16, bfloat16, 8bit, 4bit, GPTQ
18
+ base_model: Optional[str] = None # for adapter models
19
+ revision: str = "main" # commit
20
+ submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
21
+ model_type: Optional[str] = None
22
+ likes: Optional[int] = 0
23
+ params: Optional[int] = None
24
+ license: Optional[str] = ""
25
+
26
+ def get_model_args(self):
27
+ model_args = f"pretrained={self.model},revision={self.revision}"
28
+
29
+ if self.precision in ["float16", "bfloat16"]:
30
+ model_args += f",dtype={self.precision}"
31
+ elif self.precision == "8bit":
32
+ model_args += ",load_in_8bit=True"
33
+ elif self.precision == "4bit":
34
+ model_args += ",load_in_4bit=True"
35
+ elif self.precision == "GPTQ":
36
+ # A GPTQ model does not need dtype to be specified,
37
+ # it will be inferred from the config
38
+ pass
39
+ else:
40
+ raise Exception(f"Unknown precision {self.precision}.")
41
+
42
+ return model_args
43
+
44
+
45
+ def set_eval_request(
46
+ api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str
47
+ ):
48
+ """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
49
+ json_filepath = eval_request.json_filepath
50
+
51
+ with open(json_filepath) as fp:
52
+ data = json.load(fp)
53
+
54
+ data["status"] = set_to_status
55
+
56
+ with open(json_filepath, "w") as f:
57
+ f.write(json.dumps(data))
58
+
59
+ api.upload_file(
60
+ path_or_fileobj=json_filepath,
61
+ path_in_repo=json_filepath.replace(local_dir, ""),
62
+ repo_id=hf_repo,
63
+ repo_type="dataset",
64
+ )
65
+
66
+
67
+ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
68
+ """Get all pending evaluation requests and return a list in which private
69
+ models appearing first, followed by public models sorted by the number of
70
+ likes.
71
+
72
+ Returns:
73
+ `list[EvalRequest]`: a list of model info dicts.
74
+ """
75
+ snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
76
+ json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
77
+
78
+ eval_requests = []
79
+ for json_filepath in json_files:
80
+ with open(json_filepath) as fp:
81
+ data = json.load(fp)
82
+ if data["status"] in job_status:
83
+ data["json_filepath"] = json_filepath
84
+ eval_request = EvalRequest(**data)
85
+ eval_requests.append(eval_request)
86
+
87
+ return eval_requests
88
+
89
+
90
+ def check_completed_evals(
91
+ api: HfApi,
92
+ hf_repo: str,
93
+ local_dir: str,
94
+ checked_status: str,
95
+ completed_status: str,
96
+ failed_status: str,
97
+ hf_repo_results: str,
98
+ local_dir_results: str,
99
+ ):
100
+ """Checks if the currently running evals are completed, if yes, update their status on the hub."""
101
+ snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
102
+
103
+ running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
104
+
105
+ for eval_request in running_evals:
106
+ model = eval_request.model
107
+ print("====================================")
108
+ print(f"Checking {model}")
109
+
110
+ output_path = model
111
+ output_file = f"{local_dir_results}/{output_path}/results*.json"
112
+ output_file_exists = len(glob.glob(output_file)) > 0
113
+
114
+ if output_file_exists:
115
+ print(
116
+ f"EXISTS output file exists for {model} setting it to {completed_status}"
117
+ )
118
+ set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
119
+ else:
120
+ print(
121
+ f"No result file found for {model} setting it to {failed_status}"
122
+ )
123
+ set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
src/backend/run_eval_suite.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import logging
4
+ from datetime import datetime
5
+
6
+ from lm_eval import tasks, evaluator, utils
7
+
8
+ from src.envs import RESULTS_REPO, API
9
+ from src.backend.manage_requests import EvalRequest
10
+
11
+ logging.getLogger("openai").setLevel(logging.WARNING)
12
+
13
+ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
14
+ if limit:
15
+ print(
16
+ "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
17
+ )
18
+
19
+ task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
20
+
21
+ print(f"Selected Tasks: {task_names}")
22
+
23
+ results = evaluator.simple_evaluate(
24
+ model="hf-causal-experimental", # "hf-causal"
25
+ model_args=eval_request.get_model_args(),
26
+ tasks=task_names,
27
+ num_fewshot=num_fewshot,
28
+ batch_size=batch_size,
29
+ device=device,
30
+ no_cache=no_cache,
31
+ limit=limit,
32
+ write_out=True,
33
+ output_base_path="logs"
34
+ )
35
+
36
+ results["config"]["model_dtype"] = eval_request.precision
37
+ results["config"]["model_name"] = eval_request.model
38
+ results["config"]["model_sha"] = eval_request.revision
39
+
40
+ dumped = json.dumps(results, indent=2)
41
+ print(dumped)
42
+
43
+ output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
44
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
45
+ with open(output_path, "w") as f:
46
+ f.write(dumped)
47
+
48
+ print(evaluator.make_table(results))
49
+
50
+ API.upload_file(
51
+ path_or_fileobj=output_path,
52
+ path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
53
+ repo_id=results_repo,
54
+ repo_type="dataset",
55
+ )
56
+
57
+ return results
src/backend/sort_queue.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from dataclasses import dataclass
3
+
4
+ from huggingface_hub import HfApi
5
+
6
+ from src.backend.manage_requests import EvalRequest
7
+
8
+
9
+ @dataclass
10
+ class ModelMetadata:
11
+ likes: int = 0
12
+ size: int = 15
13
+
14
+
15
+ def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
16
+ private_models = [model for model in models if model.private]
17
+ public_models = [model for model in models if not model.private]
18
+
19
+ return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
20
+
21
+ def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
22
+ return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
23
+
24
+ def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
25
+ return sorted(eval_requests, key=lambda x: x.params, reverse=False)
26
+
27
+ def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
28
+ return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
src/display/formatting.py CHANGED
@@ -1,12 +1,3 @@
1
- import os
2
- from datetime import datetime, timezone
3
-
4
- from huggingface_hub import HfApi
5
- from huggingface_hub.hf_api import ModelInfo
6
-
7
-
8
- API = HfApi()
9
-
10
  def model_hyperlink(link, model_name):
11
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
 
 
 
 
 
 
 
 
 
 
 
1
  def model_hyperlink(link, model_name):
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.display.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
src/envs.py CHANGED
@@ -2,18 +2,26 @@ import os
2
 
3
  from huggingface_hub import HfApi
4
 
5
- # clone / pull the lmeh eval data
6
- TOKEN = os.environ.get("TOKEN", None)
 
 
 
 
 
 
7
 
8
- OWNER = "demo-leaderboard"
9
  REPO_ID = f"{OWNER}/leaderboard"
10
  QUEUE_REPO = f"{OWNER}/requests"
11
  RESULTS_REPO = f"{OWNER}/results"
12
 
 
13
  CACHE_PATH=os.getenv("HF_HOME", ".")
14
 
15
  # Local caches
16
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
17
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
 
18
 
19
  API = HfApi(token=TOKEN)
 
2
 
3
  from huggingface_hub import HfApi
4
 
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("TOKEN", None) # A read/write token for your org
8
+
9
+ OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request file
10
+ DEVICE = "cpu" # cuda:0 if you add compute
11
+ LIMIT = 20 # !!!! Should be None for actual evaluations!!!
12
+ # ----------------------------------
13
 
 
14
  REPO_ID = f"{OWNER}/leaderboard"
15
  QUEUE_REPO = f"{OWNER}/requests"
16
  RESULTS_REPO = f"{OWNER}/results"
17
 
18
+ # If you setup a cache later, just change HF_HOME
19
  CACHE_PATH=os.getenv("HF_HOME", ".")
20
 
21
  # Local caches
22
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
23
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
24
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
25
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
26
 
27
  API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -103,7 +103,7 @@ class EvalResult:
103
  self.num_params = request.get("params", 0)
104
  self.date = request.get("submitted_time", "")
105
  except Exception:
106
- print(f"Could not find request file for {self.org}/{self.model}")
107
 
108
  def to_dict(self):
109
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
103
  self.num_params = request.get("params", 0)
104
  self.date = request.get("submitted_time", "")
105
  except Exception:
106
+ print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
107
 
108
  def to_dict(self):
109
  """Converts the Eval Result to a dict compatible with our dataframe display"""