Clémentine commited on
Commit
24622c4
·
1 Parent(s): 55cc480

simplified the template

Browse files
README.md CHANGED
@@ -37,4 +37,3 @@ Request files are created automatically by this tool.
37
 
38
  If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
 
40
- If you want to run your own backend, you only need to change the logic in src/backend/run_eval_suite, which at the moment launches the Eleuther AI Harness.
 
37
 
38
  If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
 
 
app.py CHANGED
@@ -26,19 +26,14 @@ from src.display.utils import (
26
  WeightType,
27
  Precision
28
  )
29
- from src.envs import API, DEVICE, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
  from src.submission.submit import add_new_eval
32
 
33
 
34
- subprocess.run(["python", "scripts/fix_harness_import.py"])
35
-
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID)
38
 
39
- def launch_backend():
40
- _ = subprocess.run(["python", "main_backend.py"])
41
-
42
  try:
43
  print(EVAL_REQUESTS_PATH)
44
  snapshot_download(
@@ -82,7 +77,7 @@ def update_table(
82
 
83
 
84
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
85
- return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
86
 
87
 
88
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
@@ -92,7 +87,7 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
92
  ]
93
  # We use COLS to maintain sorting
94
  filtered_df = df[
95
- always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
96
  ]
97
  return filtered_df
98
 
@@ -157,7 +152,7 @@ with demo:
157
  choices=[
158
  c.name
159
  for c in fields(AutoEvalColumn)
160
- if not c.hidden and not c.never_hidden and not c.dummy
161
  ],
162
  value=[
163
  c.name
@@ -200,7 +195,6 @@ with demo:
200
  value=leaderboard_df[
201
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
202
  + shown_columns.value
203
- + [AutoEvalColumn.dummy.name]
204
  ],
205
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
206
  datatype=TYPES,
@@ -309,7 +303,7 @@ with demo:
309
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
310
  label="Precision",
311
  multiselect=False,
312
- value="float16" if DEVICE != "cpu" else "float32",
313
  interactive=True,
314
  )
315
  weight_type = gr.Dropdown(
@@ -348,6 +342,5 @@ with demo:
348
 
349
  scheduler = BackgroundScheduler()
350
  scheduler.add_job(restart_space, "interval", seconds=1800)
351
- scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
352
  scheduler.start()
353
  demo.queue(default_concurrency_limit=40).launch()
 
26
  WeightType,
27
  Precision
28
  )
29
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
  from src.submission.submit import add_new_eval
32
 
33
 
 
 
34
  def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
36
 
 
 
 
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
 
77
 
78
 
79
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
80
+ return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
81
 
82
 
83
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
 
87
  ]
88
  # We use COLS to maintain sorting
89
  filtered_df = df[
90
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns]
91
  ]
92
  return filtered_df
93
 
 
152
  choices=[
153
  c.name
154
  for c in fields(AutoEvalColumn)
155
+ if not c.hidden and not c.never_hidden
156
  ],
157
  value=[
158
  c.name
 
195
  value=leaderboard_df[
196
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
197
  + shown_columns.value
 
198
  ],
199
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
200
  datatype=TYPES,
 
303
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
304
  label="Precision",
305
  multiselect=False,
306
+ value="float16",
307
  interactive=True,
308
  )
309
  weight_type = gr.Dropdown(
 
342
 
343
  scheduler = BackgroundScheduler()
344
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
345
  scheduler.start()
346
  demo.queue(default_concurrency_limit=40).launch()
main_backend.py DELETED
@@ -1,78 +0,0 @@
1
- import logging
2
- import pprint
3
-
4
- from huggingface_hub import snapshot_download
5
-
6
- logging.getLogger("openai").setLevel(logging.WARNING)
7
-
8
- from src.backend.run_eval_suite import run_evaluation
9
- from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
- from src.backend.sort_queue import sort_models_by_priority
11
-
12
- from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
13
- from src.about import Tasks, NUM_FEWSHOT
14
- TASKS_HARNESS = [task.value.benchmark for task in Tasks]
15
-
16
- logging.basicConfig(level=logging.ERROR)
17
- pp = pprint.PrettyPrinter(width=80)
18
-
19
- PENDING_STATUS = "PENDING"
20
- RUNNING_STATUS = "RUNNING"
21
- FINISHED_STATUS = "FINISHED"
22
- FAILED_STATUS = "FAILED"
23
-
24
- snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
25
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
26
-
27
- def run_auto_eval():
28
- current_pending_status = [PENDING_STATUS]
29
-
30
- # pull the eval dataset from the hub and parse any eval requests
31
- # check completed evals and set them to finished
32
- check_completed_evals(
33
- api=API,
34
- checked_status=RUNNING_STATUS,
35
- completed_status=FINISHED_STATUS,
36
- failed_status=FAILED_STATUS,
37
- hf_repo=QUEUE_REPO,
38
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
39
- hf_repo_results=RESULTS_REPO,
40
- local_dir_results=EVAL_RESULTS_PATH_BACKEND
41
- )
42
-
43
- # Get all eval request that are PENDING, if you want to run other evals, change this parameter
44
- eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
45
- # Sort the evals by priority (first submitted first run)
46
- eval_requests = sort_models_by_priority(api=API, models=eval_requests)
47
-
48
- print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
49
-
50
- if len(eval_requests) == 0:
51
- return
52
-
53
- eval_request = eval_requests[0]
54
- pp.pprint(eval_request)
55
-
56
- set_eval_request(
57
- api=API,
58
- eval_request=eval_request,
59
- set_to_status=RUNNING_STATUS,
60
- hf_repo=QUEUE_REPO,
61
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
62
- )
63
-
64
- run_evaluation(
65
- eval_request=eval_request,
66
- task_names=TASKS_HARNESS,
67
- num_fewshot=NUM_FEWSHOT,
68
- local_dir=EVAL_RESULTS_PATH_BACKEND,
69
- results_repo=RESULTS_REPO,
70
- batch_size=1,
71
- device=DEVICE,
72
- no_cache=True,
73
- limit=LIMIT
74
- )
75
-
76
-
77
- if __name__ == "__main__":
78
- run_auto_eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/create_request_file.py DELETED
@@ -1,105 +0,0 @@
1
- import json
2
- import os
3
- import pprint
4
- import re
5
- from datetime import datetime, timezone
6
-
7
- import click
8
- from colorama import Fore
9
- from huggingface_hub import HfApi, snapshot_download
10
- from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
11
-
12
- precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
13
- model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
14
- weight_types = ("Original", "Delta", "Adapter")
15
-
16
-
17
- def get_model_size(model_info, precision: str):
18
- size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
19
- try:
20
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
21
- except (AttributeError, TypeError):
22
- try:
23
- size_match = re.search(size_pattern, model_info.modelId.lower())
24
- model_size = size_match.group(0)
25
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
26
- except AttributeError:
27
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
28
-
29
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
30
- model_size = size_factor * model_size
31
- return model_size
32
-
33
-
34
- def main():
35
- api = HfApi()
36
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
37
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
38
-
39
- model_name = click.prompt("Enter model name")
40
- revision = click.prompt("Enter revision", default="main")
41
- precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
42
- model_type = click.prompt("Enter model type", type=click.Choice(model_types))
43
- weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
44
- base_model = click.prompt("Enter base model", default="")
45
- status = click.prompt("Enter status", default="FINISHED")
46
-
47
- try:
48
- model_info = api.model_info(repo_id=model_name, revision=revision)
49
- except Exception as e:
50
- print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
51
- return 1
52
-
53
- model_size = get_model_size(model_info=model_info, precision=precision)
54
-
55
- try:
56
- license = model_info.cardData["license"]
57
- except Exception:
58
- license = "?"
59
-
60
- eval_entry = {
61
- "model": model_name,
62
- "base_model": base_model,
63
- "revision": revision,
64
- "private": False,
65
- "precision": precision,
66
- "weight_type": weight_type,
67
- "status": status,
68
- "submitted_time": current_time,
69
- "model_type": model_type,
70
- "likes": model_info.likes,
71
- "params": model_size,
72
- "license": license,
73
- }
74
-
75
- user_name = ""
76
- model_path = model_name
77
- if "/" in model_name:
78
- user_name = model_name.split("/")[0]
79
- model_path = model_name.split("/")[1]
80
-
81
- pprint.pprint(eval_entry)
82
-
83
- if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
84
- click.echo("continuing...")
85
-
86
- out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
87
- os.makedirs(out_dir, exist_ok=True)
88
- out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
89
-
90
- with open(out_path, "w") as f:
91
- f.write(json.dumps(eval_entry))
92
-
93
- api.upload_file(
94
- path_or_fileobj=out_path,
95
- path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
96
- repo_id=QUEUE_REPO,
97
- repo_type="dataset",
98
- commit_message=f"Add {model_name} to eval queue",
99
- )
100
- else:
101
- click.echo("aborting...")
102
-
103
-
104
- if __name__ == "__main__":
105
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/fix_harness_import.py DELETED
@@ -1,11 +0,0 @@
1
- """This file should be used after pip install -r requirements.
2
- It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
3
- It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
4
- """
5
- import os
6
-
7
- import lm_eval
8
-
9
- if __name__ == "__main__":
10
- lm_eval_path = lm_eval.__path__[0]
11
- os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/manage_requests.py DELETED
@@ -1,122 +0,0 @@
1
- import glob
2
- import json
3
- from dataclasses import dataclass
4
- from typing import Optional
5
-
6
- from huggingface_hub import HfApi, snapshot_download
7
- from src.envs import TOKEN
8
-
9
- @dataclass
10
- class EvalRequest:
11
- model: str
12
- private: bool
13
- status: str
14
- json_filepath: str
15
- weight_type: str = "Original"
16
- model_type: str = "" # pretrained, finetuned, with RL
17
- precision: str = "" # float16, bfloat16
18
- base_model: Optional[str] = None # for adapter models
19
- revision: str = "main" # commit
20
- submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
21
- model_type: Optional[str] = None
22
- likes: Optional[int] = 0
23
- params: Optional[int] = None
24
- license: Optional[str] = ""
25
-
26
- def get_model_args(self):
27
- model_args = f"pretrained={self.model},revision={self.revision}"
28
-
29
- if self.precision in ["float16", "bfloat16", "float32"]:
30
- model_args += f",dtype={self.precision}"
31
- # Quantized models need some added config, the install of bits and bytes, etc
32
- #elif self.precision == "8bit":
33
- # model_args += ",load_in_8bit=True"
34
- #elif self.precision == "4bit":
35
- # model_args += ",load_in_4bit=True"
36
- #elif self.precision == "GPTQ":
37
- # A GPTQ model does not need dtype to be specified,
38
- # it will be inferred from the config
39
- pass
40
- else:
41
- raise Exception(f"Unknown precision {self.precision}.")
42
-
43
- return model_args
44
-
45
-
46
- def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
47
- """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
48
- json_filepath = eval_request.json_filepath
49
-
50
- with open(json_filepath) as fp:
51
- data = json.load(fp)
52
-
53
- data["status"] = set_to_status
54
-
55
- with open(json_filepath, "w") as f:
56
- f.write(json.dumps(data))
57
-
58
- api.upload_file(
59
- path_or_fileobj=json_filepath,
60
- path_in_repo=json_filepath.replace(local_dir, ""),
61
- repo_id=hf_repo,
62
- repo_type="dataset",
63
- )
64
-
65
-
66
- def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
67
- """Get all pending evaluation requests and return a list in which private
68
- models appearing first, followed by public models sorted by the number of
69
- likes.
70
-
71
- Returns:
72
- `list[EvalRequest]`: a list of model info dicts.
73
- """
74
- snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN)
75
- json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
76
-
77
- eval_requests = []
78
- for json_filepath in json_files:
79
- with open(json_filepath) as fp:
80
- data = json.load(fp)
81
- if data["status"] in job_status:
82
- data["json_filepath"] = json_filepath
83
- eval_request = EvalRequest(**data)
84
- eval_requests.append(eval_request)
85
-
86
- return eval_requests
87
-
88
-
89
- def check_completed_evals(
90
- api: HfApi,
91
- hf_repo: str,
92
- local_dir: str,
93
- checked_status: str,
94
- completed_status: str,
95
- failed_status: str,
96
- hf_repo_results: str,
97
- local_dir_results: str,
98
- ):
99
- """Checks if the currently running evals are completed, if yes, update their status on the hub."""
100
- snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
101
-
102
- running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
103
-
104
- for eval_request in running_evals:
105
- model = eval_request.model
106
- print("====================================")
107
- print(f"Checking {model}")
108
-
109
- output_path = model
110
- output_file = f"{local_dir_results}/{output_path}/results*.json"
111
- output_file_exists = len(glob.glob(output_file)) > 0
112
-
113
- if output_file_exists:
114
- print(
115
- f"EXISTS output file exists for {model} setting it to {completed_status}"
116
- )
117
- set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
118
- else:
119
- print(
120
- f"No result file found for {model} setting it to {failed_status}"
121
- )
122
- set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/run_eval_suite.py DELETED
@@ -1,57 +0,0 @@
1
- import json
2
- import os
3
- import logging
4
- from datetime import datetime
5
-
6
- from lm_eval import tasks, evaluator, utils
7
-
8
- from src.envs import RESULTS_REPO, API
9
- from src.backend.manage_requests import EvalRequest
10
-
11
- logging.getLogger("openai").setLevel(logging.WARNING)
12
-
13
- def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
14
- if limit:
15
- print(
16
- "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
17
- )
18
-
19
- task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
20
-
21
- print(f"Selected Tasks: {task_names}")
22
-
23
- results = evaluator.simple_evaluate(
24
- model="hf-causal-experimental", # "hf-causal"
25
- model_args=eval_request.get_model_args(),
26
- tasks=task_names,
27
- num_fewshot=num_fewshot,
28
- batch_size=batch_size,
29
- device=device,
30
- no_cache=no_cache,
31
- limit=limit,
32
- write_out=True,
33
- output_base_path="logs"
34
- )
35
-
36
- results["config"]["model_dtype"] = eval_request.precision
37
- results["config"]["model_name"] = eval_request.model
38
- results["config"]["model_sha"] = eval_request.revision
39
-
40
- dumped = json.dumps(results, indent=2)
41
- print(dumped)
42
-
43
- output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
44
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
45
- with open(output_path, "w") as f:
46
- f.write(dumped)
47
-
48
- print(evaluator.make_table(results))
49
-
50
- API.upload_file(
51
- path_or_fileobj=output_path,
52
- path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
53
- repo_id=results_repo,
54
- repo_type="dataset",
55
- )
56
-
57
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/sort_queue.py DELETED
@@ -1,28 +0,0 @@
1
- import re
2
- from dataclasses import dataclass
3
-
4
- from huggingface_hub import HfApi
5
-
6
- from src.backend.manage_requests import EvalRequest
7
-
8
-
9
- @dataclass
10
- class ModelMetadata:
11
- likes: int = 0
12
- size: int = 15
13
-
14
-
15
- def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
16
- private_models = [model for model in models if model.private]
17
- public_models = [model for model in models if not model.private]
18
-
19
- return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
20
-
21
- def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
22
- return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
23
-
24
- def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
25
- return sorted(eval_requests, key=lambda x: x.params, reverse=False)
26
-
27
- def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
28
- return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/css_html_js.py CHANGED
@@ -38,12 +38,6 @@ custom_css = """
38
  padding: 0px;
39
  }
40
 
41
- /* Hides the final AutoEvalColumn */
42
- #llm-benchmark-tab-table table td:last-child,
43
- #llm-benchmark-tab-table table th:last-child {
44
- display: none;
45
- }
46
-
47
  /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
48
  table td:first-child,
49
  table th:first-child {
 
38
  padding: 0px;
39
  }
40
 
 
 
 
 
 
 
41
  /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
  table td:first-child,
43
  table th:first-child {
src/display/utils.py CHANGED
@@ -19,7 +19,6 @@ class ColumnContent:
19
  displayed_by_default: bool
20
  hidden: bool = False
21
  never_hidden: bool = False
22
- dummy: bool = False
23
 
24
  ## Leaderboard columns
25
  auto_eval_column_dict = []
@@ -40,8 +39,6 @@ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B
40
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
41
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
42
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
43
- # Dummy column for the search bar (hidden by the custom CSS)
44
- auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
45
 
46
  # We use make dataclass to dynamically fill the scores from Tasks
47
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
19
  displayed_by_default: bool
20
  hidden: bool = False
21
  never_hidden: bool = False
 
22
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
 
39
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
src/envs.py CHANGED
@@ -6,9 +6,7 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request file
10
- DEVICE = "cpu" # "cuda:0" if you add compute
11
- LIMIT = 20 # !!!! Should be None for actual evaluations!!!
12
  # ----------------------------------
13
 
14
  REPO_ID = f"{OWNER}/leaderboard"
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
 
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
src/leaderboard/read_evals.py CHANGED
@@ -116,7 +116,6 @@ class EvalResult:
116
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
117
  AutoEvalColumn.architecture.name: self.architecture,
118
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
119
- AutoEvalColumn.dummy.name: self.full_model,
120
  AutoEvalColumn.revision.name: self.revision,
121
  AutoEvalColumn.average.name: average,
122
  AutoEvalColumn.license.name: self.license,
 
116
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
117
  AutoEvalColumn.architecture.name: self.architecture,
118
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
 
119
  AutoEvalColumn.revision.name: self.revision,
120
  AutoEvalColumn.average.name: average,
121
  AutoEvalColumn.license.name: self.license,