leaderboard

Running on CPU Upgrade

App Files Files Community

Quentin Gallouédec commited on Apr 5

Commit

0811d37

•

1 Parent(s): 6b9db30

works with cartpole!

Browse files

Files changed (15) hide show

app.py +6 -4
custom_tasks.py +0 -90
main_backend_harness.py +37 -15
main_backend_lighteval.py +0 -92
requirements.txt +2 -11
scripts/create_request_file.py +4 -36
scripts/fix_harness_import.py +1 -1
src/about.py +9 -6
src/backend/manage_requests.py +18 -36
src/backend/run_eval_suite_harness.py +58 -25
src/backend/run_eval_suite_lighteval.py +0 -72
src/backend/sort_queue.py +2 -7
src/display/log_visualizer.py +5 -5
src/envs.py +5 -6
src/logging.py +4 -4

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 from src.logging import configure_root_logger
 logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("numexpr").setLevel(logging.WARNING)
 logging.getLogger("absl").setLevel(logging.WARNING)
@@ -8,7 +9,7 @@ configure_root_logger()
 from functools import partial
 import gradio as gr
-from main_backend_lighteval import run_auto_eval
 from src.display.log_visualizer import log_file_to_html_string
 from src.display.css_html_js import dark_mode_gradio_js
 from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
@@ -32,6 +33,7 @@ links_md = f"""
 | Results Repo    | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
 """
 def button_auto_eval():
     logger.info("Manually triggering Auto Eval")
     run_auto_eval()
@@ -45,7 +47,7 @@ with gr.Blocks(js=dark_mode_gradio_js) as demo:
         output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
         with gr.Row():
             download_button = gr.DownloadButton("Download Log File", value=log_file)
-            with gr.Accordion('Log View Configuration', open=False):
                 reverse_order_checkbox.render()
         # Add a button that when pressed, triggers run_auto_eval
         button = gr.Button("Manually Run Evaluation")
@@ -56,5 +58,5 @@ with gr.Blocks(js=dark_mode_gradio_js) as demo:
         button.click(fn=button_auto_eval, inputs=[], outputs=[])
-if __name__ == '__main__':
-    demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)

 import logging
 from src.logging import configure_root_logger
 logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("numexpr").setLevel(logging.WARNING)
 logging.getLogger("absl").setLevel(logging.WARNING)
 from functools import partial
 import gradio as gr
+from main_backend_harness import run_auto_eval
 from src.display.log_visualizer import log_file_to_html_string
 from src.display.css_html_js import dark_mode_gradio_js
 from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
 | Results Repo    | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
 """
 def button_auto_eval():
     logger.info("Manually triggering Auto Eval")
     run_auto_eval()
         output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
         with gr.Row():
             download_button = gr.DownloadButton("Download Log File", value=log_file)
+            with gr.Accordion("Log View Configuration", open=False):
                 reverse_order_checkbox.render()
         # Add a button that when pressed, triggers run_auto_eval
         button = gr.Button("Manually Run Evaluation")
         button.click(fn=button_auto_eval, inputs=[], outputs=[])
+if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)

custom_tasks.py DELETED Viewed

@@ -1,90 +0,0 @@
-# ruff: noqa: F405, F403, F401
-"""
-Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
-This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
-Author:
-"""
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
-## EVAL WITH NO SUBSET ##
-# This is how you create a simple tasks (like hellaswag) which has one single subset
-# attached to it, and one evaluation possible.
-task = LightevalTaskConfig(
-    name="myothertask",
-    prompt_function="prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
-    suite=["community"],
-    hf_repo="",
-    hf_subset="default",
-    hf_avail_splits=[],
-    evaluation_splits=[],
-    few_shots_split="",
-    few_shots_select="",
-    metric=[""],
-)
-## EVALS WITH SUBSET
-# This is how you create a subset task (like MMLU), which has several subset
-# each being its own evaluation task.
-# fmt: off
-SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
-# fmt: on
-class CustomSubsetTask(LightevalTaskConfig):
-    def __init__(
-        self,
-        name,
-        hf_subset,
-    ):
-        super().__init__(
-            name=name,
-            hf_subset=hf_subset,
-            prompt_function="prompt_fn",  # must be defined in the file
-            hf_repo="",
-            metric=[""],
-            hf_avail_splits=[],
-            evaluation_splits=[],
-            few_shots_split="",
-            few_shots_select="",
-            suite=["community"],
-            generation_size=-1,
-            stop_sequence=None,
-            output_regex=None,
-            frozen=False,
-        )
-## DEFINE YOUR PROMPT FUNCTIONS
-# Define as many as you need for your different tasks
-def prompt_fn(line, task_name: str = None):
-    """Defines how to go from a dataset line to a doc object.
-    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
-    about what this function should do in the README.
-    """
-    return Doc(
-        task_name=task_name,
-        query="",
-        choices="",
-        gold_index=0,
-        instruction="",
-    )
-## STORE YOUR EVALS
-SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
-_TASKS = SUBSET_TASKS + [task]
-## MODULE LOGIC
-# You should not need to touch this
-# Convert to dict for lighteval
-TASKS_TABLE = [task.as_dict() for task in _TASKS]
-if __name__ == "__main__":
-    print(t["name"] for t in TASKS_TABLE)
-    print(len(TASKS_TABLE))

main_backend_harness.py CHANGED Viewed

@@ -5,13 +5,23 @@ from huggingface_hub import snapshot_download
 logging.getLogger("openai").setLevel(logging.WARNING)
-from backend.run_eval_suite_harness import run_evaluation
 from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
 from src.backend.sort_queue import sort_models_by_priority
-from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
 from src.about import Tasks, NUM_FEWSHOT
 from src.logging import setup_logger
 TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # logging.basicConfig(level=logging.ERROR)
@@ -23,8 +33,23 @@ RUNNING_STATUS = "RUNNING"
 FINISHED_STATUS = "FINISHED"
 FAILED_STATUS = "FAILED"
-snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
 def run_auto_eval():
     current_pending_status = [PENDING_STATUS]
@@ -39,11 +64,13 @@ def run_auto_eval():
         hf_repo=QUEUE_REPO,
         local_dir=EVAL_REQUESTS_PATH_BACKEND,
         hf_repo_results=RESULTS_REPO,
-        local_dir_results=EVAL_RESULTS_PATH_BACKEND
     )
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
-    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
     # Sort the evals by priority (first submitted first run)
     eval_requests = sort_models_by_priority(api=API, models=eval_requests)
@@ -64,17 +91,12 @@ def run_auto_eval():
     )
     run_evaluation(
-        eval_request=eval_request,
-        task_names=TASKS_HARNESS,
-        num_fewshot=NUM_FEWSHOT,
         local_dir=EVAL_RESULTS_PATH_BACKEND,
         results_repo=RESULTS_REPO,
-        batch_size=1,
-        device=DEVICE,
-        no_cache=True,
-        limit=LIMIT
-        )
 if __name__ == "__main__":
-    run_auto_eval()

 logging.getLogger("openai").setLevel(logging.WARNING)
+from src.backend.run_eval_suite_harness import run_evaluation
 from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
 from src.backend.sort_queue import sort_models_by_priority
+from src.envs import (
+    QUEUE_REPO,
+    EVAL_REQUESTS_PATH_BACKEND,
+    RESULTS_REPO,
+    EVAL_RESULTS_PATH_BACKEND,
+    DEVICE,
+    API,
+    LIMIT,
+    TOKEN,
+)
 from src.about import Tasks, NUM_FEWSHOT
 from src.logging import setup_logger
 TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # logging.basicConfig(level=logging.ERROR)
 FINISHED_STATUS = "FINISHED"
 FAILED_STATUS = "FAILED"
+snapshot_download(
+    repo_id=RESULTS_REPO,
+    revision="main",
+    local_dir=EVAL_RESULTS_PATH_BACKEND,
+    repo_type="dataset",
+    max_workers=60,
+    token=TOKEN,
+)
+snapshot_download(
+    repo_id=QUEUE_REPO,
+    revision="main",
+    local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    repo_type="dataset",
+    max_workers=60,
+    token=TOKEN,
+)
 def run_auto_eval():
     current_pending_status = [PENDING_STATUS]
         hf_repo=QUEUE_REPO,
         local_dir=EVAL_REQUESTS_PATH_BACKEND,
         hf_repo_results=RESULTS_REPO,
+        local_dir_results=EVAL_RESULTS_PATH_BACKEND,
     )
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
+    eval_requests = get_eval_requests(
+        job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
+    )
     # Sort the evals by priority (first submitted first run)
     eval_requests = sort_models_by_priority(api=API, models=eval_requests)
     )
     run_evaluation(
+        eval_request=eval_request,
+        task_names=TASKS_HARNESS,
         local_dir=EVAL_RESULTS_PATH_BACKEND,
         results_repo=RESULTS_REPO,
+    )
 if __name__ == "__main__":
+    run_auto_eval()

main_backend_lighteval.py DELETED Viewed

@@ -1,92 +0,0 @@
-import logging
-import pprint
-from huggingface_hub import snapshot_download
-logging.getLogger("openai").setLevel(logging.WARNING)
-from src.backend.run_eval_suite_lighteval import run_evaluation
-from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
-from src.backend.sort_queue import sort_models_by_priority
-from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION
-from src.about import TASKS_LIGHTEVAL
-from src.logging import setup_logger
-logger = setup_logger(__name__)
-# logging.basicConfig(level=logging.ERROR)
-pp = pprint.PrettyPrinter(width=80)
-PENDING_STATUS = "PENDING"
-RUNNING_STATUS = "RUNNING"
-FINISHED_STATUS = "FINISHED"
-FAILED_STATUS = "FAILED"
-snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-def run_auto_eval():
-    current_pending_status = [PENDING_STATUS]
-    # pull the eval dataset from the hub and parse any eval requests
-    # check completed evals and set them to finished
-    check_completed_evals(
-        api=API,
-        checked_status=RUNNING_STATUS,
-        completed_status=FINISHED_STATUS,
-        failed_status=FAILED_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-        hf_repo_results=RESULTS_REPO,
-        local_dir_results=EVAL_RESULTS_PATH_BACKEND
-    )
-    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
-    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
-    # Sort the evals by priority (first submitted first run)
-    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
-    logger.info(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
-    if len(eval_requests) == 0:
-        return
-    eval_request = eval_requests[0]
-    logger.info(pp.pformat(eval_request))
-    set_eval_request(
-        api=API,
-        eval_request=eval_request,
-        set_to_status=RUNNING_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-    )
-    # This needs to be done
-    #instance_size, instance_type = get_instance_for_model(eval_request)
-    # For GPU
-    # instance_size, instance_type = "small", "g4dn.xlarge"
-    # For CPU
-    instance_size, instance_type = "medium", "c6i"
-    logger.info(f'Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
-    run_evaluation(
-        eval_request=eval_request,
-        task_names=TASKS_LIGHTEVAL,
-        local_dir=EVAL_RESULTS_PATH_BACKEND,
-        batch_size=1,
-        accelerator=ACCELERATOR,
-        region=REGION,
-        vendor=VENDOR,
-        instance_size=instance_size,
-        instance_type=instance_type,
-        limit=LIMIT
-        )
-    logger.info(f'Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
-if __name__ == "__main__":
-    run_auto_eval()

requirements.txt CHANGED Viewed

@@ -2,8 +2,9 @@ APScheduler==3.10.1
 black==23.11.0
 click==8.1.3
 datasets==2.14.5
-gradio==4.4.0 # will have to move to 4.19.2
 gradio_client
 huggingface-hub>=0.18.0
 matplotlib==3.7.1
 numpy==1.24.2
@@ -11,16 +12,6 @@ pandas==2.0.0
 python-dateutil==2.8.2
 requests==2.28.2
 tqdm==4.65.0
-transformers
-tokenizers>=0.15.0
-git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
-git+https://github.com/huggingface/lighteval.git#egg=lighteval
-accelerate==0.24.1
-sentencepiece
-# Evaluation suites
-lighteval
-lm_eval
 # Log Visualizer
 BeautifulSoup4==4.12.2

 black==23.11.0
 click==8.1.3
 datasets==2.14.5
+gradio==4.25.0
 gradio_client
+gymnasium==0.29.1
 huggingface-hub>=0.18.0
 matplotlib==3.7.1
 numpy==1.24.2
 python-dateutil==2.8.2
 requests==2.28.2
 tqdm==4.65.0
 # Log Visualizer
 BeautifulSoup4==4.12.2

scripts/create_request_file.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import json
 import os
 import pprint
-import re
 from datetime import datetime, timezone
 import click
@@ -9,39 +8,16 @@ from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
 from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
-precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
-model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
-weight_types = ("Original", "Delta", "Adapter")
-def get_model_size(model_info, precision: str):
-    size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
-    try:
-        model_size = round(model_info.safetensors["total"] / 1e9, 3)
-    except (AttributeError, TypeError):
-        try:
-            size_match = re.search(size_pattern, model_info.modelId.lower())
-            model_size = size_match.group(0)
-            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
-        except AttributeError:
-            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
-    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
-    model_size = size_factor * model_size
-    return model_size
 def main():
     api = HfApi()
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
     model_name = click.prompt("Enter model name")
     revision = click.prompt("Enter revision", default="main")
-    precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
-    model_type = click.prompt("Enter model type", type=click.Choice(model_types))
-    weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
-    base_model = click.prompt("Enter base model", default="")
     status = click.prompt("Enter status", default="FINISHED")
     try:
@@ -50,8 +26,6 @@ def main():
         print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
         return 1
-    model_size = get_model_size(model_info=model_info, precision=precision)
     try:
         license = model_info.cardData["license"]
     except Exception:
@@ -59,16 +33,10 @@ def main():
     eval_entry = {
         "model": model_name,
-        "base_model": base_model,
         "revision": revision,
-        "private": False,
-        "precision": precision,
-        "weight_type": weight_type,
         "status": status,
         "submitted_time": current_time,
-        "model_type": model_type,
         "likes": model_info.likes,
-        "params": model_size,
         "license": license,
     }
@@ -85,7 +53,7 @@ def main():
         out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
         os.makedirs(out_dir, exist_ok=True)
-        out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
         with open(out_path, "w") as f:
             f.write(json.dumps(eval_entry))

 import json
 import os
 import pprint
 from datetime import datetime, timezone
 import click
 from huggingface_hub import HfApi, snapshot_download
 from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
 def main():
     api = HfApi()
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    snapshot_download(
+        repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
+    )
     model_name = click.prompt("Enter model name")
     revision = click.prompt("Enter revision", default="main")
     status = click.prompt("Enter status", default="FINISHED")
     try:
         print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
         return 1
     try:
         license = model_info.cardData["license"]
     except Exception:
     eval_entry = {
         "model": model_name,
         "revision": revision,
         "status": status,
         "submitted_time": current_time,
         "likes": model_info.likes,
         "license": license,
     }
         out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
         os.makedirs(out_dir, exist_ok=True)
+        out_path = f"{out_dir}/{model_path}_eval_request.json"
         with open(out_path, "w") as f:
             f.write(json.dumps(eval_entry))

scripts/fix_harness_import.py CHANGED Viewed

@@ -8,4 +8,4 @@ import lm_eval
 if __name__ == "__main__":
     lm_eval_path = lm_eval.__path__[0]
-    os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)

 if __name__ == "__main__":
     lm_eval_path = lm_eval.__path__[0]
+    os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)

src/about.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str
@@ -11,14 +12,16 @@ class Task:
 # Change for your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
-NUM_FEWSHOT = 0 # Change with your few shot
 TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # ---------------------------------------------------
-TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
-#custom|myothertask|0|0

 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str
 # Change for your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    # task0 = Task("PongNoFrameskip-v4", "episodic_return", "PongNoFrameskip-v4")
+    task1 = Task("BreakoutNoFrameskip-v4", "episodic_return", "BreakoutNoFrameskip-v4")
+    task2 = Task("CartPole-v1", "episodic_return", "CartPole-v1")
+NUM_FEWSHOT = 0  # Change with your few shot
 TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # ---------------------------------------------------
+TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
+# custom|myothertask|0|0

src/backend/manage_requests.py CHANGED Viewed

@@ -9,41 +9,18 @@ from src.logging import setup_logger
 logger = setup_logger(__name__)
 @dataclass
 class EvalRequest:
     model: str
-    private: bool
     status: str
     json_filepath: str
-    weight_type: str = "Original"
-    model_type: str = ""  # pretrained, finetuned, with RL
-    precision: str = ""  # float16, bfloat16
-    base_model: Optional[str] = None # for adapter models
-    revision: str = "main" # commit
-    submitted_time: Optional[str] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
-    model_type: Optional[str] = None
     likes: Optional[int] = 0
-    params: Optional[int] = None
     license: Optional[str] = ""
-    def get_model_args(self):
-        model_args = f"pretrained={self.model},revision={self.revision}"
-        if self.precision in ["float16", "bfloat16", "float32"]:
-            model_args += f",dtype={self.precision}"
-        # Quantized models need some added config, the install of bits and bytes, etc
-        #elif self.precision == "8bit":
-        #    model_args += ",load_in_8bit=True"
-        #elif self.precision == "4bit":
-        #    model_args += ",load_in_4bit=True"
-        #elif self.precision == "GPTQ":
-            # A GPTQ model does not need dtype to be specified,
-            # it will be inferred from the config
-            pass
-        else:
-            raise Exception(f"Unknown precision {self.precision}.")
-        return model_args
 def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
@@ -74,7 +51,9 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
     Returns:
         `list[EvalRequest]`: a list of model info dicts.
     """
-    snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN)
     json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
     eval_requests = []
@@ -100,7 +79,14 @@ def check_completed_evals(
     local_dir_results: str,
 ):
     """Checks if the currently running evals are completed, if yes, update their status on the hub."""
-    snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
     running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
@@ -114,12 +100,8 @@ def check_completed_evals(
         output_file_exists = len(glob.glob(output_file)) > 0
         if output_file_exists:
-            logger.info(
-                f"EXISTS output file exists for {model} setting it to {completed_status}"
-            )
             set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
         else:
-            logger.info(
-                f"No result file found for {model} setting it to {failed_status}"
-            )
             set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

 logger = setup_logger(__name__)
 @dataclass
 class EvalRequest:
     model: str
     status: str
     json_filepath: str
+    revision: str = "main"  # commit
+    submitted_time: Optional[
+        str
+    ] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
     likes: Optional[int] = 0
     license: Optional[str] = ""
 def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
     Returns:
         `list[EvalRequest]`: a list of model info dicts.
     """
+    snapshot_download(
+        repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
+    )
     json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
     eval_requests = []
     local_dir_results: str,
 ):
     """Checks if the currently running evals are completed, if yes, update their status on the hub."""
+    snapshot_download(
+        repo_id=hf_repo_results,
+        revision="main",
+        local_dir=local_dir_results,
+        repo_type="dataset",
+        max_workers=60,
+        token=TOKEN,
+    )
     running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
         output_file_exists = len(glob.glob(output_file)) > 0
         if output_file_exists:
+            logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
             set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
         else:
+            logger.info(f"No result file found for {model} setting it to {failed_status}")
             set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

src/backend/run_eval_suite_harness.py CHANGED Viewed

@@ -3,41 +3,76 @@ import os
 import logging
 from datetime import datetime
-from lm_eval import tasks, evaluator, utils
 from src.envs import RESULTS_REPO, API
 from src.backend.manage_requests import EvalRequest
 from src.logging import setup_logger
 logging.getLogger("openai").setLevel(logging.WARNING)
 logger = setup_logger(__name__)
-def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
-    if limit:
-        logger.info(
-            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
-        )
-    task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
     logger.info(f"Selected Tasks: {task_names}")
-    results = evaluator.simple_evaluate(
-        model="hf-causal-experimental", # "hf-causal"
-        model_args=eval_request.get_model_args(),
-        tasks=task_names,
-        num_fewshot=num_fewshot,
-        batch_size=batch_size,
-        device=device,
-        no_cache=no_cache,
-        limit=limit,
-        write_out=True,
-        output_base_path="logs"
-    )
-    results["config"]["model_dtype"] = eval_request.precision
-    results["config"]["model_name"] = eval_request.model
-    results["config"]["model_sha"] = eval_request.revision
     dumped = json.dumps(results, indent=2)
     logger.info(dumped)
@@ -47,8 +82,6 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
     with open(output_path, "w") as f:
         f.write(dumped)
-    logger.info(evaluator.make_table(results))
     API.upload_file(
         path_or_fileobj=output_path,
         path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",

 import logging
 from datetime import datetime
 from src.envs import RESULTS_REPO, API
 from src.backend.manage_requests import EvalRequest
 from src.logging import setup_logger
+from src.backend.evaluate import run_evaluation
+import fnmatch
+import torch
+from torch import nn
+from huggingface_hub.utils._errors import EntryNotFoundError
+import gymnasium as gym
+import numpy as np
+from typing import List
+from huggingface_hub import hf_hub_download
+from src.backend.manage_requests import EvalRequest
 logging.getLogger("openai").setLevel(logging.WARNING)
 logger = setup_logger(__name__)
+def pattern_match(patterns, source_list):
+    if isinstance(patterns, str):
+        patterns = [patterns]
+    task_names = set()
+    for pattern in patterns:
+        for matching in fnmatch.filter(source_list, pattern):
+            task_names.add(matching)
+    return sorted(list(task_names))
+def run_evaluation(eval_request: EvalRequest, task_names, local_dir: str, results_repo: str):
+    tags = API.model_info(eval_request.model).tags
+    task_names = pattern_match(tags, task_names)
     logger.info(f"Selected Tasks: {task_names}")
+    results = {
+        "config": {
+            "model_name": eval_request.model,
+            "model_sha": eval_request.revision,
+        },
+        "results": {},
+    }
+    try:
+        agent_path = hf_hub_download(repo_id=eval_request.model, filename="agent.pt")
+    except EntryNotFoundError:
+        logger.error("Agent not found")
+        return
+    agent = torch.jit.load(agent_path)
+    episodic_rewards = []
+    for task_name in task_names:
+        env = gym.make(task_name)
+        for _ in range(10):
+            episodic_reward = 0.0
+            observation, info = env.reset()
+            done = False
+            while not done:
+                torch_observation = torch.from_numpy(np.array([observation]))
+                action = agent(torch_observation).numpy()[0]
+                observation, reward, terminated, truncated, info = env.step(action)
+                done = terminated or truncated
+                episodic_reward += reward
+            episodic_rewards.append(episodic_reward)
+        mean_reward = np.mean(episodic_rewards)
+        results[task_name] = {"episodic_return": mean_reward}
     dumped = json.dumps(results, indent=2)
     logger.info(dumped)
     with open(output_path, "w") as f:
         f.write(dumped)
     API.upload_file(
         path_or_fileobj=output_path,
         path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",

src/backend/run_eval_suite_lighteval.py DELETED Viewed

@@ -1,72 +0,0 @@
-import json
-import argparse
-import logging
-from datetime import datetime
-from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
-from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
-from src.backend.manage_requests import EvalRequest
-from src.logging import setup_logger
-logging.getLogger("openai").setLevel(logging.WARNING)
-logger = setup_logger(__name__)
-def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
-    if limit:
-        logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
-    args_dict = {
-            # Endpoint parameters
-            "endpoint_model_name":eval_request.model,
-            "accelerator": accelerator,
-            "vendor": vendor,
-            "region": region,
-            "instance_size": instance_size,
-            "instance_type": instance_type,
-            "reuse_existing": False,
-            "model_dtype": eval_request.precision,
-            "revision": eval_request.revision,
-            # Save parameters
-            "push_results_to_hub": True,
-            "save_details": True,
-            "push_details_to_hub": True,
-            "public_run": False,
-            "cache_dir": CACHE_PATH,
-            "results_org": RESULTS_REPO,
-            "output_dir": local_dir,
-            "job_id": str(datetime.now()),
-            # Experiment parameters
-            "override_batch_size": batch_size,
-            "custom_tasks": "custom_tasks.py",
-            "tasks": task_names,
-            "max_samples": limit,
-            "use_chat_template": False,
-            "system_prompt": None,
-            # Parameters which would be set to things by the kwargs if actually using argparse
-            "inference_server_address": None,
-            "model_args": None,
-            "num_fewshot_seeds": None,
-            "delta_weights": False,
-            "adapter_weights": False
-    }
-    args = argparse.Namespace(**args_dict)
-    try:
-        results = main(args)
-        results["config"]["model_dtype"] = eval_request.precision
-        results["config"]["model_name"] = eval_request.model
-        results["config"]["model_sha"] = eval_request.revision
-        dumped = json.dumps(results, indent=2)
-        logger.info(dumped)
-    except Exception as e: # if eval failed, we force a cleanup
-        env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
-        model_config = create_model_config(args=args, accelerator=accelerator)
-        model, _ = load_model(config=model_config, env_config=env_config)
-        model.cleanup()
-    return results

src/backend/sort_queue.py CHANGED Viewed

@@ -9,20 +9,15 @@ from src.backend.manage_requests import EvalRequest
 @dataclass
 class ModelMetadata:
     likes: int = 0
-    size: int = 15
 def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
-    private_models = [model for model in models if model.private]
-    public_models = [model for model in models if not model.private]
-    return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
 def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
     return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
-def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
-    return sorted(eval_requests, key=lambda x: x.params, reverse=False)
 def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
-    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

 @dataclass
 class ModelMetadata:
     likes: int = 0
 def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
+    return sort_by_submit_date(models)
 def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
     return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
 def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

src/display/log_visualizer.py CHANGED Viewed

@@ -12,8 +12,8 @@ from src.logging import log_file
 def log_file_to_html_string(reverse=True):
     with open(log_file, "rt") as f:
-            lines = f.readlines()
-            lines = lines[-NUM_LINES_VISUALIZE:]
     if reverse:
         lines = reversed(lines)
@@ -26,12 +26,12 @@ def log_file_to_html_string(reverse=True):
     html_content = console.export_html(inline_styles=True)
     # Parse the HTML content using BeautifulSoup
-    soup = BeautifulSoup(html_content, 'lxml')
     # Modify the <pre> tag and add custom styles
     pre_tag = soup.pre
-    pre_tag['class'] = 'scrollable'
-    del pre_tag['style']
     # Add your custom styles and the .scrollable CSS to the <style> tag
     style_tag = soup.style

 def log_file_to_html_string(reverse=True):
     with open(log_file, "rt") as f:
+        lines = f.readlines()
+        lines = lines[-NUM_LINES_VISUALIZE:]
     if reverse:
         lines = reversed(lines)
     html_content = console.export_html(inline_styles=True)
     # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html_content, "lxml")
     # Modify the <pre> tag and add custom styles
     pre_tag = soup.pre
+    pre_tag["class"] = "scrollable"
+    del pre_tag["style"]
     # Add your custom styles and the .scrollable CSS to the <style> tag
     style_tag = soup.style

src/envs.py CHANGED Viewed

@@ -4,13 +4,13 @@ from huggingface_hub import HfApi
 # Info to change for your repository
 # ----------------------------------
-TOKEN = os.environ.get("TOKEN") # A read/write token for your org
-OWNER = "open-rl-leaderboard" # Change to your org - don't forget to create a results and request file
 # For harness evaluations
-DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
-LIMIT = 20 # !!!! Should be None for actual evaluations!!!
 # For lighteval evaluations
 ACCELERATOR = "cpu"
@@ -23,7 +23,7 @@ QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
-CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
@@ -35,4 +35,3 @@ REFRESH_RATE = 1 * 60  # 1 min
 NUM_LINES_VISUALIZE = 300
 API = HfApi(token=TOKEN)

 # Info to change for your repository
 # ----------------------------------
+TOKEN = os.environ.get("TOKEN")  # A read/write token for your org
+OWNER = "open-rl-leaderboard"  # Change to your org - don't forget to create a results and request file
 # For harness evaluations
+DEVICE = "cpu"  # "cuda:0" if you add compute, for harness evaluations
+LIMIT = 20  # !!!! Should be None for actual evaluations!!!
 # For lighteval evaluations
 ACCELERATOR = "cpu"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
+CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 NUM_LINES_VISUALIZE = 300
 API = HfApi(token=TOKEN)

src/logging.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 proj_dir = Path(__file__).parents[1]
-log_file = proj_dir/"output.log"
 import logging
@@ -13,7 +13,7 @@ def setup_logger(name: str):
     logger = logging.getLogger(name)
     logger.setLevel(logging.INFO)
-    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     # Create a file handler to write logs to a file
     file_handler = logging.FileHandler(log_file)
@@ -29,10 +29,10 @@ def configure_root_logger():
     logging.basicConfig(level=logging.INFO)
     root_logger = logging.getLogger()
-    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     file_handler = logging.FileHandler(log_file)
     file_handler.setLevel(logging.INFO)
     file_handler.setFormatter(formatter)
-    root_logger.addHandler(file_handler)

 proj_dir = Path(__file__).parents[1]
+log_file = proj_dir / "output.log"
 import logging
     logger = logging.getLogger(name)
     logger.setLevel(logging.INFO)
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
     # Create a file handler to write logs to a file
     file_handler = logging.FileHandler(log_file)
     logging.basicConfig(level=logging.INFO)
     root_logger = logging.getLogger()
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
     file_handler = logging.FileHandler(log_file)
     file_handler.setLevel(logging.INFO)
     file_handler.setFormatter(formatter)
+    root_logger.addHandler(file_handler)