File size: 2,340 Bytes
08ae6c5
 
 
 
f72e694
08ae6c5
1d6da9d
1ba7e1c
1d6da9d
08ae6c5
 
1ba7e1c
 
08ae6c5
 
72bd0af
 
 
 
08ae6c5
 
 
 
72bd0af
1d6da9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f72e694
1d6da9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08ae6c5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import json
import os
import logging
from datetime import datetime
from argparse import Namespace

from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
from lighteval.models.endpoint_model import InferenceEndpointModel
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
from src.backend.manage_requests import EvalRequest

InferenceEndpointModel.add_special_tokens = lambda *args, **kwargs: None

logging.getLogger("openai").setLevel(logging.WARNING)

class DefaultNamespace(Namespace):
    def __getattr__(self, name):
        return self.__dict__.get(name, None)

def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
    if limit:
        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")

    args = DefaultNamespace(**{
            "endpoint_model_name":f"{eval_request.model}_{eval_request.precision}".lower(),
            "accelerator": accelerator,
            "vendor": vendor,
            "region": region,
            "instance_size": instance_size,
            "instance_type": instance_type,
            "max_samples": limit,
            "job_id": str(datetime.now()),
            "push_results_to_hub": True,
            "save_details": True,
            "push_details_to_hub": True,
            "public_run": False,
            "cache_dir": CACHE_PATH,
            "results_org": RESULTS_REPO,
            "output_dir": local_dir,
            "override_batch_size": batch_size,
            "custom_tasks": "custom_tasks.py",
            "tasks": task_names
    })

    try:
        results = main(args)

        results["config"]["model_dtype"] = eval_request.precision
        results["config"]["model_name"] = eval_request.model
        results["config"]["model_sha"] = eval_request.revision

        dumped = json.dumps(results, indent=2)
        print(dumped)
    except Exception: # if eval failed, we force a cleanup
        env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)

        model_config = create_model_config(args=args, accelerator=accelerator)
        model, _ = load_model(config=model_config, env_config=env_config)
        model.cleanup()


    return results