File size: 2,438 Bytes
08ae6c5
 
 
 
f72e694
08ae6c5
1d6da9d
7798457
08ae6c5
eb2a0ba
45e5a75
08ae6c5
 
 
72bd0af
 
 
 
08ae6c5
 
 
 
72bd0af
d943ab1
fe281bf
7135a84
1d6da9d
 
 
 
 
 
 
 
2e91fe1
 
1d6da9d
 
7798457
1d6da9d
 
 
d876fb6
 
2f97778
7135a84
f72e694
1d6da9d
 
 
 
eb2a0ba
1d6da9d
51bfeec
7798457
 
1d6da9d
7798457
 
 
 
08ae6c5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import json
import os
import logging
from datetime import datetime
from argparse import Namespace

from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
from src.backend.manage_requests import EvalRequest
from lighteval.logging.evaluation_tracker import EnhancedJSONEncoder
from lighteval.models.model_loader import ModelInfo

logging.getLogger("openai").setLevel(logging.WARNING)

class DefaultNamespace(Namespace):
    def __getattr__(self, name):
        return self.__dict__.get(name, None)

def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
    if limit:
        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")

    args = DefaultNamespace(**{
            "endpoint_model_name": eval_request.model,
            "model_dtype": eval_request.precision,
            "revision": eval_request.revision,
            "accelerator": accelerator,
            "vendor": vendor,
            "region": region,
            "instance_size": instance_size,
            "instance_type": instance_type,
            "max_samples": limit,
            "job_id": str(datetime.now()),
            "push_results_to_hub": True,
            "save_details": False,
            "push_details_to_hub": False,
            "public_run": False,
            "cache_dir": CACHE_PATH,
            "results_org": OWNER,
            "output_dir": local_dir,
            "override_batch_size": batch_size,
            "custom_tasks": "custom_tasks.py",
            "tasks": task_names,
            "dataset_loading_processes": 24,
            "num_fewshot_seeds": 0,
            "reuse_existing": False,
    })

    try:
        results = main(args)

        dumped = json.dumps(results, cls=EnhancedJSONEncoder, indent=2)
        print(dumped)
    except Exception as ex: # if eval failed, we force a cleanup
        import traceback
        traceback.print_exception(ex)
        env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
        args.reuse_existing = True
        model_config = create_model_config(args=args, accelerator=accelerator)
        model, _ = load_model(config=model_config, env_config=env_config)
        model.cleanup()

    return results