File size: 2,948 Bytes
d7b7dc6
 
 
 
 
58b9de9
d7b7dc6
58b9de9
d7b7dc6
58b9de9
 
 
d7b7dc6
 
 
58b9de9
150bb15
2b9835a
58b9de9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7b7dc6
150bb15
 
 
 
 
 
 
 
58b9de9
 
 
 
 
150bb15
58b9de9
 
 
d7b7dc6
 
58b9de9
d7b7dc6
150bb15
 
 
d7b7dc6
 
150bb15
d7b7dc6
150bb15
 
 
 
 
 
 
 
d7b7dc6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import json
import os
import logging
from datetime import datetime

import src.envs as envs
from src.backend.manage_requests import EvalRequest
from src.backend.evaluate_model import Evaluator

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.getLogger("openai").setLevel(logging.WARNING)


def run_evaluation(eval_request: EvalRequest, batch_size, device,
                local_dir: str, results_repo: str, no_cache=True, limit=None, 
                need_check=True, write_results=False):
    """
    Run the evaluation for a given model and upload the results.

    Args:
        eval_request (EvalRequest): The evaluation request object containing model details.
        num_fewshot (int): Number of few-shot examples.
        batch_size (int): Batch size for processing.
        device (str): The device to run the evaluation on.
        local_dir (str): Local directory path for saving results.
        results_repo (str): Repository ID where results will be uploaded.
        no_cache (bool): Whether to disable caching.
        limit (int, optional): Limit on the number of items to process. Use with caution.

    Returns:
        dict: A dictionary containing evaluation results.
    """
    if limit:
        logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")

    output_folder = os.path.join(local_dir, *eval_request.model.split("/"))
    # if os.path.exists(output_folder):
    #     f_name = os.listdir(output_folder)[-1] 
    #     print(f"Loading results from {os.path.join(output_folder, f_name)}")
    #     results = json.loads(os.path.join(output_folder, f_name))
    #     dumped = json.dumps(results, indent=2)
    #     logging.info(dumped)
    # else:
    try:
        evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
                            batch_size, device, no_cache, limit, write_out=True,
                            output_base_path='logs')
        results = evaluator.evaluate()
        evaluator.write_results()
    except Exception as e:
        logging.error(f"Error during evaluation: {e}")
        raise

    dumped = json.dumps(results, indent=2)
    logging.info(dumped)

    output_path = os.path.join(output_folder,
                            f"results_{datetime.now()}.json") #
    os.makedirs(output_folder, exist_ok=True)
    with open(output_path, "w") as f:
        f.write(dumped)
    print(f"Results have been saved to{output_path}")

    if not need_check:
        print("Path in the repo:", f"{eval_request.model}/results_{datetime.now()}.json")
        envs.API.upload_file(
            path_or_fileobj=output_path,
            path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
            repo_id=results_repo,
            repo_type="dataset",
        )

    return results