File size: 6,088 Bytes
9b2e755
0c7ef71
 
71ecfbb
0c7ef71
 
71ecfbb
9839977
 
71ecfbb
 
 
 
 
 
 
 
 
 
 
 
 
 
0c7ef71
 
 
 
 
 
 
 
5ad4694
 
ae618a2
ecefacb
71ecfbb
0c7ef71
 
 
 
ae618a2
ecefacb
0c7ef71
 
71ecfbb
0c7ef71
9839977
 
 
 
 
 
0c7ef71
 
 
5408125
 
0c7ef71
9839977
 
0c7ef71
71ecfbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c7ef71
 
 
 
 
9b2e755
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d502c8
9b2e755
71ecfbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b2e755
0c7ef71
9b2e755
0c7ef71
71ecfbb
0c7ef71
9b2e755
0c7ef71
9b2e755
 
 
 
 
 
 
 
9839977
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from huggingface_hub import ModelFilter, snapshot_download
from huggingface_hub import ModelCard

import os
import json
import time
from collections import defaultdict

from src.submission.check_validity import is_model_on_hub, check_model_card, get_model_tags
from src.leaderboard.read_evals import EvalResult
from src.envs import (
    DYNAMIC_INFO_REPO,
    DYNAMIC_INFO_PATH,
    DYNAMIC_INFO_FILE_PATH,
    API,
    H4_TOKEN,
    ORIGINAL_HF_LEADERBOARD_RESULTS_REPO,
    ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH,
    GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
)
from src.display.utils import ORIGINAL_TASKS

def update_models(file_path, models, original_leaderboard_files=None):
    """
    Search through all JSON files in the specified root folder and its subfolders,
    and update the likes key in JSON dict from value of input dict
    """
    with open(file_path, "r") as f:
        model_infos = json.load(f)
        for model_id, data in model_infos.items():
            if model_id not in models:
                data['still_on_hub'] = False
                data['likes'] = 0
                data['downloads'] = 0
                data['created_at'] = ""
                data['original_llm_scores'] = {}
                continue

            model_cfg = models[model_id]
            data['likes'] = model_cfg.likes
            data['downloads'] = model_cfg.downloads
            data['created_at'] = str(model_cfg.created_at)
            #data['params'] = get_model_size(model_cfg, data['precision'])
            data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
            data['original_llm_scores'] = {}

            # Is the model still on the hub?
            model_name = model_id
            if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
                model_name = model_cfg.card_data.base_model # for adapters, we look at the parent model
            still_on_hub, _, _ = is_model_on_hub(
                model_name=model_name, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False, token=H4_TOKEN
            )
            data['still_on_hub'] = still_on_hub

            tags = []

            if still_on_hub:
                status, _, model_card = check_model_card(model_id)
                tags = get_model_tags(model_card, model_id)

            
            if original_leaderboard_files is not None and model_id in original_leaderboard_files:
                eval_results = {}                
                for filepath in original_leaderboard_files[model_id]:
                    eval_result = EvalResult.init_from_json_file(filepath, is_original=True)
                    # Store results of same eval together
                    eval_name = eval_result.eval_name
                    if eval_name in eval_results.keys():
                        eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
                    else:
                        eval_results[eval_name] = eval_result
                for eval_result in eval_results.values():
                    precision = eval_result.precision.value.name
                    if len(eval_result.results) < len(ORIGINAL_TASKS):
                        continue
                    data['original_llm_scores'][precision] = sum([v for v in eval_result.results.values() if v is not None]) / len(ORIGINAL_TASKS)
                        
            data["tags"] = tags

    with open(file_path, 'w') as f:
        json.dump(model_infos, f, indent=2)

def update_dynamic_files():
    """ This will only update metadata for models already linked in the repo, not add missing ones.
    """
    snapshot_download(
        repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
    )

    print("UPDATE_DYNAMIC: Loaded snapshot")
    # Get models
    start = time.time()

    models = list(API.list_models(
        filter=ModelFilter(task="text-generation"),
        full=False,
        cardData=True,
        fetch_config=True,
    ))
    id_to_model = {model.id : model for model in models}

    id_to_leaderboard_files = defaultdict(list)
    if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
        try:
            print("UPDATE_DYNAMIC: Downloading Original HF Leaderboard results snapshot")
            snapshot_download(
                repo_id=ORIGINAL_HF_LEADERBOARD_RESULTS_REPO, local_dir=ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
            )
            #original_leaderboard_files = [] #API.list_repo_files(ORIGINAL_HF_LEADERBOARD_RESULTS_REPO, repo_type='dataset')
            for dirpath,_,filenames in os.walk(ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH):
                for f in filenames:
                    if not (f.startswith('results_') and f.endswith('.json')):
                        continue
                    
                    filepath = os.path.join(dirpath[len(ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH)+1:], f)   
                    model_id = filepath[:filepath.find('/results_')]
                    id_to_leaderboard_files[model_id].append(os.path.join(dirpath, f))

            for model_id in id_to_leaderboard_files:
                id_to_leaderboard_files[model_id].sort()
        except Exception as e:
            print(f"UPDATE_DYNAMIC: Could not download original results from : {e}")
            id_to_leaderboard_files = None

    print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")

    start = time.time()

    update_models(DYNAMIC_INFO_FILE_PATH, id_to_model, id_to_leaderboard_files)

    print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")

    API.upload_file(
        path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
        path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
        repo_id=DYNAMIC_INFO_REPO,
        repo_type="dataset",
        commit_message=f"Daily request file update.",
    )
    print(f"UPDATE_DYNAMIC: pushed to hub")