File size: 7,020 Bytes
e295ac3
df66f6e
e295ac3
df66f6e
e295ac3
df66f6e
9c5c692
e295ac3
7625ef6
2a5f9fb
e295ac3
2a5f9fb
7625ef6
e295ac3
 
 
 
 
2a5f9fb
06acefd
e295ac3
7625ef6
 
2a5f9fb
e295ac3
2a5f9fb
e295ac3
 
2a5f9fb
e295ac3
 
 
9c5c692
 
e295ac3
9c5c692
 
 
 
 
 
 
 
2a5f9fb
e295ac3
ab44cd6
 
 
9c5c692
 
 
 
 
e295ac3
9c5c692
 
 
e295ac3
9c5c692
 
 
 
 
 
 
 
 
 
 
 
e295ac3
9c5c692
 
 
 
 
 
 
 
e295ac3
c212cb7
2a5f9fb
 
 
c212cb7
9c5c692
2a5f9fb
e295ac3
9c5c692
 
e295ac3
9c5c692
 
 
 
e295ac3
 
 
9c5c692
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e295ac3
2a5f9fb
e295ac3
 
c212cb7
2a5f9fb
 
 
c212cb7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os

import pandas as pd
from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
from huggingface_hub.utils._errors import HfHubHTTPError
from pandas import DataFrame
import numpy as np

from src.display.utils import AutoEvalColumn, ModelType, NUMERIC_INTERVALS
from src.envs import H4_TOKEN, PATH_TO_COLLECTION

# Specific intervals for the collections
"""
intervals = {
    "1B": pd.Interval(0, 1.5, closed="right"),
    "3B": pd.Interval(2.5, 3.5, closed="neither"),
    "7B": pd.Interval(6, 8, closed="neither"),
    "13B": pd.Interval(10, 14, closed="neither"),
    "30B": pd.Interval(25, 35, closed="neither"),
    "65B": pd.Interval(60, 70, closed="neither"),
}
"""
intervals = {k:v for k,v in NUMERIC_INTERVALS.items() if "?" not in k}

def update_collections(df: DataFrame):
    """This function updates the Open LLM Leaderboard model collection with the latest best models for
    each size category and type.
    """
    collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
    params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")

    cur_best_models = []
    cur_best_scores = []
    scores_per_type = {'pretrained': 0, 'other': 0, 'language': 0}

    types_to_consider = [('pretrained', [ModelType.PT]), ('other', [ModelType.LA, ModelType.FT, ModelType.chat])]

    for item in collection.items:
        try:
            delete_collection_item(
                collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
            )
        except HfHubHTTPError:
            continue

    #filter quantized models
    df = df[df[AutoEvalColumn.precision.name].isin(['bfloat16', 'float16'])]
    
    ix = 0
    for size in intervals:
        interval_scores = []
        interval_itens_languages = []
        interval_itens = []

        numeric_interval = pd.IntervalIndex([intervals[size]])
        mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
        size_df = df.loc[mask]

        for model_type, types in types_to_consider:
            type_emojis = []
            for type in types:
                if type.value.name == "":
                    continue
                type_emoji = [t[0] for t in type.value.symbol]
                type_emojis.extend(type_emoji)
            filtered_df = size_df[size_df[AutoEvalColumn.model_type_symbol.name].isin(type_emojis)]
            filtered_df = filtered_df[filtered_df[AutoEvalColumn.average.name].astype(float) > scores_per_type[model_type]]

            best_models = filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)
            print(type_emojis, size, list(best_models[AutoEvalColumn.dummy.name])[:10])
            # We add them one by one to the leaderboard
            for i, row in best_models.iterrows():
                model = row[AutoEvalColumn.dummy.name]
                score = row[AutoEvalColumn.average.name]
                language = row[AutoEvalColumn.main_language.name]
                if language == 'Portuguese':
                    note = f"Best Portuguese {type.to_str(' ')} model of around {size} on the leaderboard today! (Score: {score})"
                else:
                    note = f"Best {type.to_str(' ')} model of around {size} on the leaderboard today! (Score: {score})"
                try:
                    collection = add_collection_item(
                        PATH_TO_COLLECTION,
                        item_id=model,
                        item_type="model",
                        exists_ok=True,
                        note=note,
                        token=H4_TOKEN,
                    )
                    ix += 1
                    item_object_id = collection.items[-1].item_object_id
                    cur_best_models.append(model)
                    interval_scores.append(float(score))
                    interval_itens_languages.append(language)
                    interval_itens.append(item_object_id)
                    scores_per_type[model_type] = float(score)
                    break
                except HfHubHTTPError:
                    continue
        if 'Portuguese' not in interval_itens_languages:
            language = ['Portuguese']
            model_type = 'language'
            filtered_df = size_df[size_df[AutoEvalColumn.main_language.name].isin(language)]
            filtered_df = filtered_df[filtered_df[AutoEvalColumn.average.name].astype(float) > scores_per_type[model_type]]

            best_models = filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)
            print(language, size, list(best_models[AutoEvalColumn.dummy.name])[:10])
            # We add them one by one to the leaderboard
            for i, row in best_models.iterrows():
                model = row[AutoEvalColumn.dummy.name]
                score = row[AutoEvalColumn.average.name]
                language = row[AutoEvalColumn.main_language.name]
                
                if language == 'Portuguese':
                    note = f"Best Portuguese {type.to_str(' ')} model of around {size} on the leaderboard today! (Score: {score})"
                else:
                    note = f"Best {type.to_str(' ')} model of around {size} on the leaderboard today! (Score: {score})"
                try:
                    collection = add_collection_item(
                        PATH_TO_COLLECTION,
                        item_id=model,
                        item_type="model",
                        exists_ok=True,
                        note=note,
                        token=H4_TOKEN,
                    )
                    ix += 1
                    item_object_id = collection.items[-1].item_object_id
                    cur_best_models.append(model)
                    interval_scores.append(float(score))
                    interval_itens_languages.append(language)
                    interval_itens.append(item_object_id)
                    scores_per_type[model_type] = float(score)
                    break
                except HfHubHTTPError:
                    continue
        # fix order:
        starting_idx = len(cur_best_models)
        k = 0
        for i in np.argsort(interval_scores):
            if i == k:
                continue
            else:
                try:
                    update_collection_item(
                        collection_slug=PATH_TO_COLLECTION, item_object_id=interval_itens[i], position=starting_idx+k
                    )
                except:
                    pass
            k += 1

    collection = get_collection(PATH_TO_COLLECTION, token=H4_TOKEN)
    for item in collection.items:
        if item.item_id not in cur_best_models:
            try:
                delete_collection_item(
                    collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
                )
            except HfHubHTTPError:
                continue