File size: 4,607 Bytes
9b2e755
0c7ef71
 
 
 
9b2e755
8d502c8
0c7ef71
 
 
 
 
 
 
 
 
 
5ad4694
 
ae618a2
ecefacb
0c7ef71
 
 
 
ae618a2
ecefacb
0c7ef71
 
 
 
 
8d502c8
0c7ef71
 
 
5408125
 
0c7ef71
5408125
 
 
 
 
 
 
 
 
 
0c7ef71
5408125
a6f1b1f
5408125
 
 
 
 
 
 
 
 
 
 
 
 
80f473c
0c7ef71
80f473c
 
0c7ef71
 
 
 
 
 
9b2e755
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d502c8
9b2e755
 
0c7ef71
9b2e755
0c7ef71
8d502c8
0c7ef71
9b2e755
0c7ef71
9b2e755
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from huggingface_hub import ModelFilter, snapshot_download
from huggingface_hub import ModelCard

import json
import time
from src.submission.check_validity import is_model_on_hub, check_model_card
from src.envs import DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API, H4_TOKEN

def update_models(file_path, models):
    """
    Search through all JSON files in the specified root folder and its subfolders,
    and update the likes key in JSON dict from value of input dict
    """
    with open(file_path, "r") as f:
        model_infos = json.load(f)
        for model_id, data in model_infos.items():
            if model_id not in models:
                data['still_on_hub'] = False
                data['likes'] = 0
                data['downloads'] = 0
                data['created_at'] = ""
                continue

            model_cfg = models[model_id]
            data['likes'] = model_cfg.likes
            data['downloads'] = model_cfg.downloads
            data['created_at'] = str(model_cfg.created_at)
            #data['params'] = get_model_size(model_cfg, data['precision'])
            data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""

            # Is the model still on the hub
            still_on_hub, error, model_config = is_model_on_hub(
                model_name=model_id, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False, token=H4_TOKEN
            )
            data['still_on_hub'] = still_on_hub

            tags = []

            if still_on_hub:
                model = model_id
                modelcard_OK, error_msg = check_model_card(model)
                model_card = None
                if modelcard_OK:
                    model_card = ModelCard.load(model)
                
                is_merge_from_metadata = False
                is_moe_from_metadata = False
                is_merge_from_model_card = False
                is_moe_from_model_card = False

                # Storing the model tags
                moe_keywords = ["moe", "mixture of experts", "mixtral"]
                if modelcard_OK:
                    if model_card.data.tags:
                        is_merge_from_metadata = "merge" in model_card.data.tags
                        is_moe_from_metadata = "moe" in model_card.data.tags
                    merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
                    # If the model is a merge but not saying it in the metadata, we flag it
                    is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
                    if is_merge_from_model_card or is_merge_from_metadata:
                        tags.append("merge")
                        if not is_merge_from_metadata:
                            tags.append("flagged:undisclosed_merge")
                    is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in moe_keywords)    
                is_moe_from_name = "moe" in model.lower().replace("/", "-").replace("_", "-").split("-")
                if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
                    tags.append("moe")
                    if not is_moe_from_metadata:
                        tags.append("flagged:undisclosed_moe")

            data["tags"] = tags

    with open(file_path, 'w') as f:
        json.dump(model_infos, f, indent=2)

def update_dynamic_files():
    """ This will only update metadata for models already linked in the repo, not add missing ones.
    """
    snapshot_download(
        repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
    )

    print("UPDATE_DYNAMIC: Loaded snapshot")
    # Get models
    start = time.time()

    models = list(API.list_models(
        filter=ModelFilter(task="text-generation"),
        full=False,
        cardData=True,
        fetch_config=True,
    ))
    id_to_model = {model.id : model for model in models}

    print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")

    start = time.time()

    update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)

    print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")

    API.upload_file(
        path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
        path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
        repo_id=DYNAMIC_INFO_REPO,
        repo_type="dataset",
        commit_message=f"Daily request file update.",
    )
    print(f"UPDATE_DYNAMIC: pushed to hub")