File size: 4,501 Bytes
9b2e755 0c7ef71 28ee758 0c7ef71 05bda40 f04f90e 28ee758 0c7ef71 28ee758 ca27c07 28ee758 0c7ef71 28ee758 0c7ef71 28ee758 0c7ef71 9b2e755 ca27c07 9b2e755 8d502c8 9b2e755 0c7ef71 9b2e755 0c7ef71 8d502c8 0c7ef71 9b2e755 0c7ef71 9b2e755 05bda40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
from huggingface_hub import ModelFilter, snapshot_download
from huggingface_hub import ModelCard
import json
import os
import time
from src.submission.check_validity import is_model_on_hub, check_model_card, get_model_tags
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API, H4_TOKEN
def update_one_model(model_id, data, models_on_the_hub):
# Model no longer on the hub at all
if model_id not in models_on_the_hub:
data['still_on_hub'] = False
data['likes'] = 0
data['downloads'] = 0
data['created_at'] = ""
data["tags"] = []
return data
# Grabbing model parameters
model_cfg = models_on_the_hub[model_id]
data['likes'] = model_cfg.likes
data['downloads'] = model_cfg.downloads
data['created_at'] = str(model_cfg.created_at)
data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
# Grabbing model details
model_name = model_id
if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
if isinstance(model_cfg.card_data.base_model, str):
model_name = model_cfg.card_data.base_model # for adapters, we look at the parent model
still_on_hub, _, _ = is_model_on_hub(
model_name=model_name, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False, token=H4_TOKEN
)
# If the model doesn't have a model card or a license, we consider it's deleted
if still_on_hub:
try:
status, _, model_card = check_model_card(model_id)
if status is False:
still_on_hub = False
except Exception:
model_card = None
still_on_hub = False
data['still_on_hub'] = still_on_hub
tags = get_model_tags(model_card, model_id) if still_on_hub else []
data["tags"] = tags
return data
def update_models(file_path, models_on_the_hub):
"""
Search through all JSON files in the specified root folder and its subfolders,
and update the likes key in JSON dict from value of input dict
"""
seen_models = []
with open(file_path, "r") as f:
model_infos = json.load(f)
for model_id in model_infos.keys():
seen_models.append(model_id)
model_infos[model_id] = update_one_model(
model_id = model_id,
data=model_infos[model_id],
models_on_the_hub=models_on_the_hub
)
# If new requests files have been created since we started all this
# we grab them
all_models = []
try:
for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
if ix == 0: continue
for file in files:
if "eval_request" in file:
path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
all_models.append(path)
except Exception as e:
print(e)
pass
for model_id in all_models:
if model_id not in seen_models:
model_infos[model_id] = update_one_model(
model_id = model_id,
data={},
models_on_the_hub=models_on_the_hub
)
with open(file_path, 'w') as f:
json.dump(model_infos, f, indent=2)
def update_dynamic_files():
""" This will only update metadata for models already linked in the repo, not add missing ones.
"""
snapshot_download(
repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
)
print("UPDATE_DYNAMIC: Loaded snapshot")
# Get models
start = time.time()
models = list(API.list_models(
#filter=ModelFilter(task="text-generation"),
full=False,
cardData=True,
fetch_config=True,
))
id_to_model = {model.id : model for model in models}
print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
start = time.time()
update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
API.upload_file(
path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
repo_id=DYNAMIC_INFO_REPO,
repo_type="dataset",
commit_message=f"Daily request file update.",
)
print(f"UPDATE_DYNAMIC: pushed to hub")
|