File size: 7,261 Bytes
cf4f63b e190970 cf4f63b dc35a83 a1d780d e638825 cf4f63b e190970 cf4f63b afb0964 cf4f63b b3bb8cb e638825 3c445df cf4f63b e190970 3c445df c0ec2ff cf4f63b e638825 cf4f63b 4615d65 cf4f63b e190970 3c445df c0ec2ff 4615d65 cf4f63b ea3298f e190970 4615d65 c0ec2ff e190970 c0ec2ff cf4f63b c0ec2ff cf4f63b c0ec2ff e190970 3c445df c0ec2ff cf4f63b 0d2ed88 dc35a83 1edb9c0 dc35a83 cf4f63b dc35a83 cf4f63b 0d2ed88 cf4f63b dc35a83 cf4f63b e190970 3c445df c0ec2ff cf4f63b e638825 0d2ed88 e638825 9c99ca5 e638825 cf4f63b a1d780d 3c445df a1d780d 3c445df a1d780d 3c445df a1d780d 3c445df a1d780d 3c445df a1d780d 3c445df a1d780d 3c445df a1d780d 3c445df a1d780d 3c445df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import datetime
from concurrent.futures import as_completed
from urllib import parse
import pandas as pd
import streamlit as st
import wandb
from requests_futures.sessions import FuturesSession
from dashboard_utils.time_tracker import _log, simple_time_tracker
EXCLUDED_PROFILES = {'borzunov', 'justheuristic', 'mryab', 'yhn112', 'SaulLu',
'training-transformers-together-machine', 'Upload'}
URL_QUICKSEARCH = "https://huggingface.co/api/quicksearch?"
WANDB_REPO = st.secrets["WANDB_REPO_INDIVIDUAL_METRICS"]
CACHE_TTL = 100
MAX_DELTA_ACTIVE_RUN_SEC = 60 * 5
@st.cache(ttl=CACHE_TTL, show_spinner=False)
@simple_time_tracker(_log)
def get_new_bubble_data():
serialized_data_points, latest_timestamp = get_serialized_data_points()
serialized_data = get_serialized_data(serialized_data_points, latest_timestamp)
usernames = []
for item in serialized_data["points"][0]:
usernames.append(item["profileId"])
profiles = get_profiles(usernames)
return serialized_data, profiles
@st.cache(ttl=CACHE_TTL, show_spinner=False)
@simple_time_tracker(_log)
def get_profiles(usernames):
profiles = []
with FuturesSession(max_workers=32) as session:
futures = []
for username in usernames:
future = session.get(URL_QUICKSEARCH + parse.urlencode({"type": "user", "q": username}))
future.username = username
futures.append(future)
for future in as_completed(futures):
resp = future.result()
username = future.username
response = resp.json()
avatarUrl = None
if response["users"]:
for user_candidate in response["users"]:
if user_candidate["user"] == username:
avatarUrl = response["users"][0]["avatarUrl"]
break
if not avatarUrl:
avatarUrl = "/avatars/57584cb934354663ac65baa04e6829bf.svg"
if avatarUrl.startswith("/avatars/"):
avatarUrl = f"https://huggingface.co{avatarUrl}"
profiles.append(
{"id": username, "name": username, "src": avatarUrl, "url": f"https://huggingface.co/{username}"}
)
return profiles
@st.cache(ttl=CACHE_TTL, show_spinner=False)
@simple_time_tracker(_log)
def get_serialized_data_points():
api = wandb.Api()
runs = api.runs(WANDB_REPO)
serialized_data_points = {}
latest_timestamp = None
for run in runs:
run_name = run.name
if run_name in EXCLUDED_PROFILES:
continue
run_summary = run.summary._json_dict
state = run.state
if run_name in serialized_data_points:
if "_timestamp" in run_summary and "_step" in run_summary:
timestamp = run_summary["_timestamp"]
serialized_data_points[run_name]["Runs"].append(
{
"batches": run_summary["_step"],
"runtime": run_summary["_runtime"],
"loss": run_summary["train/loss"],
"state": state,
"velocity": run_summary["_step"] / run_summary["_runtime"],
"date": datetime.datetime.utcfromtimestamp(timestamp),
}
)
if not latest_timestamp or timestamp > latest_timestamp:
latest_timestamp = timestamp
else:
if "_timestamp" in run_summary and "_step" in run_summary:
timestamp = run_summary["_timestamp"]
serialized_data_points[run_name] = {
"profileId": run_name,
"Runs": [
{
"batches": run_summary["_step"],
"runtime": run_summary["_runtime"],
"loss": run_summary["train/loss"],
"state": state,
"velocity": run_summary["_step"] / run_summary["_runtime"],
"date": datetime.datetime.utcfromtimestamp(timestamp),
}
],
}
if not latest_timestamp or timestamp > latest_timestamp:
latest_timestamp = timestamp
latest_timestamp = datetime.datetime.utcfromtimestamp(latest_timestamp)
return serialized_data_points, latest_timestamp
@st.cache(ttl=CACHE_TTL, show_spinner=False)
@simple_time_tracker(_log)
def get_serialized_data(serialized_data_points, latest_timestamp):
serialized_data_points_v2 = []
max_velocity = 1
for run_name, serialized_data_point in serialized_data_points.items():
activeRuns = []
loss = 0
runtime = 0
batches = 0
velocity = 0
for run in serialized_data_point["Runs"]:
if run["state"] == "running":
run["date"] = run["date"].isoformat()
activeRuns.append(run)
loss += run["loss"]
velocity += run["velocity"]
loss = loss / len(activeRuns) if activeRuns else 0
runtime += run["runtime"]
batches += run["batches"]
new_item = {
"date": latest_timestamp.isoformat(),
"profileId": run_name,
"batches": runtime, # "batches": batches quick and dirty fix
"runtime": runtime,
"activeRuns": activeRuns,
}
serialized_data_points_v2.append(new_item)
serialized_data = {"points": [serialized_data_points_v2], "maxVelocity": max_velocity}
return serialized_data
def get_leaderboard(serialized_data):
data_leaderboard = {"user": [], "runtime": []}
for user_item in serialized_data["points"][0]:
data_leaderboard["user"].append(user_item["profileId"])
data_leaderboard["runtime"].append(user_item["runtime"])
df = pd.DataFrame(data_leaderboard)
df = df.sort_values("runtime", ascending=False)
df["runtime"] = df["runtime"].apply(lambda x: datetime.timedelta(seconds=x))
df["runtime"] = df["runtime"].apply(lambda x: str(x))
df.reset_index(drop=True, inplace=True)
df.rename(columns={"user": "User", "runtime": "Total time contributed"}, inplace=True)
df["Rank"] = df.index + 1
df = df.set_index("Rank")
return df
def get_global_metrics(serialized_data):
current_time = datetime.datetime.utcnow()
num_contributing_users = len(serialized_data["points"][0])
num_active_users = 0
total_runtime = 0
for user_item in serialized_data["points"][0]:
for run in user_item["activeRuns"]:
date_run = datetime.datetime.fromisoformat(run["date"])
delta_time_sec = (current_time - date_run).total_seconds()
if delta_time_sec < MAX_DELTA_ACTIVE_RUN_SEC:
num_active_users += 1
break
total_runtime += user_item["runtime"]
total_runtime = datetime.timedelta(seconds=total_runtime)
return {
"num_contributing_users": num_contributing_users,
"num_active_users": num_active_users,
"total_runtime": total_runtime,
}
|