list-of-demos / demo_list.py
hysts's picture
hysts HF staff
Update
6e67bdd
raw
history blame
7.39 kB
import datetime
import operator
import pathlib
import pandas as pd
import tqdm.auto
import yaml
from huggingface_hub import HfApi
from constants import (OWNER_CHOICES, SLEEP_TIME_INT_TO_STR,
SLEEP_TIME_STR_TO_INT, WHOAMI)
repo_dir = pathlib.Path(__file__).parent
class DemoList:
COLUMN_INFO = [
['status', 'markdown'],
['hardware', 'markdown'],
['title', 'markdown'],
['owner', 'markdown'],
['arxiv', 'markdown'],
['github', 'markdown'],
['likes', 'number'],
['tags', 'str'],
['last_modified', 'str'],
['created', 'str'],
['sdk', 'markdown'],
['sdk_version', 'str'],
['suggested_hardware', 'markdown'],
['sleep_time', 'markdown'],
['replicas', 'markdown'],
]
def __init__(self):
self.api = HfApi()
self._raw_data = self.load_data()
self.df_raw = pd.DataFrame(self._raw_data)
self.df = self.prettify_df()
@property
def column_names(self):
return list(map(operator.itemgetter(0), self.COLUMN_INFO))
@property
def column_datatype(self):
return list(map(operator.itemgetter(1), self.COLUMN_INFO))
@staticmethod
def get_space_id(url: str) -> str:
return '/'.join(url.split('/')[-2:])
def load_data(self) -> list[dict]:
with open(repo_dir / 'list.yaml') as f:
data = yaml.safe_load(f)
res = []
for url in tqdm.auto.tqdm(list(data)):
space_id = self.get_space_id(url)
space_info = self.api.space_info(repo_id=space_id)
card = space_info.cardData
info: dict = data[url] | {
'url': url,
'title': card['title'] if 'title' in card else space_id,
'owner': space_id.split('/')[0],
'sdk': card['sdk'],
'sdk_version': card.get('sdk_version', ''),
'likes': space_info.likes,
'private': space_info.private,
'last_modified': space_info.lastModified,
'status': space_info.runtime['stage'],
'suggested_hardware': card.get('suggested_hardware', ''),
}
for tag in ['arxiv', 'github', 'tags']:
if tag not in info:
info[tag] = []
# `current` of paused Spaces is `None`, but `requested` is not
info['hardware'] = space_info.runtime['hardware']['current']
if info['hardware'] is None:
info['hardware'] = space_info.runtime['hardware']['requested']
# `gcTimeout` is `None` for `cpu-basic` Spaces and Spaces
# with "Don't sleep" sleep time.
# We use `-1` to represent it.
info['sleep_time'] = space_info.runtime['gcTimeout'] or -1
if info['sleep_time'] not in SLEEP_TIME_INT_TO_STR:
print(space_id)
print(f'Unknown sleep time: {info["sleep_time"]}')
continue
# `resources` of paused Spaces is `None`
resources = space_info.runtime['resources']
info['replicas'] = -1 if resources is None else resources[
'replicas']
res.append(info)
return res
def get_arxiv_link(self, links: list[str]) -> str:
links = [self.create_link(link.split('/')[-1], link) for link in links]
return '\n'.join(links)
def get_github_link(self, links: list[str]) -> str:
links = [self.create_link('github', link) for link in links]
return '\n'.join(links)
def get_tag_list(self, tags: list[str]) -> str:
return ', '.join(tags)
@staticmethod
def create_link(text: str, url: str) -> str:
return f'<a href={url} target="_blank">{text}</a>'
def to_div(self, text: str | None, category_name: str) -> str:
if text is None:
text = ''
class_name = f'{category_name}-{text.lower()}'
return f'<div class="{class_name}">{text}</div>'
@staticmethod
def format_timestamp(timestamp: str) -> str:
s = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.000Z')
return s.strftime('%Y/%m/%d %H:%M:%S')
@staticmethod
def add_div_tag_to_replicas(replicas: int) -> str:
if replicas == -1:
return ''
if replicas == 1:
return '1'
return f'<div class="multiple-replicas">{replicas}</div>'
@staticmethod
def add_div_tag_to_sleep_time(sleep_time_s: str, hardware: str) -> str:
if hardware == 'cpu-basic':
return f'<div class="sleep-time-cpu-basic">{sleep_time_s}</div>'
s = sleep_time_s.replace(' ', '-')
return f'<div class="sleep-time-{s}">{sleep_time_s}</div>'
def prettify_df(self) -> pd.DataFrame:
new_rows = []
for _, row in self.df_raw.copy().iterrows():
new_row = {
'status':
self.to_div(row.status, 'status'),
'hardware':
self.to_div(row.hardware, 'hardware'),
'suggested_hardware':
self.to_div(row.suggested_hardware, 'hardware'),
'title':
self.create_link(row.title, row.url),
'owner':
self.create_link(row.owner,
f'https://huggingface.co/{row.owner}'),
'arxiv':
self.get_arxiv_link(row.arxiv),
'github':
self.get_github_link(row.github),
'likes':
row.likes,
'tags':
self.get_tag_list(row.tags),
'last_modified':
self.format_timestamp(row.last_modified),
'created':
self.format_timestamp(row.created),
'sdk':
self.to_div(row.sdk, 'sdk'),
'sdk_version':
row.sdk_version,
'sleep_time':
self.add_div_tag_to_sleep_time(
SLEEP_TIME_INT_TO_STR[row.sleep_time], row.hardware),
'replicas':
self.add_div_tag_to_replicas(row.replicas),
}
new_rows.append(new_row)
df = pd.DataFrame(new_rows).loc[:, self.column_names]
return df
def apply_filter(
self,
status: list[str],
hardware: list[str],
sleep_time: list[str],
multiple_replicas: bool,
sdk: list[str],
visibility: list[str],
owner: list[str],
) -> pd.DataFrame:
df_raw = self.df_raw
df = self.df
if multiple_replicas:
df = df[df_raw.replicas > 1]
if visibility == ['public']:
df = df[~df_raw.private]
elif visibility == ['private']:
df = df[df_raw.private]
df = df[(df_raw.status.isin(status)) & (df_raw.hardware.isin(hardware))
& (df_raw.sdk.isin(sdk))]
sleep_time_int = [SLEEP_TIME_STR_TO_INT[s] for s in sleep_time]
df = df[df_raw.sleep_time.isin(sleep_time_int)]
if set(owner) == set(OWNER_CHOICES):
pass
elif WHOAMI in owner:
df = df[df_raw.owner == WHOAMI]
else:
df = df[df_raw.owner != WHOAMI]
return df