import datetime import operator import pathlib import pandas as pd import tqdm.auto import yaml from huggingface_hub import HfApi from constants import (OWNER_CHOICES, SLEEP_TIME_INT_TO_STR, SLEEP_TIME_STR_TO_INT, WHOAMI) repo_dir = pathlib.Path(__file__).parent class DemoList: COLUMN_INFO = [ ['status', 'markdown'], ['hardware', 'markdown'], ['title', 'markdown'], ['owner', 'markdown'], ['arxiv', 'markdown'], ['github', 'markdown'], ['likes', 'number'], ['tags', 'str'], ['last_modified', 'str'], ['created', 'str'], ['sdk', 'markdown'], ['sdk_version', 'str'], ['suggested_hardware', 'markdown'], ['sleep_time', 'markdown'], ['replicas', 'markdown'], ] def __init__(self): self.api = HfApi() self._raw_data = self.load_data() self.df_raw = pd.DataFrame(self._raw_data) self.df = self.prettify_df() @property def column_names(self): return list(map(operator.itemgetter(0), self.COLUMN_INFO)) @property def column_datatype(self): return list(map(operator.itemgetter(1), self.COLUMN_INFO)) @staticmethod def get_space_id(url: str) -> str: return '/'.join(url.split('/')[-2:]) def load_data(self) -> list[dict]: with open(repo_dir / 'list.yaml') as f: data = yaml.safe_load(f) res = [] for url in tqdm.auto.tqdm(list(data)): space_id = self.get_space_id(url) space_info = self.api.space_info(repo_id=space_id) card = space_info.cardData info: dict = data[url] | { 'url': url, 'title': card['title'] if 'title' in card else space_id, 'owner': space_id.split('/')[0], 'sdk': card['sdk'], 'sdk_version': card.get('sdk_version', ''), 'likes': space_info.likes, 'private': space_info.private, 'last_modified': space_info.lastModified, 'status': space_info.runtime['stage'], 'suggested_hardware': card.get('suggested_hardware', ''), } for tag in ['arxiv', 'github', 'tags']: if tag not in info: info[tag] = [] # `current` of paused Spaces is `None`, but `requested` is not info['hardware'] = space_info.runtime['hardware']['current'] if info['hardware'] is None: info['hardware'] = space_info.runtime['hardware']['requested'] # `gcTimeout` is `None` for `cpu-basic` Spaces and Spaces # with "Don't sleep" sleep time. # We use `-1` to represent it. info['sleep_time'] = space_info.runtime['gcTimeout'] or -1 if info['sleep_time'] not in SLEEP_TIME_INT_TO_STR: print(space_id) print(f'Unknown sleep time: {info["sleep_time"]}') continue # `resources` of paused Spaces is `None` resources = space_info.runtime['resources'] info['replicas'] = -1 if resources is None else resources[ 'replicas'] res.append(info) return res def get_arxiv_link(self, links: list[str]) -> str: links = [self.create_link(link.split('/')[-1], link) for link in links] return '\n'.join(links) def get_github_link(self, links: list[str]) -> str: links = [self.create_link('github', link) for link in links] return '\n'.join(links) def get_tag_list(self, tags: list[str]) -> str: return ', '.join(tags) @staticmethod def create_link(text: str, url: str) -> str: return f'{text}' def to_div(self, text: str | None, category_name: str) -> str: if text is None: text = '' class_name = f'{category_name}-{text.lower()}' return f'
{text}
' @staticmethod def format_timestamp(timestamp: str) -> str: s = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.000Z') return s.strftime('%Y/%m/%d %H:%M:%S') @staticmethod def add_div_tag_to_replicas(replicas: int) -> str: if replicas == -1: return '' if replicas == 1: return '1' return f'
{replicas}
' @staticmethod def add_div_tag_to_sleep_time(sleep_time_s: str, hardware: str) -> str: if hardware == 'cpu-basic': return f'
{sleep_time_s}
' s = sleep_time_s.replace(' ', '-') return f'
{sleep_time_s}
' def prettify_df(self) -> pd.DataFrame: new_rows = [] for _, row in self.df_raw.copy().iterrows(): new_row = { 'status': self.to_div(row.status, 'status'), 'hardware': self.to_div(row.hardware, 'hardware'), 'suggested_hardware': self.to_div(row.suggested_hardware, 'hardware'), 'title': self.create_link(row.title, row.url), 'owner': self.create_link(row.owner, f'https://huggingface.co/{row.owner}'), 'arxiv': self.get_arxiv_link(row.arxiv), 'github': self.get_github_link(row.github), 'likes': row.likes, 'tags': self.get_tag_list(row.tags), 'last_modified': self.format_timestamp(row.last_modified), 'created': self.format_timestamp(row.created), 'sdk': self.to_div(row.sdk, 'sdk'), 'sdk_version': row.sdk_version, 'sleep_time': self.add_div_tag_to_sleep_time( SLEEP_TIME_INT_TO_STR[row.sleep_time], row.hardware), 'replicas': self.add_div_tag_to_replicas(row.replicas), } new_rows.append(new_row) df = pd.DataFrame(new_rows).loc[:, self.column_names] return df def apply_filter( self, status: list[str], hardware: list[str], sleep_time: list[str], multiple_replicas: bool, sdk: list[str], visibility: list[str], owner: list[str], ) -> pd.DataFrame: df_raw = self.df_raw df = self.df if multiple_replicas: df = df[df_raw.replicas > 1] if visibility == ['public']: df = df[~df_raw.private] elif visibility == ['private']: df = df[df_raw.private] df = df[(df_raw.status.isin(status)) & (df_raw.hardware.isin(hardware)) & (df_raw.sdk.isin(sdk))] sleep_time_int = [SLEEP_TIME_STR_TO_INT[s] for s in sleep_time] df = df[df_raw.sleep_time.isin(sleep_time_int)] if set(owner) == set(OWNER_CHOICES): pass elif WHOAMI in owner: df = df[df_raw.owner == WHOAMI] else: df = df[df_raw.owner != WHOAMI] return df