import dataclasses import datetime import operator import pathlib import pandas as pd import tqdm.auto import yaml from huggingface_hub import HfApi from constants import (OWNER_CHOICES, SLEEP_TIME_INT_TO_STR, SLEEP_TIME_STR_TO_INT, WHOAMI) @dataclasses.dataclass(frozen=True) class DemoInfo: space_id: str url: str title: str owner: str sdk: str sdk_version: str likes: int status: str last_modified: str sleep_time: int replicas: int private: bool hardware: str suggested_hardware: str created: str = '' arxiv: list[str] = dataclasses.field(default_factory=list) github: list[str] = dataclasses.field(default_factory=list) tags: list[str] = dataclasses.field(default_factory=list) def __post_init__(self): object.__setattr__(self, 'last_modified', DemoInfo.convert_timestamp(self.last_modified)) object.__setattr__(self, 'created', DemoInfo.convert_timestamp(self.created)) @staticmethod def convert_timestamp(timestamp: str) -> str: try: return datetime.datetime.strptime( timestamp, '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%Y/%m/%d %H:%M:%S') except ValueError: return timestamp @classmethod def from_space_id(cls, space_id: str) -> 'DemoInfo': api = HfApi() space_info = api.space_info(repo_id=space_id) card = space_info.cardData runtime = space_info.runtime resources = runtime['resources'] return cls( space_id=space_id, url=f'https://huggingface.co/spaces/{space_id}', title=card['title'] if 'title' in card else '', owner=space_id.split('/')[0], sdk=card['sdk'], sdk_version=card.get('sdk_version', ''), likes=space_info.likes, status=runtime['stage'], last_modified=space_info.lastModified, sleep_time=runtime['gcTimeout'] or 0, replicas=resources['replicas'] if resources is not None else 0, private=space_info.private, hardware=runtime['hardware']['current'] or runtime['hardware']['requested'], suggested_hardware=card.get('suggested_hardware', ''), ) def get_df_from_yaml(path: pathlib.Path | str) -> pd.DataFrame: with pathlib.Path(path).open() as f: data = yaml.safe_load(f) demo_info = [] for space_id in tqdm.auto.tqdm(list(data)): base_info = DemoInfo.from_space_id(space_id) info = DemoInfo(**(dataclasses.asdict(base_info) | data[space_id])) demo_info.append(info) return pd.DataFrame([dataclasses.asdict(info) for info in demo_info]) class Prettifier: @staticmethod def get_arxiv_link(links: list[str]) -> str: links = [ Prettifier.create_link(link.split('/')[-1], link) for link in links ] return '\n'.join(links) @staticmethod def get_github_link(links: list[str]) -> str: links = [Prettifier.create_link('github', link) for link in links] return '\n'.join(links) @staticmethod def get_tag_list(tags: list[str]) -> str: return ', '.join(tags) @staticmethod def create_link(text: str, url: str) -> str: return f'{text}' @staticmethod def to_div(text: str | None, category_name: str) -> str: if text is None: text = '' class_name = f'{category_name}-{text.lower()}' return f'
{text}
' @staticmethod def add_div_tag_to_replicas(replicas: int) -> str: if replicas == 0: return '' if replicas == 1: return '1' return f'
{replicas}
' @staticmethod def add_div_tag_to_sleep_time(sleep_time_s: str, hardware: str) -> str: if hardware == 'cpu-basic': return f'
{sleep_time_s}
' s = sleep_time_s.replace(' ', '-') return f'
{sleep_time_s}
' def __call__(self, df: pd.DataFrame) -> pd.DataFrame: new_rows = [] for _, row in df.iterrows(): new_row = dict(row) | { 'status': self.to_div(row.status, 'status'), 'hardware': self.to_div(row.hardware, 'hardware'), 'suggested_hardware': self.to_div(row.suggested_hardware, 'hardware'), 'title': self.create_link(row.title, row.url), 'owner': self.create_link(row.owner, f'https://huggingface.co/{row.owner}'), 'sdk': self.to_div(row.sdk, 'sdk'), 'sleep_time': self.add_div_tag_to_sleep_time( SLEEP_TIME_INT_TO_STR[row.sleep_time], row.hardware), 'replicas': self.add_div_tag_to_replicas(row.replicas), 'arxiv': self.get_arxiv_link(row.arxiv), 'github': self.get_github_link(row.github), 'tags': self.get_tag_list(row.tags), } new_rows.append(new_row) return pd.DataFrame(new_rows, columns=df.columns) class DemoList: COLUMN_INFO = [ ['status', 'markdown'], ['hardware', 'markdown'], ['title', 'markdown'], ['owner', 'markdown'], ['arxiv', 'markdown'], ['github', 'markdown'], ['likes', 'number'], ['tags', 'str'], ['last_modified', 'str'], ['created', 'str'], ['sdk', 'markdown'], ['sdk_version', 'str'], ['suggested_hardware', 'markdown'], ['sleep_time', 'markdown'], ['replicas', 'markdown'], ['private', 'bool'], ] def __init__(self, df: pd.DataFrame): self.df_raw = df self._prettifier = Prettifier() self.df_prettified = self._prettifier(df).loc[:, self.column_names] @property def column_names(self): return list(map(operator.itemgetter(0), self.COLUMN_INFO)) @property def column_datatype(self): return list(map(operator.itemgetter(1), self.COLUMN_INFO)) def filter( self, status: list[str], hardware: list[str], sleep_time: list[str], multiple_replicas: bool, sdk: list[str], visibility: list[str], owner: list[str], ) -> pd.DataFrame: df = self.df_raw.copy() if multiple_replicas: df = df[self.df_raw.replicas > 1] if visibility == ['public']: df = df[~self.df_raw.private] elif visibility == ['private']: df = df[self.df_raw.private] df = df[(self.df_raw.status.isin(status)) & (self.df_raw.hardware.isin(hardware)) & (self.df_raw.sdk.isin(sdk))] sleep_time_int = [SLEEP_TIME_STR_TO_INT[s] for s in sleep_time] df = df[self.df_raw.sleep_time.isin(sleep_time_int)] if set(owner) == set(OWNER_CHOICES): pass elif WHOAMI in owner: df = df[self.df_raw.owner == WHOAMI] else: df = df[self.df_raw.owner != WHOAMI] return self._prettifier(df).loc[:, self.column_names]