from __future__ import annotations from typing import List, TypedDict import huggingface_hub from huggingface_hub.hf_api import SpaceInfo from concurrent.futures import ThreadPoolExecutor import os import json import datetime import tqdm import requests from pathlib import Path from screenshot import get_screen_shot import boto3 from threading import Lock class SpaceData(TypedDict): id: str likes: int subdomain: str lastModified: str status: str repo = huggingface_hub.Repository( local_dir="data", repo_type="dataset", clone_from="freddyaboulton/gradio-theme-subdomains", token=os.getenv("HF_TOKEN"), ) repo.git_pull() prev_data = {s['id']: s for s in json.load(open("data/val_subdomains.json"))} screen_shot_dir = Path("data") / "images" screen_shot_dir.mkdir(exist_ok=True, parents=True) s3_client = boto3.client( "s3", aws_access_key_id=os.getenv("AWS_ACCESS_KEY"), aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), ) lock = Lock() api = huggingface_hub.HfApi(token=os.getenv("HF_TOKEN")) def get_theme_preview_spaces() -> List[SpaceInfo]: return list(iter(api.list_spaces(filter="gradio-theme"))) def get_info(space_name: SpaceInfo) -> SpaceData | None: if not space_name.id: print(f"no space_name for {space_name}") return None space_info = api.space_info(space_name.id, token=os.getenv("HF_TOKEN")) if space_info.private: print(f"{space_name} is private") return None subdomain: str | None = getattr(space_info, "subdomain", None) if subdomain is None: print(f"no subdomain for {space_info.id}") return None status = space_info.runtime.stage img_id = space_info.id.replace("/", "_") light_file = str(screen_shot_dir / Path(img_id + "_light.jpg")) dark_file = str(screen_shot_dir / Path(img_id + "_dark.jpg")) if False: #status == "RUNNING": if not prev_data.get(space_info.id, {}).get("sha") or (prev_data.get(space_info.id, {}).get("sha") != space_info.sha): prev_data[space_info.id]['sha'] = space_info.sha with lock: get_screen_shot( f"https://{space_info.subdomain}.hf.space?__theme=light", 3, light_file ) with lock: get_screen_shot( f"https://{space_info.subdomain}.hf.space?__theme=dark", 3, dark_file ) s3_client.upload_file( light_file, "gradio-theme-screenshots", img_id + "_light.jpg", ExtraArgs={"ContentType": "image/jpg"}, ) s3_client.upload_file( dark_file, "gradio-theme-screenshots", img_id + "_dark.jpg", ExtraArgs={"ContentType": "image/jpg"}, ) if status not in ["SLEEPING", "RUNNING", "RUNNING_BUILDING", "BUILDING"]: print(f"Space not running, building, or sleeping {space_info.id}") elif status == "SLEEPING": requests.get(f"https://huggingface.co/spaces/{space_info.id}") return { "id": space_info.id, "likes": space_info.likes, "sha": space_info.sha, "lastModified": space_info.lastModified.strftime("%Y-%m-%d"), "screenshot_id": img_id, "status": status, "subdomain": f"https://{space_info.subdomain}.hf.space/" } # type: ignore def get_all_info(spaces: List[SpaceInfo]) -> List[SpaceData]: with ThreadPoolExecutor(max_workers=10) as executor: all_info = list(tqdm.tqdm(executor.map(get_info, spaces), total=len(spaces))) return [info for info in all_info if info] def process_spaces(): theme_spaces = list(iter(get_theme_preview_spaces())) all_info = get_all_info(theme_spaces) json.dump(all_info, open("data/subdomains.json", "w")) repo.push_to_hub( blocking=False, commit_message=f"Updating data at {datetime.datetime.now()}" )