Spaces:
Sleeping
Sleeping
File size: 3,977 Bytes
6fa035b 69f3076 6fa035b 69f3076 12c49c4 f4c39f1 5f4d02f f4c39f1 6fa035b 69f3076 6fa035b 69f3076 12c49c4 6fa035b f4c39f1 6fa035b f4c39f1 6fa035b efbfa94 ef5efcc f4c39f1 5f4d02f f4c39f1 6fa035b f4c39f1 6fa035b 69f3076 6fa035b f4c39f1 69f3076 6fa035b 69f3076 9c4359c 69f3076 6fa035b f4c39f1 a8db45f 6578ed8 bdea4ee ef5efcc 5f4d02f ef5efcc 5f4d02f 12c49c4 f4c39f1 ef5efcc f4c39f1 4dd0c5b f4c39f1 6fa035b 69f3076 6fa035b 69f3076 6fa035b 69f3076 6fa035b 69f3076 6fa035b 17de97f f4c39f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
from __future__ import annotations
from typing import List, TypedDict
import huggingface_hub
from huggingface_hub.hf_api import SpaceInfo
from concurrent.futures import ThreadPoolExecutor
import os
import json
import datetime
import tqdm
import requests
from pathlib import Path
from screenshot import get_screen_shot
import boto3
from threading import Lock
class SpaceData(TypedDict):
id: str
likes: int
subdomain: str
lastModified: str
status: str
repo = huggingface_hub.Repository(
local_dir="data",
repo_type="dataset",
clone_from="freddyaboulton/gradio-theme-subdomains",
token=os.getenv("HF_TOKEN"),
)
repo.git_pull()
prev_data = {s['id']: s for s in json.load(open("data/val_subdomains.json"))}
screen_shot_dir = Path("data") / "images"
screen_shot_dir.mkdir(exist_ok=True, parents=True)
s3_client = boto3.client(
"s3",
aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
)
lock = Lock()
api = huggingface_hub.HfApi(token=os.getenv("HF_TOKEN"))
def get_theme_preview_spaces() -> List[SpaceInfo]:
return list(iter(api.list_spaces(filter="gradio-theme")))
def get_info(space_name: SpaceInfo) -> SpaceData | None:
if not space_name.id:
print(f"no space_name for {space_name}")
return None
space_info = api.space_info(space_name.id, token=os.getenv("HF_TOKEN"))
if space_info.private:
print(f"{space_name} is private")
return None
subdomain: str | None = getattr(space_info, "subdomain", None)
if subdomain is None:
print(f"no subdomain for {space_info.id}")
return None
status = space_info.runtime["stage"]
img_id = space_info.id.replace("/", "_")
light_file = str(screen_shot_dir / Path(img_id + "_light.jpg"))
dark_file = str(screen_shot_dir / Path(img_id + "_dark.jpg"))
if False: #status == "RUNNING":
if not prev_data.get(space_info.id, {}).get("sha") or (prev_data.get(space_info.id, {}).get("sha") != space_info.sha):
prev_data[space_info.id]['sha'] = space_info.sha
with lock:
get_screen_shot(
f"https://{space_info.subdomain}.hf.space?__theme=light", 3, light_file
)
with lock:
get_screen_shot(
f"https://{space_info.subdomain}.hf.space?__theme=dark", 3, dark_file
)
s3_client.upload_file(
light_file,
"gradio-theme-screenshots",
img_id + "_light.jpg",
ExtraArgs={"ContentType": "image/jpg"},
)
s3_client.upload_file(
dark_file,
"gradio-theme-screenshots",
img_id + "_dark.jpg",
ExtraArgs={"ContentType": "image/jpg"},
)
if status not in ["SLEEPING", "RUNNING", "RUNNING_BUILDING", "BUILDING"]:
print(f"Space not running, building, or sleeping {space_info.id}")
elif status == "SLEEPING":
requests.get(f"https://huggingface.co/spaces/{space_info.id}")
return {
"id": space_info.id,
"likes": space_info.likes,
"sha": space_info.sha,
"lastModified": space_info.lastModified,
"screenshot_id": img_id,
"status": status,
"subdomain": f"https://{space_info.subdomain}.hf.space/"
} # type: ignore
def get_all_info(spaces: List[SpaceInfo]) -> List[SpaceData]:
with ThreadPoolExecutor(max_workers=10) as executor:
all_info = list(tqdm.tqdm(executor.map(get_info, spaces), total=len(spaces)))
return [info for info in all_info if info]
def process_spaces():
theme_spaces = list(iter(get_theme_preview_spaces()))
all_info = get_all_info(theme_spaces)
json.dump(all_info, open("data/subdomains.json", "w"))
repo.push_to_hub(
blocking=False, commit_message=f"Updating data at {datetime.datetime.now()}"
)
|