File size: 3,977 Bytes
6fa035b
 
69f3076
6fa035b
 
 
 
 
 
 
69f3076
12c49c4
f4c39f1
 
 
5f4d02f
f4c39f1
6fa035b
69f3076
 
 
6fa035b
69f3076
12c49c4
6fa035b
f4c39f1
6fa035b
 
 
 
f4c39f1
6fa035b
 
 
efbfa94
ef5efcc
f4c39f1
 
 
 
 
 
 
 
5f4d02f
f4c39f1
6fa035b
 
 
f4c39f1
6fa035b
69f3076
6fa035b
f4c39f1
69f3076
6fa035b
 
 
69f3076
9c4359c
 
 
69f3076
 
 
6fa035b
f4c39f1
 
 
 
 
a8db45f
6578ed8
bdea4ee
ef5efcc
 
 
 
 
 
 
 
 
 
 
 
 
5f4d02f
ef5efcc
 
 
 
 
5f4d02f
12c49c4
 
 
 
f4c39f1
 
 
ef5efcc
f4c39f1
 
 
4dd0c5b
f4c39f1
6fa035b
 
69f3076
6fa035b
69f3076
 
6fa035b
 
 
69f3076
6fa035b
69f3076
6fa035b
17de97f
f4c39f1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from __future__ import annotations

from typing import List, TypedDict

import huggingface_hub
from huggingface_hub.hf_api import SpaceInfo
from concurrent.futures import ThreadPoolExecutor
import os
import json
import datetime
import tqdm
import requests
from pathlib import Path
from screenshot import get_screen_shot
import boto3
from threading import Lock


class SpaceData(TypedDict):
    id: str
    likes: int
    subdomain: str
    lastModified: str
    status: str


repo = huggingface_hub.Repository(
    local_dir="data",
    repo_type="dataset",
    clone_from="freddyaboulton/gradio-theme-subdomains",
    token=os.getenv("HF_TOKEN"),
)
repo.git_pull()

prev_data = {s['id']: s for s in json.load(open("data/val_subdomains.json"))}

screen_shot_dir = Path("data") / "images"
screen_shot_dir.mkdir(exist_ok=True, parents=True)

s3_client = boto3.client(
    "s3",
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
)
lock = Lock()


api = huggingface_hub.HfApi(token=os.getenv("HF_TOKEN"))


def get_theme_preview_spaces() -> List[SpaceInfo]:
    return list(iter(api.list_spaces(filter="gradio-theme")))


def get_info(space_name: SpaceInfo) -> SpaceData | None:
    if not space_name.id:
        print(f"no space_name for {space_name}")
        return None
    space_info = api.space_info(space_name.id, token=os.getenv("HF_TOKEN"))
    if space_info.private:
        print(f"{space_name} is private")
        return None
    subdomain: str | None = getattr(space_info, "subdomain", None)
    if subdomain is None:
        print(f"no subdomain for {space_info.id}")
        return None

    status = space_info.runtime["stage"]
    img_id = space_info.id.replace("/", "_")
    light_file = str(screen_shot_dir / Path(img_id + "_light.jpg"))
    dark_file = str(screen_shot_dir / Path(img_id + "_dark.jpg"))
    if False: #status == "RUNNING":
        if not prev_data.get(space_info.id, {}).get("sha") or (prev_data.get(space_info.id, {}).get("sha") != space_info.sha):
            prev_data[space_info.id]['sha'] = space_info.sha
            with lock:
                get_screen_shot(
                    f"https://{space_info.subdomain}.hf.space?__theme=light", 3, light_file
                )
            with lock:
                get_screen_shot(
                    f"https://{space_info.subdomain}.hf.space?__theme=dark", 3, dark_file
                )
            s3_client.upload_file(
                light_file,
                "gradio-theme-screenshots",
                img_id + "_light.jpg",
                ExtraArgs={"ContentType": "image/jpg"},
            )
            s3_client.upload_file(
                dark_file,
                "gradio-theme-screenshots",
                img_id + "_dark.jpg",
                ExtraArgs={"ContentType": "image/jpg"},
            )
    if status not in ["SLEEPING", "RUNNING", "RUNNING_BUILDING", "BUILDING"]:
        print(f"Space not running, building, or sleeping {space_info.id}")
    elif status == "SLEEPING":
        requests.get(f"https://huggingface.co/spaces/{space_info.id}")
    return {
        "id": space_info.id,
        "likes": space_info.likes,
        "sha": space_info.sha,
        "lastModified": space_info.lastModified,
        "screenshot_id": img_id,
        "status": status,
        "subdomain": f"https://{space_info.subdomain}.hf.space/"
    }  # type: ignore


def get_all_info(spaces: List[SpaceInfo]) -> List[SpaceData]:
    with ThreadPoolExecutor(max_workers=10) as executor:
        all_info = list(tqdm.tqdm(executor.map(get_info, spaces), total=len(spaces)))
    return [info for info in all_info if info]


def process_spaces():
    theme_spaces = list(iter(get_theme_preview_spaces()))

    all_info = get_all_info(theme_spaces)

    json.dump(all_info, open("data/subdomains.json", "w"))
    repo.push_to_hub(
        blocking=False, commit_message=f"Updating data at {datetime.datetime.now()}"
    )