File size: 4,175 Bytes
e901392
 
 
 
fe6396c
f8207e5
 
 
fe6396c
e901392
f8207e5
 
 
 
 
 
 
 
 
 
 
 
860ee6e
f8207e5
e901392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e96101e
e901392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33c1203
 
 
 
 
e901392
 
 
e6064e2
e901392
e6064e2
 
 
 
e901392
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
from datetime import datetime
from typing import Any, Dict, List

import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import HfApi
from huggingface_hub.utils import logging
from tqdm.auto import tqdm

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
USER_AGENT = os.getenv("USER_AGENT")
assert (
    USER_AGENT is not None
), "You need to set USER_AGENT in your environment variables"

logger = logging.get_logger(__name__)

api = HfApi(token=HF_TOKEN)
MAX_DATASETS = None


def has_card_data(dataset):
    return hasattr(dataset, "card_data")


def check_dataset_has_dataset_info(dataset):
    return bool(
        has_card_data(dataset)
        and hasattr(dataset.card_data, "dataset_info")
        and dataset.card_data.dataset_info is not None
    )


def parse_single_config_dataset(data):
    config_name = data.get("config_name", "default")
    features = data.get("features", [])
    column_names = [feature.get("name") for feature in features]
    return {
        "config_name": config_name,
        "column_names": column_names,
        "features": features,
    }


def parse_multiple_config_dataset(data: List[Dict[str, Any]]):
    return [parse_single_config_dataset(d) for d in data]


def parse_dataset(dataset):
    hub_id = dataset.id
    likes = dataset.likes
    downloads = dataset.downloads
    tags = dataset.tags
    created_at = dataset.created_at
    last_modified = dataset.last_modified
    license = dataset.card_data.license
    language = dataset.card_data.language
    return {
        "hub_id": hub_id,
        "likes": likes,
        "downloads": downloads,
        "tags": tags,
        "created_at": created_at,
        "last_modified": last_modified,
        "license": license,
        "language": language,
    }


def parsed_column_info(dataset_info):
    if isinstance(dataset_info, dict):
        return [parse_single_config_dataset(dataset_info)]
    elif isinstance(dataset_info, list):
        return parse_multiple_config_dataset(dataset_info)
    return None


def ensure_list_of_strings(value):
    if value is None:
        return []
    if isinstance(value, list):
        return [str(item) for item in value]
    return [str(value)]


def refresh_data() -> List[Dict[str, Any]]:
    # current date as string
    now = datetime.now()
    # check if a file for the current date exists
    if os.path.exists(f"datasets_{now.strftime('%Y-%m-%d')}.parquet"):
        df = pd.read_parquet(f"datasets_{now.strftime('%Y-%m-%d')}.parquet")
        return df.to_dict(orient="records")

    # List all datasets
    datasets = list(api.list_datasets(limit=MAX_DATASETS, full=True))

    # Filter datasets with dataset info
    datasets = [
        dataset for dataset in tqdm(datasets) if check_dataset_has_dataset_info(dataset)
    ]

    parsed_datasets = []
    for dataset in tqdm(datasets):
        try:
            datasetinfo = parse_dataset(dataset)
            column_info = parsed_column_info(dataset.card_data.dataset_info)
            parsed_datasets.extend({**datasetinfo, **info} for info in column_info)
        except Exception as e:
            print(f"Error processing dataset {dataset.id}: {e}")
            continue

    # Convert to DataFrame
    df = pd.DataFrame(parsed_datasets)

    # Ensure 'license', 'tags', and 'language' are lists of strings
    df["license"] = df["license"].apply(ensure_list_of_strings)
    df["tags"] = df["tags"].apply(ensure_list_of_strings)
    df["language"] = df["language"].apply(ensure_list_of_strings)

    # Ensure 'column_names' is a list
    df["column_names"] = df["column_names"].apply(
        lambda x: x if isinstance(x, list) else []
    )

    df = df.astype({"hub_id": "string", "config_name": "string"})

    # save to parquet file with current date
    # df.to_parquet(f"datasets_{now.strftime('%Y-%m-%d')}.parquet")

    # # save to JSON file with current date
    # df.to_json(
    #     f"datasets_{now.strftime('%Y-%m-%d')}.json", orient="records", lines=True
    # )

    # return a list of dictionaries
    return df.to_dict(orient="records")