import requests import duckdb DATASET_VIEWER_API_URL = "https://datasets-server.huggingface.co/" session = requests.Session() def fetch_json(url, params=None, timeout=20): response = session.get(url, params=params, timeout=timeout) response.raise_for_status() data = response.json() if "error" in data: raise Exception(f"Error fetching data: {data['error']}") return data def get_split_rows(dataset, config, split): url = f"{DATASET_VIEWER_API_URL}/size" params = {"dataset": dataset, "config": config} config_size = fetch_json(url, params) split_size = next( (s for s in config_size["size"]["splits"] if s["split"] == split), None ) if split_size is None: raise Exception(f"Error fetching split {split} in config {config}") return split_size["num_rows"] def get_parquet_urls(dataset, config, split): url = f"{DATASET_VIEWER_API_URL}/parquet" params = {"dataset": dataset, "config": config, "split": split} parquet_files = fetch_json(url, params) parquet_urls = [file["url"] for file in parquet_files["parquet_files"]] return ",".join(f"'{url}'" for url in parquet_urls) def get_docs_from_parquet(parquet_urls, column, offset, limit): sql_query = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};" df = duckdb.sql(sql_query).to_df() return df[column].tolist() def get_info(dataset): url = f"{DATASET_VIEWER_API_URL}/info" params = {"dataset": dataset} info_resp = fetch_json(url, params) return info_resp["dataset_info"]