topics-generator / src /viewer_api.py
asoria's picture
asoria HF staff
some refactor
dc70c7b
raw
history blame
1.59 kB
import requests
import duckdb
DATASET_VIEWER_API_URL = "https://datasets-server.huggingface.co/"
session = requests.Session()
def fetch_json(url, params=None, timeout=20):
response = session.get(url, params=params, timeout=timeout)
response.raise_for_status()
data = response.json()
if "error" in data:
raise Exception(f"Error fetching data: {data['error']}")
return data
def get_split_rows(dataset, config, split):
url = f"{DATASET_VIEWER_API_URL}/size"
params = {"dataset": dataset, "config": config}
config_size = fetch_json(url, params)
split_size = next(
(s for s in config_size["size"]["splits"] if s["split"] == split), None
)
if split_size is None:
raise Exception(f"Error fetching split {split} in config {config}")
return split_size["num_rows"]
def get_parquet_urls(dataset, config, split):
url = f"{DATASET_VIEWER_API_URL}/parquet"
params = {"dataset": dataset, "config": config, "split": split}
parquet_files = fetch_json(url, params)
parquet_urls = [file["url"] for file in parquet_files["parquet_files"]]
return ",".join(f"'{url}'" for url in parquet_urls)
def get_docs_from_parquet(parquet_urls, column, offset, limit):
sql_query = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};"
df = duckdb.sql(sql_query).to_df()
return df[column].tolist()
def get_info(dataset):
url = f"{DATASET_VIEWER_API_URL}/info"
params = {"dataset": dataset}
info_resp = fetch_json(url, params)
return info_resp["dataset_info"]