import gradio as gr import duckdb from huggingface_hub import HfFileSystem from huggingface_hub.hf_file_system import safe_quote import pandas as pd import requests DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co" PARQUET_REVISION="refs/convert/parquet" TABLE_WILDCARD="{table}" fs = HfFileSystem() duckdb.register_filesystem(fs) def get_parquet_files(dataset, config, split): response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={config}", timeout=60) if response.status_code != 200: raise Exception(response) response = response.json() parquet_files = response["parquet_files"] file_names = [content["filename"] for content in parquet_files if content["split"] == split] if len(file_names) == 0: raise Exception("No parquet files found for dataset") return file_names def run_command(dataset, config, split, sql): try: if TABLE_WILDCARD not in sql: raise Exception(f"Query must contains {TABLE_WILDCARD} wildcard.") parquet_files = get_parquet_files(dataset, config, split) print(f"File names found: {','.join(parquet_files)}") parquet_first_file = parquet_files[0] # TODO: Send pattern to duck db to read all split parquets print(f"Trying with the first one {parquet_first_file}") location=f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{config}/{parquet_first_file}" print(location) sql = sql.replace(TABLE_WILDCARD, f"'{location}'") result = duckdb.query(sql).to_df() print("Ok") except Exception as error: print(f"Error: {str(error)}") return pd.DataFrame({"Error": [f"❌ {str(error)}"]}) return result with gr.Blocks() as demo: gr.Markdown(" ## SQL Query using DuckDB for datasets server parquet files") dataset = gr.Textbox(label="dataset", placeholder="mstz/iris", value="mstz/iris") config = gr.Textbox(label="config", placeholder="iris", value="iris") split = gr.Textbox(label="split", placeholder="train", value="train") sql = gr.Textbox( label="Query in SQL format - It should have {table} wildcard", placeholder=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3", value=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3", lines=3, ) run_button = gr.Button("Run") gr.Markdown("### Result") cached_responses_table = gr.DataFrame() run_button.click(run_command, inputs=[dataset, config, split, sql], outputs=cached_responses_table) if __name__ == "__main__": demo.launch()