Spaces:
Running
Running
import os | |
from uuid import uuid4 | |
import duckdb | |
import gradio as gr | |
import pandas as pd | |
import requests | |
from duckdb import DuckDBPyConnection, DuckDBPyRelation | |
from duckdb.typing import DuckDBPyType | |
from huggingface_hub import HfApi | |
Connection = DuckDBPyConnection | |
Table = DuckDBPyRelation | |
Dtype = DuckDBPyType | |
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet") | |
memory_con = duckdb.connect(":memory:") | |
empty_tbl = memory_con.sql("SELECT null as col_1, null as col_2, null as col_3, null as col_4 FROM range(10)") | |
PAGE_SIZE = 5 | |
NUM_TRENDING_DATASETS = 10 | |
NUM_USER_DATASETS = 10 | |
SESSIONS_DIR = "s" | |
SESSION_DIR_SPACES_SYMLINK = os.path.join("/data", SESSIONS_DIR) | |
SPACE_ID = os.getenv("SPACE_ID") | |
SPACE_HOST = os.getenv("SPACE_HOST") | |
APP_URL = f"https://huggingface.co/spaces/{SPACE_ID}" if SPACE_ID else "http://127.0.0.1:7870" | |
HOST_URL = f"https://{SPACE_HOST}" if SPACE_HOST else "http://127.0.0.1:7870" | |
css = """ | |
.transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion { | |
background: var(--body-background-fill); | |
} | |
.gradio-container { | |
padding: var(--size-4) 0 !important; | |
max-width: 98% !important; | |
} | |
.cell-menu-button { | |
z-index: -1; | |
} | |
.centered { | |
text-align: center; | |
} | |
""" | |
def to_json_df(con: Connection, tbl: Table) -> pd.DataFrame: | |
query = ", ".join("nullif(([" + col + "]::JSON)[0]::VARCHAR, 'null') AS " + col for col in tbl.columns) | |
out = con.sql(f"SELECT {query} FROM tbl").df() | |
return out | |
def from_json_df(con: Connection, df: pd.DataFrame, columns: list[str], dtypes: list[Dtype]) -> Table: | |
query = ", ".join( | |
"if(" + col + " IS null, null, " | |
+ ( | |
"trim(" + col + """::JSON::VARCHAR, '"')""" | |
if str(dtype) == "VARCHAR" | |
else "(" + col + "::JSON::" + str(dtype) + ")" | |
) # remove double quotes at the start and end | |
+ ") AS " + col for col, dtype in zip(columns, dtypes)) | |
return con.sql(f"SELECT {query} FROM df") | |
def setup_edits(con: Connection, dataset: str, pattern: str) -> None: | |
con.sql(f"CREATE VIEW IF NOT EXISTS dataset AS SELECT * FROM 'hf://datasets/{dataset}/{pattern}'") | |
empty_dataset_tbl = con.sql("SELECT * FROM dataset LIMIT 0;") | |
columns = empty_dataset_tbl.columns | |
dtypes = empty_dataset_tbl.dtypes | |
con.sql(f"CREATE TABLE IF NOT EXISTS edits(rowid INTEGER PRIMARY KEY, {', '.join(col + ' ' + str(dtype) for col, dtype in zip(columns, dtypes))})") | |
con.sql( | |
"CREATE VIEW IF NOT EXISTS edited_dataset AS " | |
"WITH edits_per_rowid AS (SELECT * FROM (SELECT unnest(range(max(rowid) + 1)) AS rowid FROM edits) LEFT JOIN edits USING (rowid) ORDER BY rowid) " | |
f"SELECT {', '.join('ifnull(edits_per_rowid.' + col + ', dataset.' + col + ') AS ' + col for col in columns)} FROM dataset POSITIONAL JOIN edits_per_rowid" | |
) | |
with gr.Blocks(css=css) as demo: | |
session_state = gr.BrowserState() | |
loading_codes_json = gr.JSON([], visible=False) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("# π€ Hugging Face Dataset Spreadsheets π\n\nEdit any Parquet dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)", elem_classes="centered") | |
with gr.Group(): | |
with gr.Tab("Select Dataset"): | |
with gr.Row(): | |
dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10) | |
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False) | |
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False) | |
with gr.Tab("Share Link"): | |
share_link_textbox = gr.Textbox(label="Copy the link to the Spreadsheet:", show_copy_button=True, interactive=False) | |
with gr.Tab("Use Locally"): | |
use_locally_markdown = gr.Markdown() | |
dataframe = gr.DataFrame(to_json_df(memory_con, empty_tbl), interactive=True, wrap=True) | |
with gr.Row(): | |
prev_button = gr.Button("< Previous", min_width=140, interactive=False) | |
with gr.Column(scale=9, min_width=0): | |
page_html = gr.HTML("Page 1", elem_classes="centered") | |
next_button = gr.Button("Next >", min_width=140) | |
def show_subset_dropdown(dataset: str): | |
if dataset and "/" not in dataset.strip().strip("/"): | |
return [] | |
resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json() | |
loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or [] | |
subsets = [loading_code["config_name"] for loading_code in loading_codes] | |
subset = (subsets or [""])[0] | |
return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes | |
def show_split_dropdown(subset: str, loading_codes: list[dict]): | |
splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0] | |
split = (splits or [""])[0] | |
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset)) | |
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page: int): | |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] | |
if session and dataset and subset and split and pattern: | |
duckdb_file = session + ".duckdb" | |
con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file)) | |
setup_edits(con, dataset, pattern) | |
# Uncomment to have one edit for testing | |
# con.sql("INSERT OR REPLACE INTO edits SELECT 2 AS rowid, * FROM dataset LIMIT 1") | |
tbl = con.sql(f"SELECT * FROM edited_dataset LIMIT {PAGE_SIZE} OFFSET {(page - 1) * PAGE_SIZE}") | |
return dict(value=to_json_df(con, tbl)) | |
else: | |
return dict(value=to_json_df(memory_con, empty_tbl)) | |
def _fetch_datasets(session: str | None, request: gr.Request): | |
datasets = ["CohereForAI/Global-MMLU"] | |
datasets += [ds.id for ds in HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]) if ds.id not in datasets] | |
session = request.query_params.get(SESSIONS_DIR) or session | |
if session: | |
namespace, dataset_name, subset, split, _ = session.split("--") | |
dataset = namespace + "/" + dataset_name | |
if "dataset" in request.query_params and request.query_params["dataset"] != dataset: | |
session = None | |
dataset = request.query_params["dataset"] | |
else: | |
dataset = request.query_params.get("dataset") or datasets[0] | |
subsets, loading_codes = show_subset_dropdown(dataset) | |
subsets["value"] = subset if session else subsets["value"] | |
splits = show_split_dropdown(subsets["value"], loading_codes) | |
splits["value"] = split if session else splits["value"] | |
session = session if isinstance(session, str) else f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}" | |
page = 1 | |
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session, page) | |
return { | |
dataset_dropdown: gr.Dropdown(choices=datasets, value=dataset), | |
loading_codes_json: loading_codes, | |
subset_dropdown: gr.Dropdown(**subsets), | |
split_dropdown: gr.Dropdown(**splits), | |
session_state: session, | |
dataframe: gr.DataFrame(**input_dataframe), | |
share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}", | |
use_locally_markdown: ( | |
f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n""" | |
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```""" | |
), | |
prev_button: gr.Button(interactive=False), | |
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False), | |
page_html: f"Page {page}", | |
} | |
def _show_subset_dropdown(session: str | None, dataset: str): | |
subsets, loading_codes = show_subset_dropdown(dataset) | |
splits = show_split_dropdown(subsets["value"], loading_codes) | |
session = f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}" | |
page = 1 | |
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session, page) | |
return { | |
loading_codes_json: loading_codes, | |
subset_dropdown: gr.Dropdown(**subsets), | |
split_dropdown: gr.Dropdown(**splits), | |
session_state: session, | |
dataframe: gr.DataFrame(**input_dataframe), | |
share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}", | |
use_locally_markdown: ( | |
f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n""" | |
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```""" | |
), | |
prev_button: gr.Button(interactive=False), | |
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False), | |
page_html: f"Page {page}", | |
} | |
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]): | |
splits = show_split_dropdown(subset, loading_codes) | |
session = f"{dataset.replace('/', '--')}--{subset}--{splits['value']}--{uuid4()}" | |
page = 1 | |
input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes, session, page) | |
return { | |
split_dropdown: gr.Dropdown(**splits), | |
session_state: session, | |
dataframe: gr.DataFrame(**input_dataframe), | |
share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}", | |
use_locally_markdown: ( | |
f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n""" | |
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```""" | |
), | |
prev_button: gr.Button(interactive=False), | |
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False), | |
page_html: f"Page {page}", | |
} | |
def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame: | |
session = f"{dataset.replace('/', '--')}--{subset}--{split}--{uuid4()}" | |
page = 1 | |
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page) | |
return { | |
session_state: session, | |
dataframe: gr.DataFrame(**input_dataframe), | |
share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}", | |
use_locally_markdown: ( | |
f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n""" | |
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```""" | |
), | |
prev_button: gr.Button(interactive=False), | |
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False), | |
page_html: f"Page {page}", | |
} | |
def _show_next_page(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page_str: str) -> pd.DataFrame: | |
page = int(page_str.split(" ")[-1]) + 1 | |
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page) | |
return { | |
dataframe: gr.DataFrame(**input_dataframe), | |
prev_button: gr.Button(elem_classes="", interactive=True), | |
page_html: f"Page {page}", | |
} | |
def _show_prev_page(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page_str: str) -> pd.DataFrame: | |
page = int(page_str.split(" ")[-1]) - 1 | |
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page) | |
return { | |
dataframe: gr.DataFrame(**input_dataframe), | |
prev_button: gr.Button(interactive=False) if page == 1 else gr.Button(elem_classes="", interactive=True), | |
page_html: f"Page {page}", | |
} | |
def _dataframe_input(df: pd.DataFrame, session: str | None, dataset: str, subset: str, split: str, loading_codes: list[dict], page_str: str): | |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] | |
if session and dataset and subset and split and pattern: | |
duckdb_file = session + ".duckdb" | |
con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file)) | |
setup_edits(con, dataset, pattern) | |
empty_dataset_tbl = con.sql("SELECT * EXCLUDE (rowid) FROM edits LIMIT 0;") | |
columns = empty_dataset_tbl.columns | |
dtypes = empty_dataset_tbl.dtypes | |
tbl = from_json_df(con, df, columns=columns, dtypes=dtypes) | |
page = int(page_str.split(" ")[-1]) | |
# TODO add edits for page > 1 | |
# Note: Here we don't use INSERT OR REPLACE because of Not implemented Error: List Update is not supported. | |
con.sql(f"DELETE FROM edits WHERE rowid IN range({(page - 1) * PAGE_SIZE}, {page * PAGE_SIZE})") | |
try: | |
con.sql(f"INSERT INTO edits SELECT * FROM (SELECT unnest(range({(page - 1) * PAGE_SIZE}, {page * PAGE_SIZE})) AS rowid) POSITIONAL JOIN tbl") | |
except duckdb.ConversionException as e: | |
raise gr.Error(str(e).split('\n')[0], title="duckdb.ConversionException") | |
print(f"Saved {dataset} edits") | |
if __name__ == "__main__": | |
if SPACE_ID: | |
os.makedirs(SESSION_DIR_SPACES_SYMLINK, exist_ok=True) | |
try: | |
os.symlink(SESSION_DIR_SPACES_SYMLINK, SESSIONS_DIR, target_is_directory=True) | |
except FileExistsError: | |
pass | |
else: | |
os.makedirs(SESSIONS_DIR, exist_ok=True) | |
gr.set_static_paths(paths=[SESSIONS_DIR]) | |
demo.launch() | |