File size: 16,996 Bytes
33da8e3
 
2c1a98c
 
 
 
 
33da8e3
2c1a98c
 
 
33da8e3
2c1a98c
 
 
33da8e3
 
 
2c1a98c
 
33da8e3
4c1f027
16cc948
9d9f5bf
 
 
2c1a98c
 
 
 
 
 
 
 
33da8e3
 
 
36212af
 
 
2c1a98c
 
33da8e3
 
 
 
2c1a98c
33da8e3
 
bd73664
 
 
 
 
 
33da8e3
 
2c1a98c
33da8e3
 
 
 
 
 
 
 
 
 
 
2c1a98c
 
33da8e3
 
2c1a98c
 
f3ac7ac
2c1a98c
33da8e3
 
 
 
 
 
 
 
 
 
36212af
 
 
 
 
2c1a98c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36212af
2c1a98c
33da8e3
 
 
 
 
 
36212af
33da8e3
2c1a98c
33da8e3
 
2c1a98c
36212af
33da8e3
0d3455a
 
33da8e3
 
 
 
 
 
 
 
2a1cc9d
2c1a98c
33da8e3
2c1a98c
33da8e3
 
36212af
 
2c1a98c
2a1cc9d
2c1a98c
 
 
33da8e3
2c1a98c
9d9f5bf
33da8e3
9d9f5bf
 
36212af
 
 
 
2c1a98c
 
030df6f
33da8e3
2c1a98c
 
33da8e3
36212af
 
2c1a98c
 
 
 
33da8e3
2c1a98c
030df6f
 
 
 
 
36212af
 
 
2c1a98c
 
030df6f
2c1a98c
 
33da8e3
36212af
 
2c1a98c
 
33da8e3
2c1a98c
030df6f
 
 
 
 
36212af
 
 
2c1a98c
 
030df6f
2c1a98c
33da8e3
36212af
 
2c1a98c
33da8e3
2c1a98c
030df6f
 
 
 
 
36212af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c1a98c
33da8e3
36212af
 
33da8e3
 
 
 
 
 
 
 
 
36212af
33da8e3
bd73664
36212af
bd73664
36212af
bd73664
 
33da8e3
 
2c1a98c
 
16cc948
4c1f027
 
 
 
 
 
 
 
2c1a98c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
import os
from uuid import uuid4

import duckdb
import gradio as gr
import pandas as pd
import requests
from duckdb import DuckDBPyConnection, DuckDBPyRelation
from duckdb.typing import DuckDBPyType
from huggingface_hub import HfApi

Connection = DuckDBPyConnection
Table = DuckDBPyRelation
Dtype = DuckDBPyType
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
memory_con = duckdb.connect(":memory:")
empty_tbl = memory_con.sql("SELECT null as col_1, null as col_2, null as col_3, null as col_4 FROM range(10)")
PAGE_SIZE = 5
NUM_TRENDING_DATASETS = 10
NUM_USER_DATASETS = 10
SESSIONS_DIR = "s"
SESSION_DIR_SPACES_SYMLINK = os.path.join("/data", SESSIONS_DIR)
SPACE_ID = os.getenv("SPACE_ID")
SPACE_HOST = os.getenv("SPACE_HOST")
APP_URL = f"https://huggingface.co/spaces/{SPACE_ID}" if SPACE_ID else "http://127.0.0.1:7870"
HOST_URL = f"https://{SPACE_HOST}" if SPACE_HOST else "http://127.0.0.1:7870"
css = """
.transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion  {
    background: var(--body-background-fill);
}
.gradio-container {
    padding: var(--size-4) 0 !important;
    max-width: 98% !important;
}
.cell-menu-button {
    z-index: -1;
}
.centered {
    text-align: center;
}
"""

def to_json_df(con: Connection, tbl: Table) -> pd.DataFrame:
    query = ", ".join("nullif(([" + col + "]::JSON)[0]::VARCHAR, 'null') AS " + col for col in tbl.columns)
    out = con.sql(f"SELECT {query} FROM tbl").df()
    return out

def from_json_df(con: Connection, df: pd.DataFrame, columns: list[str], dtypes: list[Dtype]) -> Table:
    query = ", ".join(
        "if(" + col + " IS null, null, "
        + (
            "trim(" + col + """::JSON::VARCHAR, '"')"""
            if str(dtype) == "VARCHAR"
            else "(" + col + "::JSON::" + str(dtype) + ")"
          )  # remove double quotes at the start and end
        + ") AS " + col for col, dtype in zip(columns, dtypes))
    return con.sql(f"SELECT {query} FROM df")

def setup_edits(con: Connection, dataset: str, pattern: str) -> None:
    con.sql(f"CREATE VIEW IF NOT EXISTS dataset AS SELECT * FROM 'hf://datasets/{dataset}/{pattern}'")
    empty_dataset_tbl = con.sql("SELECT * FROM dataset LIMIT 0;")
    columns = empty_dataset_tbl.columns
    dtypes = empty_dataset_tbl.dtypes
    con.sql(f"CREATE TABLE IF NOT EXISTS edits(rowid INTEGER PRIMARY KEY, {', '.join(col + ' ' + str(dtype) for col, dtype in zip(columns, dtypes))})")
    con.sql(
        "CREATE VIEW IF NOT EXISTS edited_dataset AS "
        "WITH edits_per_rowid AS (SELECT * FROM (SELECT unnest(range(max(rowid) + 1)) AS rowid FROM edits) LEFT JOIN edits USING (rowid) ORDER BY rowid) "
        f"SELECT {', '.join('ifnull(edits_per_rowid.' + col + ', dataset.' + col + ') AS ' + col for col in columns)} FROM dataset POSITIONAL JOIN edits_per_rowid"
    )

with gr.Blocks(css=css) as demo:
    session_state = gr.BrowserState()
    loading_codes_json = gr.JSON([], visible=False)
    with gr.Row():
        with gr.Column():
            gr.Markdown("# πŸ€— Hugging Face Dataset Spreadsheets πŸ“\n\nEdit any Parquet dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)", elem_classes="centered")
    with gr.Group():
        with gr.Tab("Select Dataset"):
            with gr.Row():
                dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
                subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
                split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
        with gr.Tab("Share Link"):
            share_link_textbox = gr.Textbox(label="Copy the link to the Spreadsheet:", show_copy_button=True, interactive=False)
        with gr.Tab("Use Locally"):
            use_locally_markdown = gr.Markdown()
        dataframe = gr.DataFrame(to_json_df(memory_con, empty_tbl), interactive=True, wrap=True)
        with gr.Row():
            prev_button = gr.Button("< Previous", min_width=140, interactive=False)
            with gr.Column(scale=9, min_width=0):
                page_html = gr.HTML("Page 1", elem_classes="centered")
            next_button = gr.Button("Next >", min_width=140)

    def show_subset_dropdown(dataset: str):
        if dataset and "/" not in dataset.strip().strip("/"):
            return []
        resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
        loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or []
        subsets = [loading_code["config_name"] for loading_code in loading_codes]
        subset = (subsets or [""])[0]
        return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes

    def show_split_dropdown(subset: str, loading_codes: list[dict]):
        splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
        split = (splits or [""])[0]
        return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))

    def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page: int):
        pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
        if session and dataset and subset and split and pattern:
            duckdb_file = session + ".duckdb"
            con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file))
            setup_edits(con, dataset, pattern)
            # Uncomment to have one edit for testing
            # con.sql("INSERT OR REPLACE INTO edits SELECT 2 AS rowid, * FROM dataset LIMIT 1")
            tbl = con.sql(f"SELECT * FROM edited_dataset LIMIT {PAGE_SIZE} OFFSET {(page - 1) * PAGE_SIZE}")
            return dict(value=to_json_df(con, tbl))
        else:
            return dict(value=to_json_df(memory_con, empty_tbl))
        

    @demo.load(inputs=session_state, outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe, session_state, share_link_textbox, use_locally_markdown, prev_button, next_button, page_html])
    def _fetch_datasets(session: str | None, request: gr.Request):
        datasets = ["CohereForAI/Global-MMLU"]
        datasets += [ds.id for ds in HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]) if ds.id not in datasets]
        session = request.query_params.get(SESSIONS_DIR) or session
        if session:
            namespace, dataset_name, subset, split, _ = session.split("--")
            dataset = namespace + "/" + dataset_name
            if "dataset" in request.query_params and request.query_params["dataset"] != dataset:
                session = None
                dataset = request.query_params["dataset"]
        else:
            dataset = request.query_params.get("dataset") or datasets[0]
        subsets, loading_codes = show_subset_dropdown(dataset)
        subsets["value"] = subset if session else subsets["value"]
        splits = show_split_dropdown(subsets["value"], loading_codes)
        splits["value"] = split if session else splits["value"]
        session = session if isinstance(session, str) else f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
        page = 1
        input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session, page)
        return {
            dataset_dropdown: gr.Dropdown(choices=datasets, value=dataset),
            loading_codes_json: loading_codes,
            subset_dropdown: gr.Dropdown(**subsets),
            split_dropdown: gr.Dropdown(**splits),
            session_state: session,
            dataframe: gr.DataFrame(**input_dataframe),
            share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}",
            use_locally_markdown: (
                f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
                f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
            ),
            prev_button: gr.Button(interactive=False),
            next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
            page_html: f"Page {page}",
        }
    
    @dataset_dropdown.select(inputs=[session_state, dataset_dropdown], outputs=[session_state, loading_codes_json, subset_dropdown, split_dropdown, dataframe, share_link_textbox, use_locally_markdown, prev_button, next_button, page_html])
    def _show_subset_dropdown(session: str | None, dataset: str):
        subsets, loading_codes = show_subset_dropdown(dataset)
        splits = show_split_dropdown(subsets["value"], loading_codes)
        session = f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
        page = 1
        input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session, page)
        return {
            loading_codes_json: loading_codes,
            subset_dropdown: gr.Dropdown(**subsets),
            split_dropdown: gr.Dropdown(**splits),
            session_state: session,
            dataframe: gr.DataFrame(**input_dataframe),
            share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}",
            use_locally_markdown: (
                f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
                f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
            ),
            prev_button: gr.Button(interactive=False),
            next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
            page_html: f"Page {page}",
        }
    
    @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[session_state, split_dropdown, dataframe, share_link_textbox, use_locally_markdown, prev_button, next_button, page_html])
    def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
        splits = show_split_dropdown(subset, loading_codes)
        session = f"{dataset.replace('/', '--')}--{subset}--{splits['value']}--{uuid4()}"
        page = 1
        input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes, session, page)
        return {
            split_dropdown: gr.Dropdown(**splits),
            session_state: session,
            dataframe: gr.DataFrame(**input_dataframe),
            share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}",
            use_locally_markdown: (
                f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
                f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
            ),
            prev_button: gr.Button(interactive=False),
            next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
            page_html: f"Page {page}",
        }
    
    @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[session_state, dataframe, share_link_textbox, use_locally_markdown, prev_button, next_button, page_html])
    def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
        session = f"{dataset.replace('/', '--')}--{subset}--{split}--{uuid4()}"
        page = 1
        input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
        return {
            session_state: session,
            dataframe: gr.DataFrame(**input_dataframe),
            share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}",
            use_locally_markdown: (
                f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
                f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
            ),
            prev_button: gr.Button(interactive=False),
            next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
            page_html: f"Page {page}",
        }
    
    @next_button.click(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, session_state, page_html], outputs=[dataframe, prev_button, next_button, page_html])
    def _show_next_page(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page_str: str) -> pd.DataFrame:
        page = int(page_str.split(" ")[-1]) + 1
        input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
        return {
            dataframe: gr.DataFrame(**input_dataframe),
            prev_button: gr.Button(elem_classes="", interactive=True),
            page_html: f"Page {page}",
        }

    @prev_button.click(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, session_state, page_html], outputs=[dataframe, prev_button, next_button, page_html])
    def _show_prev_page(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page_str: str) -> pd.DataFrame:
        page = int(page_str.split(" ")[-1]) - 1
        input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
        return {
            dataframe: gr.DataFrame(**input_dataframe),
            prev_button: gr.Button(interactive=False) if page == 1 else gr.Button(elem_classes="", interactive=True),
            page_html: f"Page {page}",
        }
    
    @dataframe.input(inputs=[dataframe, session_state, dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, page_html])
    def _dataframe_input(df: pd.DataFrame, session: str | None, dataset: str, subset: str, split: str, loading_codes: list[dict], page_str: str):
        pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
        if session and dataset and subset and split and pattern:
            duckdb_file = session + ".duckdb"
            con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file))
            setup_edits(con, dataset, pattern)
            empty_dataset_tbl = con.sql("SELECT * EXCLUDE (rowid) FROM edits LIMIT 0;")
            columns = empty_dataset_tbl.columns
            dtypes = empty_dataset_tbl.dtypes
            tbl = from_json_df(con, df, columns=columns, dtypes=dtypes)
            page = int(page_str.split(" ")[-1])
            # TODO add edits for page > 1
            # Note: Here we don't use INSERT OR REPLACE because of Not implemented Error: List Update is not supported.
            con.sql(f"DELETE FROM edits WHERE rowid IN range({(page - 1) * PAGE_SIZE}, {page * PAGE_SIZE})")
            try:
                con.sql(f"INSERT INTO edits SELECT * FROM (SELECT unnest(range({(page - 1) * PAGE_SIZE}, {page * PAGE_SIZE})) AS rowid) POSITIONAL JOIN tbl")
            except duckdb.ConversionException as e:
                raise gr.Error(str(e).split('\n')[0], title="duckdb.ConversionException")
            print(f"Saved {dataset} edits")
        

if __name__ == "__main__":
    if SPACE_ID:
        os.makedirs(SESSION_DIR_SPACES_SYMLINK, exist_ok=True)
        try:
            os.symlink(SESSION_DIR_SPACES_SYMLINK, SESSIONS_DIR, target_is_directory=True)
        except FileExistsError:
            pass
    else:
        os.makedirs(SESSIONS_DIR, exist_ok=True)
    gr.set_static_paths(paths=[SESSIONS_DIR])
    demo.launch()