lhoestq HF staff commited on
Commit
4fbb557
1 Parent(s): 397f2df
Files changed (2) hide show
  1. app.py +119 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import count, islice
2
+ from typing import Any, Iterable
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+ import requests
7
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
8
+
9
+
10
+ session = requests.Session()
11
+ empty_dataframe = pd.DataFrame({"1": [], "2": [], "3": []})
12
+ NUM_ROWS_PREVIEW = 5
13
+
14
+
15
+ with gr.Blocks() as demo:
16
+ gr.Markdown(
17
+ "# 🤗 Dataset ReWriter ✍️✨\n\n"
18
+ "Adjust, translate or transform completely existing datasets.\n\n"
19
+ )
20
+ with gr.Row():
21
+ with gr.Column(scale=3):
22
+ dataset_search = HuggingfaceHubSearch(
23
+ label="Hub Dataset ID",
24
+ placeholder="Search for dataset id on Huggingface",
25
+ search_type="dataset",
26
+ )
27
+ subset_dropdown = gr.Dropdown(info="Subset", show_label=False, visible=False)
28
+ split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
29
+
30
+ input_query = gr.Textbox(label="Enter the adjustment or transformation to apply to the dataset:")
31
+ rewrite_button = gr.Button("ReWrite Dataset", variant="primary")
32
+
33
+ gr.Markdown("### Input")
34
+ input_preview = gr.DataFrame(interactive=False, wrap=True)
35
+
36
+ gr.Markdown("### Output")
37
+ output_preview = gr.DataFrame(interactive=False, wrap=True)
38
+ save_button = gr.Button("Save ReWriten Dataset", interactive=False)
39
+
40
+
41
+ ############
42
+ #
43
+ # Utils
44
+ #
45
+ ###########
46
+
47
+
48
+ def stream_rows(dataset: str, subset: str, split: str, batch_size: int = 100) -> Iterable[dict[str, Any]]:
49
+ for i in count():
50
+ rows_resp = session.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={subset}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=10).json()
51
+ if "error" in rows_resp:
52
+ raise RuntimeError(rows_resp["error"])
53
+ if not rows_resp["rows"]:
54
+ break
55
+ for row_item in rows_resp["rows"]:
56
+ yield row_item["row"]
57
+
58
+
59
+ ############
60
+ #
61
+ # Events
62
+ #
63
+ ###########
64
+
65
+ def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str) -> dict:
66
+ if "/" not in dataset.strip().strip("/"):
67
+ return None, None, {
68
+ subset_dropdown: gr.Dropdown(visible=False),
69
+ split_dropdown: gr.Dropdown(visible=False),
70
+ }
71
+ info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
72
+ if "error" in info_resp:
73
+ return None, None, {
74
+ subset_dropdown: gr.Dropdown(visible=False),
75
+ split_dropdown: gr.Dropdown(visible=False),
76
+ }
77
+ subsets: list[str] = list(info_resp["dataset_info"])
78
+ subset = default_subset if default_subset in subsets else subsets[0]
79
+ splits: list[str] = info_resp["dataset_info"][subset]["splits"]
80
+ split = default_split if default_split in splits else splits[0]
81
+ return subset, split, {
82
+ subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
83
+ split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
84
+ }
85
+
86
+
87
+ def _show_input_preview(dataset: str, default_subset: str, default_split: str) -> dict:
88
+ subset, split, output = _resolve_dataset_selection(dataset, default_subset=default_subset, default_split=default_split)
89
+ if subset is None or split is None:
90
+ return output
91
+ return {
92
+ input_preview: pd.DataFrame(islice(({
93
+ k: str(v) for k, v in row.items()}
94
+ for row in stream_rows(dataset, subset, split, batch_size=NUM_ROWS_PREVIEW)
95
+ ), NUM_ROWS_PREVIEW)),
96
+ **output
97
+ }
98
+
99
+
100
+ @dataset_search.change(inputs=[dataset_search], outputs=[input_preview, subset_dropdown, split_dropdown])
101
+ def show_input_from_dataset_search(dataset: str) -> dict:
102
+ return _show_input_preview(dataset, default_subset="default", default_split="train")
103
+
104
+ @subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[input_preview, subset_dropdown, split_dropdown])
105
+ def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
106
+ return _show_input_preview(dataset, default_subset=subset, default_split="train")
107
+
108
+ @split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[input_preview, subset_dropdown, split_dropdown])
109
+ def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
110
+ return _show_input_preview(dataset, default_subset=subset, default_split=split)
111
+
112
+
113
+ @rewrite_button.click(inputs=[dataset_search, subset_dropdown, split_dropdown, input_preview], outputs=[output_preview])
114
+ def rewrite(dataset: str, subset: str, split: str, input_preview_df: pd.DataFrame) -> dict:
115
+ # TODO: implement
116
+ return {output_preview: pd.DataFrame([{"TODO": ["implement"]}])}
117
+
118
+
119
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ requests
2
+ pandas
3
+ gradio_huggingfacehub_search