Spaces:
Running
on
Zero
Running
on
Zero
Commit
β’
fd7a758
1
Parent(s):
284cae9
add nested texts
Browse files
app.py
CHANGED
@@ -107,7 +107,9 @@ def run_quality_check(dataset, config, split, column, batch_size, num_examples):
|
|
107 |
logging.info("Data fetched.")
|
108 |
|
109 |
data_sample = data.sample(num_examples, seed=16) if data.shape[0] > num_examples else data
|
110 |
-
texts =
|
|
|
|
|
111 |
predictions, texts_processed = [], []
|
112 |
num_examples = min(len(texts), num_examples)
|
113 |
for i in range(0, num_examples, batch_size):
|
@@ -144,7 +146,7 @@ def plot_toxicity(scores):
|
|
144 |
|
145 |
return fig
|
146 |
|
147 |
-
def call_perspective_api(texts_df, column_name, dataset, config, split):#, full_check=False):
|
148 |
headers = {
|
149 |
"content-type": "application/json",
|
150 |
}
|
@@ -154,21 +156,23 @@ def call_perspective_api(texts_df, column_name, dataset, config, split):#, full_
|
|
154 |
if texts_df.values.tolist() == [['', '', '']]:
|
155 |
logging.info(f"Fetching data for {dataset=} {config=} {split=} {column_name=}")
|
156 |
try:
|
157 |
-
|
158 |
-
except
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
|
|
167 |
logging.info("Data fetched.")
|
168 |
texts_df = texts_df.to_pandas()
|
169 |
|
170 |
-
# texts = texts_df.sample(100, seed=16)[column_name].values if not full_check else texts_df[column_name].values
|
171 |
texts = texts_df.sample(100, random_state=16)[column_name].values if texts_df.shape[0] > 100 else texts_df[column_name].values
|
|
|
|
|
172 |
|
173 |
n_samples = len(texts)
|
174 |
for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
|
@@ -245,45 +249,80 @@ with gr.Blocks() as demo:
|
|
245 |
"""
|
246 |
return gr.HTML(value=html_code)
|
247 |
|
248 |
-
|
|
|
|
|
249 |
|
250 |
-
def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str):
|
251 |
if "/" not in dataset.strip().strip("/"):
|
252 |
return {
|
253 |
subset_dropdown: gr.Dropdown(visible=False),
|
254 |
split_dropdown: gr.Dropdown(visible=False),
|
255 |
-
text_column_dropdown: gr.Dropdown(info="Text colum name to check
|
|
|
256 |
}
|
257 |
info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
258 |
if "error" in info_resp:
|
259 |
return {
|
260 |
subset_dropdown: gr.Dropdown(visible=False),
|
261 |
split_dropdown: gr.Dropdown(visible=False),
|
262 |
-
text_column_dropdown: gr.Dropdown(label="Text column name", info="Text colum name to check
|
|
|
263 |
}
|
264 |
subsets: list[str] = list(info_resp["dataset_info"])
|
265 |
subset = default_subset if default_subset in subsets else subsets[0]
|
266 |
splits: list[str] = info_resp["dataset_info"][subset]["splits"]
|
267 |
split = default_split if default_split in splits else splits[0]
|
268 |
features = info_resp["dataset_info"][subset]["features"]
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
return {
|
271 |
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
272 |
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
273 |
-
text_column_dropdown: gr.Dropdown(choices=text_features, label="Text column name", info="Text colum name to check (only non-nested texts are supported)"),
|
|
|
274 |
}
|
275 |
|
276 |
-
@dataset_name.change(inputs=[dataset_name], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
|
277 |
def show_input_from_subset_dropdown(dataset: str) -> dict:
|
278 |
-
return _resolve_dataset_selection(dataset, default_subset="default", default_split="train")
|
279 |
|
280 |
-
@subset_dropdown.change(inputs=[dataset_name, subset_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
|
281 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
282 |
-
return _resolve_dataset_selection(dataset, default_subset=subset, default_split="train")
|
283 |
|
284 |
-
@split_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
|
285 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
286 |
-
return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split)
|
|
|
|
|
|
|
|
|
287 |
|
288 |
gr.Markdown("## Run nvidia quality classifier")
|
289 |
batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
|
@@ -305,13 +344,13 @@ with gr.Blocks() as demo:
|
|
305 |
|
306 |
gr.Examples(
|
307 |
[
|
308 |
-
["HuggingFaceFW/fineweb-edu", "default", "train", "text", 16, 500],
|
309 |
-
["fka/awesome-chatgpt-prompts", "default", "train", "prompt", 64, 200],
|
310 |
-
["proj-persona/PersonaHub", "instruction", "train", "synthesized text", 32, 1000],
|
311 |
-
["argilla/FinePersonas-v0.1", "default", "train", "persona", 64, 1000],
|
312 |
-
["
|
313 |
],
|
314 |
-
[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, batch_size, num_examples],
|
315 |
[progress_bar, plot, df_low, df_medium, df_high, texts_df],
|
316 |
fn=run_quality_check,
|
317 |
run_on_click=False,
|
@@ -320,7 +359,7 @@ with gr.Blocks() as demo:
|
|
320 |
|
321 |
gr_check_btn.click(
|
322 |
run_quality_check,
|
323 |
-
inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, batch_size, num_examples],
|
324 |
outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
|
325 |
)
|
326 |
|
@@ -335,7 +374,7 @@ with gr.Blocks() as demo:
|
|
335 |
toxicity_df = gr.DataFrame()
|
336 |
gr_toxicity_btn.click(
|
337 |
call_perspective_api,
|
338 |
-
inputs=[texts_df, text_column_dropdown, dataset_name, subset_dropdown, split_dropdown],#, checkbox],
|
339 |
outputs=[toxicity_progress_bar, toxicity_hist, toxicity_df]
|
340 |
)
|
341 |
|
|
|
107 |
logging.info("Data fetched.")
|
108 |
|
109 |
data_sample = data.sample(num_examples, seed=16) if data.shape[0] > num_examples else data
|
110 |
+
texts = data_sample[column].to_list()
|
111 |
+
if nested_column:
|
112 |
+
texts = [text[nested_column] for text in texts]
|
113 |
predictions, texts_processed = [], []
|
114 |
num_examples = min(len(texts), num_examples)
|
115 |
for i in range(0, num_examples, batch_size):
|
|
|
146 |
|
147 |
return fig
|
148 |
|
149 |
+
def call_perspective_api(texts_df, column_name, nested_column_name, dataset, config, split):#, full_check=False):
|
150 |
headers = {
|
151 |
"content-type": "application/json",
|
152 |
}
|
|
|
156 |
if texts_df.values.tolist() == [['', '', '']]:
|
157 |
logging.info(f"Fetching data for {dataset=} {config=} {split=} {column_name=}")
|
158 |
try:
|
159 |
+
filename = get_first_parquet_filename(dataset, config, split)
|
160 |
+
except Exception as error:
|
161 |
+
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
162 |
+
return
|
163 |
+
|
164 |
+
try:
|
165 |
+
logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
|
166 |
+
texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
|
167 |
+
except Exception as error:
|
168 |
+
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
169 |
+
return
|
170 |
logging.info("Data fetched.")
|
171 |
texts_df = texts_df.to_pandas()
|
172 |
|
|
|
173 |
texts = texts_df.sample(100, random_state=16)[column_name].values if texts_df.shape[0] > 100 else texts_df[column_name].values
|
174 |
+
if nested_column_name:
|
175 |
+
texts = [text[nested_column_name] for text in texts]
|
176 |
|
177 |
n_samples = len(texts)
|
178 |
for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
|
|
|
249 |
"""
|
250 |
return gr.HTML(value=html_code)
|
251 |
|
252 |
+
with gr.Row():
|
253 |
+
text_column_dropdown = gr.Dropdown(label="Text column name", info="Text colum name to check. ")
|
254 |
+
nested_text_column_dropdown = gr.Dropdown(label="Nested text key")#, visible=False)
|
255 |
|
256 |
+
def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str, text_feature):
|
257 |
if "/" not in dataset.strip().strip("/"):
|
258 |
return {
|
259 |
subset_dropdown: gr.Dropdown(visible=False),
|
260 |
split_dropdown: gr.Dropdown(visible=False),
|
261 |
+
text_column_dropdown: gr.Dropdown(info="Text colum name to check"),
|
262 |
+
nested_text_column_dropdown: gr.Dropdown(visible=False)
|
263 |
}
|
264 |
info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
265 |
if "error" in info_resp:
|
266 |
return {
|
267 |
subset_dropdown: gr.Dropdown(visible=False),
|
268 |
split_dropdown: gr.Dropdown(visible=False),
|
269 |
+
text_column_dropdown: gr.Dropdown(label="Text column name", info="Text colum name to check"),
|
270 |
+
nested_text_column_dropdown: gr.Dropdown(visible=False)
|
271 |
}
|
272 |
subsets: list[str] = list(info_resp["dataset_info"])
|
273 |
subset = default_subset if default_subset in subsets else subsets[0]
|
274 |
splits: list[str] = info_resp["dataset_info"][subset]["splits"]
|
275 |
split = default_split if default_split in splits else splits[0]
|
276 |
features = info_resp["dataset_info"][subset]["features"]
|
277 |
+
|
278 |
+
def _is_string_feature(feature):
|
279 |
+
return isinstance(feature, dict) and feature.get("dtype") == "string"
|
280 |
+
|
281 |
+
text_features = [feature_name for feature_name, feature in features.items() if _is_string_feature(feature)]
|
282 |
+
nested_features = [feature_name for feature_name, feature in features.items() if isinstance(feature, dict) and isinstance(next(iter(feature.values())), dict)]
|
283 |
+
nested_text_features = [feature_name for feature_name in nested_features if any(_is_string_feature(nested_feature) for nested_feature in features[feature_name].values())]
|
284 |
+
if not text_feature:
|
285 |
+
return {
|
286 |
+
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
287 |
+
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
288 |
+
text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features, label="Text column name",
|
289 |
+
info="Text colum name to check"),
|
290 |
+
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
291 |
+
}
|
292 |
+
logging.info(nested_text_features)
|
293 |
+
if text_feature in nested_text_features:
|
294 |
+
nested_keys = [feature_name for feature_name, feature in features[text_feature].items() if _is_string_feature(feature)]
|
295 |
+
return {
|
296 |
+
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
297 |
+
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
298 |
+
text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features,
|
299 |
+
label="Text column name",
|
300 |
+
info="Text colum name to check (only non-nested texts are supported)"),
|
301 |
+
nested_text_column_dropdown: gr.Dropdown(value=nested_keys[0], choices=nested_keys,
|
302 |
+
label="Nested text column name", visible=True)
|
303 |
+
}
|
304 |
return {
|
305 |
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
306 |
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
307 |
+
text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features, label="Text column name", info="Text colum name to check (only non-nested texts are supported)"),
|
308 |
+
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
309 |
}
|
310 |
|
311 |
+
@dataset_name.change(inputs=[dataset_name], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
|
312 |
def show_input_from_subset_dropdown(dataset: str) -> dict:
|
313 |
+
return _resolve_dataset_selection(dataset, default_subset="default", default_split="train", text_feature=None)
|
314 |
|
315 |
+
@subset_dropdown.change(inputs=[dataset_name, subset_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
|
316 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
317 |
+
return _resolve_dataset_selection(dataset, default_subset=subset, default_split="train", text_feature=None)
|
318 |
|
319 |
+
@split_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
|
320 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
321 |
+
return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split, text_feature=None)
|
322 |
+
|
323 |
+
@text_column_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
|
324 |
+
def show_input_from_text_column_dropdown(dataset: str, subset: str, split: str, text_column) -> dict:
|
325 |
+
return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split, text_feature=text_column)
|
326 |
|
327 |
gr.Markdown("## Run nvidia quality classifier")
|
328 |
batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
|
|
|
344 |
|
345 |
gr.Examples(
|
346 |
[
|
347 |
+
["HuggingFaceFW/fineweb-edu", "default", "train", "text", None, 16, 500],
|
348 |
+
# ["fka/awesome-chatgpt-prompts", "default", "train", "prompt", 64, 200],
|
349 |
+
# ["proj-persona/PersonaHub", "instruction", "train", "synthesized text", 32, 1000],
|
350 |
+
["argilla/FinePersonas-v0.1", "default", "train", "persona", None, 64, 1000],
|
351 |
+
["allenai/real-toxicity-prompts", "default", "train", "continuation", "text", 64, 1000],
|
352 |
],
|
353 |
+
[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown, batch_size, num_examples],
|
354 |
[progress_bar, plot, df_low, df_medium, df_high, texts_df],
|
355 |
fn=run_quality_check,
|
356 |
run_on_click=False,
|
|
|
359 |
|
360 |
gr_check_btn.click(
|
361 |
run_quality_check,
|
362 |
+
inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown, batch_size, num_examples],
|
363 |
outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
|
364 |
)
|
365 |
|
|
|
374 |
toxicity_df = gr.DataFrame()
|
375 |
gr_toxicity_btn.click(
|
376 |
call_perspective_api,
|
377 |
+
inputs=[texts_df, text_column_dropdown, nested_text_column_dropdown, dataset_name, subset_dropdown, split_dropdown],#, checkbox],
|
378 |
outputs=[toxicity_progress_bar, toxicity_hist, toxicity_df]
|
379 |
)
|
380 |
|