Spaces:
Sleeping
Sleeping
File size: 3,404 Bytes
05daa8e 9627e38 c143e76 d96e79d 7d373bd c143e76 7d373bd d96e79d c143e76 05daa8e 9627e38 05daa8e c143e76 47aa39b c143e76 d96e79d da70c80 7d373bd 47aa39b c143e76 da70c80 c143e76 da70c80 c143e76 da70c80 c143e76 da70c80 c143e76 bf76396 61c510a 7d373bd 8ffc0c7 bf76396 7d373bd bf76396 7d373bd bf76396 61c510a 7d373bd d96e79d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from itertools import count, islice
from typing import Any, Iterable, TypeVar
import gradio as gr
import requests
import pandas as pd
from datasets import Features
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
MAX_ROWS = 100
T = TypeVar("T")
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
batch_size = 100
for i in count():
rows_resp = requests.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=20).json()
if "error" in rows_resp:
raise RuntimeError(rows_resp["error"])
if not rows_resp["rows"]:
break
for row_item in rows_resp["rows"]:
yield row_item["row"]
class track_iter:
def __init__(self, it: Iterable[T]):
self.it = it
self.next_idx = 0
def __iter__(self) -> T:
for item in self.it:
self.next_idx += 1
yield item
def analyze_dataset(dataset: str) -> pd.DataFrame:
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
if "error" in info_resp:
yield "β " + info_resp["error"], pd.DataFrame()
return
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
features = Features.from_dict(info_resp["dataset_info"][config]["features"])
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
num_rows = min(info_resp["dataset_info"][config]["splits"][split]["num_examples"], MAX_ROWS)
scanned_columns = get_columns_with_strings(features)
columns_descriptions = [
get_column_description(column_name, features[column_name]) for column_name in scanned_columns
]
rows = track_iter(islice(stream_rows(dataset, config, split), MAX_ROWS))
presidio_entities = []
for presidio_entity in presidio_scan_entities(
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
):
presidio_entities.append(presidio_entity)
yield f"βοΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
yield f"β
Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
with gr.Blocks() as demo:
gr.Markdown("# Scan datasets using Presidio")
gr.Markdown("The space takes an HF dataset name as an input, and returns the list of entities detected by Presidio in the first samples.")
inputs = [
HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
),
]
button = gr.Button("Run Presidio Scan")
outputs = [
gr.Markdown(),
gr.DataFrame(),
]
button.click(analyze_dataset, inputs, outputs)
gr.Examples(
[["microsoft/orca-math-word-problems-200k"], ["tatsu-lab/alpaca"], ["Anthropic/hh-rlhf"], ["OpenAssistant/oasst1"], ["sidhq/email-thread-summary"], ["lhoestq/fake_name_and_ssn"]],
inputs,
outputs,
fn=analyze_dataset,
run_on_click=True
)
demo.launch()
|