File size: 2,449 Bytes
c143e76 d96e79d 7d373bd c143e76 7d373bd d96e79d c143e76 d96e79d 7d373bd c143e76 7d373bd c143e76 7d373bd d96e79d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from itertools import count
from typing import Any
import gradio as gr
import requests
import pandas as pd
from datasets import Features
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
def stream_rows() -> Iterable[dict[str, Any]]:
batch_size = 100
for i in count():
rows_resp = requests.get(f"https://datasets_server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=20).json()
if "error" in rows_resp:
raise RuntimeError(rows_resp["error"])
if not rows_resp["rows"]:
break
for row_item in rows_resp["rows"]:
yield row_item["row"]
def analyze_dataset(dataset: str) -> pd.DataFrame:
info_resp = requests.get(f"https://datasets_server.huggingface.co/info?dataset={dataset}", timeout=3).json()
if "error" in info_resp:
yield "❌ " + info_resp["error"], pd.DataFrame()
return
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
features = Features.from_dict(info_resp["dataset_info"][config]["features"])
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
scanned_columns = get_columns_with_strings(features)
columns_descriptions = [
get_column_description(column_name, features[column_name]) for column_name in scanned_columns
]
rows = stream_rows(dataset, config, split)
presidio_entities = []
for presidio_entity in presidio_scan_entities(
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
):
presidio_entities.append(presidio_entity)
yield f"Presidio scan results for {dataset}:", pd.DataFrame(presidio_entities)
demo = gr.Interface(
fn=analyze_dataset,
inputs=[
HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
),
],
outputs=[
gr.Markdown(),
gr.DataFrame(),
],
title="Scan datasets using Presidio",
description="The space takes an HF dataset name as an input, and returns the list of entities detected by Presidio in the first samples.",
)
demo.launch()
|