lhoestq HF staff commited on
Commit
db651f9
β€’
1 Parent(s): c6d23b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -4
app.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
  from datasets import Features
8
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
9
 
10
- from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
11
 
12
  MAX_ROWS = 100
13
  T = TypeVar("T")
@@ -34,7 +34,7 @@ class track_iter:
34
  self.next_idx += 1
35
  yield item
36
 
37
- def analyze_dataset(dataset: str) -> pd.DataFrame:
38
  info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
39
  if "error" in info_resp:
40
  yield "❌ " + info_resp["error"], pd.DataFrame()
@@ -52,8 +52,9 @@ def analyze_dataset(dataset: str) -> pd.DataFrame:
52
  for presidio_entity in presidio_scan_entities(
53
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
54
  ):
55
- presidio_entities.append(presidio_entity)
56
- yield f"βš™οΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
 
57
  yield f"βœ… Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
58
 
59
  with gr.Blocks() as demo:
@@ -65,6 +66,12 @@ with gr.Blocks() as demo:
65
  placeholder="Search for dataset id on Huggingface",
66
  search_type="dataset",
67
  ),
 
 
 
 
 
 
68
  ]
69
  button = gr.Button("Run Presidio Scan")
70
  outputs = [
 
7
  from datasets import Features
8
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
9
 
10
+ from analyze import analyzer, get_column_description, get_columns_with_strings, presidio_scan_entities
11
 
12
  MAX_ROWS = 100
13
  T = TypeVar("T")
 
34
  self.next_idx += 1
35
  yield item
36
 
37
+ def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFrame:
38
  info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
39
  if "error" in info_resp:
40
  yield "❌ " + info_resp["error"], pd.DataFrame()
 
52
  for presidio_entity in presidio_scan_entities(
53
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
54
  ):
55
+ if presidio_entity.type in enabled_presidio_entities:
56
+ presidio_entities.append(presidio_entity)
57
+ yield f"βš™οΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
58
  yield f"βœ… Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
59
 
60
  with gr.Blocks() as demo:
 
66
  placeholder="Search for dataset id on Huggingface",
67
  search_type="dataset",
68
  ),
69
+ gr.CheckBoxGroup(
70
+ label="Presidio entities",
71
+ choices=analyzer.get_supported_entities(),
72
+ value=["PERSON", "CREDIT_CARD", "US_SSN", "PHONE_NUMBER", "EMAIL_ADDRESS", "IP_ADDRESS", "US_BANK_NUMBER", "EMAIL", "IBAN_CODE"],
73
+ interative=True,
74
+ ),
75
  ]
76
  button = gr.Button("Run Presidio Scan")
77
  outputs = [