Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,19 @@ from analyze import analyzer, get_column_description, get_columns_with_strings,
|
|
11 |
|
12 |
MAX_ROWS = 100
|
13 |
T = TypeVar("T")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
16 |
batch_size = 100
|
@@ -52,7 +65,7 @@ def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFram
|
|
52 |
for presidio_entity in presidio_scan_entities(
|
53 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
54 |
):
|
55 |
-
if presidio_entity
|
56 |
presidio_entities.append(presidio_entity)
|
57 |
yield f"⚙️ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
58 |
yield f"✅ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
@@ -68,8 +81,8 @@ with gr.Blocks() as demo:
|
|
68 |
),
|
69 |
gr.CheckBoxGroup(
|
70 |
label="Presidio entities",
|
71 |
-
choices=analyzer.get_supported_entities(),
|
72 |
-
value=
|
73 |
interative=True,
|
74 |
),
|
75 |
]
|
@@ -80,7 +93,14 @@ with gr.Blocks() as demo:
|
|
80 |
]
|
81 |
button.click(analyze_dataset, inputs, outputs)
|
82 |
gr.Examples(
|
83 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
inputs,
|
85 |
outputs,
|
86 |
fn=analyze_dataset,
|
|
|
11 |
|
12 |
MAX_ROWS = 100
|
13 |
T = TypeVar("T")
|
14 |
+
DEFAULT_PRESIDIO_ENTITIES = sorted([
|
15 |
+
'PERSON',
|
16 |
+
'CREDIT_CARD',
|
17 |
+
'US_SSN',
|
18 |
+
'US_DRIVER_LICENSE',
|
19 |
+
'PHONE_NUMBER',
|
20 |
+
'US_PASSPORT',
|
21 |
+
'EMAIL_ADDRESS',
|
22 |
+
'IP_ADDRESS',
|
23 |
+
'US_BANK_NUMBER',
|
24 |
+
'IBAN_CODE',
|
25 |
+
'EMAIL',
|
26 |
+
])
|
27 |
|
28 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
29 |
batch_size = 100
|
|
|
65 |
for presidio_entity in presidio_scan_entities(
|
66 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
67 |
):
|
68 |
+
if presidio_entity["type"] in enabled_presidio_entities:
|
69 |
presidio_entities.append(presidio_entity)
|
70 |
yield f"⚙️ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
71 |
yield f"✅ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
|
|
81 |
),
|
82 |
gr.CheckBoxGroup(
|
83 |
label="Presidio entities",
|
84 |
+
choices=sorted(analyzer.get_supported_entities()),
|
85 |
+
value=DEFAULT_PRESIDIO_ENTITIES,
|
86 |
interative=True,
|
87 |
),
|
88 |
]
|
|
|
93 |
]
|
94 |
button.click(analyze_dataset, inputs, outputs)
|
95 |
gr.Examples(
|
96 |
+
[
|
97 |
+
["microsoft/orca-math-word-problems-200k", DEFAULT_PRESIDIO_ENTITIES],
|
98 |
+
["tatsu-lab/alpaca", DEFAULT_PRESIDIO_ENTITIES],
|
99 |
+
["Anthropic/hh-rlhf", DEFAULT_PRESIDIO_ENTITIES],
|
100 |
+
["OpenAssistant/oasst1", DEFAULT_PRESIDIO_ENTITIES],
|
101 |
+
["sidhq/email-thread-summary", DEFAULT_PRESIDIO_ENTITIES],
|
102 |
+
["lhoestq/fake_name_and_ssn", DEFAULT_PRESIDIO_ENTITIES]
|
103 |
+
],
|
104 |
inputs,
|
105 |
outputs,
|
106 |
fn=analyze_dataset,
|