Spaces:
Sleeping
Sleeping
add track_iter
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from itertools import count, islice
|
2 |
-
from typing import Any, Iterable
|
3 |
|
4 |
import gradio as gr
|
5 |
import requests
|
@@ -9,8 +9,8 @@ from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
|
9 |
|
10 |
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
|
11 |
|
12 |
-
MAX_ENTITIES = 100
|
13 |
MAX_ROWS = 100
|
|
|
14 |
|
15 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
16 |
batch_size = 100
|
@@ -23,6 +23,17 @@ def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any
|
|
23 |
for row_item in rows_resp["rows"]:
|
24 |
yield row_item["row"]
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def analyze_dataset(dataset: str) -> pd.DataFrame:
|
27 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
28 |
if "error" in info_resp:
|
@@ -31,17 +42,18 @@ def analyze_dataset(dataset: str) -> pd.DataFrame:
|
|
31 |
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
|
32 |
features = Features.from_dict(info_resp["dataset_info"][config]["features"])
|
33 |
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
|
|
|
34 |
scanned_columns = get_columns_with_strings(features)
|
35 |
columns_descriptions = [
|
36 |
get_column_description(column_name, features[column_name]) for column_name in scanned_columns
|
37 |
]
|
38 |
-
rows = islice(stream_rows(dataset, config, split), MAX_ROWS)
|
39 |
presidio_entities = []
|
40 |
-
for presidio_entity in
|
41 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
42 |
-
)
|
43 |
presidio_entities.append(presidio_entity)
|
44 |
-
yield f"
|
45 |
|
46 |
demo = gr.Interface(
|
47 |
fn=analyze_dataset,
|
|
|
1 |
from itertools import count, islice
|
2 |
+
from typing import Any, Iterable, TypedVar
|
3 |
|
4 |
import gradio as gr
|
5 |
import requests
|
|
|
9 |
|
10 |
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
|
11 |
|
|
|
12 |
MAX_ROWS = 100
|
13 |
+
T = TypedVar("T")
|
14 |
|
15 |
def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
|
16 |
batch_size = 100
|
|
|
23 |
for row_item in rows_resp["rows"]:
|
24 |
yield row_item["row"]
|
25 |
|
26 |
+
class track_iter:
|
27 |
+
|
28 |
+
def __init__(self, it: Iterable[T]):
|
29 |
+
self.it = it
|
30 |
+
self.next_idx = 0
|
31 |
+
|
32 |
+
def __iter__(self) -> T:
|
33 |
+
for item in self.it:
|
34 |
+
self.next_idx += 1
|
35 |
+
yield item
|
36 |
+
|
37 |
def analyze_dataset(dataset: str) -> pd.DataFrame:
|
38 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
39 |
if "error" in info_resp:
|
|
|
42 |
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
|
43 |
features = Features.from_dict(info_resp["dataset_info"][config]["features"])
|
44 |
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
|
45 |
+
num_rows = min(info_resp["dataset_info"][config]["splits"][split]["num_examples"], MAX_ROWS)
|
46 |
scanned_columns = get_columns_with_strings(features)
|
47 |
columns_descriptions = [
|
48 |
get_column_description(column_name, features[column_name]) for column_name in scanned_columns
|
49 |
]
|
50 |
+
rows = track_iter(islice(stream_rows(dataset, config, split), MAX_ROWS))
|
51 |
presidio_entities = []
|
52 |
+
for presidio_entity in presidio_scan_entities(
|
53 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
54 |
+
):
|
55 |
presidio_entities.append(presidio_entity)
|
56 |
+
yield f"Scanning {dataset} [{rows.next_idx} / {num_rows}]:", pd.DataFrame(presidio_entities)
|
57 |
|
58 |
demo = gr.Interface(
|
59 |
fn=analyze_dataset,
|