lhoestq HF staff commited on
Commit
da70c80
1 Parent(s): ede461a

add track_iter

Browse files
Files changed (1) hide show
  1. app.py +18 -6
app.py CHANGED
@@ -1,5 +1,5 @@
1
  from itertools import count, islice
2
- from typing import Any, Iterable
3
 
4
  import gradio as gr
5
  import requests
@@ -9,8 +9,8 @@ from gradio_huggingfacehub_search import HuggingfaceHubSearch
9
 
10
  from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
11
 
12
- MAX_ENTITIES = 100
13
  MAX_ROWS = 100
 
14
 
15
  def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
16
  batch_size = 100
@@ -23,6 +23,17 @@ def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any
23
  for row_item in rows_resp["rows"]:
24
  yield row_item["row"]
25
 
 
 
 
 
 
 
 
 
 
 
 
26
  def analyze_dataset(dataset: str) -> pd.DataFrame:
27
  info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
28
  if "error" in info_resp:
@@ -31,17 +42,18 @@ def analyze_dataset(dataset: str) -> pd.DataFrame:
31
  config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
32
  features = Features.from_dict(info_resp["dataset_info"][config]["features"])
33
  split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
 
34
  scanned_columns = get_columns_with_strings(features)
35
  columns_descriptions = [
36
  get_column_description(column_name, features[column_name]) for column_name in scanned_columns
37
  ]
38
- rows = islice(stream_rows(dataset, config, split), MAX_ROWS)
39
  presidio_entities = []
40
- for presidio_entity in islice(presidio_scan_entities(
41
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
42
- ), MAX_ENTITIES):
43
  presidio_entities.append(presidio_entity)
44
- yield f"Presidio scan results for {dataset}:", pd.DataFrame(presidio_entities)
45
 
46
  demo = gr.Interface(
47
  fn=analyze_dataset,
 
1
  from itertools import count, islice
2
+ from typing import Any, Iterable, TypedVar
3
 
4
  import gradio as gr
5
  import requests
 
9
 
10
  from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
11
 
 
12
  MAX_ROWS = 100
13
+ T = TypedVar("T")
14
 
15
  def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
16
  batch_size = 100
 
23
  for row_item in rows_resp["rows"]:
24
  yield row_item["row"]
25
 
26
+ class track_iter:
27
+
28
+ def __init__(self, it: Iterable[T]):
29
+ self.it = it
30
+ self.next_idx = 0
31
+
32
+ def __iter__(self) -> T:
33
+ for item in self.it:
34
+ self.next_idx += 1
35
+ yield item
36
+
37
  def analyze_dataset(dataset: str) -> pd.DataFrame:
38
  info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
39
  if "error" in info_resp:
 
42
  config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
43
  features = Features.from_dict(info_resp["dataset_info"][config]["features"])
44
  split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
45
+ num_rows = min(info_resp["dataset_info"][config]["splits"][split]["num_examples"], MAX_ROWS)
46
  scanned_columns = get_columns_with_strings(features)
47
  columns_descriptions = [
48
  get_column_description(column_name, features[column_name]) for column_name in scanned_columns
49
  ]
50
+ rows = track_iter(islice(stream_rows(dataset, config, split), MAX_ROWS))
51
  presidio_entities = []
52
+ for presidio_entity in presidio_scan_entities(
53
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
54
+ ):
55
  presidio_entities.append(presidio_entity)
56
+ yield f"Scanning {dataset} [{rows.next_idx} / {num_rows}]:", pd.DataFrame(presidio_entities)
57
 
58
  demo = gr.Interface(
59
  fn=analyze_dataset,