polinaeterna HF staff commited on
Commit
b858233
β€’
1 Parent(s): 4bc0ae7

fetch parquet filename via dataset-viewer api

Browse files
Files changed (1) hide show
  1. app.py +18 -10
app.py CHANGED
@@ -81,20 +81,28 @@ def plot_and_df(texts, preds):
81
  )
82
 
83
 
 
 
 
 
 
 
 
 
84
  @spaces.GPU
85
  def run_quality_check(dataset, config, split, column, batch_size, num_examples):
86
  logging.info(f"Fetching data for {dataset=} {config=} {split=} {column=}")
87
  try:
88
- data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
89
- except pl.exceptions.ComputeError:
90
- try:
91
- data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
92
- except pl.exceptions.ComputeError:
93
- try:
94
- data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}-part0/0000.parquet", columns=[column])
95
- except Exception as error:
96
- yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),
97
- return
98
  logging.info("Data fetched.")
99
 
100
  data_sample = data.sample(num_examples, seed=16) if data.shape[0] > num_examples else data
 
81
  )
82
 
83
 
84
+ def get_first_parquet_filename(dataset, config, split):
85
+ parquet_resp = session.get(f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}", timeout=3).json()
86
+ if "error" in parquet_resp:
87
+ raise ValueError(parquet_resp["error"])
88
+ first_parquet_file_url = [file for file in parquet_resp["parquet_files"] if file["split"] == split][0]["url"]
89
+ return "/".join(first_parquet_file_url.split("/")[-3:])
90
+
91
+
92
  @spaces.GPU
93
  def run_quality_check(dataset, config, split, column, batch_size, num_examples):
94
  logging.info(f"Fetching data for {dataset=} {config=} {split=} {column=}")
95
  try:
96
+ filename = get_first_parquet_filename(dataset, config, split)
97
+ except Exception as error:
98
+ yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),
99
+ return
100
+
101
+ try:
102
+ data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column])
103
+ except Exception as error:
104
+ yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),
105
+ return
106
  logging.info("Data fetched.")
107
 
108
  data_sample = data.sample(num_examples, seed=16) if data.shape[0] > num_examples else data