davanstrien HF staff commited on
Commit
4a5d03c
1 Parent(s): 9ed5b2c

remove nsfw datasets

Browse files
Files changed (1) hide show
  1. load_data.py +1 -0
load_data.py CHANGED
@@ -141,6 +141,7 @@ def load_cards(
141
  f"Loading cards with min_len={min_len}, min_likes={min_likes}, last_modified={last_modified}"
142
  )
143
  df = pl.read_parquet(DATASET_PARQUET_URL)
 
144
  df = parse_markdown_column(df, "card", "datasetId")
145
  df = df.with_columns(pl.col("parsed_markdown").str.len_chars().alias("card_len"))
146
  df = df.filter(pl.col("card_len") > min_len)
 
141
  f"Loading cards with min_len={min_len}, min_likes={min_likes}, last_modified={last_modified}"
142
  )
143
  df = pl.read_parquet(DATASET_PARQUET_URL)
144
+ df = df.filter(~pl.col("tags").list.contains("not-for-all-audiences"))
145
  df = parse_markdown_column(df, "card", "datasetId")
146
  df = df.with_columns(pl.col("parsed_markdown").str.len_chars().alias("card_len"))
147
  df = df.filter(pl.col("card_len") > min_len)