derek-thomas HF staff commited on
Commit
fb9efd9
1 Parent(s): 6621d73

Filtering nsfw content for nomic

Browse files
Files changed (1) hide show
  1. src/build_nomic.py +4 -0
src/build_nomic.py CHANGED
@@ -87,6 +87,10 @@ def preprocess_markdown(text):
87
  def build_nomic(dataset):
88
  df = dataset['train'].to_pandas()
89
 
 
 
 
 
90
  non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
91
  'score', 'score_percentile', 'html_content', 'subreddit']
92
 
 
87
  def build_nomic(dataset):
88
  df = dataset['train'].to_pandas()
89
 
90
+ # Filter df for nsfw content for displaying in Nomic
91
+ df = df[~df[['content', 'title', 'flair', 'permalink']].apply(
92
+ lambda x: x.str.contains('nsfw', case=False, na=False)).any(axis=1)]
93
+
94
  non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
95
  'score', 'score_percentile', 'html_content', 'subreddit']
96