derek-thomas commited on
Commit
1abbec9
·
1 Parent(s): 7bde858

Removing nsfw content via the nsfw column as well!

Browse files
Files changed (1) hide show
  1. src/build_nomic.py +2 -2
src/build_nomic.py CHANGED
@@ -87,9 +87,9 @@ def preprocess_markdown(text):
87
  def build_nomic(dataset):
88
  df = dataset['train'].to_pandas()
89
 
90
- # Filter df for nsfw content for displaying in Nomic
91
  df = df[~df[['content', 'title', 'flair', 'permalink']].apply(
92
- lambda x: x.str.contains('nsfw', case=False, na=False)).any(axis=1)]
93
 
94
  non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
95
  'score', 'score_percentile', 'html_content', 'subreddit']
 
87
  def build_nomic(dataset):
88
  df = dataset['train'].to_pandas()
89
 
90
+ # For nomig: filter out rows that contain 'nsfw' in specified text columns or where 'nsfw' column is True
91
  df = df[~df[['content', 'title', 'flair', 'permalink']].apply(
92
+ lambda x: x.str.contains('nsfw', case=False, na=False)).any(axis=1) & ~df['nsfw']]
93
 
94
  non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
95
  'score', 'score_percentile', 'html_content', 'subreddit']