Spaces:
Runtime error
Runtime error
derek-thomas
commited on
Commit
·
1abbec9
1
Parent(s):
7bde858
Removing nsfw content via the nsfw column as well!
Browse files- src/build_nomic.py +2 -2
src/build_nomic.py
CHANGED
@@ -87,9 +87,9 @@ def preprocess_markdown(text):
|
|
87 |
def build_nomic(dataset):
|
88 |
df = dataset['train'].to_pandas()
|
89 |
|
90 |
-
#
|
91 |
df = df[~df[['content', 'title', 'flair', 'permalink']].apply(
|
92 |
-
|
93 |
|
94 |
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
|
95 |
'score', 'score_percentile', 'html_content', 'subreddit']
|
|
|
87 |
def build_nomic(dataset):
|
88 |
df = dataset['train'].to_pandas()
|
89 |
|
90 |
+
# For nomig: filter out rows that contain 'nsfw' in specified text columns or where 'nsfw' column is True
|
91 |
df = df[~df[['content', 'title', 'flair', 'permalink']].apply(
|
92 |
+
lambda x: x.str.contains('nsfw', case=False, na=False)).any(axis=1) & ~df['nsfw']]
|
93 |
|
94 |
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
|
95 |
'score', 'score_percentile', 'html_content', 'subreddit']
|