Spaces:
Runtime error
Runtime error
derek-thomas
commited on
Commit
·
fb9efd9
1
Parent(s):
6621d73
Filtering nsfw content for nomic
Browse files- src/build_nomic.py +4 -0
src/build_nomic.py
CHANGED
@@ -87,6 +87,10 @@ def preprocess_markdown(text):
|
|
87 |
def build_nomic(dataset):
|
88 |
df = dataset['train'].to_pandas()
|
89 |
|
|
|
|
|
|
|
|
|
90 |
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
|
91 |
'score', 'score_percentile', 'html_content', 'subreddit']
|
92 |
|
|
|
87 |
def build_nomic(dataset):
|
88 |
df = dataset['train'].to_pandas()
|
89 |
|
90 |
+
# Filter df for nsfw content for displaying in Nomic
|
91 |
+
df = df[~df[['content', 'title', 'flair', 'permalink']].apply(
|
92 |
+
lambda x: x.str.contains('nsfw', case=False, na=False)).any(axis=1)]
|
93 |
+
|
94 |
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
|
95 |
'score', 'score_percentile', 'html_content', 'subreddit']
|
96 |
|