Spaces:
Runtime error
Runtime error
Commit
•
779c2fa
1
Parent(s):
7fa626d
Updating column names
Browse files- src/build_nomic.py +10 -3
src/build_nomic.py
CHANGED
@@ -10,11 +10,16 @@ NOMIC_KEY = os.getenv('NOMIC_KEY')
|
|
10 |
nomic.login(NOMIC_KEY)
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
13 |
def build_nomic(dataset):
|
14 |
df = dataset['train'].to_pandas()
|
15 |
|
16 |
-
non_embedding_columns = ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', '
|
17 |
-
'score', '
|
18 |
|
19 |
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
20 |
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
|
@@ -30,9 +35,11 @@ def build_nomic(dataset):
|
|
30 |
# This assigns each score to its corresponding percentile range
|
31 |
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
32 |
|
|
|
|
|
33 |
# Create Atlas project
|
34 |
project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
|
35 |
data=df[non_embedding_columns].to_dict(orient='records'),
|
36 |
id_field='id',
|
37 |
identifier='BORU Subreddit Neural Search',
|
38 |
-
)
|
|
|
10 |
nomic.login(NOMIC_KEY)
|
11 |
|
12 |
|
13 |
+
def count_words(text):
|
14 |
+
words = text.split()
|
15 |
+
return len(words)
|
16 |
+
|
17 |
+
|
18 |
def build_nomic(dataset):
|
19 |
df = dataset['train'].to_pandas()
|
20 |
|
21 |
+
non_embedding_columns = ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', 'word_count',
|
22 |
+
'score', 'score_percentile']
|
23 |
|
24 |
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
25 |
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
|
|
|
35 |
# This assigns each score to its corresponding percentile range
|
36 |
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
37 |
|
38 |
+
df['word_count'] = df['content'].apply(count_words)
|
39 |
+
|
40 |
# Create Atlas project
|
41 |
project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
|
42 |
data=df[non_embedding_columns].to_dict(orient='records'),
|
43 |
id_field='id',
|
44 |
identifier='BORU Subreddit Neural Search',
|
45 |
+
)
|