Spaces:
Runtime error
Runtime error
derek-thomas
commited on
Commit
·
7d5b5ca
1
Parent(s):
5d0ccb5
Trying spoiler tag handling
Browse files- src/build_nomic.py +41 -10
src/build_nomic.py
CHANGED
@@ -17,17 +17,42 @@ NOMIC_KEY = os.getenv('NOMIC_KEY')
|
|
17 |
nomic.login(NOMIC_KEY)
|
18 |
logger = setup_logger(__name__)
|
19 |
|
|
|
|
|
|
|
20 |
|
21 |
def count_words(text):
|
22 |
words = text.split()
|
23 |
return len(words)
|
24 |
|
25 |
|
26 |
-
def
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
return html
|
29 |
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
def delete_old_nomic():
|
32 |
logger.info(f"Trying to delete old version of nomic Atlas...")
|
33 |
try:
|
@@ -41,6 +66,20 @@ def delete_old_nomic():
|
|
41 |
logger.info(f"Failed to delete old version of nomic Atlas.")
|
42 |
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
def build_nomic(dataset):
|
45 |
df = dataset['train'].to_pandas()
|
46 |
|
@@ -65,14 +104,6 @@ def build_nomic(dataset):
|
|
65 |
df['url'] = 'https://www.reddit.com' + df['permalink']
|
66 |
df['html_content'] = df['content'].apply(convert_markdown_to_html)
|
67 |
|
68 |
-
# Regex to extract subreddit
|
69 |
-
subreddit_re = re.compile(r'r/(\w+)')
|
70 |
-
def extract_subreddit(text):
|
71 |
-
match = subreddit_re.search(text)
|
72 |
-
if match:
|
73 |
-
return 'r/' + match.group(1)
|
74 |
-
return ''
|
75 |
-
|
76 |
# Apply the function
|
77 |
df['subreddit'] = df['content'].apply(extract_subreddit)
|
78 |
|
|
|
17 |
nomic.login(NOMIC_KEY)
|
18 |
logger = setup_logger(__name__)
|
19 |
|
20 |
+
# Regex to extract subreddit
|
21 |
+
subreddit_re = re.compile(r'r/(\w+)')
|
22 |
+
|
23 |
|
24 |
def count_words(text):
|
25 |
words = text.split()
|
26 |
return len(words)
|
27 |
|
28 |
|
29 |
+
def preprocess_markdown(text):
|
30 |
+
# Inline CSS for spoilers
|
31 |
+
spoiler_style = 'background-color: black; color: black;'
|
32 |
+
hover_style = 'color: inherit;' # Assuming you want the text to be visible on hover
|
33 |
+
|
34 |
+
# Replace Reddit spoiler tags with an HTML span with inline styles
|
35 |
+
text = re.sub(
|
36 |
+
r'\>\!(.*?)\<\!',
|
37 |
+
r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
|
38 |
+
text
|
39 |
+
)
|
40 |
+
return text
|
41 |
+
|
42 |
+
|
43 |
+
def convert_markdown_to_html(text):
|
44 |
+
processed_text = preprocess_markdown(text)
|
45 |
+
html = markdown.markdown(processed_text)
|
46 |
return html
|
47 |
|
48 |
|
49 |
+
def extract_subreddit(text):
|
50 |
+
match = subreddit_re.search(text)
|
51 |
+
if match:
|
52 |
+
return 'r/' + match.group(1)
|
53 |
+
return ''
|
54 |
+
|
55 |
+
|
56 |
def delete_old_nomic():
|
57 |
logger.info(f"Trying to delete old version of nomic Atlas...")
|
58 |
try:
|
|
|
66 |
logger.info(f"Failed to delete old version of nomic Atlas.")
|
67 |
|
68 |
|
69 |
+
def preprocess_markdown(text):
|
70 |
+
# Inline CSS for spoilers
|
71 |
+
spoiler_style = 'background-color: black; color: black;'
|
72 |
+
hover_style = 'color: inherit;' # Assuming you want the text to be visible on hover
|
73 |
+
|
74 |
+
# Replace Reddit spoiler tags >!spoiler!< with an HTML span with inline styles
|
75 |
+
text = re.sub(
|
76 |
+
r'\>\!(.*?)\<\!',
|
77 |
+
r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
|
78 |
+
text
|
79 |
+
)
|
80 |
+
return text
|
81 |
+
|
82 |
+
|
83 |
def build_nomic(dataset):
|
84 |
df = dataset['train'].to_pandas()
|
85 |
|
|
|
104 |
df['url'] = 'https://www.reddit.com' + df['permalink']
|
105 |
df['html_content'] = df['content'].apply(convert_markdown_to_html)
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
# Apply the function
|
108 |
df['subreddit'] = df['content'].apply(extract_subreddit)
|
109 |
|