derek-thomas HF staff commited on
Commit
7d5b5ca
1 Parent(s): 5d0ccb5

Trying spoiler tag handling

Browse files
Files changed (1) hide show
  1. src/build_nomic.py +41 -10
src/build_nomic.py CHANGED
@@ -17,17 +17,42 @@ NOMIC_KEY = os.getenv('NOMIC_KEY')
17
  nomic.login(NOMIC_KEY)
18
  logger = setup_logger(__name__)
19
 
 
 
 
20
 
21
  def count_words(text):
22
  words = text.split()
23
  return len(words)
24
 
25
 
26
- def convert_markdown_to_html(markdown_text):
27
- html = markdown.markdown(markdown_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  return html
29
 
30
 
 
 
 
 
 
 
 
31
  def delete_old_nomic():
32
  logger.info(f"Trying to delete old version of nomic Atlas...")
33
  try:
@@ -41,6 +66,20 @@ def delete_old_nomic():
41
  logger.info(f"Failed to delete old version of nomic Atlas.")
42
 
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def build_nomic(dataset):
45
  df = dataset['train'].to_pandas()
46
 
@@ -65,14 +104,6 @@ def build_nomic(dataset):
65
  df['url'] = 'https://www.reddit.com' + df['permalink']
66
  df['html_content'] = df['content'].apply(convert_markdown_to_html)
67
 
68
- # Regex to extract subreddit
69
- subreddit_re = re.compile(r'r/(\w+)')
70
- def extract_subreddit(text):
71
- match = subreddit_re.search(text)
72
- if match:
73
- return 'r/' + match.group(1)
74
- return ''
75
-
76
  # Apply the function
77
  df['subreddit'] = df['content'].apply(extract_subreddit)
78
 
 
17
  nomic.login(NOMIC_KEY)
18
  logger = setup_logger(__name__)
19
 
20
+ # Regex to extract subreddit
21
+ subreddit_re = re.compile(r'r/(\w+)')
22
+
23
 
24
  def count_words(text):
25
  words = text.split()
26
  return len(words)
27
 
28
 
29
+ def preprocess_markdown(text):
30
+ # Inline CSS for spoilers
31
+ spoiler_style = 'background-color: black; color: black;'
32
+ hover_style = 'color: inherit;' # Assuming you want the text to be visible on hover
33
+
34
+ # Replace Reddit spoiler tags with an HTML span with inline styles
35
+ text = re.sub(
36
+ r'\>\!(.*?)\<\!',
37
+ r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
38
+ text
39
+ )
40
+ return text
41
+
42
+
43
+ def convert_markdown_to_html(text):
44
+ processed_text = preprocess_markdown(text)
45
+ html = markdown.markdown(processed_text)
46
  return html
47
 
48
 
49
+ def extract_subreddit(text):
50
+ match = subreddit_re.search(text)
51
+ if match:
52
+ return 'r/' + match.group(1)
53
+ return ''
54
+
55
+
56
  def delete_old_nomic():
57
  logger.info(f"Trying to delete old version of nomic Atlas...")
58
  try:
 
66
  logger.info(f"Failed to delete old version of nomic Atlas.")
67
 
68
 
69
+ def preprocess_markdown(text):
70
+ # Inline CSS for spoilers
71
+ spoiler_style = 'background-color: black; color: black;'
72
+ hover_style = 'color: inherit;' # Assuming you want the text to be visible on hover
73
+
74
+ # Replace Reddit spoiler tags >!spoiler!< with an HTML span with inline styles
75
+ text = re.sub(
76
+ r'\>\!(.*?)\<\!',
77
+ r'<span class="spoiler" style="' + spoiler_style + '" onmouseover="this.style.color=\'' + hover_style + '\'" onmouseout="this.style.color=\'black\'">\1</span>',
78
+ text
79
+ )
80
+ return text
81
+
82
+
83
  def build_nomic(dataset):
84
  df = dataset['train'].to_pandas()
85
 
 
104
  df['url'] = 'https://www.reddit.com' + df['permalink']
105
  df['html_content'] = df['content'].apply(convert_markdown_to_html)
106
 
 
 
 
 
 
 
 
 
107
  # Apply the function
108
  df['subreddit'] = df['content'].apply(extract_subreddit)
109