Spaces:
Running
Running
⚗️ ⚡️ better stopwords and splitting
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
utils.py
CHANGED
@@ -19,11 +19,11 @@ logging.basicConfig(
|
|
19 |
|
20 |
import torch
|
21 |
from natsort import natsorted
|
22 |
-
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
|
23 |
from rapidfuzz import fuzz
|
24 |
|
25 |
STOPWORDS = set(
|
26 |
-
"a about above after again
|
27 |
)
|
28 |
|
29 |
|
@@ -66,30 +66,48 @@ def remove_stopwords(
|
|
66 |
:param bool contraction_tokenize: use custom apostrophe tokenizer, defaults to True
|
67 |
:return str: text with stopwords removed
|
68 |
"""
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
|
80 |
-
|
81 |
-
|
|
|
82 |
|
83 |
-
|
84 |
|
85 |
-
|
86 |
-
filtered_text = re.sub(r"\s+", " ", filtered_text)
|
87 |
-
filtered_text = filtered_text.strip()
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
93 |
|
94 |
return filtered_text
|
95 |
|
|
|
19 |
|
20 |
import torch
|
21 |
from natsort import natsorted
|
22 |
+
from nltk.tokenize import word_tokenize, WhitespaceTokenizer, sent_tokenize
|
23 |
from rapidfuzz import fuzz
|
24 |
|
25 |
STOPWORDS = set(
|
26 |
+
"a about above after again all also am an and any are aren't as at back be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he'd he'll he's hence her here here's hers herself him himself his how how's however i'd i'll i'm i've if in into is isn't it's its itself just let's me more moreover most mustn't my myself new nor now of off on once only or other ought our ours ourselves out over own really same shan't she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's therefore these they they'd they'll they're they've this those through thus to too under until up use used using very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you'd you'll you're you've your yours yourself yourselves".split()
|
27 |
)
|
28 |
|
29 |
|
|
|
66 |
:param bool contraction_tokenize: use custom apostrophe tokenizer, defaults to True
|
67 |
:return str: text with stopwords removed
|
68 |
"""
|
69 |
+
lines = text.split("\n")
|
70 |
+
filtered_lines = []
|
71 |
+
|
72 |
+
def fix_commas(text: str) -> str:
|
73 |
+
"""fixes commas in text to have a space after them"""
|
74 |
+
spaced_text = text.replace(",", ", ")
|
75 |
+
return spaced_text.replace(" ", " ").strip()
|
76 |
+
|
77 |
+
for line in lines:
|
78 |
+
sentences = sent_tokenize(line)
|
79 |
+
filtered_sentences = []
|
80 |
+
|
81 |
+
for sentence in sentences:
|
82 |
+
# Add space around punctuations for the regex to work correctly, only if they are followed by a letter
|
83 |
+
sentence_with_spaces = re.sub(r"([.,!?])(\w)", r"\1 \2", sentence[:-1])
|
84 |
+
|
85 |
+
words = (
|
86 |
+
contraction_aware_tokenize(sentence_with_spaces)
|
87 |
+
if contraction_tokenize
|
88 |
+
else word_tokenize(sentence_with_spaces)
|
89 |
+
)
|
90 |
|
91 |
+
filtered_words = []
|
92 |
+
for word in words:
|
93 |
+
if word.lower() not in stopwords:
|
94 |
+
filtered_words.append(word)
|
95 |
|
96 |
+
filtered_sentence = " ".join(filtered_words)
|
97 |
+
# Restore original spaces around punctuation marks
|
98 |
+
filtered_sentence = re.sub(r"([.,!?])\s*", r"\1", filtered_sentence)
|
99 |
|
100 |
+
filtered_sentences.append(filtered_sentence + sentence[-1])
|
101 |
|
102 |
+
filtered_line = " ".join(filtered_sentences)
|
|
|
|
|
103 |
|
104 |
+
# Replace multiple consecutive whitespaces with a single space
|
105 |
+
filtered_line = re.sub(r"\s+", " ", filtered_line)
|
106 |
+
filtered_line = fix_commas(filtered_line.strip())
|
107 |
+
|
108 |
+
filtered_lines.append(filtered_line)
|
109 |
+
|
110 |
+
filtered_text = "\n".join(filtered_lines)
|
111 |
|
112 |
return filtered_text
|
113 |
|