Update functions.py
Browse files- functions.py +7 -1
functions.py
CHANGED
@@ -106,10 +106,16 @@ def preprocess_plain_text(text,window_size=3):
|
|
106 |
return passages
|
107 |
|
108 |
@st.experimental_memo(suppress_st_warning=True)
|
109 |
-
def
|
110 |
|
111 |
"""Chunk text longer than 500 tokens"""
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
article = nlp(text)
|
114 |
sentences = [i.text for i in list(article.sents)]
|
115 |
|
|
|
106 |
return passages
|
107 |
|
108 |
@st.experimental_memo(suppress_st_warning=True)
|
109 |
+
def chunk_and_process_text(text):
|
110 |
|
111 |
"""Chunk text longer than 500 tokens"""
|
112 |
|
113 |
+
text = text.encode("ascii", "ignore").decode() # unicode
|
114 |
+
text = re.sub(r"https*\S+", " ", text) # url
|
115 |
+
text = re.sub(r"@\S+", " ", text) # mentions
|
116 |
+
text = re.sub(r"#\S+", " ", text) # hastags
|
117 |
+
text = re.sub(r"\s{2,}", " ", text) # over spaces
|
118 |
+
|
119 |
article = nlp(text)
|
120 |
sentences = [i.text for i in list(article.sents)]
|
121 |
|