Spaces:
Running
Running
Update helper_functions_api.py
Browse files- helper_functions_api.py +5 -4
helper_functions_api.py
CHANGED
@@ -149,10 +149,11 @@ class Scraper:
|
|
149 |
|
150 |
def extract_main_content(html):
|
151 |
if html:
|
|
|
152 |
soup = BeautifulSoup(html, 'lxml')
|
153 |
-
|
154 |
-
|
155 |
-
return
|
156 |
return ""
|
157 |
|
158 |
def process_content(url, query):
|
@@ -161,7 +162,7 @@ def process_content(url, query):
|
|
161 |
if html_content:
|
162 |
content = extract_main_content(html_content)
|
163 |
if content:
|
164 |
-
rephrased_content = rephrase_content(remove_stopwords(content)
|
165 |
return rephrased_content, url
|
166 |
return "", url
|
167 |
|
|
|
149 |
|
150 |
def extract_main_content(html):
|
151 |
if html:
|
152 |
+
plain_text = ""
|
153 |
soup = BeautifulSoup(html, 'lxml')
|
154 |
+
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
|
155 |
+
plain_text += element.get_text(separator=" ", strip=True) + "\n"
|
156 |
+
return plain_text
|
157 |
return ""
|
158 |
|
159 |
def process_content(url, query):
|
|
|
162 |
if html_content:
|
163 |
content = extract_main_content(html_content)
|
164 |
if content:
|
165 |
+
rephrased_content = rephrase_content(limit_tokens(remove_stopwords(content)), query)
|
166 |
return rephrased_content, url
|
167 |
return "", url
|
168 |
|