pvanand commited on
Commit
5e64525
1 Parent(s): 9631168

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +5 -4
helper_functions_api.py CHANGED
@@ -149,10 +149,11 @@ class Scraper:
149
 
150
  def extract_main_content(html):
151
  if html:
 
152
  soup = BeautifulSoup(html, 'lxml')
153
- paragraphs = soup.find_all('p')
154
- text = ' '.join(p.get_text() for p in paragraphs)
155
- return text
156
  return ""
157
 
158
  def process_content(url, query):
@@ -161,7 +162,7 @@ def process_content(url, query):
161
  if html_content:
162
  content = extract_main_content(html_content)
163
  if content:
164
- rephrased_content = rephrase_content(remove_stopwords(content)[:4096*4], query)
165
  return rephrased_content, url
166
  return "", url
167
 
 
149
 
150
  def extract_main_content(html):
151
  if html:
152
+ plain_text = ""
153
  soup = BeautifulSoup(html, 'lxml')
154
+ for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
155
+ plain_text += element.get_text(separator=" ", strip=True) + "\n"
156
+ return plain_text
157
  return ""
158
 
159
  def process_content(url, query):
 
162
  if html_content:
163
  content = extract_main_content(html_content)
164
  if content:
165
+ rephrased_content = rephrase_content(limit_tokens(remove_stopwords(content)), query)
166
  return rephrased_content, url
167
  return "", url
168