pvanand commited on
Commit
8b98f16
·
verified ·
1 Parent(s): a2e6e86

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +16 -8
helper_functions_api.py CHANGED
@@ -67,6 +67,7 @@ from half_json.core import JSONFixer
67
  from openai import OpenAI
68
  from together import Together
69
  from urllib.parse import urlparse
 
70
 
71
  llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
72
  llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
@@ -197,13 +198,20 @@ class Scraper:
197
  return None
198
 
199
  def extract_main_content(html):
200
- if html:
201
- plain_text = ""
202
- soup = BeautifulSoup(html, 'lxml')
203
- for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
204
- plain_text += element.get_text(separator=" ", strip=True) + "\n"
205
- return plain_text
206
- return ""
 
 
 
 
 
 
 
207
 
208
  def process_content(data_format, url, query):
209
  scraper = Scraper()
@@ -213,7 +221,7 @@ def process_content(data_format, url, query):
213
  if content:
214
  rephrased_content = rephrase_content(
215
  data_format=data_format,
216
- content=limit_tokens(remove_stopwords(content), token_limit=1000),
217
  query=query,
218
  )
219
  return rephrased_content, url
 
67
  from openai import OpenAI
68
  from together import Together
69
  from urllib.parse import urlparse
70
+ import trafilatura
71
 
72
  llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
73
  llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
 
198
  return None
199
 
200
  def extract_main_content(html):
201
+ extracted = trafilatura.extract(
202
+ html,
203
+ output_format="markdown",
204
+ target_language="en",
205
+ include_tables=True,
206
+ include_images=False,
207
+ include_links=False,
208
+ deduplicate=True,
209
+ )
210
+
211
+ if extracted:
212
+ return trafilatura.utils.sanitize(extracted)
213
+ else:
214
+ return ""
215
 
216
  def process_content(data_format, url, query):
217
  scraper = Scraper()
 
221
  if content:
222
  rephrased_content = rephrase_content(
223
  data_format=data_format,
224
+ content=limit_tokens(remove_stopwords(content), token_limit=4000),
225
  query=query,
226
  )
227
  return rephrased_content, url