pvanand commited on
Commit
659d7dc
·
verified ·
1 Parent(s): ee4ce2a

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +17 -12
helper_functions_api.py CHANGED
@@ -4,7 +4,7 @@ from mistune.plugins.table import table
4
  from jinja2 import Template
5
  import re
6
  import os
7
- import hrequests
8
 
9
  def md_to_html(md_text):
10
  renderer = mistune.HTMLRenderer()
@@ -183,16 +183,20 @@ def rephrase_content(data_format, content, query):
183
  max_tokens=500,
184
  )
185
 
 
 
 
 
186
 
187
- @retry(tries=3, delay=1)
188
- def fetch_content(url):
189
- try:
190
- response = hrequests.get(url)
191
- if response.status_code == 200:
192
- return response.text
193
- except Exception as e:
194
- print(f"Error fetching page content for {url}: {e}")
195
- return None
196
 
197
  def extract_main_content(html):
198
  extracted = trafilatura.extract(
@@ -211,7 +215,8 @@ def extract_main_content(html):
211
  return ""
212
 
213
  def process_content(data_format, url, query):
214
- html_content = fetch_content(url)
 
215
  if html_content:
216
  content = extract_main_content(html_content)
217
  if content:
@@ -263,4 +268,4 @@ def search_brave(query, num_results=5):
263
  # cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip() #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
264
  # brave = Brave(BRAVE_API_KEY)
265
  # search_results = brave.search(q=cleaned_search_query, count=num_results)
266
- # return [url.__str__() for url in search_results.urls],cleaned_search_query
 
4
  from jinja2 import Template
5
  import re
6
  import os
7
+ import requests
8
 
9
  def md_to_html(md_text):
10
  renderer = mistune.HTMLRenderer()
 
183
  max_tokens=500,
184
  )
185
 
186
+ class Scraper:
187
+ def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
188
+ self.session = requests.Session()
189
+ self.session.headers.update({"User-Agent": user_agent})
190
 
191
+ @retry(tries=3, delay=1)
192
+ def fetch_content(self, url):
193
+ try:
194
+ response = self.session.get(url, timeout=2)
195
+ if response.status_code == 200:
196
+ return response.text
197
+ except requests.exceptions.RequestException as e:
198
+ print(f"Error fetching page content for {url}: {e}")
199
+ return None
200
 
201
  def extract_main_content(html):
202
  extracted = trafilatura.extract(
 
215
  return ""
216
 
217
  def process_content(data_format, url, query):
218
+ scraper = Scraper()
219
+ html_content = scraper.fetch_content(url)
220
  if html_content:
221
  content = extract_main_content(html_content)
222
  if content:
 
268
  # cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip() #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
269
  # brave = Brave(BRAVE_API_KEY)
270
  # search_results = brave.search(q=cleaned_search_query, count=num_results)
271
+ # return [url.__str__() for url in search_results.urls],cleaned_search_query