pvanand commited on
Commit
8a07fcd
·
verified ·
1 Parent(s): 98f6f16

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +11 -17
helper_functions_api.py CHANGED
@@ -4,7 +4,7 @@ from mistune.plugins.table import table
4
  from jinja2 import Template
5
  import re
6
  import os
7
- import requests
8
 
9
  def md_to_html(md_text):
10
  renderer = mistune.HTMLRenderer()
@@ -183,20 +183,15 @@ def rephrase_content(data_format, content, query):
183
  max_tokens=500,
184
  )
185
 
186
- class Scraper:
187
- def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
188
- self.session = requests.Session()
189
- self.session.headers.update({"User-Agent": user_agent})
190
 
191
- @retry(tries=3, delay=1)
192
- def fetch_content(self, url):
193
- try:
194
- response = self.session.get(url, timeout=2)
195
- if response.status_code == 200:
196
- return response.text
197
- except requests.exceptions.RequestException as e:
198
- print(f"Error fetching page content for {url}: {e}")
199
- return None
200
 
201
  def extract_main_content(html):
202
  extracted = trafilatura.extract(
@@ -215,8 +210,7 @@ def extract_main_content(html):
215
  return ""
216
 
217
  def process_content(data_format, url, query):
218
- scraper = Scraper()
219
- html_content = scraper.fetch_content(url)
220
  if html_content:
221
  content = extract_main_content(html_content)
222
  if content:
@@ -268,4 +262,4 @@ def search_brave(query, num_results=5):
268
  # cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip() #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
269
  # brave = Brave(BRAVE_API_KEY)
270
  # search_results = brave.search(q=cleaned_search_query, count=num_results)
271
- # return [url.__str__() for url in search_results.urls],cleaned_search_query
 
4
  from jinja2 import Template
5
  import re
6
  import os
7
+ import hrequests
8
 
9
  def md_to_html(md_text):
10
  renderer = mistune.HTMLRenderer()
 
183
  max_tokens=500,
184
  )
185
 
 
 
 
 
186
 
187
+ def fetch_content(url):
188
+ try:
189
+ response = hrequests.get(url)
190
+ if response.status_code == 200:
191
+ return response.text
192
+ except Exception as e:
193
+ print(f"Error fetching page content for {url}: {e}")
194
+ return None
 
195
 
196
  def extract_main_content(html):
197
  extracted = trafilatura.extract(
 
210
  return ""
211
 
212
  def process_content(data_format, url, query):
213
+ html_content = fetch_content(url)
 
214
  if html_content:
215
  content = extract_main_content(html_content)
216
  if content:
 
262
  # cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip() #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
263
  # brave = Brave(BRAVE_API_KEY)
264
  # search_results = brave.search(q=cleaned_search_query, count=num_results)
265
+ # return [url.__str__() for url in search_results.urls],cleaned_search_query