Spaces:
Sleeping
Sleeping
Update helper_functions_api.py
Browse files- helper_functions_api.py +17 -12
helper_functions_api.py
CHANGED
@@ -4,7 +4,7 @@ from mistune.plugins.table import table
|
|
4 |
from jinja2 import Template
|
5 |
import re
|
6 |
import os
|
7 |
-
import
|
8 |
|
9 |
def md_to_html(md_text):
|
10 |
renderer = mistune.HTMLRenderer()
|
@@ -183,16 +183,20 @@ def rephrase_content(data_format, content, query):
|
|
183 |
max_tokens=500,
|
184 |
)
|
185 |
|
|
|
|
|
|
|
|
|
186 |
|
187 |
-
@retry(tries=3, delay=1)
|
188 |
-
def fetch_content(url):
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
|
197 |
def extract_main_content(html):
|
198 |
extracted = trafilatura.extract(
|
@@ -211,7 +215,8 @@ def extract_main_content(html):
|
|
211 |
return ""
|
212 |
|
213 |
def process_content(data_format, url, query):
|
214 |
-
|
|
|
215 |
if html_content:
|
216 |
content = extract_main_content(html_content)
|
217 |
if content:
|
@@ -263,4 +268,4 @@ def search_brave(query, num_results=5):
|
|
263 |
# cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip() #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
|
264 |
# brave = Brave(BRAVE_API_KEY)
|
265 |
# search_results = brave.search(q=cleaned_search_query, count=num_results)
|
266 |
-
# return [url.__str__() for url in search_results.urls],cleaned_search_query
|
|
|
4 |
from jinja2 import Template
|
5 |
import re
|
6 |
import os
|
7 |
+
import requests
|
8 |
|
9 |
def md_to_html(md_text):
|
10 |
renderer = mistune.HTMLRenderer()
|
|
|
183 |
max_tokens=500,
|
184 |
)
|
185 |
|
186 |
+
class Scraper:
|
187 |
+
def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
|
188 |
+
self.session = requests.Session()
|
189 |
+
self.session.headers.update({"User-Agent": user_agent})
|
190 |
|
191 |
+
@retry(tries=3, delay=1)
|
192 |
+
def fetch_content(self, url):
|
193 |
+
try:
|
194 |
+
response = self.session.get(url, timeout=2)
|
195 |
+
if response.status_code == 200:
|
196 |
+
return response.text
|
197 |
+
except requests.exceptions.RequestException as e:
|
198 |
+
print(f"Error fetching page content for {url}: {e}")
|
199 |
+
return None
|
200 |
|
201 |
def extract_main_content(html):
|
202 |
extracted = trafilatura.extract(
|
|
|
215 |
return ""
|
216 |
|
217 |
def process_content(data_format, url, query):
|
218 |
+
scraper = Scraper()
|
219 |
+
html_content = scraper.fetch_content(url)
|
220 |
if html_content:
|
221 |
content = extract_main_content(html_content)
|
222 |
if content:
|
|
|
268 |
# cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip() #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
|
269 |
# brave = Brave(BRAVE_API_KEY)
|
270 |
# search_results = brave.search(q=cleaned_search_query, count=num_results)
|
271 |
+
# return [url.__str__() for url in search_results.urls],cleaned_search_query
|