Spaces:
Running
Running
Update helper_functions_api.py
Browse files- helper_functions_api.py +11 -17
helper_functions_api.py
CHANGED
@@ -4,7 +4,7 @@ from mistune.plugins.table import table
|
|
4 |
from jinja2 import Template
|
5 |
import re
|
6 |
import os
|
7 |
-
import
|
8 |
|
9 |
def md_to_html(md_text):
|
10 |
renderer = mistune.HTMLRenderer()
|
@@ -183,20 +183,15 @@ def rephrase_content(data_format, content, query):
|
|
183 |
max_tokens=500,
|
184 |
)
|
185 |
|
186 |
-
class Scraper:
|
187 |
-
def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
|
188 |
-
self.session = requests.Session()
|
189 |
-
self.session.headers.update({"User-Agent": user_agent})
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
return None
|
200 |
|
201 |
def extract_main_content(html):
|
202 |
extracted = trafilatura.extract(
|
@@ -215,8 +210,7 @@ def extract_main_content(html):
|
|
215 |
return ""
|
216 |
|
217 |
def process_content(data_format, url, query):
|
218 |
-
|
219 |
-
html_content = scraper.fetch_content(url)
|
220 |
if html_content:
|
221 |
content = extract_main_content(html_content)
|
222 |
if content:
|
@@ -268,4 +262,4 @@ def search_brave(query, num_results=5):
|
|
268 |
# cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip() #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
|
269 |
# brave = Brave(BRAVE_API_KEY)
|
270 |
# search_results = brave.search(q=cleaned_search_query, count=num_results)
|
271 |
-
# return [url.__str__() for url in search_results.urls],cleaned_search_query
|
|
|
4 |
from jinja2 import Template
|
5 |
import re
|
6 |
import os
|
7 |
+
import hrequests
|
8 |
|
9 |
def md_to_html(md_text):
|
10 |
renderer = mistune.HTMLRenderer()
|
|
|
183 |
max_tokens=500,
|
184 |
)
|
185 |
|
|
|
|
|
|
|
|
|
186 |
|
187 |
+
def fetch_content(url):
|
188 |
+
try:
|
189 |
+
response = hrequests.get(url)
|
190 |
+
if response.status_code == 200:
|
191 |
+
return response.text
|
192 |
+
except Exception as e:
|
193 |
+
print(f"Error fetching page content for {url}: {e}")
|
194 |
+
return None
|
|
|
195 |
|
196 |
def extract_main_content(html):
|
197 |
extracted = trafilatura.extract(
|
|
|
210 |
return ""
|
211 |
|
212 |
def process_content(data_format, url, query):
|
213 |
+
html_content = fetch_content(url)
|
|
|
214 |
if html_content:
|
215 |
content = extract_main_content(html_content)
|
216 |
if content:
|
|
|
262 |
# cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip() #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
|
263 |
# brave = Brave(BRAVE_API_KEY)
|
264 |
# search_results = brave.search(q=cleaned_search_query, count=num_results)
|
265 |
+
# return [url.__str__() for url in search_results.urls],cleaned_search_query
|