Spaces:
Sleeping
Sleeping
Update tools/visit_webpage.py
Browse files- tools/visit_webpage.py +17 -3
tools/visit_webpage.py
CHANGED
@@ -3,6 +3,7 @@ from smolagents.tools import Tool
|
|
3 |
import requests
|
4 |
import markdownify
|
5 |
import smolagents
|
|
|
6 |
|
7 |
class VisitWebpageTool(Tool):
|
8 |
name = "visit_webpage"
|
@@ -22,8 +23,21 @@ class VisitWebpageTool(Tool):
|
|
22 |
"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
|
23 |
) from e
|
24 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
# Send a GET request to the URL with a 20-second timeout
|
26 |
-
response = requests.get(url, timeout=20)
|
27 |
response.raise_for_status() # Raise an exception for bad status codes
|
28 |
|
29 |
# Convert the HTML content to Markdown
|
@@ -32,7 +46,7 @@ class VisitWebpageTool(Tool):
|
|
32 |
# Remove multiple line breaks
|
33 |
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
|
34 |
|
35 |
-
return truncate_content(markdown_content,
|
36 |
|
37 |
except requests.exceptions.Timeout:
|
38 |
return "The request timed out. Please try again later or check the URL."
|
@@ -42,4 +56,4 @@ class VisitWebpageTool(Tool):
|
|
42 |
return f"An unexpected error occurred: {str(e)}"
|
43 |
|
44 |
def __init__(self, *args, **kwargs):
|
45 |
-
self.is_initialized = False
|
|
|
3 |
import requests
|
4 |
import markdownify
|
5 |
import smolagents
|
6 |
+
import re
|
7 |
|
8 |
class VisitWebpageTool(Tool):
|
9 |
name = "visit_webpage"
|
|
|
23 |
"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
|
24 |
) from e
|
25 |
try:
|
26 |
+
headers = {
|
27 |
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
|
28 |
+
"referer": "https://duckduckgo.com/",
|
29 |
+
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
30 |
+
'accept-language': 'en-US,en;q=0.9',
|
31 |
+
'accept-Encoding': 'gzip, deflate, br',
|
32 |
+
'priority': 'u=0, i',
|
33 |
+
'referer': 'https://duckduckgo.com/',
|
34 |
+
'sec-fetch-dest': 'document',
|
35 |
+
'sec-fetch-mode': 'navigate',
|
36 |
+
'sec-fetch-site': 'cross-site',
|
37 |
+
}
|
38 |
+
|
39 |
# Send a GET request to the URL with a 20-second timeout
|
40 |
+
response = requests.get(url, timeout=20, allow_redirects=True, headers=headers)
|
41 |
response.raise_for_status() # Raise an exception for bad status codes
|
42 |
|
43 |
# Convert the HTML content to Markdown
|
|
|
46 |
# Remove multiple line breaks
|
47 |
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
|
48 |
|
49 |
+
return truncate_content(markdown_content, 50000)
|
50 |
|
51 |
except requests.exceptions.Timeout:
|
52 |
return "The request timed out. Please try again later or check the URL."
|
|
|
56 |
return f"An unexpected error occurred: {str(e)}"
|
57 |
|
58 |
def __init__(self, *args, **kwargs):
|
59 |
+
self.is_initialized = False
|