TCares commited on
Commit
001a158
·
verified ·
1 Parent(s): 14153b0

Update tools/visit_webpage.py

Browse files
Files changed (1) hide show
  1. tools/visit_webpage.py +17 -3
tools/visit_webpage.py CHANGED
@@ -3,6 +3,7 @@ from smolagents.tools import Tool
3
  import requests
4
  import markdownify
5
  import smolagents
 
6
 
7
  class VisitWebpageTool(Tool):
8
  name = "visit_webpage"
@@ -22,8 +23,21 @@ class VisitWebpageTool(Tool):
22
  "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
23
  ) from e
24
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Send a GET request to the URL with a 20-second timeout
26
- response = requests.get(url, timeout=20)
27
  response.raise_for_status() # Raise an exception for bad status codes
28
 
29
  # Convert the HTML content to Markdown
@@ -32,7 +46,7 @@ class VisitWebpageTool(Tool):
32
  # Remove multiple line breaks
33
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
34
 
35
- return truncate_content(markdown_content, 10000)
36
 
37
  except requests.exceptions.Timeout:
38
  return "The request timed out. Please try again later or check the URL."
@@ -42,4 +56,4 @@ class VisitWebpageTool(Tool):
42
  return f"An unexpected error occurred: {str(e)}"
43
 
44
  def __init__(self, *args, **kwargs):
45
- self.is_initialized = False
 
3
  import requests
4
  import markdownify
5
  import smolagents
6
+ import re
7
 
8
  class VisitWebpageTool(Tool):
9
  name = "visit_webpage"
 
23
  "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
24
  ) from e
25
  try:
26
+ headers = {
27
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
28
+ "referer": "https://duckduckgo.com/",
29
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
30
+ 'accept-language': 'en-US,en;q=0.9',
31
+ 'accept-Encoding': 'gzip, deflate, br',
32
+ 'priority': 'u=0, i',
33
+ 'referer': 'https://duckduckgo.com/',
34
+ 'sec-fetch-dest': 'document',
35
+ 'sec-fetch-mode': 'navigate',
36
+ 'sec-fetch-site': 'cross-site',
37
+ }
38
+
39
  # Send a GET request to the URL with a 20-second timeout
40
+ response = requests.get(url, timeout=20, allow_redirects=True, headers=headers)
41
  response.raise_for_status() # Raise an exception for bad status codes
42
 
43
  # Convert the HTML content to Markdown
 
46
  # Remove multiple line breaks
47
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
48
 
49
+ return truncate_content(markdown_content, 50000)
50
 
51
  except requests.exceptions.Timeout:
52
  return "The request timed out. Please try again later or check the URL."
 
56
  return f"An unexpected error occurred: {str(e)}"
57
 
58
  def __init__(self, *args, **kwargs):
59
+ self.is_initialized = False