TCares commited on
Commit
eecce0d
·
verified ·
1 Parent(s): b7ab08f

Update tools/visit_webpage.py

Browse files
Files changed (1) hide show
  1. tools/visit_webpage.py +7 -21
tools/visit_webpage.py CHANGED
@@ -1,8 +1,5 @@
1
- from typing import Any, Optional
2
  from smolagents.tools import Tool
3
- import requests
4
- import markdownify
5
- import smolagents
6
  import re
7
 
8
  class VisitWebpageTool(Tool):
@@ -23,25 +20,14 @@ class VisitWebpageTool(Tool):
23
  "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
24
  ) from e
25
  try:
26
- headers = {
27
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15",
28
- "referer": "https://duckduckgo.com/",
29
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
30
- 'accept-language': 'en-US,en;q=0.9',
31
- 'accept-Encoding': 'gzip, deflate, br',
32
- 'priority': 'u=0, i',
33
- 'referer': 'https://duckduckgo.com/',
34
- 'sec-fetch-dest': 'document',
35
- 'sec-fetch-mode': 'navigate',
36
- 'sec-fetch-site': 'cross-site',
37
- }
38
-
39
- # Send a GET request to the URL with a 20-second timeout
40
- response = requests.get(url, timeout=20, allow_redirects=True, headers=headers)
41
- response.raise_for_status() # Raise an exception for bad status codes
42
 
43
  # Convert the HTML content to Markdown
44
- markdown_content = markdownify(response.text).strip()
45
 
46
  # Remove multiple line breaks
47
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
 
 
1
  from smolagents.tools import Tool
2
+ from selenium import webdriver
 
 
3
  import re
4
 
5
  class VisitWebpageTool(Tool):
 
20
  "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
21
  ) from e
22
  try:
23
+ options = webdriver.ChromeOptions()
24
+ options.add_argument("--headless") # Run in headless mode
25
+
26
+ driver = webdriver.Chrome(options=options)
27
+ driver.get(url)
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # Convert the HTML content to Markdown
30
+ markdown_content = markdownify(driver.page_source).strip()
31
 
32
  # Remove multiple line breaks
33
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)