Pamudu13's picture
Upload 16 files
53e65b7 verified
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
def start_requests(self):
query = input("Enter your search query: ")
google_search_url = f"https://www.google.com/search?q={query}"
# Set up Selenium
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Run in headless mode
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(google_search_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
urls = []
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('/url?q='):
url = href.split('/url?q=')[1].split('&')[0]
if not url.startswith('http'):
continue
urls.append(url)
if len(urls) == self.max_scrapes:
break
if not urls:
self.logger.error("No URLs extracted from Google search results.")
return
self.logger.info(f"Extracted URLs: {urls}")
for url in urls:
yield Request(url, callback=self.parse)