|
from selenium import webdriver |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.chrome.service import Service |
|
from webdriver_manager.chrome import ChromeDriverManager |
|
|
|
def start_requests(self): |
|
query = input("Enter your search query: ") |
|
google_search_url = f"https://www.google.com/search?q={query}" |
|
|
|
|
|
options = webdriver.ChromeOptions() |
|
options.add_argument('--headless') |
|
options.add_argument('--disable-gpu') |
|
options.add_argument('--no-sandbox') |
|
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) |
|
|
|
driver.get(google_search_url) |
|
soup = BeautifulSoup(driver.page_source, 'html.parser') |
|
driver.quit() |
|
|
|
urls = [] |
|
for link in soup.find_all('a', href=True): |
|
href = link['href'] |
|
if href.startswith('/url?q='): |
|
url = href.split('/url?q=')[1].split('&')[0] |
|
if not url.startswith('http'): |
|
continue |
|
urls.append(url) |
|
if len(urls) == self.max_scrapes: |
|
break |
|
|
|
if not urls: |
|
self.logger.error("No URLs extracted from Google search results.") |
|
return |
|
|
|
self.logger.info(f"Extracted URLs: {urls}") |
|
for url in urls: |
|
yield Request(url, callback=self.parse) |