Spaces:

Pamudu13
/

automatedblogpostcreater

Running

File size: 1,322 Bytes

53e65b7

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def start_requests(self):
    query = input("Enter your search query: ")
    google_search_url = f"https://www.google.com/search?q={query}"

    # Set up Selenium
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    driver.get(google_search_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    urls = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('/url?q='):
            url = href.split('/url?q=')[1].split('&')[0]
            if not url.startswith('http'):
                continue
            urls.append(url)
            if len(urls) == self.max_scrapes:
                break

    if not urls:
        self.logger.error("No URLs extracted from Google search results.")
        return

    self.logger.info(f"Extracted URLs: {urls}")
    for url in urls:
        yield Request(url, callback=self.parse)