Spaces:

Pamudu13
/

automatedblogpostcreater

Running

automatedblogpostcreater / testt.py

Upload 16 files

53e65b7 verified 20 days ago

1.32 kB

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.service import Service
	from webdriver_manager.chrome import ChromeDriverManager

	def start_requests(self):
	query = input("Enter your search query: ")
	google_search_url = f"https://www.google.com/search?q={query}"

	# Set up Selenium
	options = webdriver.ChromeOptions()
	options.add_argument('--headless') # Run in headless mode
	options.add_argument('--disable-gpu')
	options.add_argument('--no-sandbox')
	driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

	driver.get(google_search_url)
	soup = BeautifulSoup(driver.page_source, 'html.parser')
	driver.quit()

	urls = []
	for link in soup.find_all('a', href=True):
	href = link['href']
	if href.startswith('/url?q='):
	url = href.split('/url?q=')[1].split('&')[0]
	if not url.startswith('http'):
	continue
	urls.append(url)
	if len(urls) == self.max_scrapes:
	break

	if not urls:
	self.logger.error("No URLs extracted from Google search results.")
	return

	self.logger.info(f"Extracted URLs: {urls}")
	for url in urls:
	yield Request(url, callback=self.parse)