from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup from webdriver_manager.chrome import ChromeDriverManager import os import time # Configure Chrome options chrome_options = Options() chrome_options.add_argument("--headless") # Run in headless mode chrome_options.add_argument("--disable-gpu") complete_starttime = time.time() # URL of the Google Trends page #script_dir = os.path.dirname(os.path.abspath(__file__)) #driver_path = os.path.join(script_dir, 'chromedriver') def setup_driver(): chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") # Let webdriver_manager handle downloading and setting up Chromedriver service = ChromeService(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) return driver def process_selenium_row(index, selenium_rows, driver): """Extract dynamic data using Selenium by clicking on the row.""" max_retries = 3 for attempt in range(max_retries): try: row = selenium_rows[index] row.click() # Wait for elements with class="xZCHj" to load WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj")) ) links = driver.find_elements(By.CLASS_NAME, "xZCHj") dynamic_data = { "article": [ { "href": link.get_attribute("href"), "title": link.text } for link in links ] } if dynamic_data["article"]: return dynamic_data except Exception as e: print(f"Error processing row {index} (Attempt {attempt + 1}): {e}") selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]') print(f"Failed to process row {index} after {max_retries} attempts.") return {"article": []} def scrape_google_trends(driver, url): """Scrape Google Trends data and save to JSON.""" all_data = [] try: driver.get(url) WebDriverWait(driver, 20).until( EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]')) ) soup = BeautifulSoup(driver.page_source, "html.parser") tables = soup.select('[jsname="cC57zf"]') for table in tables: rows_bs = table.find_all("tr") selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]') for index, row_bs in enumerate(rows_bs): static_data = [ [div.get_text(strip=True) for div in cell.find_all("div")] for cell in row_bs.find_all("td")[1:4] ] print(static_data) dynamic_data = process_selenium_row(index, selenium_rows, driver) combined_row = { "static_data": static_data, "dynamic_data": dynamic_data } all_data.append(combined_row) return all_data except Exception as e: print(f"An error occurred: {e}") finally: driver.quit() def crawl_url(url): """Main function to be called from another script.""" driver = setup_driver() return scrape_google_trends(driver, url) if __name__ == "__main__": #crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2") driver = setup_driver()