Spaces:
Running
Running
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from bs4 import BeautifulSoup | |
import os | |
import time | |
# Configure Chrome options | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") # Run in headless mode | |
chrome_options.add_argument("--disable-gpu") | |
complete_starttime = time.time() | |
# URL of the Google Trends page | |
def setup_driver(): | |
"""Set up the Selenium WebDriver.""" | |
script_dir = os.path.dirname(os.path.abspath(__file__)) | |
driver_path = os.path.join(script_dir, 'chromedriver.exe') | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") # Run in headless mode | |
chrome_options.add_argument("--disable-gpu") | |
driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options) | |
return driver | |
def process_selenium_row(index, selenium_rows, driver): | |
"""Extract dynamic data using Selenium by clicking on the row.""" | |
max_retries = 3 | |
for attempt in range(max_retries): | |
try: | |
row = selenium_rows[index] | |
row.click() | |
# Wait for elements with class="xZCHj" to load | |
WebDriverWait(driver, 10).until( | |
EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj")) | |
) | |
links = driver.find_elements(By.CLASS_NAME, "xZCHj") | |
dynamic_data = { | |
"article": [ | |
{ | |
"href": link.get_attribute("href"), | |
"title": link.text | |
} | |
for link in links | |
] | |
} | |
if dynamic_data["article"]: | |
return dynamic_data | |
except Exception as e: | |
print(f"Error processing row {index} (Attempt {attempt + 1}): {e}") | |
selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]') | |
print(f"Failed to process row {index} after {max_retries} attempts.") | |
return {"article": []} | |
def scrape_google_trends(driver, url): | |
"""Scrape Google Trends data and save to JSON.""" | |
all_data = [] | |
try: | |
driver.get(url) | |
WebDriverWait(driver, 20).until( | |
EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]')) | |
) | |
soup = BeautifulSoup(driver.page_source, "html.parser") | |
tables = soup.select('[jsname="cC57zf"]') | |
for table in tables: | |
rows_bs = table.find_all("tr") | |
selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]') | |
for index, row_bs in enumerate(rows_bs): | |
static_data = [ | |
[div.get_text(strip=True) for div in cell.find_all("div")] | |
for cell in row_bs.find_all("td")[1:4] | |
] | |
print(static_data) | |
dynamic_data = process_selenium_row(index, selenium_rows, driver) | |
combined_row = { | |
"static_data": static_data, | |
"dynamic_data": dynamic_data | |
} | |
all_data.append(combined_row) | |
return all_data | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
finally: | |
driver.quit() | |
def crawl_url(url): | |
"""Main function to be called from another script.""" | |
driver = setup_driver() | |
return scrape_google_trends(driver, url) | |
if __name__ == "__main__": | |
#crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2") | |
driver = setup_driver() | |