Spaces:

tensora
/

webcrawler

Running

App Files Files Community

Add1E commited on Jan 12

Commit

00b85ba

verified ·

1 Parent(s): 1e07a06

Update trend_crawl.py

Browse files

Files changed (1) hide show

trend_crawl.py +110 -110

trend_crawl.py CHANGED Viewed

@@ -1,110 +1,110 @@
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from bs4 import BeautifulSoup
-import os
-import time
-# Configure Chrome options
-chrome_options = Options()
-chrome_options.add_argument("--headless")  # Run in headless mode
-chrome_options.add_argument("--disable-gpu")
-complete_starttime = time.time()
-# URL of the Google Trends page
-def setup_driver():
-    """Set up the Selenium WebDriver."""
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    driver_path = os.path.join(script_dir, 'chromedriver.exe')
-    chrome_options = Options()
-    chrome_options.add_argument("--headless")  # Run in headless mode
-    chrome_options.add_argument("--disable-gpu")
-    driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options)
-    return driver
-def process_selenium_row(index, selenium_rows, driver):
-    """Extract dynamic data using Selenium by clicking on the row."""
-    max_retries = 3
-    for attempt in range(max_retries):
-        try:
-            row = selenium_rows[index]
-            row.click()
-            # Wait for elements with class="xZCHj" to load
-            WebDriverWait(driver, 10).until(
-                EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
-            )
-            links = driver.find_elements(By.CLASS_NAME, "xZCHj")
-            dynamic_data = {
-                "article": [
-                    {
-                        "href": link.get_attribute("href"),
-                        "title": link.text
-                    }
-                    for link in links
-                ]
-            }
-            if dynamic_data["article"]:
-                return dynamic_data
-        except Exception as e:
-            print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
-            selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
-    print(f"Failed to process row {index} after {max_retries} attempts.")
-    return {"article": []}
-def scrape_google_trends(driver, url):
-    """Scrape Google Trends data and save to JSON."""
-    all_data = []
-    try:
-        driver.get(url)
-        WebDriverWait(driver, 20).until(
-            EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]'))
-        )
-        soup = BeautifulSoup(driver.page_source, "html.parser")
-        tables = soup.select('[jsname="cC57zf"]')
-        for table in tables:
-            rows_bs = table.find_all("tr")
-            selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
-            for index, row_bs in enumerate(rows_bs):
-                static_data = [
-                    [div.get_text(strip=True) for div in cell.find_all("div")]
-                    for cell in row_bs.find_all("td")[1:4]
-                ]
-                print(static_data)
-                dynamic_data = process_selenium_row(index, selenium_rows, driver)
-                combined_row = {
-                    "static_data": static_data,
-                    "dynamic_data": dynamic_data
-                }
-                all_data.append(combined_row)
-        return all_data
-    except Exception as e:
-        print(f"An error occurred: {e}")
-    finally:
-        driver.quit()
-def crawl_url(url):
-    """Main function to be called from another script."""
-    driver = setup_driver()
-    return scrape_google_trends(driver, url)
-if __name__ == "__main__":
-    #crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2")
-    driver = setup_driver()

+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+import os
+import time
+# Configure Chrome options
+chrome_options = Options()
+chrome_options.add_argument("--headless")  # Run in headless mode
+chrome_options.add_argument("--disable-gpu")
+complete_starttime = time.time()
+# URL of the Google Trends page
+def setup_driver():
+    """Set up the Selenium WebDriver."""
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    driver_path = os.path.join(script_dir, 'chromedriver')
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")  # Run in headless mode
+    chrome_options.add_argument("--disable-gpu")
+    driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options)
+    return driver
+def process_selenium_row(index, selenium_rows, driver):
+    """Extract dynamic data using Selenium by clicking on the row."""
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            row = selenium_rows[index]
+            row.click()
+            # Wait for elements with class="xZCHj" to load
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
+            )
+            links = driver.find_elements(By.CLASS_NAME, "xZCHj")
+            dynamic_data = {
+                "article": [
+                    {
+                        "href": link.get_attribute("href"),
+                        "title": link.text
+                    }
+                    for link in links
+                ]
+            }
+            if dynamic_data["article"]:
+                return dynamic_data
+        except Exception as e:
+            print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
+            selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
+    print(f"Failed to process row {index} after {max_retries} attempts.")
+    return {"article": []}
+def scrape_google_trends(driver, url):
+    """Scrape Google Trends data and save to JSON."""
+    all_data = []
+    try:
+        driver.get(url)
+        WebDriverWait(driver, 20).until(
+            EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]'))
+        )
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        tables = soup.select('[jsname="cC57zf"]')
+        for table in tables:
+            rows_bs = table.find_all("tr")
+            selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
+            for index, row_bs in enumerate(rows_bs):
+                static_data = [
+                    [div.get_text(strip=True) for div in cell.find_all("div")]
+                    for cell in row_bs.find_all("td")[1:4]
+                ]
+                print(static_data)
+                dynamic_data = process_selenium_row(index, selenium_rows, driver)
+                combined_row = {
+                    "static_data": static_data,
+                    "dynamic_data": dynamic_data
+                }
+                all_data.append(combined_row)
+        return all_data
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    finally:
+        driver.quit()
+def crawl_url(url):
+    """Main function to be called from another script."""
+    driver = setup_driver()
+    return scrape_google_trends(driver, url)
+if __name__ == "__main__":
+    #crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2")
+    driver = setup_driver()