Spaces:

tensora
/

webcrawler

Running

App Files Files Community

Add1E commited on 16 days ago

Commit

56cfe41

verified ·

1 Parent(s): 1a4568a

Update trend_crawl.py

Browse files

Files changed (1) hide show

trend_crawl.py +66 -114

trend_crawl.py CHANGED Viewed

@@ -1,120 +1,72 @@
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.chrome.service import Service as ChromeService
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from bs4 import BeautifulSoup
-from webdriver_manager.chrome import ChromeDriverManager
-import os
-import time
-# Configure Chrome options
-chrome_options = Options()
-chrome_options.add_argument("--headless")  # Run in headless mode
-chrome_options.add_argument("--disable-gpu")
-complete_starttime = time.time()
-# URL of the Google Trends page
-#script_dir = os.path.dirname(os.path.abspath(__file__))
-#driver_path = os.path.join(script_dir, 'chromedriver')
-def setup_driver():
-    options = webdriver.ChromeOptions()
-    options.add_argument('--headless')
-    options.add_argument('--no-sandbox')
-    options.add_argument('--disable-dev-shm-usage')
-    wd = webdriver.Chrome(options=options)
-    return wd
-def process_selenium_row(index, selenium_rows, driver):
-    """Extract dynamic data using Selenium by clicking on the row."""
-    max_retries = 3
-    for attempt in range(max_retries):
         try:
-            articles = {}
-            # Refresh the rows before processing
-            #selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
-            row = selenium_rows[index]
-            driver.execute_script("arguments[0].click();", row)  # Use JavaScript click for stability
-            # Wait for the articles to load dynamically
-            WebDriverWait(driver, 10).until(
-                EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
-            )
-            # Fetch only the newly loaded articles
-            articles = driver.find_elements(By.CLASS_NAME, "xZCHj")
-            print(articles)
-            # Extract data from the current row only
-            dynamic_data = {
-                "article": [
-                    {
-                        "href": article.get_attribute("href"),
-                        "title": article.text
-                    }
-                    for article in articles
-                ]
-            }
-            # Clear previously fetched articles and return current ones
-            return dynamic_data
         except Exception as e:
-            print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
-            time.sleep(1)  # Add delay before retry
-    print(f"Failed to process row {index} after {max_retries} attempts.")
-    return {"article": []}
-def scrape_google_trends(driver, url):
-    """Scrape Google Trends data and save to JSON."""
-    all_data = []
-    try:
-        driver.get(url)
-        WebDriverWait(driver, 20).until(
-            EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]'))
         )
-        soup = BeautifulSoup(driver.page_source, "html.parser")
-        tables = soup.select('[jsname="cC57zf"]')
-        for table in tables:
-            rows_bs = table.find_all("tr")
-            selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
-            for index, row_bs in enumerate(rows_bs):
-                static_data = [
-                    [div.get_text(strip=True) for div in cell.find_all("div")]
-                    for cell in row_bs.find_all("td")[1:4]
-                ]
-                dynamic_data = process_selenium_row(index, selenium_rows, driver)
-                combined_row = {
-                    "static_data": static_data,
-                    "dynamic_data": dynamic_data
-                }
-                all_data.append(combined_row)
-        return all_data
-    except Exception as e:
-        print(f"An error occurred: {e}")
-    finally:
-        driver.quit()
-def crawl_url(url):
-    """Main function to be called from another script."""
-    driver = setup_driver()
-    return scrape_google_trends(driver, url)
-if __name__ == "__main__":
-    #crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2")
-    driver = setup_driver()

+from trendspy import Trends
+tr = Trends()
+TREND_TOPICS = {
+    1: "Autos and Vehicles",
+    2: "Beauty and Fashion",
+    3: "Business and Finance",
+    20: "Climate",
+    4: "Entertainment",
+    5: "Food and Drink",
+    6: "Games",
+    7: "Health",
+    8: "Hobbies and Leisure",
+    9: "Jobs and Education",
+    10: "Law and Government",
+    11: "Other",
+    13: "Pets and Animals",
+    14: "Politics",
+    15: "Science",
+    16: "Shopping",
+    17: "Sports",
+    18: "Technology",
+    19: "Travel and Transportation"
+}
+trends_json = {}
+def process_trends_for_country(country_code, trends_list):
+    if country_code not in trends_json:
+        trends_json[country_code] = {"All categories" : {}}
+    for trend in trends_list:
+        category = None
+        for topic_id in trend.topics:
+            if topic_id in TREND_TOPICS:
+                category = TREND_TOPICS[topic_id]
+                break
+        if category is None:
+            category = TREND_TOPICS[11]
+        if category not in trends_json[country_code]:
+            trends_json[country_code][category] = {}
+        topic_name = trend.keyword
         try:
+            news = tr.trending_now_news_by_ids(trend.news_tokens, max_news=3)
+            articles = [
+                {"title": article.title, "href": article.url}
+                for article in news
+            ]
         except Exception as e:
+            articles=[]
+        trends_json[country_code]["All categories"][topic_name] = {
+            "searchQueries": trend.volume,
+            "articles": articles,
+        }
+        trends_json[country_code][category][topic_name] = {
+            "searchQueries": trend.volume,
+            "articles": articles,
+        }
+def get_trends(countries: list):
+    for country in countries:
+        trends = tr.trending_now(geo=country)
+        process_trends_for_country(country, trends)
+        all_categories = trends_json[country]["All categories"]
+        sorted_all_categories = dict(
+            sorted(all_categories.items(), key=lambda x: x[1]["searchQueries"], reverse=True)
         )
+        trends_json[country]["All categories"] = sorted_all_categories
+    return trends_json