Add1E commited on
Commit
56cfe41
·
verified ·
1 Parent(s): 1a4568a

Update trend_crawl.py

Browse files
Files changed (1) hide show
  1. trend_crawl.py +66 -114
trend_crawl.py CHANGED
@@ -1,120 +1,72 @@
1
- from selenium import webdriver
2
- from selenium.webdriver.common.by import By
3
- from selenium.webdriver.chrome.service import Service as ChromeService
4
- from selenium.webdriver.chrome.options import Options
5
- from selenium.webdriver.support.ui import WebDriverWait
6
- from selenium.webdriver.support import expected_conditions as EC
7
- from bs4 import BeautifulSoup
8
- from webdriver_manager.chrome import ChromeDriverManager
9
- import os
10
- import time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- # Configure Chrome options
13
- chrome_options = Options()
14
- chrome_options.add_argument("--headless") # Run in headless mode
15
- chrome_options.add_argument("--disable-gpu")
16
- complete_starttime = time.time()
17
-
18
- # URL of the Google Trends page
19
-
20
-
21
- #script_dir = os.path.dirname(os.path.abspath(__file__))
22
- #driver_path = os.path.join(script_dir, 'chromedriver')
23
-
24
- def setup_driver():
25
- options = webdriver.ChromeOptions()
26
- options.add_argument('--headless')
27
- options.add_argument('--no-sandbox')
28
- options.add_argument('--disable-dev-shm-usage')
29
- wd = webdriver.Chrome(options=options)
30
- return wd
31
-
32
- def process_selenium_row(index, selenium_rows, driver):
33
- """Extract dynamic data using Selenium by clicking on the row."""
34
- max_retries = 3
35
- for attempt in range(max_retries):
36
  try:
37
- articles = {}
38
- # Refresh the rows before processing
39
- #selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
40
- row = selenium_rows[index]
41
- driver.execute_script("arguments[0].click();", row) # Use JavaScript click for stability
42
-
43
- # Wait for the articles to load dynamically
44
- WebDriverWait(driver, 10).until(
45
- EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
46
- )
47
-
48
- # Fetch only the newly loaded articles
49
- articles = driver.find_elements(By.CLASS_NAME, "xZCHj")
50
- print(articles)
51
- # Extract data from the current row only
52
- dynamic_data = {
53
- "article": [
54
- {
55
- "href": article.get_attribute("href"),
56
- "title": article.text
57
- }
58
- for article in articles
59
- ]
60
- }
61
-
62
- # Clear previously fetched articles and return current ones
63
- return dynamic_data
64
-
65
  except Exception as e:
66
- print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
67
- time.sleep(1) # Add delay before retry
68
-
69
- print(f"Failed to process row {index} after {max_retries} attempts.")
70
- return {"article": []}
71
-
72
-
73
- def scrape_google_trends(driver, url):
74
- """Scrape Google Trends data and save to JSON."""
75
- all_data = []
76
-
77
- try:
78
- driver.get(url)
79
-
80
- WebDriverWait(driver, 20).until(
81
- EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]'))
 
82
  )
 
 
83
 
84
- soup = BeautifulSoup(driver.page_source, "html.parser")
85
- tables = soup.select('[jsname="cC57zf"]')
86
-
87
- for table in tables:
88
- rows_bs = table.find_all("tr")
89
- selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
90
-
91
- for index, row_bs in enumerate(rows_bs):
92
- static_data = [
93
- [div.get_text(strip=True) for div in cell.find_all("div")]
94
- for cell in row_bs.find_all("td")[1:4]
95
- ]
96
- dynamic_data = process_selenium_row(index, selenium_rows, driver)
97
- combined_row = {
98
- "static_data": static_data,
99
- "dynamic_data": dynamic_data
100
- }
101
- all_data.append(combined_row)
102
-
103
- return all_data
104
-
105
- except Exception as e:
106
- print(f"An error occurred: {e}")
107
-
108
- finally:
109
- driver.quit()
110
-
111
-
112
-
113
- def crawl_url(url):
114
- """Main function to be called from another script."""
115
- driver = setup_driver()
116
- return scrape_google_trends(driver, url)
117
 
118
- if __name__ == "__main__":
119
- #crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2")
120
- driver = setup_driver()
 
1
+ from trendspy import Trends
2
+ tr = Trends()
3
+
4
+ TREND_TOPICS = {
5
+ 1: "Autos and Vehicles",
6
+ 2: "Beauty and Fashion",
7
+ 3: "Business and Finance",
8
+ 20: "Climate",
9
+ 4: "Entertainment",
10
+ 5: "Food and Drink",
11
+ 6: "Games",
12
+ 7: "Health",
13
+ 8: "Hobbies and Leisure",
14
+ 9: "Jobs and Education",
15
+ 10: "Law and Government",
16
+ 11: "Other",
17
+ 13: "Pets and Animals",
18
+ 14: "Politics",
19
+ 15: "Science",
20
+ 16: "Shopping",
21
+ 17: "Sports",
22
+ 18: "Technology",
23
+ 19: "Travel and Transportation"
24
+ }
25
+ trends_json = {}
26
+ def process_trends_for_country(country_code, trends_list):
27
+ if country_code not in trends_json:
28
+ trends_json[country_code] = {"All categories" : {}}
29
+ for trend in trends_list:
30
+
31
+ category = None
32
+ for topic_id in trend.topics:
33
+ if topic_id in TREND_TOPICS:
34
+ category = TREND_TOPICS[topic_id]
35
+ break
36
+ if category is None:
37
+ category = TREND_TOPICS[11]
38
+
39
+ if category not in trends_json[country_code]:
40
+ trends_json[country_code][category] = {}
41
+
42
+ topic_name = trend.keyword
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  try:
45
+ news = tr.trending_now_news_by_ids(trend.news_tokens, max_news=3)
46
+ articles = [
47
+ {"title": article.title, "href": article.url}
48
+ for article in news
49
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  except Exception as e:
51
+ articles=[]
52
+ trends_json[country_code]["All categories"][topic_name] = {
53
+ "searchQueries": trend.volume,
54
+ "articles": articles,
55
+ }
56
+ trends_json[country_code][category][topic_name] = {
57
+ "searchQueries": trend.volume,
58
+ "articles": articles,
59
+ }
60
+
61
+ def get_trends(countries: list):
62
+ for country in countries:
63
+ trends = tr.trending_now(geo=country)
64
+ process_trends_for_country(country, trends)
65
+ all_categories = trends_json[country]["All categories"]
66
+ sorted_all_categories = dict(
67
+ sorted(all_categories.items(), key=lambda x: x[1]["searchQueries"], reverse=True)
68
  )
69
+ trends_json[country]["All categories"] = sorted_all_categories
70
+ return trends_json
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72