Add1E commited on
Commit
00b85ba
·
verified ·
1 Parent(s): 1e07a06

Update trend_crawl.py

Browse files
Files changed (1) hide show
  1. trend_crawl.py +110 -110
trend_crawl.py CHANGED
@@ -1,110 +1,110 @@
1
- from selenium import webdriver
2
- from selenium.webdriver.common.by import By
3
- from selenium.webdriver.chrome.service import Service
4
- from selenium.webdriver.chrome.options import Options
5
- from selenium.webdriver.support.ui import WebDriverWait
6
- from selenium.webdriver.support import expected_conditions as EC
7
- from bs4 import BeautifulSoup
8
- import os
9
- import time
10
-
11
- # Configure Chrome options
12
- chrome_options = Options()
13
- chrome_options.add_argument("--headless") # Run in headless mode
14
- chrome_options.add_argument("--disable-gpu")
15
- complete_starttime = time.time()
16
-
17
- # URL of the Google Trends page
18
-
19
- def setup_driver():
20
- """Set up the Selenium WebDriver."""
21
- script_dir = os.path.dirname(os.path.abspath(__file__))
22
- driver_path = os.path.join(script_dir, 'chromedriver.exe')
23
- chrome_options = Options()
24
- chrome_options.add_argument("--headless") # Run in headless mode
25
- chrome_options.add_argument("--disable-gpu")
26
- driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options)
27
- return driver
28
-
29
- def process_selenium_row(index, selenium_rows, driver):
30
- """Extract dynamic data using Selenium by clicking on the row."""
31
- max_retries = 3
32
- for attempt in range(max_retries):
33
- try:
34
- row = selenium_rows[index]
35
- row.click()
36
-
37
- # Wait for elements with class="xZCHj" to load
38
- WebDriverWait(driver, 10).until(
39
- EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
40
- )
41
-
42
- links = driver.find_elements(By.CLASS_NAME, "xZCHj")
43
- dynamic_data = {
44
- "article": [
45
- {
46
- "href": link.get_attribute("href"),
47
- "title": link.text
48
- }
49
- for link in links
50
- ]
51
- }
52
-
53
- if dynamic_data["article"]:
54
- return dynamic_data
55
- except Exception as e:
56
- print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
57
- selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
58
-
59
- print(f"Failed to process row {index} after {max_retries} attempts.")
60
- return {"article": []}
61
-
62
- def scrape_google_trends(driver, url):
63
- """Scrape Google Trends data and save to JSON."""
64
- all_data = []
65
-
66
- try:
67
- driver.get(url)
68
-
69
- WebDriverWait(driver, 20).until(
70
- EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]'))
71
- )
72
-
73
- soup = BeautifulSoup(driver.page_source, "html.parser")
74
- tables = soup.select('[jsname="cC57zf"]')
75
-
76
- for table in tables:
77
- rows_bs = table.find_all("tr")
78
- selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
79
-
80
- for index, row_bs in enumerate(rows_bs):
81
- static_data = [
82
- [div.get_text(strip=True) for div in cell.find_all("div")]
83
- for cell in row_bs.find_all("td")[1:4]
84
- ]
85
- print(static_data)
86
- dynamic_data = process_selenium_row(index, selenium_rows, driver)
87
- combined_row = {
88
- "static_data": static_data,
89
- "dynamic_data": dynamic_data
90
- }
91
- all_data.append(combined_row)
92
-
93
- return all_data
94
-
95
- except Exception as e:
96
- print(f"An error occurred: {e}")
97
-
98
- finally:
99
- driver.quit()
100
-
101
-
102
-
103
- def crawl_url(url):
104
- """Main function to be called from another script."""
105
- driver = setup_driver()
106
- return scrape_google_trends(driver, url)
107
-
108
- if __name__ == "__main__":
109
- #crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2")
110
- driver = setup_driver()
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.by import By
3
+ from selenium.webdriver.chrome.service import Service
4
+ from selenium.webdriver.chrome.options import Options
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+ from bs4 import BeautifulSoup
8
+ import os
9
+ import time
10
+
11
+ # Configure Chrome options
12
+ chrome_options = Options()
13
+ chrome_options.add_argument("--headless") # Run in headless mode
14
+ chrome_options.add_argument("--disable-gpu")
15
+ complete_starttime = time.time()
16
+
17
+ # URL of the Google Trends page
18
+
19
+ def setup_driver():
20
+ """Set up the Selenium WebDriver."""
21
+ script_dir = os.path.dirname(os.path.abspath(__file__))
22
+ driver_path = os.path.join(script_dir, 'chromedriver')
23
+ chrome_options = Options()
24
+ chrome_options.add_argument("--headless") # Run in headless mode
25
+ chrome_options.add_argument("--disable-gpu")
26
+ driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options)
27
+ return driver
28
+
29
+ def process_selenium_row(index, selenium_rows, driver):
30
+ """Extract dynamic data using Selenium by clicking on the row."""
31
+ max_retries = 3
32
+ for attempt in range(max_retries):
33
+ try:
34
+ row = selenium_rows[index]
35
+ row.click()
36
+
37
+ # Wait for elements with class="xZCHj" to load
38
+ WebDriverWait(driver, 10).until(
39
+ EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
40
+ )
41
+
42
+ links = driver.find_elements(By.CLASS_NAME, "xZCHj")
43
+ dynamic_data = {
44
+ "article": [
45
+ {
46
+ "href": link.get_attribute("href"),
47
+ "title": link.text
48
+ }
49
+ for link in links
50
+ ]
51
+ }
52
+
53
+ if dynamic_data["article"]:
54
+ return dynamic_data
55
+ except Exception as e:
56
+ print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
57
+ selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
58
+
59
+ print(f"Failed to process row {index} after {max_retries} attempts.")
60
+ return {"article": []}
61
+
62
+ def scrape_google_trends(driver, url):
63
+ """Scrape Google Trends data and save to JSON."""
64
+ all_data = []
65
+
66
+ try:
67
+ driver.get(url)
68
+
69
+ WebDriverWait(driver, 20).until(
70
+ EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]'))
71
+ )
72
+
73
+ soup = BeautifulSoup(driver.page_source, "html.parser")
74
+ tables = soup.select('[jsname="cC57zf"]')
75
+
76
+ for table in tables:
77
+ rows_bs = table.find_all("tr")
78
+ selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
79
+
80
+ for index, row_bs in enumerate(rows_bs):
81
+ static_data = [
82
+ [div.get_text(strip=True) for div in cell.find_all("div")]
83
+ for cell in row_bs.find_all("td")[1:4]
84
+ ]
85
+ print(static_data)
86
+ dynamic_data = process_selenium_row(index, selenium_rows, driver)
87
+ combined_row = {
88
+ "static_data": static_data,
89
+ "dynamic_data": dynamic_data
90
+ }
91
+ all_data.append(combined_row)
92
+
93
+ return all_data
94
+
95
+ except Exception as e:
96
+ print(f"An error occurred: {e}")
97
+
98
+ finally:
99
+ driver.quit()
100
+
101
+
102
+
103
+ def crawl_url(url):
104
+ """Main function to be called from another script."""
105
+ driver = setup_driver()
106
+ return scrape_google_trends(driver, url)
107
+
108
+ if __name__ == "__main__":
109
+ #crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2")
110
+ driver = setup_driver()