Add1E commited on
Commit
af9105a
·
verified ·
1 Parent(s): ced5606

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +340 -334
  2. trend_crawl2.py +202 -0
app.py CHANGED
@@ -1,335 +1,341 @@
1
- from pytrends.request import TrendReq
2
- import streamlit as st
3
- import pandas as pd
4
- import xml.etree.ElementTree as ET
5
- import requests
6
- from datetime import datetime
7
- import pytz
8
- import hmac
9
- import os
10
- import time
11
- from PIL import Image
12
- from trend_crawl import crawl_url
13
-
14
- def parse_url(url):
15
- response = requests.get(url)
16
-
17
- root = ET.fromstring(response.content)
18
- return root
19
-
20
- def convert_into_dict(req_json):
21
-
22
- result = {}
23
-
24
- # Iterate over each entry in the JSON data
25
- for entry in req_json:
26
- # Extract 'entityName' and 'searchQueries' from 'static_data'
27
- static_data = entry.get("static_data", [])
28
- if static_data and len(static_data[0]) >= 4:
29
- entity_name = static_data[0][0] # First element
30
- search_queries = static_data[0][3] # Fourth element
31
- else:
32
- entity_name = None
33
- search_queries = None
34
-
35
- # Initialize the entity in the result dictionary if not already present
36
- if entity_name and entity_name not in result:
37
- result[entity_name] = {
38
- "searchQueries": search_queries,
39
- "articles": []
40
- }
41
-
42
- # Extract articles from 'dynamic_data'
43
- articles = entry.get("dynamic_data", {}).get("article", [])
44
- for article in articles:
45
- href = article.get("href")
46
- article_title = article.get("title")
47
-
48
- # Append the article information to the corresponding entity's article list
49
- if entity_name:
50
- result[entity_name]["articles"].append({
51
- "href": href,
52
- "title": article_title
53
- })
54
-
55
- return result
56
-
57
-
58
- def find_details(req_json, gewünschter_titel):
59
- gewünschte_details = []
60
- for trend_info in req_json:
61
- if trend_info['title'] == gewünschter_titel:
62
-
63
- for article in trend_info['articles']:
64
- article_details = {
65
- 'url': article['url'],
66
- 'snippet': article['snippet'],
67
- 'articleTitle': article['articleTitle'],
68
- 'time': article['time'],
69
- 'source' : article['source']
70
- }
71
-
72
- gewünschte_details.append(article_details)
73
- return gewünschte_details
74
-
75
- def find_details2(req_json):
76
- gewünschte_details = []
77
-
78
- for article in req_json:
79
- article_details = {
80
- 'url': article['url'],
81
- 'snippet': article['snippet'],
82
- 'articleTitle': article['title'],
83
- 'source' : article['source']
84
-
85
- }
86
-
87
- gewünschte_details.append(article_details)
88
- return gewünschte_details
89
-
90
- if 'reset' not in st.session_state:
91
- st.session_state.reset = False
92
-
93
- def display_trends_from_yesterday():
94
- checkbox_statuses = {}
95
- urls = []
96
-
97
- timezone = 'Europe/Vienna'
98
- today = datetime.now(pytz.timezone(timezone)).date()
99
- feed = parse_url(feed_url1)
100
- entries = []
101
- ns = {'ht': 'https://trends.google.de/trends/trendingsearches/daily'} # Define namespace
102
- for item in feed.findall('.//item'):
103
- pubDate = datetime.strptime(item.find('pubDate').text, '%a, %d %b %Y %H:%M:%S %z').date()
104
- # Filter: Überspringe, wenn pubDate heute ist
105
- if pubDate == today:
106
- continue
107
- entry = {
108
- 'title': item.find('title').text,
109
- 'pubDate': item.find('pubDate').text,
110
- 'approx_traffic': item.find('ht:approx_traffic', ns).text if item.find('ht:approx_traffic', ns) is not None else None,
111
- 'news_items': []
112
- }
113
- for news_item in item.findall('ht:news_item', ns):
114
- news_details = {
115
- 'title': news_item.find('ht:news_item_title', ns).text,
116
- 'snippet': news_item.find('ht:news_item_snippet', ns).text,
117
- 'url': news_item.find('ht:news_item_url', ns).text,
118
- 'source': news_item.find('ht:news_item_source', ns).text
119
- }
120
- entry['news_items'].append(news_details)
121
- entries.append(entry)
122
-
123
- count = 1
124
- for entry in entries:
125
- with st.expander(f"{count}• {entry['title']} | Generated Traffic: {entry['approx_traffic']}"):
126
- st.write(f"Veröffentlichungsdatum : {entry['pubDate']}")
127
- for count2, link in enumerate(entry['news_items'], start=1):
128
- checkbox_label = f"yesterday_{count}_{count2}"
129
- if st.session_state.reset:
130
- st.session_state[checkbox_label] = False
131
- checkbox_statuses[checkbox_label] = st.session_state.get(checkbox_label, False)
132
- checkbox_statuses[checkbox_label] = st.checkbox(
133
- f"{count2}• {link['title']} | {link['source']} | [Go To →]({link['url']})",
134
- value=checkbox_statuses[checkbox_label],
135
- key=checkbox_label
136
- )
137
- if checkbox_statuses[checkbox_label]:
138
- urls.append(link['url'])
139
-
140
- # Button am Ende des Expanders
141
- base_url = os.getenv("url")
142
- query_params = "&".join([f"article-links[]={url}" for url in urls])
143
- full_url = f"{base_url}{query_params}"
144
- st.link_button("Open All Links" , url= full_url)
145
- count += 1
146
-
147
-
148
-
149
- # Function to display articles for a specific category
150
- def display_articles_for_category(pn_option):
151
- checkbox_statuses = {}
152
- urls = []
153
-
154
-
155
- trending_data = st.session_state["real_trending_searches"][pn_option]
156
-
157
- if st.session_state.get("reset", False):
158
- for idx, (topic, data) in enumerate(trending_data.items()):
159
- for article_index, _ in enumerate(data["articles"]):
160
- checkbox_label = f"{pn_option}_{idx}_{article_index + 1}"
161
- st.session_state[checkbox_label] = False
162
-
163
- for idx, (topic, data) in enumerate(trending_data.items()):
164
-
165
-
166
- with st.expander(f"{idx + 1}• {topic} | Generated Traffic: {data['searchQueries']}"):
167
-
168
- for article_index, article in enumerate(data["articles"][:3], start=1):
169
- checkbox_label = f"{pn_option}_{idx}_{article_index}"
170
-
171
-
172
- current_value = st.session_state.get(checkbox_label, False)
173
- checkbox_statuses[checkbox_label] = current_value
174
-
175
-
176
- disabled = (not current_value) and (sum(checkbox_statuses.values()) >= MAX_CHECKED)
177
-
178
- checkbox_statuses[checkbox_label] = st.checkbox(
179
- f"{article_index}• {article['title']} | [Go To →]({article['href']})",
180
- value=current_value,
181
- key=checkbox_label,
182
- disabled=disabled
183
- )
184
-
185
- if checkbox_statuses[checkbox_label]:
186
- urls.append(article["href"])
187
-
188
- base_url = os.getenv("url", "https://example.com/?")
189
- query_params = "&".join([f"article-links[]={u}" for u in urls])
190
- full_url = f"{base_url}{query_params}"
191
- st.link_button("Open All Links", url=full_url)
192
-
193
- # Funktion zum Rendern von Artikeln für heute
194
- def display_articles_for_today(count, index):
195
- checkbox_statuses = {}
196
- urls = []
197
- # Dictionary zur Verwaltung des Status jeder Checkbox
198
- for count2, url in enumerate(index['articles'], start=1):
199
- checkbox_label = f"today_{count}_{count2}"
200
- if st.session_state.reset:
201
- st.session_state[checkbox_label] = False
202
- checkbox_statuses[checkbox_label] = st.session_state.get(checkbox_label, False)
203
-
204
-
205
- with st.expander(f"{count+1}• {index['title']['query']} | Generated Traffic: {index['formattedTraffic']}"):
206
- articles = find_details2(index['articles'])
207
- for count2, url in enumerate(articles, start=1):
208
- checkbox_label = f"today_{count}_{count2}"
209
- disabled = not checkbox_statuses[checkbox_label] and sum(checkbox_statuses.values()) >= MAX_CHECKED
210
- checkbox_statuses[checkbox_label] = st.checkbox(
211
- f"{count2}• {url['articleTitle']} | {url['source']} | [Go To →]({url['url']})",
212
- value=checkbox_statuses[checkbox_label],
213
- key=checkbox_label,
214
- disabled=disabled
215
- )
216
- if checkbox_statuses[checkbox_label]:
217
- urls.append(url['url'])
218
-
219
- # Button am Ende des Expanders
220
- base_url = os.getenv("url")
221
- query_params = "&".join([f"article-links[]={url}" for url in urls])
222
- full_url = f"{base_url}{query_params}"
223
- st.link_button("Open All Links" , url= full_url)
224
-
225
-
226
- categories = {
227
- "Alle": "all"
228
- # "Gesundheit": "m",
229
- # "Business": "b",
230
- # "Headlines": "h",
231
- # "Sport": "s",
232
- # "Entertainment": "e",
233
- # "Technik": "t",
234
- }
235
-
236
- country_list = {
237
- "Germamy" : "DE",
238
- "Austria" : "AT"
239
- }
240
-
241
- pytrend = TrendReq(hl='de-AT', tz=360, timeout=(10,50))
242
-
243
- if 'base_load_finished' not in st.session_state:
244
- st.session_state["real_trending_searches"] = {}
245
- st.session_state["base_data"] = {}
246
- st.session_state["pn"] = "AT"
247
- print(st.session_state.reset)
248
- if 'base_load_finished' not in st.session_state or st.session_state.reset:
249
- with st.spinner("Loading Trends"):
250
- st.session_state["today"] = {}
251
- st.session_state["base"] = {}
252
- for country_name, pn_option in country_list.items():
253
- st.session_state["base_data"][pn_option] = {}
254
- st.session_state["real_trending_searches"][pn_option] = {}
255
- st.session_state["today"][pn_option] = pytrend.today_searches(pn=pn_option)
256
-
257
- for category_name, category_code in categories.items():
258
- st.session_state["base"][pn_option] = crawl_url(url=f"https://trends.google.com/trends/trendingsearches/daily?geo={pn_option}&category=2")
259
- st.session_state["real_trending_searches"][pn_option] = convert_into_dict(st.session_state["base"][pn_option])
260
- st.session_state["base_load_finished"]= True
261
-
262
-
263
- MAX_CHECKED = 3
264
-
265
- def check_password():
266
- """Returns `True` if the user had the correct password."""
267
-
268
- def password_entered():
269
- """Checks whether a password entered by the user is correct."""
270
- if hmac.compare_digest(st.session_state["password"], os.environ.get("PASSWORD")):
271
- st.session_state["password_correct"] = True
272
- del st.session_state["password"] # Don't store the password.
273
- else:
274
- st.session_state["password_correct"] = False
275
-
276
- # Return True if the password is validated.
277
- if st.session_state.get("password_correct", False):
278
- return True
279
-
280
- # Show input for password.
281
- st.text_input(
282
- "Password", type="password", on_change=password_entered, key="password"
283
- )
284
- if "password_correct" in st.session_state:
285
- st.error("😕 Password incorrect")
286
- return False
287
-
288
-
289
- if not check_password():
290
- st.stop() # Do not continue if check_password is not True.
291
-
292
-
293
-
294
-
295
-
296
- if 'selected_option' not in st.session_state:
297
- st.session_state['selected_option'] = "default_value" # You can set a default value as needed
298
-
299
- img = Image.open(r"heute_tensora.png")
300
- st.sidebar.image(img)
301
-
302
- # Now, you can safely use st.session_state['selected_option']
303
- # Selectbox to choose a country
304
- selected_country = st.sidebar.selectbox("Choose a Country", ["AT", "DE"])
305
- feed_url1 = f'https://trends.google.de/trends/trendingsearches/daily/rss?geo={selected_country}'
306
-
307
- # Button to trigger actions
308
- if st.sidebar.button("Change Country"):
309
- if selected_country == "AT":
310
- st.session_state["pn"] = selected_country
311
- elif selected_country == "DE":
312
- st.session_state["pn"] = selected_country
313
-
314
- selected_option = st.sidebar.radio("Choose an option", ["Realzeit Anfragen", "Tagesaktuelle Anfragen", "Trends von Gestern"])
315
- st.warning("Die aufgelisteten Keywörter für erhöhte Reichweite in den Überschriften verwenden")
316
- if selected_option == "Tagesaktuelle Anfragen":
317
-
318
- for count, index in enumerate(st.session_state["today"][selected_country], start=0):
319
- try:
320
- display_articles_for_today(count, index)
321
- except Exception as e:
322
- st.code(e)
323
- continue
324
- elif selected_option == "Realzeit Anfragen":
325
- #choices_list = list(st.session_state["real_trending_searches"][selected_country].keys())
326
- #if len(categories) == len(choices_list):
327
- # st.session_state["base_load_finished"] = True
328
- #auswahl = st.selectbox("Select Ressort", choices_list)
329
-
330
- display_articles_for_category(selected_country)
331
- elif selected_option == "Trends von Gestern":
332
- display_trends_from_yesterday()
333
-
334
- if st.session_state.reset:
 
 
 
 
 
 
335
  st.session_state["reset"] = False
 
1
+ from pytrends.request import TrendReq
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import xml.etree.ElementTree as ET
5
+ import requests
6
+ from datetime import datetime
7
+ import pytz
8
+ import hmac
9
+ import os
10
+ import time
11
+ from PIL import Image
12
+ from trend_crawl2 import crawl_url
13
+ import re
14
+
15
+ os.environ["PASSWORD"] = "heute_123"
16
+
17
+
18
+ def parse_url(url):
19
+ response = requests.get(url)
20
+
21
+ root = ET.fromstring(response.content)
22
+ return root
23
+
24
+ def convert_into_dict(req_json):
25
+ result = {}
26
+
27
+ # Iterate over each category in the JSON data
28
+ for category, entries in req_json.items():
29
+ # Initialize the category if not already in result
30
+ if category not in result:
31
+ result[category] = {}
32
+
33
+ for entry in entries:
34
+ # Extract 'entityName' and 'searchQueries' from 'static_data'
35
+ static_data = entry.get("static_data", [])
36
+ if static_data and len(static_data[0]) >= 4:
37
+ entity_name = static_data[0][0] # First element
38
+ search_queries = static_data[0][3] # Fourth element
39
+ else:
40
+ entity_name = None
41
+ search_queries = None
42
+
43
+ # Initialize the entity under the category if not already present
44
+ if entity_name:
45
+ if entity_name not in result[category]:
46
+ result[category][entity_name] = {
47
+ "searchQueries": search_queries,
48
+ "articles": []
49
+ }
50
+
51
+ # Extract articles from 'dynamic_data'
52
+ articles = entry.get("dynamic_data", {}).get("article", [])
53
+ for article in articles:
54
+ href = article.get("href")
55
+ article_title = article.get("title")
56
+
57
+ # Append the article information to the corresponding entity's article list
58
+ result[category][entity_name]["articles"].append({
59
+ "href": href,
60
+ "title": article_title
61
+ })
62
+
63
+ return result
64
+
65
+
66
+ def find_details(req_json, gewünschter_titel):
67
+ gewünschte_details = []
68
+ for trend_info in req_json:
69
+ if trend_info['title'] == gewünschter_titel:
70
+
71
+ for article in trend_info['articles']:
72
+ article_details = {
73
+ 'url': article['url'],
74
+ 'snippet': article['snippet'],
75
+ 'articleTitle': article['articleTitle'],
76
+ 'time': article['time'],
77
+ 'source' : article['source']
78
+ }
79
+
80
+ gewünschte_details.append(article_details)
81
+ return gewünschte_details
82
+
83
+ def find_details2(req_json):
84
+ gewünschte_details = []
85
+
86
+ for article in req_json:
87
+ article_details = {
88
+ 'url': article['url'],
89
+ 'snippet': article['snippet'],
90
+ 'articleTitle': article['title'],
91
+ 'source' : article['source']
92
+
93
+ }
94
+
95
+ gewünschte_details.append(article_details)
96
+ return gewünschte_details
97
+
98
+ if 'reset' not in st.session_state:
99
+ st.session_state.reset = False
100
+
101
+ def display_trends_from_yesterday():
102
+ checkbox_statuses = {}
103
+ urls = []
104
+
105
+ timezone = 'Europe/Vienna'
106
+ today = datetime.now(pytz.timezone(timezone)).date()
107
+ feed = parse_url(feed_url1)
108
+ entries = []
109
+ ns = {'ht': 'https://trends.google.de/trends/trendingsearches/daily'} # Define namespace
110
+ for item in feed.findall('.//item'):
111
+ pubDate = datetime.strptime(item.find('pubDate').text, '%a, %d %b %Y %H:%M:%S %z').date()
112
+ # Filter: Überspringe, wenn pubDate heute ist
113
+ if pubDate == today:
114
+ continue
115
+ entry = {
116
+ 'title': item.find('title').text,
117
+ 'pubDate': item.find('pubDate').text,
118
+ 'approx_traffic': item.find('ht:approx_traffic', ns).text if item.find('ht:approx_traffic', ns) is not None else None,
119
+ 'news_items': []
120
+ }
121
+ for news_item in item.findall('ht:news_item', ns):
122
+ news_details = {
123
+ 'title': news_item.find('ht:news_item_title', ns).text,
124
+ 'snippet': news_item.find('ht:news_item_snippet', ns).text,
125
+ 'url': news_item.find('ht:news_item_url', ns).text,
126
+ 'source': news_item.find('ht:news_item_source', ns).text
127
+ }
128
+ entry['news_items'].append(news_details)
129
+ entries.append(entry)
130
+
131
+ count = 1
132
+ for entry in entries:
133
+ with st.expander(f"{count}• {entry['title']} | Generated Traffic: {entry['approx_traffic']}"):
134
+ st.write(f"Veröffentlichungsdatum : {entry['pubDate']}")
135
+ for count2, link in enumerate(entry['news_items'], start=1):
136
+ checkbox_label = f"yesterday_{count}_{count2}"
137
+ if st.session_state.reset:
138
+ st.session_state[checkbox_label] = False
139
+ checkbox_statuses[checkbox_label] = st.session_state.get(checkbox_label, False)
140
+ checkbox_statuses[checkbox_label] = st.checkbox(
141
+ f"{count2}• {link['title']} | {link['source']} | [Go To →]({link['url']})",
142
+ value=checkbox_statuses[checkbox_label],
143
+ key=checkbox_label
144
+ )
145
+ if checkbox_statuses[checkbox_label]:
146
+ urls.append(link['url'])
147
+
148
+ # Button am Ende des Expanders
149
+ base_url = os.getenv("url")
150
+ query_params = "&".join([f"article-links[]={url}" for url in urls])
151
+ full_url = f"{base_url}{query_params}"
152
+ st.link_button("Open All Links" , url= full_url)
153
+ count += 1
154
+
155
+
156
+
157
+ # Function to display articles for a specific category
158
+ def display_articles_for_category(category):
159
+ checkbox_statuses = {}
160
+ urls = []
161
+
162
+
163
+ trending_data = st.session_state["real_trending_searches"][selected_country][category]
164
+
165
+ if st.session_state.get("reset", False):
166
+ for idx, (topic, data) in enumerate(trending_data.items()):
167
+ for article_index, _ in enumerate(data["articles"]):
168
+ checkbox_label = f"{category}_{idx}_{article_index + 1}"
169
+ st.session_state[checkbox_label] = False
170
+
171
+ for idx, (topic, data) in enumerate(trending_data.items()):
172
+
173
+
174
+ with st.expander(f"{idx + 1}• {topic} | Generated Traffic: {data['searchQueries']}"):
175
+
176
+ for article_index, article in enumerate(data["articles"], start=1):
177
+ checkbox_label = f"{category}_{idx}_{article_index}"
178
+
179
+
180
+ current_value = st.session_state.get(checkbox_label, False)
181
+ checkbox_statuses[checkbox_label] = current_value
182
+
183
+
184
+ disabled = (not current_value) and (sum(checkbox_statuses.values()) >= MAX_CHECKED)
185
+
186
+ checkbox_statuses[checkbox_label] = st.checkbox(
187
+ f"{article_index}• {article['title']} | [Go To →]({article['href']})",
188
+ value=current_value,
189
+ key=checkbox_label,
190
+ disabled=disabled
191
+ )
192
+
193
+ if checkbox_statuses[checkbox_label]:
194
+ urls.append(article["href"])
195
+
196
+ base_url = os.getenv("url", "https://example.com/?")
197
+ query_params = "&".join([f"article-links[]={u}" for u in urls])
198
+ full_url = f"{base_url}{query_params}"
199
+ st.link_button("Open All Links", url=full_url)
200
+
201
+ # Funktion zum Rendern von Artikeln für heute
202
+ def display_articles_for_today(count, index):
203
+ checkbox_statuses = {}
204
+ urls = []
205
+ # Dictionary zur Verwaltung des Status jeder Checkbox
206
+ for count2, url in enumerate(index['articles'], start=1):
207
+ checkbox_label = f"today_{count}_{count2}"
208
+ if st.session_state.reset:
209
+ st.session_state[checkbox_label] = False
210
+ checkbox_statuses[checkbox_label] = st.session_state.get(checkbox_label, False)
211
+
212
+
213
+ with st.expander(f"{count+1}• {index['title']['query']} | Generated Traffic: {index['formattedTraffic']}"):
214
+ articles = find_details2(index['articles'])
215
+ for count2, url in enumerate(articles, start=1):
216
+ checkbox_label = f"today_{count}_{count2}"
217
+ disabled = not checkbox_statuses[checkbox_label] and sum(checkbox_statuses.values()) >= MAX_CHECKED
218
+ checkbox_statuses[checkbox_label] = st.checkbox(
219
+ f"{count2}• {url['articleTitle']} | {url['source']} | [Go To →]({url['url']})",
220
+ value=checkbox_statuses[checkbox_label],
221
+ key=checkbox_label,
222
+ disabled=disabled
223
+ )
224
+ if checkbox_statuses[checkbox_label]:
225
+ urls.append(url['url'])
226
+
227
+ # Button am Ende des Expanders
228
+ base_url = os.getenv("url")
229
+ query_params = "&".join([f"article-links[]={url}" for url in urls])
230
+ full_url = f"{base_url}{query_params}"
231
+ st.link_button("Open All Links" , url= full_url)
232
+
233
+
234
+ country_list = {
235
+ "Germamy" : "DE",
236
+ "Austria" : "AT"
237
+ }
238
+
239
+ pytrend = TrendReq(hl='de-AT', tz=360, timeout=(10,50))
240
+
241
+ if 'base_load_finished' not in st.session_state:
242
+ st.session_state["real_trending_searches"] = {}
243
+ st.session_state["base_data"] = {}
244
+ st.session_state["pn"] = "AT"
245
+ print(st.session_state.reset)
246
+ if 'base_load_finished' not in st.session_state or st.session_state.reset:
247
+ with st.spinner("Loading Trends"):
248
+ st.session_state["today"] = {}
249
+ st.session_state["base"] = {}
250
+ for country_name, pn_option in country_list.items():
251
+ st.session_state["base_data"][pn_option] = {}
252
+ st.session_state["real_trending_searches"][pn_option] = {}
253
+ st.session_state["today"][pn_option] = pytrend.today_searches(pn=pn_option)
254
+
255
+ st.session_state["base"][pn_option] = crawl_url(url=f"https://trends.google.com/trends/trendingsearches/daily?geo={pn_option}&category=2")
256
+ st.session_state["real_trending_searches"][pn_option] = convert_into_dict(st.session_state["base"][pn_option])
257
+ st.code(st.session_state["real_trending_searches"])
258
+ st.session_state["base_load_finished"]= True
259
+
260
+ MAX_CHECKED = 3
261
+
262
+ def check_password():
263
+ """Returns `True` if the user had the correct password."""
264
+
265
+ def password_entered():
266
+ """Checks whether a password entered by the user is correct."""
267
+ if hmac.compare_digest(st.session_state["password"], os.environ.get("PASSWORD")):
268
+ st.session_state["password_correct"] = True
269
+ st.code("asdf")
270
+ del st.session_state["password"] # Don't store the password.
271
+ else:
272
+ st.session_state["password_correct"] = False
273
+ st.code("sad")
274
+
275
+ # Return True if the password is validated.
276
+ if st.session_state.get("password_correct", False):
277
+ return True
278
+
279
+ # Show input for password.
280
+ st.text_input(
281
+ "Password", type="password", on_change=password_entered, key="password"
282
+ )
283
+ if "password_correct" in st.session_state:
284
+ st.error("😕 Password incorrect")
285
+ return False
286
+
287
+
288
+ if not check_password():
289
+ st.stop() # Do not continue if check_password is not True.
290
+
291
+
292
+
293
+
294
+
295
+ if 'selected_option' not in st.session_state:
296
+ st.session_state['selected_option'] = "default_value" # You can set a default value as needed
297
+
298
+ img = Image.open(r"heute_tensora.png")
299
+ st.sidebar.image(img)
300
+
301
+ # Now, you can safely use st.session_state['selected_option']
302
+ # Selectbox to choose a country
303
+ selected_country = st.sidebar.selectbox("Choose a Country", ["AT", "DE"])
304
+ feed_url1 = f'https://trends.google.de/trends/trendingsearches/daily/rss?geo={selected_country}'
305
+
306
+ # Button to trigger actions
307
+ if st.sidebar.button("Change Country"):
308
+ if selected_country == "AT":
309
+ st.session_state["pn"] = selected_country
310
+ elif selected_country == "DE":
311
+ st.session_state["pn"] = selected_country
312
+
313
+ selected_option = st.sidebar.radio("Choose an option", ["Realzeit Anfragen", "Tagesaktuelle Anfragen", "Trends von Gestern"])
314
+ st.warning("Die aufgelisteten Keywörter für erhöhte Reichweite in den Überschriften verwenden")
315
+
316
+ if selected_option == "Tagesaktuelle Anfragen":
317
+
318
+ for count, index in enumerate(st.session_state["today"][selected_country], start=0):
319
+ try:
320
+ display_articles_for_today(count, index)
321
+ except Exception as e:
322
+ st.code(e)
323
+ continue
324
+
325
+ elif selected_option == "Realzeit Anfragen":
326
+
327
+ raw_choices_list = list(st.session_state["real_trending_searches"][selected_country].keys())
328
+
329
+ cleaned_to_raw_mapping = {re.sub(r"\s\(\d+\)$", "", choice): choice for choice in raw_choices_list}
330
+ choices_list = list(cleaned_to_raw_mapping.keys())
331
+
332
+ auswahl = st.selectbox("Select Ressort", choices_list, index=0)
333
+
334
+
335
+ display_articles_for_category(cleaned_to_raw_mapping[auswahl])
336
+
337
+ elif selected_option == "Trends von Gestern":
338
+ display_trends_from_yesterday()
339
+
340
+ if st.session_state.reset:
341
  st.session_state["reset"] = False
trend_crawl2.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.by import By
3
+ from selenium.webdriver.chrome.service import Service as ChromeService
4
+ from selenium.common.exceptions import ElementClickInterceptedException
5
+ from selenium.webdriver.chrome.options import Options
6
+ from selenium.webdriver.support.ui import WebDriverWait
7
+ from selenium.webdriver.support import expected_conditions as EC
8
+ from bs4 import BeautifulSoup
9
+ from webdriver_manager.chrome import ChromeDriverManager
10
+ import time
11
+ import json
12
+
13
+ # Configure Chrome options
14
+ chrome_options = Options()
15
+ chrome_options.add_argument("--headless") # Run in headless mode
16
+ chrome_options.add_argument("--disable-gpu")
17
+
18
+
19
+ def setup_driver():
20
+ options = webdriver.ChromeOptions()
21
+ options.add_argument('--headless')
22
+ options.add_argument('--no-sandbox')
23
+ options.add_argument('--disable-dev-shm-usage')
24
+ wd = webdriver.Chrome(options=options)
25
+ return wd
26
+
27
+ def click_and_scrape(driver, url):
28
+ """Click each li element and scrape data."""
29
+ result_dict = {}
30
+ try:
31
+ driver.get(url)
32
+
33
+ for attempt in range(4):
34
+ try:
35
+ button = WebDriverWait(driver, 20).until(
36
+ EC.element_to_be_clickable((
37
+ By.XPATH,
38
+ "//button[@aria-label='Alle Kategorien, Kategorie auswählen']"
39
+ ))
40
+ )
41
+ print("Button located.")
42
+
43
+ # Scroll into view to ensure visibility
44
+ driver.execute_script("arguments[0].scrollIntoView();", button)
45
+ print(button.get_attribute("outerHTML"))
46
+
47
+
48
+ button.click()
49
+ print("Button clicked successfully.")
50
+ break
51
+ except ElementClickInterceptedException:
52
+ print(f"Attempt {attempt + 1}: Click intercepted. Retrying...")
53
+
54
+
55
+
56
+ # Wait for the ul element to load
57
+ try:
58
+ # Wait for the ul element with the specific aria-label to load
59
+ ul_element = WebDriverWait(driver, 20).until(
60
+ EC.presence_of_element_located((
61
+ By.XPATH,
62
+ "//ul[@aria-label='Kategorie']"
63
+ ))
64
+ )
65
+ li_elements = ul_element.find_elements(By.TAG_NAME, "li")
66
+ except Exception as e:
67
+ print(f"Error locating ul_element: {e}")
68
+ selected_elements = [li_elements[2]] + li_elements[4:]
69
+ for index, li in enumerate(selected_elements):
70
+ try:
71
+ # Scroll each li element into view
72
+ driver.execute_script("arguments[0].scrollIntoView();", li)
73
+ # Click the <li> using JavaScript
74
+ driver.execute_script("arguments[0].click();", li)
75
+ print(f"Clicked LI {index} using JavaScript.")
76
+ time.sleep(2)
77
+ try:
78
+ span = li.find_element(By.CLASS_NAME, "W7g1Rb-rymPhb-fpDzbe-fmcmS")
79
+ span_content = span.get_attribute("innerText")
80
+ print(f"Extracted span content for LI {index}: {span_content}")
81
+ data = scrape_google_trends(driver)
82
+ result_dict[f"{span_content}"] = data
83
+ except Exception as e:
84
+ print(f"Could not find or extract span content in LI {index}: {e}")
85
+ span_content = f"iteration_{index}"
86
+ result_dict[f"{span_content}"] = []
87
+
88
+ except Exception as e:
89
+ print(f"Error interacting with LI {index}: {e}")
90
+
91
+ # for index, li in enumerate(li_elements):
92
+ # try:
93
+ # # Click each li element
94
+ # driver.execute_script("arguments[0].scrollIntoView();", li) # Ensure li is in view
95
+ # li.click()
96
+ # time.sleep(1) # Slight delay to ensure loading
97
+
98
+ # # Wait for content to load dynamically
99
+ # WebDriverWait(driver, 10).until(
100
+ # EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
101
+ # )
102
+
103
+ # # Extract data using scrape_google_trends logic
104
+ # data = scrape_google_trends(driver)
105
+
106
+ # # Save results to the dictionary
107
+ # result_dict[f"iteration_{index}"] = data
108
+
109
+ # except Exception as e:
110
+ # print(f"Error processing li element {index}: {e}")
111
+
112
+ except Exception as e:
113
+ print(f"Error during click and scrape: {e}")
114
+
115
+ finally:
116
+ driver.quit()
117
+
118
+ return result_dict
119
+
120
+ def process_selenium_row(index, rows, driver):
121
+ """Extract dynamic data using Selenium by clicking on the row."""
122
+ max_retries = 3
123
+ for attempt in range(max_retries):
124
+ try:
125
+ articles = {}
126
+
127
+ driver.execute_script("arguments[0].click();", rows[index]) # Use JavaScript click for stability
128
+
129
+ # Wait for the articles to load dynamically
130
+ WebDriverWait(driver, 10).until(
131
+ EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
132
+ )
133
+
134
+ # Fetch only the newly loaded articles
135
+ articles = driver.find_elements(By.CLASS_NAME, "xZCHj")
136
+ # Extract data from the current row only
137
+ dynamic_data = {
138
+ "article": [
139
+ {
140
+ "href": article.get_attribute("href"),
141
+ "title": article.text
142
+ }
143
+ for article in articles
144
+ ]
145
+ }
146
+
147
+ # Clear previously fetched articles and return current ones
148
+ return dynamic_data
149
+
150
+ except Exception as e:
151
+ error = e
152
+
153
+ print(f"Failed to process row {index} after {max_retries} attempts.")
154
+ return {"article": []}
155
+
156
+ def scrape_google_trends(driver):
157
+ """Scrape data dynamically from the current page."""
158
+ all_data = []
159
+ try:
160
+ selenium_rows = None
161
+ WebDriverWait(driver, 2).until(
162
+ EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]'))
163
+ )
164
+ soup = BeautifulSoup(driver.page_source, "html.parser")
165
+ selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
166
+ tables = soup.select('[jsname="cC57zf"]')
167
+
168
+ for table in tables:
169
+ rows_bs = table.find_all("tr")
170
+ for index, row_bs in enumerate(rows_bs):
171
+ static_data = [
172
+ [div.get_text(strip=True) for div in cell.find_all("div")]
173
+ for cell in row_bs.find_all("td")[1:4]
174
+ ]
175
+ dynamic_data = process_selenium_row(index, selenium_rows, driver)
176
+ combined_row = {
177
+ "static_data": static_data,
178
+ "dynamic_data": dynamic_data
179
+ }
180
+ all_data.append(combined_row)
181
+
182
+ return all_data
183
+
184
+ except Exception as e:
185
+ with open(f"page_source_debug.html", "w", encoding="utf-8") as f:
186
+ f.write(driver.page_source)
187
+ print(f"An error occurred during scraping: {e}")
188
+ return []
189
+
190
+ def crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT"):
191
+ """Main function to crawl dynamically and scrape Google Trends."""
192
+ driver = setup_driver()
193
+ results = click_and_scrape(driver,url)
194
+ return results
195
+
196
+ if __name__ == "__main__":
197
+ results = crawl_url()
198
+ try:
199
+ with open("results.json", "w", encoding="utf-8") as f:
200
+ json.dump(results, f, ensure_ascii=False, indent=4)
201
+ except Exception as e:
202
+ print(f"Error writing results to JSON: {e}")