poemsforaphrodite commited on
Commit
7eda627
·
verified ·
1 Parent(s): b0ff441

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -342
app.py CHANGED
@@ -1,342 +1,3 @@
1
- # Standard library imports
2
- import datetime
3
- import base64
4
- import os
5
-
6
- # Related third-party imports
7
- import streamlit as st
8
- from streamlit_elements import elements
9
- from google_auth_oauthlib.flow import Flow
10
- from googleapiclient.discovery import build
11
- from dotenv import load_dotenv
12
- import pandas as pd
13
- import searchconsole
14
- import cohere
15
- from sklearn.metrics.pairwise import cosine_similarity
16
- import requests
17
- from bs4 import BeautifulSoup
18
-
19
- load_dotenv()
20
- #test
21
-
22
- # Initialize Cohere client
23
- COHERE_API_KEY = os.environ["COHERE_API_KEY"]
24
- co = cohere.Client(COHERE_API_KEY)
25
-
26
- # Configuration: Set to True if running locally, False if running on Streamlit Cloud
27
- IS_LOCAL = False
28
-
29
- # Constants
30
- SEARCH_TYPES = ["web", "image", "video", "news", "discover", "googleNews"]
31
- DATE_RANGE_OPTIONS = [
32
- "Last 7 Days",
33
- "Last 30 Days",
34
- "Last 3 Months",
35
- "Last 6 Months",
36
- "Last 12 Months",
37
- "Last 16 Months",
38
- "Custom Range"
39
- ]
40
- DEVICE_OPTIONS = ["All Devices", "desktop", "mobile", "tablet"]
41
- BASE_DIMENSIONS = ["page", "query", "country", "date"]
42
- MAX_ROWS = 250_000
43
- DF_PREVIEW_ROWS = 100
44
-
45
- # -------------
46
- # Streamlit App Configuration
47
- # -------------
48
-
49
- def setup_streamlit():
50
- st.set_page_config(page_title="Simple Google Search Console Data", layout="wide")
51
- st.title("GSC Relenvacy Score Calculator")
52
- # st.markdown(f"### Lightweight GSC Data Extractor. (Max {MAX_ROWS:,} Rows)")
53
- st.divider()
54
-
55
- def init_session_state():
56
- if 'selected_property' not in st.session_state:
57
- st.session_state.selected_property = None
58
- if 'selected_search_type' not in st.session_state:
59
- st.session_state.selected_search_type = 'web'
60
- if 'selected_date_range' not in st.session_state:
61
- st.session_state.selected_date_range = 'Last 7 Days'
62
- if 'start_date' not in st.session_state:
63
- st.session_state.start_date = datetime.date.today() - datetime.timedelta(days=7)
64
- if 'end_date' not in st.session_state:
65
- st.session_state.end_date = datetime.date.today()
66
- if 'selected_dimensions' not in st.session_state:
67
- st.session_state.selected_dimensions = ['page', 'query']
68
- if 'selected_device' not in st.session_state:
69
- st.session_state.selected_device = 'All Devices'
70
- if 'custom_start_date' not in st.session_state:
71
- st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
72
- if 'custom_end_date' not in st.session_state:
73
- st.session_state.custom_end_date = datetime.date.today()
74
-
75
- # -------------
76
- # Data Processing Functions
77
- # -------------
78
-
79
- def fetch_content(url):
80
- try:
81
- response = requests.get(url)
82
- response.raise_for_status()
83
- soup = BeautifulSoup(response.text, 'html.parser')
84
- content = soup.get_text(separator=' ', strip=True)
85
- return content
86
- except requests.RequestException as e:
87
- return str(e)
88
-
89
- def generate_embeddings(text_list, model_type):
90
- if not text_list:
91
- return []
92
- model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
93
- input_type = 'search_document'
94
- response = co.embed(model=model, texts=text_list, input_type=input_type)
95
- embeddings = response.embeddings
96
- return embeddings
97
-
98
- def calculate_relevancy_scores(df, model_type):
99
- try:
100
- page_contents = [fetch_content(url) for url in df['page']]
101
- page_embeddings = generate_embeddings(page_contents, model_type)
102
- query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
103
- relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
104
- df = df.assign(relevancy_score=relevancy_scores)
105
- except Exception as e:
106
- st.warning(f"Error calculating relevancy scores: {e}")
107
- df = df.assign(relevancy_score=0)
108
- return df
109
-
110
- def process_gsc_data(df):
111
- # Filter for queries below position 10
112
- df_filtered = df[df['position'] > 10].copy()
113
-
114
- # Sort by impressions in descending order
115
- df_sorted = df_filtered.sort_values(['impressions'], ascending=[False])
116
-
117
- # Keep only the highest impression query for each page
118
- df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
119
-
120
- if 'relevancy_score' not in df_unique.columns:
121
- df_unique['relevancy_score'] = 0
122
- else:
123
- df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
124
-
125
- result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
126
- return result
127
-
128
- # -------------
129
- # Google Authentication Functions
130
- # -------------
131
-
132
- def load_config():
133
- client_config = {
134
- "web": {
135
- "client_id": os.environ["CLIENT_ID"],
136
- "client_secret": os.environ["CLIENT_SECRET"],
137
- "auth_uri": "https://accounts.google.com/o/oauth2/auth",
138
- "token_uri": "https://oauth2.googleapis.com/token",
139
- "redirect_uris": ["https://poemsforaphrodite-gscpro.hf.space/"],
140
- }
141
- }
142
- return client_config
143
-
144
- def init_oauth_flow(client_config):
145
- scopes = ["https://www.googleapis.com/auth/webmasters.readonly"]
146
- flow = Flow.from_client_config(
147
- client_config,
148
- scopes=scopes,
149
- redirect_uri=client_config["web"]["redirect_uris"][0]
150
- )
151
- return flow
152
-
153
- def google_auth(client_config):
154
- flow = init_oauth_flow(client_config)
155
- auth_url, _ = flow.authorization_url(prompt="consent")
156
- return flow, auth_url
157
-
158
- def auth_search_console(client_config, credentials):
159
- token = {
160
- "token": credentials.token,
161
- "refresh_token": credentials.refresh_token,
162
- "token_uri": credentials.token_uri,
163
- "client_id": credentials.client_id,
164
- "client_secret": credentials.client_secret,
165
- "scopes": credentials.scopes,
166
- "id_token": getattr(credentials, "id_token", None),
167
- }
168
- return searchconsole.authenticate(client_config=client_config, credentials=token)
169
-
170
- # -------------
171
- # Data Fetching Functions
172
- # -------------
173
-
174
- def list_gsc_properties(credentials):
175
- service = build('webmasters', 'v3', credentials=credentials)
176
- site_list = service.sites().list().execute()
177
- return [site['siteUrl'] for site in site_list.get('siteEntry', [])] or ["No properties found"]
178
-
179
- def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type=None):
180
- query = webproperty.query.range(start_date, end_date).search_type(search_type).dimension(*dimensions)
181
- if 'device' in dimensions and device_type and device_type != 'All Devices':
182
- query = query.filter('device', 'equals', device_type.lower())
183
- try:
184
- df = query.limit(MAX_ROWS).get().to_dataframe()
185
- return process_gsc_data(df)
186
- except Exception as e:
187
- show_error(e)
188
- return pd.DataFrame()
189
-
190
- def fetch_data_loading(webproperty, search_type, start_date, end_date, dimensions, device_type=None, model_type='english'):
191
- with st.spinner('Fetching data and calculating relevancy scores...'):
192
- df = fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type)
193
- if not df.empty:
194
- df = calculate_relevancy_scores(df, model_type)
195
- processed_df = process_gsc_data(df)
196
- return processed_df
197
-
198
- # -------------
199
- # Utility Functions
200
- # -------------
201
-
202
- def update_dimensions(selected_search_type):
203
- return BASE_DIMENSIONS + ['device'] if selected_search_type in SEARCH_TYPES else BASE_DIMENSIONS
204
-
205
- def calc_date_range(selection, custom_start=None, custom_end=None):
206
- range_map = {
207
- 'Last 7 Days': 7,
208
- 'Last 30 Days': 30,
209
- 'Last 3 Months': 90,
210
- 'Last 6 Months': 180,
211
- 'Last 12 Months': 365,
212
- 'Last 16 Months': 480
213
- }
214
- today = datetime.date.today()
215
- if selection == 'Custom Range':
216
- if custom_start and custom_end:
217
- return custom_start, custom_end
218
- else:
219
- return today - datetime.timedelta(days=7), today
220
- return today - datetime.timedelta(days=range_map.get(selection, 0)), today
221
-
222
- def show_error(e):
223
- st.error(f"An error occurred: {e}")
224
-
225
- def property_change():
226
- st.session_state.selected_property = st.session_state['selected_property_selector']
227
-
228
- # -------------
229
- # File & Download Operations
230
- # -------------
231
-
232
- def show_dataframe(report):
233
- with st.expander("Preview the First 100 Rows (Unique Pages with Top Query)"):
234
- st.dataframe(report.head(DF_PREVIEW_ROWS))
235
-
236
- def download_csv_link(report):
237
- def to_csv(df):
238
- return df.to_csv(index=False, encoding='utf-8-sig')
239
- csv = to_csv(report)
240
- b64_csv = base64.b64encode(csv.encode()).decode()
241
- href = f'<a href="data:file/csv;base64,{b64_csv}" download="search_console_data.csv">Download CSV File</a>'
242
- st.markdown(href, unsafe_allow_html=True)
243
-
244
- # -------------
245
- # Streamlit UI Components
246
- # -------------
247
-
248
- def show_google_sign_in(auth_url):
249
- with st.sidebar:
250
- if st.button("Sign in with Google"):
251
- st.write('Please click the link below to sign in:')
252
- st.markdown(f'[Google Sign-In]({auth_url})', unsafe_allow_html=True)
253
-
254
- def show_property_selector(properties, account):
255
- selected_property = st.selectbox(
256
- "Select a Search Console Property:",
257
- properties,
258
- index=properties.index(
259
- st.session_state.selected_property) if st.session_state.selected_property in properties else 0,
260
- key='selected_property_selector',
261
- on_change=property_change
262
- )
263
- return account[selected_property]
264
-
265
- def show_search_type_selector():
266
- return st.selectbox(
267
- "Select Search Type:",
268
- SEARCH_TYPES,
269
- index=SEARCH_TYPES.index(st.session_state.selected_search_type),
270
- key='search_type_selector'
271
- )
272
-
273
- def show_model_type_selector():
274
- return st.selectbox(
275
- "Select the embedding model:",
276
- ["english", "multilingual"],
277
- key='model_type_selector'
278
- )
279
-
280
- def show_date_range_selector():
281
- return st.selectbox(
282
- "Select Date Range:",
283
- DATE_RANGE_OPTIONS,
284
- index=DATE_RANGE_OPTIONS.index(st.session_state.selected_date_range),
285
- key='date_range_selector'
286
- )
287
-
288
- def show_custom_date_inputs():
289
- st.session_state.custom_start_date = st.date_input("Start Date", st.session_state.custom_start_date)
290
- st.session_state.custom_end_date = st.date_input("End Date", st.session_state.custom_end_date)
291
-
292
- def show_dimensions_selector(search_type):
293
- available_dimensions = update_dimensions(search_type)
294
- return st.multiselect(
295
- "Select Dimensions:",
296
- available_dimensions,
297
- default=st.session_state.selected_dimensions,
298
- key='dimensions_selector'
299
- )
300
-
301
- def show_paginated_dataframe(report, rows_per_page=20):
302
- # Convert 'position' column to integer
303
- report['position'] = report['position'].astype(int)
304
-
305
- # Create a clickable URL column
306
- def make_clickable(url):
307
- return f'<a href="{url}" target="_blank">{url}</a>'
308
-
309
- report['clickable_url'] = report['page'].apply(make_clickable)
310
-
311
- # Reorder columns to put clickable_url first and sort by impressions
312
- columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
313
- report = report[columns].sort_values('impressions', ascending=False)
314
-
315
- total_rows = len(report)
316
- total_pages = (total_rows - 1) // rows_per_page + 1
317
-
318
- if 'current_page' not in st.session_state:
319
- st.session_state.current_page = 1
320
-
321
- col1, col2, col3 = st.columns([1,3,1])
322
- with col1:
323
- if st.button("Previous", disabled=st.session_state.current_page == 1):
324
- st.session_state.current_page -= 1
325
- with col2:
326
- st.write(f"Page {st.session_state.current_page} of {total_pages}")
327
- with col3:
328
- if st.button("Next", disabled=st.session_state.current_page == total_pages):
329
- st.session_state.current_page += 1
330
-
331
- start_idx = (st.session_state.current_page - 1) * rows_per_page
332
- end_idx = start_idx + rows_per_page
333
-
334
- # Use st.markdown to display the dataframe with clickable links
335
- st.markdown(report.iloc[start_idx:end_idx].to_html(escape=False, index=False), unsafe_allow_html=True)
336
- # -------------
337
- # Main Streamlit App Function
338
- # -------------
339
-
340
  def main():
341
  setup_streamlit()
342
  client_config = load_config()
@@ -345,7 +6,7 @@ def main():
345
  st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
346
 
347
  # Retrieve query parameters
348
- query_params = st.experimental_get_query_params()
349
  print("Query Parameters:", query_params)
350
 
351
  # Display the query parameters in the Streamlit app for debugging
@@ -377,7 +38,7 @@ def main():
377
  webproperty = show_property_selector(properties, account)
378
  search_type = show_search_type_selector()
379
  date_range_selection = show_date_range_selector()
380
- model_type = show_model_type_selector() # Add this line
381
  if date_range_selection == 'Custom Range':
382
  show_custom_date_inputs()
383
  start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
@@ -391,7 +52,7 @@ def main():
391
 
392
  if st.button("Fetch Data"):
393
  with st.spinner('Fetching data...'):
394
- st.session_state.report_data = fetch_data_loading(webproperty, search_type, start_date, end_date, selected_dimensions, model_type=model_type) # Update this line
395
 
396
  if st.session_state.report_data is not None and not st.session_state.report_data.empty:
397
  show_paginated_dataframe(st.session_state.report_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def main():
2
  setup_streamlit()
3
  client_config = load_config()
 
6
  st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
7
 
8
  # Retrieve query parameters
9
+ query_params = st.query_params
10
  print("Query Parameters:", query_params)
11
 
12
  # Display the query parameters in the Streamlit app for debugging
 
38
  webproperty = show_property_selector(properties, account)
39
  search_type = show_search_type_selector()
40
  date_range_selection = show_date_range_selector()
41
+ model_type = show_model_type_selector()
42
  if date_range_selection == 'Custom Range':
43
  show_custom_date_inputs()
44
  start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
 
52
 
53
  if st.button("Fetch Data"):
54
  with st.spinner('Fetching data...'):
55
+ st.session_state.report_data = fetch_data_loading(webproperty, search_type, start_date, end_date, selected_dimensions, model_type=model_type)
56
 
57
  if st.session_state.report_data is not None and not st.session_state.report_data.empty:
58
  show_paginated_dataframe(st.session_state.report_data)