Spaces:
Running
Running
poemsforaphrodite
commited on
Commit
•
0d6414e
1
Parent(s):
ea21800
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
# Standard library imports
|
2 |
import datetime
|
3 |
import base64
|
|
|
4 |
|
5 |
# Related third-party imports
|
6 |
import streamlit as st
|
@@ -10,20 +11,19 @@ from googleapiclient.discovery import build
|
|
10 |
from dotenv import load_dotenv
|
11 |
import pandas as pd
|
12 |
import searchconsole
|
13 |
-
import os
|
14 |
import cohere
|
15 |
from sklearn.metrics.pairwise import cosine_similarity
|
16 |
import requests
|
17 |
from bs4 import BeautifulSoup
|
18 |
|
19 |
load_dotenv()
|
|
|
20 |
# Initialize Cohere client
|
21 |
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
22 |
co = cohere.Client(COHERE_API_KEY)
|
23 |
|
24 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
25 |
IS_LOCAL = False
|
26 |
-
#==TREs
|
27 |
|
28 |
# Constants
|
29 |
SEARCH_TYPES = ["web", "image", "video", "news", "discover", "googleNews"]
|
@@ -46,14 +46,9 @@ DF_PREVIEW_ROWS = 100
|
|
46 |
# -------------
|
47 |
|
48 |
def setup_streamlit():
|
49 |
-
"""
|
50 |
-
Configures Streamlit's page settings and displays the app title and markdown information.
|
51 |
-
Sets the page layout, title, and markdown content with links and app description.
|
52 |
-
"""
|
53 |
st.set_page_config(page_title="✨ Simple Google Search Console Data | LeeFoot.co.uk", layout="wide")
|
54 |
st.title("✨ Simple Google Search Console Data | June 2024")
|
55 |
st.markdown(f"### Lightweight GSC Data Extractor. (Max {MAX_ROWS:,} Rows)")
|
56 |
-
|
57 |
st.markdown(
|
58 |
"""
|
59 |
<p>
|
@@ -65,10 +60,6 @@ def setup_streamlit():
|
|
65 |
st.divider()
|
66 |
|
67 |
def init_session_state():
|
68 |
-
"""
|
69 |
-
Initialises or updates the Streamlit session state variables for property selection,
|
70 |
-
search type, date range, dimensions, and device type.
|
71 |
-
"""
|
72 |
if 'selected_property' not in st.session_state:
|
73 |
st.session_state.selected_property = None
|
74 |
if 'selected_search_type' not in st.session_state:
|
@@ -88,11 +79,11 @@ def init_session_state():
|
|
88 |
if 'custom_end_date' not in st.session_state:
|
89 |
st.session_state.custom_end_date = datetime.date.today()
|
90 |
|
|
|
|
|
|
|
91 |
|
92 |
def fetch_content(url):
|
93 |
-
"""
|
94 |
-
Fetches the content of a webpage.
|
95 |
-
"""
|
96 |
try:
|
97 |
response = requests.get(url)
|
98 |
response.raise_for_status()
|
@@ -101,66 +92,39 @@ def fetch_content(url):
|
|
101 |
return content
|
102 |
except requests.RequestException as e:
|
103 |
return str(e)
|
104 |
-
|
105 |
def generate_embeddings(text_list):
|
106 |
-
"""
|
107 |
-
Generates embeddings for a list of texts using Cohere's API.
|
108 |
-
"""
|
109 |
if not text_list:
|
110 |
return []
|
111 |
-
|
112 |
model = 'embed-english-v3.0'
|
113 |
input_type = 'search_document'
|
114 |
response = co.embed(model=model, texts=text_list, input_type=input_type)
|
115 |
embeddings = response.embeddings
|
116 |
return embeddings
|
117 |
|
118 |
-
|
119 |
def calculate_relevancy_scores(df):
|
120 |
-
"""
|
121 |
-
Calculates relevancy scores for each row in the dataframe.
|
122 |
-
"""
|
123 |
try:
|
124 |
-
st.write("Calculating relevancy scores...")
|
125 |
-
st.write(f"Input DataFrame shape: {df.shape}")
|
126 |
-
st.write(f"Input DataFrame columns: {df.columns}")
|
127 |
-
|
128 |
page_contents = [fetch_content(url) for url in df['page']]
|
129 |
-
st.write(f"Fetched {len(page_contents)} page contents")
|
130 |
-
|
131 |
page_embeddings = generate_embeddings(page_contents)
|
132 |
-
st.write(f"Generated {len(page_embeddings)} page embeddings")
|
133 |
-
|
134 |
query_embeddings = generate_embeddings(df['query'].tolist())
|
135 |
-
st.write(f"Generated {len(query_embeddings)} query embeddings")
|
136 |
-
|
137 |
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
138 |
-
st.write(f"Calculated {len(relevancy_scores)} relevancy scores")
|
139 |
-
st.write(f"Sample relevancy scores: {relevancy_scores[:5]}")
|
140 |
-
|
141 |
df = df.assign(relevancy_score=relevancy_scores)
|
142 |
-
st.write(f"Assigned relevancy scores to DataFrame")
|
143 |
-
st.write(f"DataFrame shape after assigning scores: {df.shape}")
|
144 |
-
st.write(f"DataFrame columns after assigning scores: {df.columns}")
|
145 |
-
st.write(f"Sample relevancy scores from DataFrame: {df['relevancy_score'].head()}")
|
146 |
-
|
147 |
except Exception as e:
|
148 |
st.warning(f"Error calculating relevancy scores: {e}")
|
149 |
-
df = df.assign(relevancy_score=0)
|
150 |
-
|
151 |
return df
|
152 |
-
def fetch_data_loading(webproperty, search_type, start_date, end_date, dimensions, device_type=None):
|
153 |
-
"""
|
154 |
-
Fetches Google Search Console data with a loading indicator and calculates relevancy scores.
|
155 |
-
"""
|
156 |
-
with st.spinner('Fetching data and calculating relevancy scores...'):
|
157 |
-
df = fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type)
|
158 |
-
if not df.empty:
|
159 |
-
df = calculate_relevancy_scores(df)
|
160 |
-
st.write(f"Data fetched. Shape: {df.shape}")
|
161 |
-
return df
|
162 |
-
# -------------
|
163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
# Google Authentication Functions
|
165 |
# -------------
|
166 |
|
@@ -177,30 +141,20 @@ def load_config():
|
|
177 |
return client_config
|
178 |
|
179 |
def init_oauth_flow(client_config):
|
180 |
-
"""
|
181 |
-
Initializes the OAuth flow for Google API authentication using the client configuration.
|
182 |
-
Sets the necessary scopes and returns the configured Flow object.
|
183 |
-
"""
|
184 |
scopes = ["https://www.googleapis.com/auth/webmasters.readonly"]
|
185 |
flow = Flow.from_client_config(
|
186 |
client_config,
|
187 |
scopes=scopes,
|
188 |
-
redirect_uri=client_config["web"]["redirect_uris"][0]
|
189 |
)
|
190 |
return flow
|
|
|
191 |
def google_auth(client_config):
|
192 |
-
"""
|
193 |
-
Starts the Google authentication process using OAuth.
|
194 |
-
Generates and returns the OAuth flow and the authentication URL.
|
195 |
-
"""
|
196 |
flow = init_oauth_flow(client_config)
|
197 |
auth_url, _ = flow.authorization_url(prompt="consent")
|
198 |
return flow, auth_url
|
|
|
199 |
def auth_search_console(client_config, credentials):
|
200 |
-
"""
|
201 |
-
Authenticates the user with the Google Search Console API using provided credentials.
|
202 |
-
Returns an authenticated searchconsole client.
|
203 |
-
"""
|
204 |
token = {
|
205 |
"token": credentials.token,
|
206 |
"refresh_token": credentials.refresh_token,
|
@@ -217,24 +171,14 @@ def auth_search_console(client_config, credentials):
|
|
217 |
# -------------
|
218 |
|
219 |
def list_gsc_properties(credentials):
|
220 |
-
"""
|
221 |
-
Lists all Google Search Console properties accessible with the given credentials.
|
222 |
-
Returns a list of property URLs or a message if no properties are found.
|
223 |
-
"""
|
224 |
service = build('webmasters', 'v3', credentials=credentials)
|
225 |
site_list = service.sites().list().execute()
|
226 |
return [site['siteUrl'] for site in site_list.get('siteEntry', [])] or ["No properties found"]
|
227 |
|
228 |
def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type=None):
|
229 |
-
"""
|
230 |
-
Fetches Google Search Console data for a specified property, date range, dimensions, and device type.
|
231 |
-
Handles errors and returns the data as a DataFrame.
|
232 |
-
"""
|
233 |
query = webproperty.query.range(start_date, end_date).search_type(search_type).dimension(*dimensions)
|
234 |
-
|
235 |
if 'device' in dimensions and device_type and device_type != 'All Devices':
|
236 |
query = query.filter('device', 'equals', device_type.lower())
|
237 |
-
|
238 |
try:
|
239 |
df = query.limit(MAX_ROWS).get().to_dataframe()
|
240 |
return process_gsc_data(df)
|
@@ -242,88 +186,22 @@ def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, d
|
|
242 |
show_error(e)
|
243 |
return pd.DataFrame()
|
244 |
|
245 |
-
def process_gsc_data(df):
|
246 |
-
"""
|
247 |
-
Processes the GSC data to return only unique pages with their first query and relevancy score.
|
248 |
-
"""
|
249 |
-
st.write("Processing GSC data...")
|
250 |
-
st.write(f"Input DataFrame shape: {df.shape}")
|
251 |
-
st.write(f"Input DataFrame columns: {df.columns}")
|
252 |
-
|
253 |
-
# Sort the dataframe by page and clicks (descending) to get the most relevant query first
|
254 |
-
df_sorted = df.sort_values(['page', 'clicks'], ascending=[True, False])
|
255 |
-
|
256 |
-
# Get the first occurrence of each page (which will be the one with the highest clicks)
|
257 |
-
df_unique = df_sorted.drop_duplicates(subset='page', keep='first').copy()
|
258 |
-
|
259 |
-
st.write(f"Unique pages DataFrame shape: {df_unique.shape}")
|
260 |
-
st.write(f"Unique pages DataFrame columns: {df_unique.columns}")
|
261 |
-
|
262 |
-
# Ensure 'relevancy_score' column exists and is preserved
|
263 |
-
if 'relevancy_score' not in df_unique.columns:
|
264 |
-
st.write("Relevancy score column not found, adding default values")
|
265 |
-
df_unique['relevancy_score'] = 0 # Default value if column doesn't exist
|
266 |
-
else:
|
267 |
-
st.write("Preserving relevancy scores")
|
268 |
-
# Make sure to keep the original relevancy scores
|
269 |
-
df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
|
270 |
-
|
271 |
-
# Select only the relevant columns, including the relevancy_score
|
272 |
-
result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
|
273 |
-
|
274 |
-
st.write(f"Processed data. Shape: {result.shape}")
|
275 |
-
st.write(f"Columns: {result.columns}")
|
276 |
-
st.write(f"Sample relevancy scores: {result['relevancy_score'].head()}")
|
277 |
-
|
278 |
-
return result
|
279 |
-
|
280 |
-
|
281 |
def fetch_data_loading(webproperty, search_type, start_date, end_date, dimensions, device_type=None):
|
282 |
-
"""
|
283 |
-
Fetches Google Search Console data with a loading indicator and calculates relevancy scores.
|
284 |
-
"""
|
285 |
with st.spinner('Fetching data and calculating relevancy scores...'):
|
286 |
df = fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type)
|
287 |
-
st.write(f"Data fetched. Shape: {df.shape}")
|
288 |
-
st.write(f"Columns: {df.columns}")
|
289 |
-
|
290 |
if not df.empty:
|
291 |
df = calculate_relevancy_scores(df)
|
292 |
-
st.write("Relevancy scores calculated.")
|
293 |
-
st.write(f"DataFrame shape after calculating scores: {df.shape}")
|
294 |
-
st.write(f"DataFrame columns after calculating scores: {df.columns}")
|
295 |
-
st.write(f"Sample relevancy scores after calculation: {df['relevancy_score'].head()}")
|
296 |
-
|
297 |
processed_df = process_gsc_data(df)
|
298 |
-
st.write("Data processed.")
|
299 |
-
st.write(f"Final DataFrame shape: {processed_df.shape}")
|
300 |
-
st.write(f"Final DataFrame columns: {processed_df.columns}")
|
301 |
-
st.write(f"Final sample relevancy scores: {processed_df['relevancy_score'].head()}")
|
302 |
-
|
303 |
return processed_df
|
304 |
-
"""
|
305 |
-
Fetches Google Search Console data with a loading indicator. Utilises 'fetch_gsc_data' for data retrieval.
|
306 |
-
Returns the fetched data as a DataFrame.
|
307 |
-
"""
|
308 |
-
with st.spinner('Fetching data...'):
|
309 |
-
return fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type)
|
310 |
|
311 |
# -------------
|
312 |
# Utility Functions
|
313 |
# -------------
|
314 |
|
315 |
def update_dimensions(selected_search_type):
|
316 |
-
"""
|
317 |
-
Updates and returns the list of dimensions based on the selected search type.
|
318 |
-
Adds 'device' to dimensions if the search type requires it.
|
319 |
-
"""
|
320 |
return BASE_DIMENSIONS + ['device'] if selected_search_type in SEARCH_TYPES else BASE_DIMENSIONS
|
321 |
|
322 |
def calc_date_range(selection, custom_start=None, custom_end=None):
|
323 |
-
"""
|
324 |
-
Calculates the date range based on the selected range option.
|
325 |
-
Returns the start and end dates for the specified range.
|
326 |
-
"""
|
327 |
range_map = {
|
328 |
'Last 7 Days': 7,
|
329 |
'Last 30 Days': 30,
|
@@ -341,17 +219,9 @@ def calc_date_range(selection, custom_start=None, custom_end=None):
|
|
341 |
return today - datetime.timedelta(days=range_map.get(selection, 0)), today
|
342 |
|
343 |
def show_error(e):
|
344 |
-
"""
|
345 |
-
Displays an error message in the Streamlit app.
|
346 |
-
Formats and shows the provided error 'e'.
|
347 |
-
"""
|
348 |
st.error(f"An error occurred: {e}")
|
349 |
|
350 |
def property_change():
|
351 |
-
"""
|
352 |
-
Updates the 'selected_property' in the Streamlit session state.
|
353 |
-
Triggered on change of the property selection.
|
354 |
-
"""
|
355 |
st.session_state.selected_property = st.session_state['selected_property_selector']
|
356 |
|
357 |
# -------------
|
@@ -359,19 +229,12 @@ def property_change():
|
|
359 |
# -------------
|
360 |
|
361 |
def show_dataframe(report):
|
362 |
-
"""
|
363 |
-
Shows a preview of the first 100 rows of the processed report DataFrame in an expandable section.
|
364 |
-
"""
|
365 |
with st.expander("Preview the First 100 Rows (Unique Pages with Top Query)"):
|
366 |
st.dataframe(report.head(DF_PREVIEW_ROWS))
|
367 |
|
368 |
def download_csv_link(report):
|
369 |
-
"""
|
370 |
-
Generates and displays a download link for the report DataFrame in CSV format.
|
371 |
-
"""
|
372 |
def to_csv(df):
|
373 |
return df.to_csv(index=False, encoding='utf-8-sig')
|
374 |
-
|
375 |
csv = to_csv(report)
|
376 |
b64_csv = base64.b64encode(csv.encode()).decode()
|
377 |
href = f'<a href="data:file/csv;base64,{b64_csv}" download="search_console_data.csv">Download CSV File</a>'
|
@@ -382,20 +245,12 @@ def download_csv_link(report):
|
|
382 |
# -------------
|
383 |
|
384 |
def show_google_sign_in(auth_url):
|
385 |
-
"""
|
386 |
-
Displays the Google sign-in button and authentication URL in the Streamlit sidebar.
|
387 |
-
"""
|
388 |
with st.sidebar:
|
389 |
if st.button("Sign in with Google"):
|
390 |
-
# Open the authentication URL
|
391 |
st.write('Please click the link below to sign in:')
|
392 |
st.markdown(f'[Google Sign-In]({auth_url})', unsafe_allow_html=True)
|
393 |
|
394 |
def show_property_selector(properties, account):
|
395 |
-
"""
|
396 |
-
Displays a dropdown selector for Google Search Console properties.
|
397 |
-
Returns the selected property's webproperty object.
|
398 |
-
"""
|
399 |
selected_property = st.selectbox(
|
400 |
"Select a Search Console Property:",
|
401 |
properties,
|
@@ -407,10 +262,6 @@ def show_property_selector(properties, account):
|
|
407 |
return account[selected_property]
|
408 |
|
409 |
def show_search_type_selector():
|
410 |
-
"""
|
411 |
-
Displays a dropdown selector for choosing the search type.
|
412 |
-
Returns the selected search type.
|
413 |
-
"""
|
414 |
return st.selectbox(
|
415 |
"Select Search Type:",
|
416 |
SEARCH_TYPES,
|
@@ -419,10 +270,6 @@ def show_search_type_selector():
|
|
419 |
)
|
420 |
|
421 |
def show_date_range_selector():
|
422 |
-
"""
|
423 |
-
Displays a dropdown selector for choosing the date range.
|
424 |
-
Returns the selected date range option.
|
425 |
-
"""
|
426 |
return st.selectbox(
|
427 |
"Select Date Range:",
|
428 |
DATE_RANGE_OPTIONS,
|
@@ -431,18 +278,10 @@ def show_date_range_selector():
|
|
431 |
)
|
432 |
|
433 |
def show_custom_date_inputs():
|
434 |
-
"""
|
435 |
-
Displays date input fields for custom date range selection.
|
436 |
-
Updates session state with the selected dates.
|
437 |
-
"""
|
438 |
st.session_state.custom_start_date = st.date_input("Start Date", st.session_state.custom_start_date)
|
439 |
st.session_state.custom_end_date = st.date_input("End Date", st.session_state.custom_end_date)
|
440 |
|
441 |
def show_dimensions_selector(search_type):
|
442 |
-
"""
|
443 |
-
Displays a multi-select box for choosing dimensions based on the selected search type.
|
444 |
-
Returns the selected dimensions.
|
445 |
-
"""
|
446 |
available_dimensions = update_dimensions(search_type)
|
447 |
return st.multiselect(
|
448 |
"Select Dimensions:",
|
@@ -451,32 +290,24 @@ def show_dimensions_selector(search_type):
|
|
451 |
key='dimensions_selector'
|
452 |
)
|
453 |
|
454 |
-
def show_fetch_data_button(webproperty, search_type, start_date, end_date, selected_dimensions):
|
455 |
-
"""
|
456 |
-
Displays a button to fetch data based on selected parameters.
|
457 |
-
Shows the report DataFrame and download link upon successful data fetching.
|
458 |
-
"""
|
459 |
-
if st.button("Fetch Data"):
|
460 |
-
report = fetch_data_loading(webproperty, search_type, start_date, end_date, selected_dimensions)
|
461 |
-
|
462 |
-
if report is not None and not report.empty:
|
463 |
-
show_dataframe(report)
|
464 |
-
download_csv_link(report)
|
465 |
-
else:
|
466 |
-
st.warning("No data found for the selected criteria.")
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
def show_paginated_dataframe(report, rows_per_page=20):
|
471 |
-
"""
|
472 |
-
Displays the DataFrame with custom pagination.
|
473 |
-
"""
|
474 |
total_rows = len(report)
|
475 |
total_pages = (total_rows - 1) // rows_per_page + 1
|
476 |
|
477 |
-
|
|
|
478 |
|
479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
end_idx = start_idx + rows_per_page
|
481 |
st.dataframe(report.iloc[start_idx:end_idx])
|
482 |
|
@@ -487,16 +318,18 @@ def show_paginated_dataframe(report, rows_per_page=20):
|
|
487 |
def main():
|
488 |
setup_streamlit()
|
489 |
client_config = load_config()
|
490 |
-
|
|
|
|
|
491 |
|
492 |
query_params = st.experimental_get_query_params()
|
493 |
auth_code = query_params.get("code", [None])[0]
|
494 |
|
495 |
-
if auth_code and not st.session_state
|
496 |
st.session_state.auth_flow.fetch_token(code=auth_code)
|
497 |
st.session_state.credentials = st.session_state.auth_flow.credentials
|
498 |
|
499 |
-
if not st.session_state
|
500 |
show_google_sign_in(st.session_state.auth_url)
|
501 |
else:
|
502 |
init_session_state()
|
@@ -516,13 +349,19 @@ def main():
|
|
516 |
|
517 |
selected_dimensions = show_dimensions_selector(search_type)
|
518 |
|
519 |
-
if
|
520 |
-
|
|
|
|
|
|
|
|
|
521 |
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
|
|
526 |
|
|
|
527 |
if __name__ == "__main__":
|
528 |
main()
|
|
|
1 |
# Standard library imports
|
2 |
import datetime
|
3 |
import base64
|
4 |
+
import os
|
5 |
|
6 |
# Related third-party imports
|
7 |
import streamlit as st
|
|
|
11 |
from dotenv import load_dotenv
|
12 |
import pandas as pd
|
13 |
import searchconsole
|
|
|
14 |
import cohere
|
15 |
from sklearn.metrics.pairwise import cosine_similarity
|
16 |
import requests
|
17 |
from bs4 import BeautifulSoup
|
18 |
|
19 |
load_dotenv()
|
20 |
+
|
21 |
# Initialize Cohere client
|
22 |
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
23 |
co = cohere.Client(COHERE_API_KEY)
|
24 |
|
25 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
26 |
IS_LOCAL = False
|
|
|
27 |
|
28 |
# Constants
|
29 |
SEARCH_TYPES = ["web", "image", "video", "news", "discover", "googleNews"]
|
|
|
46 |
# -------------
|
47 |
|
48 |
def setup_streamlit():
|
|
|
|
|
|
|
|
|
49 |
st.set_page_config(page_title="✨ Simple Google Search Console Data | LeeFoot.co.uk", layout="wide")
|
50 |
st.title("✨ Simple Google Search Console Data | June 2024")
|
51 |
st.markdown(f"### Lightweight GSC Data Extractor. (Max {MAX_ROWS:,} Rows)")
|
|
|
52 |
st.markdown(
|
53 |
"""
|
54 |
<p>
|
|
|
60 |
st.divider()
|
61 |
|
62 |
def init_session_state():
|
|
|
|
|
|
|
|
|
63 |
if 'selected_property' not in st.session_state:
|
64 |
st.session_state.selected_property = None
|
65 |
if 'selected_search_type' not in st.session_state:
|
|
|
79 |
if 'custom_end_date' not in st.session_state:
|
80 |
st.session_state.custom_end_date = datetime.date.today()
|
81 |
|
82 |
+
# -------------
|
83 |
+
# Data Processing Functions
|
84 |
+
# -------------
|
85 |
|
86 |
def fetch_content(url):
|
|
|
|
|
|
|
87 |
try:
|
88 |
response = requests.get(url)
|
89 |
response.raise_for_status()
|
|
|
92 |
return content
|
93 |
except requests.RequestException as e:
|
94 |
return str(e)
|
95 |
+
|
96 |
def generate_embeddings(text_list):
|
|
|
|
|
|
|
97 |
if not text_list:
|
98 |
return []
|
|
|
99 |
model = 'embed-english-v3.0'
|
100 |
input_type = 'search_document'
|
101 |
response = co.embed(model=model, texts=text_list, input_type=input_type)
|
102 |
embeddings = response.embeddings
|
103 |
return embeddings
|
104 |
|
|
|
105 |
def calculate_relevancy_scores(df):
|
|
|
|
|
|
|
106 |
try:
|
|
|
|
|
|
|
|
|
107 |
page_contents = [fetch_content(url) for url in df['page']]
|
|
|
|
|
108 |
page_embeddings = generate_embeddings(page_contents)
|
|
|
|
|
109 |
query_embeddings = generate_embeddings(df['query'].tolist())
|
|
|
|
|
110 |
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
|
|
|
|
|
|
111 |
df = df.assign(relevancy_score=relevancy_scores)
|
|
|
|
|
|
|
|
|
|
|
112 |
except Exception as e:
|
113 |
st.warning(f"Error calculating relevancy scores: {e}")
|
114 |
+
df = df.assign(relevancy_score=0)
|
|
|
115 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
+
def process_gsc_data(df):
|
118 |
+
df_sorted = df.sort_values(['page', 'clicks'], ascending=[True, False])
|
119 |
+
df_unique = df_sorted.drop_duplicates(subset='page', keep='first').copy()
|
120 |
+
if 'relevancy_score' not in df_unique.columns:
|
121 |
+
df_unique['relevancy_score'] = 0
|
122 |
+
else:
|
123 |
+
df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
|
124 |
+
result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
|
125 |
+
return result
|
126 |
+
|
127 |
+
# -------------
|
128 |
# Google Authentication Functions
|
129 |
# -------------
|
130 |
|
|
|
141 |
return client_config
|
142 |
|
143 |
def init_oauth_flow(client_config):
|
|
|
|
|
|
|
|
|
144 |
scopes = ["https://www.googleapis.com/auth/webmasters.readonly"]
|
145 |
flow = Flow.from_client_config(
|
146 |
client_config,
|
147 |
scopes=scopes,
|
148 |
+
redirect_uri=client_config["web"]["redirect_uris"][0]
|
149 |
)
|
150 |
return flow
|
151 |
+
|
152 |
def google_auth(client_config):
|
|
|
|
|
|
|
|
|
153 |
flow = init_oauth_flow(client_config)
|
154 |
auth_url, _ = flow.authorization_url(prompt="consent")
|
155 |
return flow, auth_url
|
156 |
+
|
157 |
def auth_search_console(client_config, credentials):
|
|
|
|
|
|
|
|
|
158 |
token = {
|
159 |
"token": credentials.token,
|
160 |
"refresh_token": credentials.refresh_token,
|
|
|
171 |
# -------------
|
172 |
|
173 |
def list_gsc_properties(credentials):
|
|
|
|
|
|
|
|
|
174 |
service = build('webmasters', 'v3', credentials=credentials)
|
175 |
site_list = service.sites().list().execute()
|
176 |
return [site['siteUrl'] for site in site_list.get('siteEntry', [])] or ["No properties found"]
|
177 |
|
178 |
def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type=None):
|
|
|
|
|
|
|
|
|
179 |
query = webproperty.query.range(start_date, end_date).search_type(search_type).dimension(*dimensions)
|
|
|
180 |
if 'device' in dimensions and device_type and device_type != 'All Devices':
|
181 |
query = query.filter('device', 'equals', device_type.lower())
|
|
|
182 |
try:
|
183 |
df = query.limit(MAX_ROWS).get().to_dataframe()
|
184 |
return process_gsc_data(df)
|
|
|
186 |
show_error(e)
|
187 |
return pd.DataFrame()
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
def fetch_data_loading(webproperty, search_type, start_date, end_date, dimensions, device_type=None):
|
|
|
|
|
|
|
190 |
with st.spinner('Fetching data and calculating relevancy scores...'):
|
191 |
df = fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type)
|
|
|
|
|
|
|
192 |
if not df.empty:
|
193 |
df = calculate_relevancy_scores(df)
|
|
|
|
|
|
|
|
|
|
|
194 |
processed_df = process_gsc_data(df)
|
|
|
|
|
|
|
|
|
|
|
195 |
return processed_df
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
# -------------
|
198 |
# Utility Functions
|
199 |
# -------------
|
200 |
|
201 |
def update_dimensions(selected_search_type):
|
|
|
|
|
|
|
|
|
202 |
return BASE_DIMENSIONS + ['device'] if selected_search_type in SEARCH_TYPES else BASE_DIMENSIONS
|
203 |
|
204 |
def calc_date_range(selection, custom_start=None, custom_end=None):
|
|
|
|
|
|
|
|
|
205 |
range_map = {
|
206 |
'Last 7 Days': 7,
|
207 |
'Last 30 Days': 30,
|
|
|
219 |
return today - datetime.timedelta(days=range_map.get(selection, 0)), today
|
220 |
|
221 |
def show_error(e):
|
|
|
|
|
|
|
|
|
222 |
st.error(f"An error occurred: {e}")
|
223 |
|
224 |
def property_change():
|
|
|
|
|
|
|
|
|
225 |
st.session_state.selected_property = st.session_state['selected_property_selector']
|
226 |
|
227 |
# -------------
|
|
|
229 |
# -------------
|
230 |
|
231 |
def show_dataframe(report):
|
|
|
|
|
|
|
232 |
with st.expander("Preview the First 100 Rows (Unique Pages with Top Query)"):
|
233 |
st.dataframe(report.head(DF_PREVIEW_ROWS))
|
234 |
|
235 |
def download_csv_link(report):
|
|
|
|
|
|
|
236 |
def to_csv(df):
|
237 |
return df.to_csv(index=False, encoding='utf-8-sig')
|
|
|
238 |
csv = to_csv(report)
|
239 |
b64_csv = base64.b64encode(csv.encode()).decode()
|
240 |
href = f'<a href="data:file/csv;base64,{b64_csv}" download="search_console_data.csv">Download CSV File</a>'
|
|
|
245 |
# -------------
|
246 |
|
247 |
def show_google_sign_in(auth_url):
|
|
|
|
|
|
|
248 |
with st.sidebar:
|
249 |
if st.button("Sign in with Google"):
|
|
|
250 |
st.write('Please click the link below to sign in:')
|
251 |
st.markdown(f'[Google Sign-In]({auth_url})', unsafe_allow_html=True)
|
252 |
|
253 |
def show_property_selector(properties, account):
|
|
|
|
|
|
|
|
|
254 |
selected_property = st.selectbox(
|
255 |
"Select a Search Console Property:",
|
256 |
properties,
|
|
|
262 |
return account[selected_property]
|
263 |
|
264 |
def show_search_type_selector():
|
|
|
|
|
|
|
|
|
265 |
return st.selectbox(
|
266 |
"Select Search Type:",
|
267 |
SEARCH_TYPES,
|
|
|
270 |
)
|
271 |
|
272 |
def show_date_range_selector():
|
|
|
|
|
|
|
|
|
273 |
return st.selectbox(
|
274 |
"Select Date Range:",
|
275 |
DATE_RANGE_OPTIONS,
|
|
|
278 |
)
|
279 |
|
280 |
def show_custom_date_inputs():
|
|
|
|
|
|
|
|
|
281 |
st.session_state.custom_start_date = st.date_input("Start Date", st.session_state.custom_start_date)
|
282 |
st.session_state.custom_end_date = st.date_input("End Date", st.session_state.custom_end_date)
|
283 |
|
284 |
def show_dimensions_selector(search_type):
|
|
|
|
|
|
|
|
|
285 |
available_dimensions = update_dimensions(search_type)
|
286 |
return st.multiselect(
|
287 |
"Select Dimensions:",
|
|
|
290 |
key='dimensions_selector'
|
291 |
)
|
292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
def show_paginated_dataframe(report, rows_per_page=20):
|
|
|
|
|
|
|
294 |
total_rows = len(report)
|
295 |
total_pages = (total_rows - 1) // rows_per_page + 1
|
296 |
|
297 |
+
if 'current_page' not in st.session_state:
|
298 |
+
st.session_state.current_page = 1
|
299 |
|
300 |
+
col1, col2, col3 = st.columns([1,3,1])
|
301 |
+
with col1:
|
302 |
+
if st.button("Previous", disabled=st.session_state.current_page == 1):
|
303 |
+
st.session_state.current_page -= 1
|
304 |
+
with col2:
|
305 |
+
st.write(f"Page {st.session_state.current_page} of {total_pages}")
|
306 |
+
with col3:
|
307 |
+
if st.button("Next", disabled=st.session_state.current_page == total_pages):
|
308 |
+
st.session_state.current_page += 1
|
309 |
+
|
310 |
+
start_idx = (st.session_state.current_page - 1) * rows_per_page
|
311 |
end_idx = start_idx + rows_per_page
|
312 |
st.dataframe(report.iloc[start_idx:end_idx])
|
313 |
|
|
|
318 |
def main():
|
319 |
setup_streamlit()
|
320 |
client_config = load_config()
|
321 |
+
|
322 |
+
if 'auth_flow' not in st.session_state or 'auth_url' not in st.session_state:
|
323 |
+
st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
|
324 |
|
325 |
query_params = st.experimental_get_query_params()
|
326 |
auth_code = query_params.get("code", [None])[0]
|
327 |
|
328 |
+
if auth_code and 'credentials' not in st.session_state:
|
329 |
st.session_state.auth_flow.fetch_token(code=auth_code)
|
330 |
st.session_state.credentials = st.session_state.auth_flow.credentials
|
331 |
|
332 |
+
if 'credentials' not in st.session_state:
|
333 |
show_google_sign_in(st.session_state.auth_url)
|
334 |
else:
|
335 |
init_session_state()
|
|
|
349 |
|
350 |
selected_dimensions = show_dimensions_selector(search_type)
|
351 |
|
352 |
+
if 'report_data' not in st.session_state:
|
353 |
+
st.session_state.report_data = None
|
354 |
+
|
355 |
+
if st.button("Fetch Data"):
|
356 |
+
with st.spinner('Fetching data...'):
|
357 |
+
st.session_state.report_data = fetch_data_loading(webproperty, search_type, start_date, end_date, selected_dimensions)
|
358 |
|
359 |
+
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
360 |
+
show_paginated_dataframe(st.session_state.report_data)
|
361 |
+
download_csv_link(st.session_state.report_data)
|
362 |
+
elif st.session_state.report_data is not None:
|
363 |
+
st.warning("No data found for the selected criteria.")
|
364 |
|
365 |
+
|
366 |
if __name__ == "__main__":
|
367 |
main()
|