Spaces:
Sleeping
Sleeping
poemsforaphrodite
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -2,8 +2,6 @@
|
|
2 |
import datetime
|
3 |
import base64
|
4 |
import os
|
5 |
-
import sys
|
6 |
-
import json
|
7 |
|
8 |
# Related third-party imports
|
9 |
import streamlit as st
|
@@ -16,20 +14,11 @@ import searchconsole
|
|
16 |
import cohere
|
17 |
from sklearn.metrics.pairwise import cosine_similarity
|
18 |
import requests
|
19 |
-
import logging
|
20 |
from bs4 import BeautifulSoup
|
21 |
|
22 |
load_dotenv()
|
23 |
-
#
|
24 |
-
|
25 |
-
level=logging.INFO,
|
26 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
27 |
-
stream=sys.stdout # This will ensure the logs are captured by Hugging Face
|
28 |
-
)
|
29 |
-
logger = logging.getLogger(__name__)
|
30 |
-
|
31 |
-
# Explicitly set Streamlit's logg
|
32 |
-
st.set_option('deprecation.showfileUploaderEncoding', False)
|
33 |
# Initialize Cohere client
|
34 |
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
35 |
co = cohere.Client(COHERE_API_KEY)
|
@@ -105,20 +94,33 @@ def generate_embeddings(text_list, model_type):
|
|
105 |
embeddings = response.embeddings
|
106 |
return embeddings
|
107 |
|
108 |
-
def
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
def process_gsc_data(df):
|
|
|
115 |
df_sorted = df.sort_values(['impressions'], ascending=[False])
|
|
|
|
|
116 |
df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
119 |
return result
|
120 |
|
121 |
-
|
122 |
# -------------
|
123 |
# Google Authentication Functions
|
124 |
# -------------
|
@@ -298,40 +300,59 @@ def show_dimensions_selector(search_type):
|
|
298 |
key='dimensions_selector'
|
299 |
)
|
300 |
|
301 |
-
|
302 |
-
|
303 |
-
def show_paginated_dataframe(report, rows_per_page=20, model_type='english'):
|
304 |
-
logger.info("Displaying paginated dataframe")
|
305 |
-
|
306 |
-
# Check if required columns are present
|
307 |
-
required_columns = ['page', 'query', 'clicks', 'impressions', 'ctr', 'position']
|
308 |
-
missing_columns = [col for col in required_columns if col not in report.columns]
|
309 |
-
|
310 |
-
if missing_columns:
|
311 |
-
st.error(f"Error: The following required columns are missing from the data: {', '.join(missing_columns)}")
|
312 |
-
return report
|
313 |
-
|
314 |
report['position'] = report['position'].astype(int)
|
315 |
report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
|
316 |
|
|
|
317 |
def format_ctr(x):
|
318 |
try:
|
319 |
return f"{float(x):.2%}"
|
320 |
except ValueError:
|
321 |
-
return x
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
|
323 |
report['ctr'] = report['ctr'].apply(format_ctr)
|
324 |
-
|
325 |
-
|
|
|
|
|
|
|
326 |
|
327 |
-
|
|
|
|
|
|
|
328 |
report = report[columns]
|
329 |
|
330 |
-
|
|
|
331 |
sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
|
332 |
|
333 |
ascending = sort_order == "Ascending"
|
334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
|
336 |
total_rows = len(report)
|
337 |
total_pages = (total_rows - 1) // rows_per_page + 1
|
@@ -352,72 +373,17 @@ def show_paginated_dataframe(report, rows_per_page=20, model_type='english'):
|
|
352 |
start_idx = (st.session_state.current_page - 1) * rows_per_page
|
353 |
end_idx = start_idx + rows_per_page
|
354 |
|
355 |
-
|
356 |
-
|
357 |
-
# Display column headers
|
358 |
-
col1, col2, col3, col4, col5, col6, col7, col8 = st.columns([3, 2, 1, 1, 1, 1, 1, 1])
|
359 |
-
col1.write("**Page**")
|
360 |
-
col2.write("**Query**")
|
361 |
-
col3.write("**Impressions**")
|
362 |
-
col4.write("**Clicks**")
|
363 |
-
col5.write("**CTR**")
|
364 |
-
col6.write("**Position**")
|
365 |
-
col7.write("**Relevancy Score**")
|
366 |
-
col8.write("**Action**")
|
367 |
-
|
368 |
-
# Display data rows
|
369 |
-
for idx, row in page_data.iterrows():
|
370 |
-
col1, col2, col3, col4, col5, col6, col7, col8 = st.columns([3, 2, 1, 1, 1, 1, 1, 1])
|
371 |
-
with col1:
|
372 |
-
st.write(f"[{row['page']}]({row['page']})")
|
373 |
-
with col2:
|
374 |
-
st.write(row['query'])
|
375 |
-
with col3:
|
376 |
-
st.write(row['impressions'])
|
377 |
-
with col4:
|
378 |
-
st.write(row['clicks'])
|
379 |
-
with col5:
|
380 |
-
st.write(row['ctr'])
|
381 |
-
with col6:
|
382 |
-
st.write(row['position'])
|
383 |
-
with col7:
|
384 |
-
st.write(row['relevancy_score'] if row['relevancy_score'] is not None else "N/A")
|
385 |
-
with col8:
|
386 |
-
if st.button("Calculate", key=f"calc_{idx}"):
|
387 |
-
logger.info(f"Calculating relevancy for row index: {start_idx + idx}")
|
388 |
-
try:
|
389 |
-
page_content = fetch_content(row['page'])
|
390 |
-
logger.info(f"Fetched content for {row['page']}: {page_content[:100]}...") # Log the first 100 characters
|
391 |
-
query = row['query']
|
392 |
-
relevancy_score = calculate_single_relevancy_score(page_content, query, model_type)
|
393 |
-
logger.info(f"Relevancy score calculated: {relevancy_score}")
|
394 |
-
report.at[start_idx + idx, 'relevancy_score'] = f"{relevancy_score:.2f}"
|
395 |
-
st.success(f"Relevancy score calculated for row {start_idx + idx + 1}")
|
396 |
-
st.experimental_rerun()
|
397 |
-
except Exception as e:
|
398 |
-
logger.error(f"Error calculating relevancy score: {str(e)}")
|
399 |
-
logger.error(f"Error details: {type(e).__name__}, {str(e)}")
|
400 |
-
st.error(f"Error calculating relevancy score: {str(e)}")
|
401 |
-
if isinstance(e, requests.exceptions.RequestException):
|
402 |
-
st.error(f"Error fetching content from {row['page']}. Please check if the URL is accessible.")
|
403 |
-
elif isinstance(e, json.JSONDecodeError):
|
404 |
-
st.error("Error parsing JSON response. The content might not be in the expected format.")
|
405 |
-
|
406 |
-
return report
|
407 |
-
|
408 |
-
# Make sure to import json at the top of your file
|
409 |
# -------------
|
410 |
# Main Streamlit App Function
|
411 |
# -------------
|
412 |
|
413 |
-
|
414 |
def main():
|
415 |
-
logger.info("Starting the Streamlit app")
|
416 |
setup_streamlit()
|
417 |
client_config = load_config()
|
418 |
|
419 |
if 'auth_flow' not in st.session_state or 'auth_url' not in st.session_state:
|
420 |
-
logger.info("Initializing Google auth flow")
|
421 |
st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
|
422 |
|
423 |
# Directly access query parameters using st.query_params
|
@@ -426,27 +392,23 @@ def main():
|
|
426 |
# Retrieve the 'code' parameter
|
427 |
auth_code = query_params.get("code", None)
|
428 |
|
|
|
429 |
if auth_code and 'credentials' not in st.session_state:
|
430 |
-
logger.info("Fetching token with auth code")
|
431 |
st.session_state.auth_flow.fetch_token(code=auth_code)
|
432 |
st.session_state.credentials = st.session_state.auth_flow.credentials
|
433 |
-
logger.info("Credentials stored in session state")
|
434 |
|
435 |
if 'credentials' not in st.session_state:
|
436 |
-
logger.info("No credentials found, showing Google sign-in")
|
437 |
show_google_sign_in(st.session_state.auth_url)
|
438 |
else:
|
439 |
-
logger.info("Credentials found, initializing session state")
|
440 |
init_session_state()
|
441 |
account = auth_search_console(client_config, st.session_state.credentials)
|
442 |
properties = list_gsc_properties(st.session_state.credentials)
|
443 |
|
444 |
if properties:
|
445 |
-
logger.info(f"Found {len(properties)} properties")
|
446 |
webproperty = show_property_selector(properties, account)
|
447 |
search_type = show_search_type_selector()
|
448 |
date_range_selection = show_date_range_selector()
|
449 |
-
model_type = show_model_type_selector()
|
450 |
if date_range_selection == 'Custom Range':
|
451 |
show_custom_date_inputs()
|
452 |
start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
|
@@ -460,22 +422,18 @@ def main():
|
|
460 |
|
461 |
if st.button("Fetch Data"):
|
462 |
with st.spinner('Fetching data...'):
|
463 |
-
logger.info(f"Fetching GSC data for {webproperty} from {start_date} to {end_date}")
|
464 |
st.session_state.report_data = fetch_gsc_data(webproperty, search_type, start_date, end_date, selected_dimensions)
|
465 |
-
logger.info(f"Data fetched: {len(st.session_state.report_data)} rows")
|
466 |
|
467 |
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
468 |
-
|
469 |
-
|
470 |
-
st.
|
|
|
|
|
471 |
download_csv_link(st.session_state.report_data)
|
472 |
elif st.session_state.report_data is not None:
|
473 |
-
logger.warning("No data found for the selected criteria")
|
474 |
st.warning("No data found for the selected criteria.")
|
475 |
-
else:
|
476 |
-
logger.warning("No properties found for the account")
|
477 |
-
st.warning("No properties found for your Google Search Console account.")
|
478 |
|
|
|
479 |
if __name__ == "__main__":
|
480 |
-
logger.info("Application started")
|
481 |
main()
|
|
|
2 |
import datetime
|
3 |
import base64
|
4 |
import os
|
|
|
|
|
5 |
|
6 |
# Related third-party imports
|
7 |
import streamlit as st
|
|
|
14 |
import cohere
|
15 |
from sklearn.metrics.pairwise import cosine_similarity
|
16 |
import requests
|
|
|
17 |
from bs4 import BeautifulSoup
|
18 |
|
19 |
load_dotenv()
|
20 |
+
#test
|
21 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# Initialize Cohere client
|
23 |
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
24 |
co = cohere.Client(COHERE_API_KEY)
|
|
|
94 |
embeddings = response.embeddings
|
95 |
return embeddings
|
96 |
|
97 |
+
def calculate_relevancy_scores(df, model_type):
|
98 |
+
try:
|
99 |
+
page_contents = [fetch_content(url) for url in df['page']]
|
100 |
+
page_embeddings = generate_embeddings(page_contents, model_type)
|
101 |
+
query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
|
102 |
+
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
103 |
+
df = df.assign(relevancy_score=relevancy_scores)
|
104 |
+
except Exception as e:
|
105 |
+
st.warning(f"Error calculating relevancy scores: {e}")
|
106 |
+
df = df.assign(relevancy_score=0)
|
107 |
+
return df
|
108 |
|
109 |
def process_gsc_data(df):
|
110 |
+
# Remove the filter for queries below position 10
|
111 |
df_sorted = df.sort_values(['impressions'], ascending=[False])
|
112 |
+
|
113 |
+
# Keep only the highest impression query for each page
|
114 |
df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
|
115 |
+
|
116 |
+
if 'relevancy_score' not in df_unique.columns:
|
117 |
+
df_unique['relevancy_score'] = 0
|
118 |
+
else:
|
119 |
+
df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
|
120 |
+
|
121 |
+
result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
|
122 |
return result
|
123 |
|
|
|
124 |
# -------------
|
125 |
# Google Authentication Functions
|
126 |
# -------------
|
|
|
300 |
key='dimensions_selector'
|
301 |
)
|
302 |
|
303 |
+
def show_paginated_dataframe(report, rows_per_page=20):
|
304 |
+
# Convert 'position' column to integer and 'impressions' to numeric
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
report['position'] = report['position'].astype(int)
|
306 |
report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
|
307 |
|
308 |
+
# Format CTR as percentage and relevancy_score with two decimal places
|
309 |
def format_ctr(x):
|
310 |
try:
|
311 |
return f"{float(x):.2%}"
|
312 |
except ValueError:
|
313 |
+
return x # Return the original value if it can't be converted to float
|
314 |
+
|
315 |
+
def format_relevancy_score(x):
|
316 |
+
try:
|
317 |
+
return f"{float(x):.2f}"
|
318 |
+
except ValueError:
|
319 |
+
return x # Return the original value if it can't be converted to float
|
320 |
|
321 |
report['ctr'] = report['ctr'].apply(format_ctr)
|
322 |
+
report['relevancy_score'] = report['relevancy_score'].apply(format_relevancy_score)
|
323 |
+
|
324 |
+
# Create a clickable URL column
|
325 |
+
def make_clickable(url):
|
326 |
+
return f'<a href="{url}" target="_blank">{url}</a>'
|
327 |
|
328 |
+
report['clickable_url'] = report['page'].apply(make_clickable)
|
329 |
+
|
330 |
+
# Reorder columns to put clickable_url first
|
331 |
+
columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
|
332 |
report = report[columns]
|
333 |
|
334 |
+
# Add sorting functionality
|
335 |
+
sort_column = st.selectbox("Sort by:", columns[1:], index=columns[1:].index('impressions')) # Set 'impressions' as default
|
336 |
sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
|
337 |
|
338 |
ascending = sort_order == "Ascending"
|
339 |
+
|
340 |
+
# Convert back to numeric for sorting
|
341 |
+
def safe_float_convert(x):
|
342 |
+
try:
|
343 |
+
return float(x.rstrip('%')) / 100 if isinstance(x, str) and x.endswith('%') else float(x)
|
344 |
+
except ValueError:
|
345 |
+
return 0 # Return 0 or another default value if conversion fails
|
346 |
+
|
347 |
+
report['ctr_numeric'] = report['ctr'].apply(safe_float_convert)
|
348 |
+
report['relevancy_score_numeric'] = report['relevancy_score'].apply(safe_float_convert)
|
349 |
+
|
350 |
+
# Sort using the numeric columns
|
351 |
+
sort_column_numeric = sort_column + '_numeric' if sort_column in ['ctr', 'relevancy_score'] else sort_column
|
352 |
+
report = report.sort_values(by=sort_column_numeric, ascending=ascending)
|
353 |
+
|
354 |
+
# Remove the temporary numeric columns
|
355 |
+
report = report.drop(columns=['ctr_numeric', 'relevancy_score_numeric'])
|
356 |
|
357 |
total_rows = len(report)
|
358 |
total_pages = (total_rows - 1) // rows_per_page + 1
|
|
|
373 |
start_idx = (st.session_state.current_page - 1) * rows_per_page
|
374 |
end_idx = start_idx + rows_per_page
|
375 |
|
376 |
+
# Use st.markdown to display the dataframe with clickable links
|
377 |
+
st.markdown(report.iloc[start_idx:end_idx].to_html(escape=False, index=False), unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
# -------------
|
379 |
# Main Streamlit App Function
|
380 |
# -------------
|
381 |
|
|
|
382 |
def main():
|
|
|
383 |
setup_streamlit()
|
384 |
client_config = load_config()
|
385 |
|
386 |
if 'auth_flow' not in st.session_state or 'auth_url' not in st.session_state:
|
|
|
387 |
st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
|
388 |
|
389 |
# Directly access query parameters using st.query_params
|
|
|
392 |
# Retrieve the 'code' parameter
|
393 |
auth_code = query_params.get("code", None)
|
394 |
|
395 |
+
|
396 |
if auth_code and 'credentials' not in st.session_state:
|
|
|
397 |
st.session_state.auth_flow.fetch_token(code=auth_code)
|
398 |
st.session_state.credentials = st.session_state.auth_flow.credentials
|
|
|
399 |
|
400 |
if 'credentials' not in st.session_state:
|
|
|
401 |
show_google_sign_in(st.session_state.auth_url)
|
402 |
else:
|
|
|
403 |
init_session_state()
|
404 |
account = auth_search_console(client_config, st.session_state.credentials)
|
405 |
properties = list_gsc_properties(st.session_state.credentials)
|
406 |
|
407 |
if properties:
|
|
|
408 |
webproperty = show_property_selector(properties, account)
|
409 |
search_type = show_search_type_selector()
|
410 |
date_range_selection = show_date_range_selector()
|
411 |
+
model_type = show_model_type_selector() # Add this line
|
412 |
if date_range_selection == 'Custom Range':
|
413 |
show_custom_date_inputs()
|
414 |
start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
|
|
|
422 |
|
423 |
if st.button("Fetch Data"):
|
424 |
with st.spinner('Fetching data...'):
|
|
|
425 |
st.session_state.report_data = fetch_gsc_data(webproperty, search_type, start_date, end_date, selected_dimensions)
|
|
|
426 |
|
427 |
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
428 |
+
st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
|
429 |
+
|
430 |
+
if st.button("Calculate Relevancy Scores"):
|
431 |
+
st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
|
432 |
+
show_paginated_dataframe(st.session_state.report_data)
|
433 |
download_csv_link(st.session_state.report_data)
|
434 |
elif st.session_state.report_data is not None:
|
|
|
435 |
st.warning("No data found for the selected criteria.")
|
|
|
|
|
|
|
436 |
|
437 |
+
|
438 |
if __name__ == "__main__":
|
|
|
439 |
main()
|