poemsforaphrodite commited on
Commit
3a8e960
·
verified ·
1 Parent(s): 3dc03f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -113
app.py CHANGED
@@ -2,8 +2,6 @@
2
  import datetime
3
  import base64
4
  import os
5
- import sys
6
- import json
7
 
8
  # Related third-party imports
9
  import streamlit as st
@@ -16,20 +14,11 @@ import searchconsole
16
  import cohere
17
  from sklearn.metrics.pairwise import cosine_similarity
18
  import requests
19
- import logging
20
  from bs4 import BeautifulSoup
21
 
22
  load_dotenv()
23
- # Set up logging
24
- logging.basicConfig(
25
- level=logging.INFO,
26
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
27
- stream=sys.stdout # This will ensure the logs are captured by Hugging Face
28
- )
29
- logger = logging.getLogger(__name__)
30
-
31
- # Explicitly set Streamlit's logg
32
- st.set_option('deprecation.showfileUploaderEncoding', False)
33
  # Initialize Cohere client
34
  COHERE_API_KEY = os.environ["COHERE_API_KEY"]
35
  co = cohere.Client(COHERE_API_KEY)
@@ -105,20 +94,33 @@ def generate_embeddings(text_list, model_type):
105
  embeddings = response.embeddings
106
  return embeddings
107
 
108
- def calculate_single_relevancy_score(page_content, query, model_type):
109
- page_embedding = generate_embeddings([page_content], model_type)[0]
110
- query_embedding = generate_embeddings([query], model_type)[0]
111
- relevancy_score = cosine_similarity([query_embedding], [page_embedding])[0][0]
112
- return relevancy_score
 
 
 
 
 
 
113
 
114
  def process_gsc_data(df):
 
115
  df_sorted = df.sort_values(['impressions'], ascending=[False])
 
 
116
  df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
117
- result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position']]
118
- result['relevancy_score'] = None # Initialize relevancy_score as None
 
 
 
 
 
119
  return result
120
 
121
-
122
  # -------------
123
  # Google Authentication Functions
124
  # -------------
@@ -298,40 +300,59 @@ def show_dimensions_selector(search_type):
298
  key='dimensions_selector'
299
  )
300
 
301
-
302
-
303
- def show_paginated_dataframe(report, rows_per_page=20, model_type='english'):
304
- logger.info("Displaying paginated dataframe")
305
-
306
- # Check if required columns are present
307
- required_columns = ['page', 'query', 'clicks', 'impressions', 'ctr', 'position']
308
- missing_columns = [col for col in required_columns if col not in report.columns]
309
-
310
- if missing_columns:
311
- st.error(f"Error: The following required columns are missing from the data: {', '.join(missing_columns)}")
312
- return report
313
-
314
  report['position'] = report['position'].astype(int)
315
  report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
316
 
 
317
  def format_ctr(x):
318
  try:
319
  return f"{float(x):.2%}"
320
  except ValueError:
321
- return x
 
 
 
 
 
 
322
 
323
  report['ctr'] = report['ctr'].apply(format_ctr)
324
- if 'relevancy_score' not in report.columns:
325
- report['relevancy_score'] = None
 
 
 
326
 
327
- columns = ['page', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
 
 
 
328
  report = report[columns]
329
 
330
- sort_column = st.selectbox("Sort by:", columns, index=columns.index('impressions'))
 
331
  sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
332
 
333
  ascending = sort_order == "Ascending"
334
- report = report.sort_values(by=sort_column, ascending=ascending)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  total_rows = len(report)
337
  total_pages = (total_rows - 1) // rows_per_page + 1
@@ -352,72 +373,17 @@ def show_paginated_dataframe(report, rows_per_page=20, model_type='english'):
352
  start_idx = (st.session_state.current_page - 1) * rows_per_page
353
  end_idx = start_idx + rows_per_page
354
 
355
- page_data = report.iloc[start_idx:end_idx].reset_index(drop=True)
356
-
357
- # Display column headers
358
- col1, col2, col3, col4, col5, col6, col7, col8 = st.columns([3, 2, 1, 1, 1, 1, 1, 1])
359
- col1.write("**Page**")
360
- col2.write("**Query**")
361
- col3.write("**Impressions**")
362
- col4.write("**Clicks**")
363
- col5.write("**CTR**")
364
- col6.write("**Position**")
365
- col7.write("**Relevancy Score**")
366
- col8.write("**Action**")
367
-
368
- # Display data rows
369
- for idx, row in page_data.iterrows():
370
- col1, col2, col3, col4, col5, col6, col7, col8 = st.columns([3, 2, 1, 1, 1, 1, 1, 1])
371
- with col1:
372
- st.write(f"[{row['page']}]({row['page']})")
373
- with col2:
374
- st.write(row['query'])
375
- with col3:
376
- st.write(row['impressions'])
377
- with col4:
378
- st.write(row['clicks'])
379
- with col5:
380
- st.write(row['ctr'])
381
- with col6:
382
- st.write(row['position'])
383
- with col7:
384
- st.write(row['relevancy_score'] if row['relevancy_score'] is not None else "N/A")
385
- with col8:
386
- if st.button("Calculate", key=f"calc_{idx}"):
387
- logger.info(f"Calculating relevancy for row index: {start_idx + idx}")
388
- try:
389
- page_content = fetch_content(row['page'])
390
- logger.info(f"Fetched content for {row['page']}: {page_content[:100]}...") # Log the first 100 characters
391
- query = row['query']
392
- relevancy_score = calculate_single_relevancy_score(page_content, query, model_type)
393
- logger.info(f"Relevancy score calculated: {relevancy_score}")
394
- report.at[start_idx + idx, 'relevancy_score'] = f"{relevancy_score:.2f}"
395
- st.success(f"Relevancy score calculated for row {start_idx + idx + 1}")
396
- st.experimental_rerun()
397
- except Exception as e:
398
- logger.error(f"Error calculating relevancy score: {str(e)}")
399
- logger.error(f"Error details: {type(e).__name__}, {str(e)}")
400
- st.error(f"Error calculating relevancy score: {str(e)}")
401
- if isinstance(e, requests.exceptions.RequestException):
402
- st.error(f"Error fetching content from {row['page']}. Please check if the URL is accessible.")
403
- elif isinstance(e, json.JSONDecodeError):
404
- st.error("Error parsing JSON response. The content might not be in the expected format.")
405
-
406
- return report
407
-
408
- # Make sure to import json at the top of your file
409
  # -------------
410
  # Main Streamlit App Function
411
  # -------------
412
 
413
-
414
  def main():
415
- logger.info("Starting the Streamlit app")
416
  setup_streamlit()
417
  client_config = load_config()
418
 
419
  if 'auth_flow' not in st.session_state or 'auth_url' not in st.session_state:
420
- logger.info("Initializing Google auth flow")
421
  st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
422
 
423
  # Directly access query parameters using st.query_params
@@ -426,27 +392,23 @@ def main():
426
  # Retrieve the 'code' parameter
427
  auth_code = query_params.get("code", None)
428
 
 
429
  if auth_code and 'credentials' not in st.session_state:
430
- logger.info("Fetching token with auth code")
431
  st.session_state.auth_flow.fetch_token(code=auth_code)
432
  st.session_state.credentials = st.session_state.auth_flow.credentials
433
- logger.info("Credentials stored in session state")
434
 
435
  if 'credentials' not in st.session_state:
436
- logger.info("No credentials found, showing Google sign-in")
437
  show_google_sign_in(st.session_state.auth_url)
438
  else:
439
- logger.info("Credentials found, initializing session state")
440
  init_session_state()
441
  account = auth_search_console(client_config, st.session_state.credentials)
442
  properties = list_gsc_properties(st.session_state.credentials)
443
 
444
  if properties:
445
- logger.info(f"Found {len(properties)} properties")
446
  webproperty = show_property_selector(properties, account)
447
  search_type = show_search_type_selector()
448
  date_range_selection = show_date_range_selector()
449
- model_type = show_model_type_selector()
450
  if date_range_selection == 'Custom Range':
451
  show_custom_date_inputs()
452
  start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
@@ -460,22 +422,18 @@ def main():
460
 
461
  if st.button("Fetch Data"):
462
  with st.spinner('Fetching data...'):
463
- logger.info(f"Fetching GSC data for {webproperty} from {start_date} to {end_date}")
464
  st.session_state.report_data = fetch_gsc_data(webproperty, search_type, start_date, end_date, selected_dimensions)
465
- logger.info(f"Data fetched: {len(st.session_state.report_data)} rows")
466
 
467
  if st.session_state.report_data is not None and not st.session_state.report_data.empty:
468
- logger.info("Displaying fetched data")
469
- st.write("Data fetched successfully. Click the 'Calculate' button in the Relevancy Score column to calculate the score for each row.")
470
- st.session_state.report_data = show_paginated_dataframe(st.session_state.report_data, model_type=model_type)
 
 
471
  download_csv_link(st.session_state.report_data)
472
  elif st.session_state.report_data is not None:
473
- logger.warning("No data found for the selected criteria")
474
  st.warning("No data found for the selected criteria.")
475
- else:
476
- logger.warning("No properties found for the account")
477
- st.warning("No properties found for your Google Search Console account.")
478
 
 
479
  if __name__ == "__main__":
480
- logger.info("Application started")
481
  main()
 
2
  import datetime
3
  import base64
4
  import os
 
 
5
 
6
  # Related third-party imports
7
  import streamlit as st
 
14
  import cohere
15
  from sklearn.metrics.pairwise import cosine_similarity
16
  import requests
 
17
  from bs4 import BeautifulSoup
18
 
19
  load_dotenv()
20
+ #test
21
+
 
 
 
 
 
 
 
 
22
  # Initialize Cohere client
23
  COHERE_API_KEY = os.environ["COHERE_API_KEY"]
24
  co = cohere.Client(COHERE_API_KEY)
 
94
  embeddings = response.embeddings
95
  return embeddings
96
 
97
+ def calculate_relevancy_scores(df, model_type):
98
+ try:
99
+ page_contents = [fetch_content(url) for url in df['page']]
100
+ page_embeddings = generate_embeddings(page_contents, model_type)
101
+ query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
102
+ relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
103
+ df = df.assign(relevancy_score=relevancy_scores)
104
+ except Exception as e:
105
+ st.warning(f"Error calculating relevancy scores: {e}")
106
+ df = df.assign(relevancy_score=0)
107
+ return df
108
 
109
  def process_gsc_data(df):
110
+ # Remove the filter for queries below position 10
111
  df_sorted = df.sort_values(['impressions'], ascending=[False])
112
+
113
+ # Keep only the highest impression query for each page
114
  df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
115
+
116
+ if 'relevancy_score' not in df_unique.columns:
117
+ df_unique['relevancy_score'] = 0
118
+ else:
119
+ df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
120
+
121
+ result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
122
  return result
123
 
 
124
  # -------------
125
  # Google Authentication Functions
126
  # -------------
 
300
  key='dimensions_selector'
301
  )
302
 
303
+ def show_paginated_dataframe(report, rows_per_page=20):
304
+ # Convert 'position' column to integer and 'impressions' to numeric
 
 
 
 
 
 
 
 
 
 
 
305
  report['position'] = report['position'].astype(int)
306
  report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
307
 
308
+ # Format CTR as percentage and relevancy_score with two decimal places
309
  def format_ctr(x):
310
  try:
311
  return f"{float(x):.2%}"
312
  except ValueError:
313
+ return x # Return the original value if it can't be converted to float
314
+
315
+ def format_relevancy_score(x):
316
+ try:
317
+ return f"{float(x):.2f}"
318
+ except ValueError:
319
+ return x # Return the original value if it can't be converted to float
320
 
321
  report['ctr'] = report['ctr'].apply(format_ctr)
322
+ report['relevancy_score'] = report['relevancy_score'].apply(format_relevancy_score)
323
+
324
+ # Create a clickable URL column
325
+ def make_clickable(url):
326
+ return f'<a href="{url}" target="_blank">{url}</a>'
327
 
328
+ report['clickable_url'] = report['page'].apply(make_clickable)
329
+
330
+ # Reorder columns to put clickable_url first
331
+ columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
332
  report = report[columns]
333
 
334
+ # Add sorting functionality
335
+ sort_column = st.selectbox("Sort by:", columns[1:], index=columns[1:].index('impressions')) # Set 'impressions' as default
336
  sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
337
 
338
  ascending = sort_order == "Ascending"
339
+
340
+ # Convert back to numeric for sorting
341
+ def safe_float_convert(x):
342
+ try:
343
+ return float(x.rstrip('%')) / 100 if isinstance(x, str) and x.endswith('%') else float(x)
344
+ except ValueError:
345
+ return 0 # Return 0 or another default value if conversion fails
346
+
347
+ report['ctr_numeric'] = report['ctr'].apply(safe_float_convert)
348
+ report['relevancy_score_numeric'] = report['relevancy_score'].apply(safe_float_convert)
349
+
350
+ # Sort using the numeric columns
351
+ sort_column_numeric = sort_column + '_numeric' if sort_column in ['ctr', 'relevancy_score'] else sort_column
352
+ report = report.sort_values(by=sort_column_numeric, ascending=ascending)
353
+
354
+ # Remove the temporary numeric columns
355
+ report = report.drop(columns=['ctr_numeric', 'relevancy_score_numeric'])
356
 
357
  total_rows = len(report)
358
  total_pages = (total_rows - 1) // rows_per_page + 1
 
373
  start_idx = (st.session_state.current_page - 1) * rows_per_page
374
  end_idx = start_idx + rows_per_page
375
 
376
+ # Use st.markdown to display the dataframe with clickable links
377
+ st.markdown(report.iloc[start_idx:end_idx].to_html(escape=False, index=False), unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  # -------------
379
  # Main Streamlit App Function
380
  # -------------
381
 
 
382
  def main():
 
383
  setup_streamlit()
384
  client_config = load_config()
385
 
386
  if 'auth_flow' not in st.session_state or 'auth_url' not in st.session_state:
 
387
  st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
388
 
389
  # Directly access query parameters using st.query_params
 
392
  # Retrieve the 'code' parameter
393
  auth_code = query_params.get("code", None)
394
 
395
+
396
  if auth_code and 'credentials' not in st.session_state:
 
397
  st.session_state.auth_flow.fetch_token(code=auth_code)
398
  st.session_state.credentials = st.session_state.auth_flow.credentials
 
399
 
400
  if 'credentials' not in st.session_state:
 
401
  show_google_sign_in(st.session_state.auth_url)
402
  else:
 
403
  init_session_state()
404
  account = auth_search_console(client_config, st.session_state.credentials)
405
  properties = list_gsc_properties(st.session_state.credentials)
406
 
407
  if properties:
 
408
  webproperty = show_property_selector(properties, account)
409
  search_type = show_search_type_selector()
410
  date_range_selection = show_date_range_selector()
411
+ model_type = show_model_type_selector() # Add this line
412
  if date_range_selection == 'Custom Range':
413
  show_custom_date_inputs()
414
  start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
 
422
 
423
  if st.button("Fetch Data"):
424
  with st.spinner('Fetching data...'):
 
425
  st.session_state.report_data = fetch_gsc_data(webproperty, search_type, start_date, end_date, selected_dimensions)
 
426
 
427
  if st.session_state.report_data is not None and not st.session_state.report_data.empty:
428
+ st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
429
+
430
+ if st.button("Calculate Relevancy Scores"):
431
+ st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
432
+ show_paginated_dataframe(st.session_state.report_data)
433
  download_csv_link(st.session_state.report_data)
434
  elif st.session_state.report_data is not None:
 
435
  st.warning("No data found for the selected criteria.")
 
 
 
436
 
437
+
438
  if __name__ == "__main__":
 
439
  main()