poemsforaphrodite commited on
Commit
91e9e9a
·
verified ·
1 Parent(s): fbb8761

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -44
app.py CHANGED
@@ -94,31 +94,17 @@ def generate_embeddings(text_list, model_type):
94
  embeddings = response.embeddings
95
  return embeddings
96
 
97
- def calculate_relevancy_scores(df, model_type):
98
- try:
99
- page_contents = [fetch_content(url) for url in df['page']]
100
- page_embeddings = generate_embeddings(page_contents, model_type)
101
- query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
102
- relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
103
- df = df.assign(relevancy_score=relevancy_scores)
104
- except Exception as e:
105
- st.warning(f"Error calculating relevancy scores: {e}")
106
- df = df.assign(relevancy_score=0)
107
- return df
108
 
109
  def process_gsc_data(df):
110
- # Remove the filter for queries below position 10
111
  df_sorted = df.sort_values(['impressions'], ascending=[False])
112
-
113
- # Keep only the highest impression query for each page
114
  df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
115
-
116
- if 'relevancy_score' not in df_unique.columns:
117
- df_unique['relevancy_score'] = 0
118
- else:
119
- df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
120
-
121
- result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
122
  return result
123
 
124
  # -------------
@@ -300,58 +286,52 @@ def show_dimensions_selector(search_type):
300
  key='dimensions_selector'
301
  )
302
 
303
- def show_paginated_dataframe(report, rows_per_page=20):
304
- # Convert 'position' column to integer and 'impressions' to numeric
305
  report['position'] = report['position'].astype(int)
306
  report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
307
 
308
- # Format CTR as percentage and relevancy_score with two decimal places
309
  def format_ctr(x):
310
  try:
311
  return f"{float(x):.2%}"
312
  except ValueError:
313
- return x # Return the original value if it can't be converted to float
314
 
315
  def format_relevancy_score(x):
 
 
316
  try:
317
  return f"{float(x):.2f}"
318
  except ValueError:
319
- return x # Return the original value if it can't be converted to float
320
 
321
  report['ctr'] = report['ctr'].apply(format_ctr)
322
  report['relevancy_score'] = report['relevancy_score'].apply(format_relevancy_score)
323
 
324
- # Create a clickable URL column
325
  def make_clickable(url):
326
  return f'<a href="{url}" target="_blank">{url}</a>'
327
 
328
  report['clickable_url'] = report['page'].apply(make_clickable)
329
 
330
- # Reorder columns to put clickable_url first
331
- columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
332
- report = report[columns]
333
 
334
- # Add sorting functionality
335
- sort_column = st.selectbox("Sort by:", columns[1:], index=columns[1:].index('impressions')) # Set 'impressions' as default
336
  sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
337
 
338
  ascending = sort_order == "Ascending"
339
 
340
- # Convert back to numeric for sorting
341
  def safe_float_convert(x):
342
  try:
343
  return float(x.rstrip('%')) / 100 if isinstance(x, str) and x.endswith('%') else float(x)
344
  except ValueError:
345
- return 0 # Return 0 or another default value if conversion fails
346
 
347
  report['ctr_numeric'] = report['ctr'].apply(safe_float_convert)
348
- report['relevancy_score_numeric'] = report['relevancy_score'].apply(safe_float_convert)
349
 
350
- # Sort using the numeric columns
351
  sort_column_numeric = sort_column + '_numeric' if sort_column in ['ctr', 'relevancy_score'] else sort_column
352
  report = report.sort_values(by=sort_column_numeric, ascending=ascending)
353
 
354
- # Remove the temporary numeric columns
355
  report = report.drop(columns=['ctr_numeric', 'relevancy_score_numeric'])
356
 
357
  total_rows = len(report)
@@ -373,8 +353,42 @@ def show_paginated_dataframe(report, rows_per_page=20):
373
  start_idx = (st.session_state.current_page - 1) * rows_per_page
374
  end_idx = start_idx + rows_per_page
375
 
376
- # Use st.markdown to display the dataframe with clickable links
377
- st.markdown(report.iloc[start_idx:end_idx].to_html(escape=False, index=False), unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  # -------------
379
  # Main Streamlit App Function
380
  # -------------
@@ -425,11 +439,8 @@ def main():
425
  st.session_state.report_data = fetch_gsc_data(webproperty, search_type, start_date, end_date, selected_dimensions)
426
 
427
  if st.session_state.report_data is not None and not st.session_state.report_data.empty:
428
- st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
429
-
430
- if st.button("Calculate Relevancy Scores"):
431
- st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
432
- show_paginated_dataframe(st.session_state.report_data)
433
  download_csv_link(st.session_state.report_data)
434
  elif st.session_state.report_data is not None:
435
  st.warning("No data found for the selected criteria.")
 
94
  embeddings = response.embeddings
95
  return embeddings
96
 
97
+ def calculate_single_relevancy_score(page_content, query, model_type):
98
+ page_embedding = generate_embeddings([page_content], model_type)[0]
99
+ query_embedding = generate_embeddings([query], model_type)[0]
100
+ relevancy_score = cosine_similarity([query_embedding], [page_embedding])[0][0]
101
+ return relevancy_score
 
 
 
 
 
 
102
 
103
  def process_gsc_data(df):
 
104
  df_sorted = df.sort_values(['impressions'], ascending=[False])
 
 
105
  df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
106
+ result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position']]
107
+ result['relevancy_score'] = None # Initialize relevancy_score as None
 
 
 
 
 
108
  return result
109
 
110
  # -------------
 
286
  key='dimensions_selector'
287
  )
288
 
289
+ def show_paginated_dataframe(report, rows_per_page=20, model_type='english'):
 
290
  report['position'] = report['position'].astype(int)
291
  report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
292
 
 
293
  def format_ctr(x):
294
  try:
295
  return f"{float(x):.2%}"
296
  except ValueError:
297
+ return x
298
 
299
  def format_relevancy_score(x):
300
+ if x is None:
301
+ return "Not calculated"
302
  try:
303
  return f"{float(x):.2f}"
304
  except ValueError:
305
+ return x
306
 
307
  report['ctr'] = report['ctr'].apply(format_ctr)
308
  report['relevancy_score'] = report['relevancy_score'].apply(format_relevancy_score)
309
 
 
310
  def make_clickable(url):
311
  return f'<a href="{url}" target="_blank">{url}</a>'
312
 
313
  report['clickable_url'] = report['page'].apply(make_clickable)
314
 
315
+ columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score', 'action']
316
+ report = report[columns[:-1]] # Exclude 'action' from the DataFrame
 
317
 
318
+ sort_column = st.selectbox("Sort by:", columns[1:-1], index=columns[1:-1].index('impressions'))
 
319
  sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
320
 
321
  ascending = sort_order == "Ascending"
322
 
 
323
  def safe_float_convert(x):
324
  try:
325
  return float(x.rstrip('%')) / 100 if isinstance(x, str) and x.endswith('%') else float(x)
326
  except ValueError:
327
+ return 0
328
 
329
  report['ctr_numeric'] = report['ctr'].apply(safe_float_convert)
330
+ report['relevancy_score_numeric'] = report['relevancy_score'].apply(lambda x: safe_float_convert(x) if x != "Not calculated" else -1)
331
 
 
332
  sort_column_numeric = sort_column + '_numeric' if sort_column in ['ctr', 'relevancy_score'] else sort_column
333
  report = report.sort_values(by=sort_column_numeric, ascending=ascending)
334
 
 
335
  report = report.drop(columns=['ctr_numeric', 'relevancy_score_numeric'])
336
 
337
  total_rows = len(report)
 
353
  start_idx = (st.session_state.current_page - 1) * rows_per_page
354
  end_idx = start_idx + rows_per_page
355
 
356
+ # Create a placeholder for the dataframe
357
+ dataframe_placeholder = st.empty()
358
+
359
+ # Function to update the dataframe
360
+ def update_dataframe():
361
+ df_html = report.iloc[start_idx:end_idx].to_html(escape=False, index=False)
362
+ df_html = df_html.replace('</table>', ''.join([
363
+ '<th>Action</th></tr>',
364
+ *[f'<tr><td colspan="7"></td><td><button onclick="calculate_relevancy({i})">Calculate Relevancy</button></td></tr>'
365
+ for i in range(start_idx, min(end_idx, len(report)))]
366
+ ]) + '</table>')
367
+ dataframe_placeholder.markdown(df_html, unsafe_allow_html=True)
368
+
369
+ # Initial dataframe display
370
+ update_dataframe()
371
+
372
+ # JavaScript to handle button clicks
373
+ st.markdown("""
374
+ <script>
375
+ function calculate_relevancy(row_index) {
376
+ Streamlit.setComponentValue('calculate_relevancy', row_index);
377
+ }
378
+ </script>
379
+ """, unsafe_allow_html=True)
380
+
381
+ # Handle relevancy calculation
382
+ if st.session_state.get('calculate_relevancy'):
383
+ row_index = st.session_state.calculate_relevancy
384
+ page_content = fetch_content(report.iloc[row_index]['page'])
385
+ query = report.iloc[row_index]['query']
386
+ relevancy_score = calculate_single_relevancy_score(page_content, query, model_type)
387
+ report.at[row_index, 'relevancy_score'] = relevancy_score
388
+ st.session_state.calculate_relevancy = None # Reset the state
389
+ update_dataframe() # Update the dataframe display
390
+
391
+ return report
392
  # -------------
393
  # Main Streamlit App Function
394
  # -------------
 
439
  st.session_state.report_data = fetch_gsc_data(webproperty, search_type, start_date, end_date, selected_dimensions)
440
 
441
  if st.session_state.report_data is not None and not st.session_state.report_data.empty:
442
+ st.write("Data fetched successfully. Click the 'Calculate Relevancy' button in each row to calculate its relevancy score.")
443
+ st.session_state.report_data = show_paginated_dataframe(st.session_state.report_data, model_type=model_type)
 
 
 
444
  download_csv_link(st.session_state.report_data)
445
  elif st.session_state.report_data is not None:
446
  st.warning("No data found for the selected criteria.")