Spaces:
Running
Running
poemsforaphrodite
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -94,31 +94,17 @@ def generate_embeddings(text_list, model_type):
|
|
94 |
embeddings = response.embeddings
|
95 |
return embeddings
|
96 |
|
97 |
-
def
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
103 |
-
df = df.assign(relevancy_score=relevancy_scores)
|
104 |
-
except Exception as e:
|
105 |
-
st.warning(f"Error calculating relevancy scores: {e}")
|
106 |
-
df = df.assign(relevancy_score=0)
|
107 |
-
return df
|
108 |
|
109 |
def process_gsc_data(df):
|
110 |
-
# Remove the filter for queries below position 10
|
111 |
df_sorted = df.sort_values(['impressions'], ascending=[False])
|
112 |
-
|
113 |
-
# Keep only the highest impression query for each page
|
114 |
df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
|
115 |
-
|
116 |
-
|
117 |
-
df_unique['relevancy_score'] = 0
|
118 |
-
else:
|
119 |
-
df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
|
120 |
-
|
121 |
-
result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
|
122 |
return result
|
123 |
|
124 |
# -------------
|
@@ -300,58 +286,52 @@ def show_dimensions_selector(search_type):
|
|
300 |
key='dimensions_selector'
|
301 |
)
|
302 |
|
303 |
-
def show_paginated_dataframe(report, rows_per_page=20):
|
304 |
-
# Convert 'position' column to integer and 'impressions' to numeric
|
305 |
report['position'] = report['position'].astype(int)
|
306 |
report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
|
307 |
|
308 |
-
# Format CTR as percentage and relevancy_score with two decimal places
|
309 |
def format_ctr(x):
|
310 |
try:
|
311 |
return f"{float(x):.2%}"
|
312 |
except ValueError:
|
313 |
-
return x
|
314 |
|
315 |
def format_relevancy_score(x):
|
|
|
|
|
316 |
try:
|
317 |
return f"{float(x):.2f}"
|
318 |
except ValueError:
|
319 |
-
return x
|
320 |
|
321 |
report['ctr'] = report['ctr'].apply(format_ctr)
|
322 |
report['relevancy_score'] = report['relevancy_score'].apply(format_relevancy_score)
|
323 |
|
324 |
-
# Create a clickable URL column
|
325 |
def make_clickable(url):
|
326 |
return f'<a href="{url}" target="_blank">{url}</a>'
|
327 |
|
328 |
report['clickable_url'] = report['page'].apply(make_clickable)
|
329 |
|
330 |
-
|
331 |
-
|
332 |
-
report = report[columns]
|
333 |
|
334 |
-
|
335 |
-
sort_column = st.selectbox("Sort by:", columns[1:], index=columns[1:].index('impressions')) # Set 'impressions' as default
|
336 |
sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
|
337 |
|
338 |
ascending = sort_order == "Ascending"
|
339 |
|
340 |
-
# Convert back to numeric for sorting
|
341 |
def safe_float_convert(x):
|
342 |
try:
|
343 |
return float(x.rstrip('%')) / 100 if isinstance(x, str) and x.endswith('%') else float(x)
|
344 |
except ValueError:
|
345 |
-
return 0
|
346 |
|
347 |
report['ctr_numeric'] = report['ctr'].apply(safe_float_convert)
|
348 |
-
report['relevancy_score_numeric'] = report['relevancy_score'].apply(safe_float_convert)
|
349 |
|
350 |
-
# Sort using the numeric columns
|
351 |
sort_column_numeric = sort_column + '_numeric' if sort_column in ['ctr', 'relevancy_score'] else sort_column
|
352 |
report = report.sort_values(by=sort_column_numeric, ascending=ascending)
|
353 |
|
354 |
-
# Remove the temporary numeric columns
|
355 |
report = report.drop(columns=['ctr_numeric', 'relevancy_score_numeric'])
|
356 |
|
357 |
total_rows = len(report)
|
@@ -373,8 +353,42 @@ def show_paginated_dataframe(report, rows_per_page=20):
|
|
373 |
start_idx = (st.session_state.current_page - 1) * rows_per_page
|
374 |
end_idx = start_idx + rows_per_page
|
375 |
|
376 |
-
#
|
377 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
# -------------
|
379 |
# Main Streamlit App Function
|
380 |
# -------------
|
@@ -425,11 +439,8 @@ def main():
|
|
425 |
st.session_state.report_data = fetch_gsc_data(webproperty, search_type, start_date, end_date, selected_dimensions)
|
426 |
|
427 |
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
428 |
-
st.write("Data fetched successfully. Click the button
|
429 |
-
|
430 |
-
if st.button("Calculate Relevancy Scores"):
|
431 |
-
st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
|
432 |
-
show_paginated_dataframe(st.session_state.report_data)
|
433 |
download_csv_link(st.session_state.report_data)
|
434 |
elif st.session_state.report_data is not None:
|
435 |
st.warning("No data found for the selected criteria.")
|
|
|
94 |
embeddings = response.embeddings
|
95 |
return embeddings
|
96 |
|
97 |
+
def calculate_single_relevancy_score(page_content, query, model_type):
|
98 |
+
page_embedding = generate_embeddings([page_content], model_type)[0]
|
99 |
+
query_embedding = generate_embeddings([query], model_type)[0]
|
100 |
+
relevancy_score = cosine_similarity([query_embedding], [page_embedding])[0][0]
|
101 |
+
return relevancy_score
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
def process_gsc_data(df):
|
|
|
104 |
df_sorted = df.sort_values(['impressions'], ascending=[False])
|
|
|
|
|
105 |
df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
|
106 |
+
result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position']]
|
107 |
+
result['relevancy_score'] = None # Initialize relevancy_score as None
|
|
|
|
|
|
|
|
|
|
|
108 |
return result
|
109 |
|
110 |
# -------------
|
|
|
286 |
key='dimensions_selector'
|
287 |
)
|
288 |
|
289 |
+
def show_paginated_dataframe(report, rows_per_page=20, model_type='english'):
|
|
|
290 |
report['position'] = report['position'].astype(int)
|
291 |
report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
|
292 |
|
|
|
293 |
def format_ctr(x):
|
294 |
try:
|
295 |
return f"{float(x):.2%}"
|
296 |
except ValueError:
|
297 |
+
return x
|
298 |
|
299 |
def format_relevancy_score(x):
|
300 |
+
if x is None:
|
301 |
+
return "Not calculated"
|
302 |
try:
|
303 |
return f"{float(x):.2f}"
|
304 |
except ValueError:
|
305 |
+
return x
|
306 |
|
307 |
report['ctr'] = report['ctr'].apply(format_ctr)
|
308 |
report['relevancy_score'] = report['relevancy_score'].apply(format_relevancy_score)
|
309 |
|
|
|
310 |
def make_clickable(url):
|
311 |
return f'<a href="{url}" target="_blank">{url}</a>'
|
312 |
|
313 |
report['clickable_url'] = report['page'].apply(make_clickable)
|
314 |
|
315 |
+
columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score', 'action']
|
316 |
+
report = report[columns[:-1]] # Exclude 'action' from the DataFrame
|
|
|
317 |
|
318 |
+
sort_column = st.selectbox("Sort by:", columns[1:-1], index=columns[1:-1].index('impressions'))
|
|
|
319 |
sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
|
320 |
|
321 |
ascending = sort_order == "Ascending"
|
322 |
|
|
|
323 |
def safe_float_convert(x):
|
324 |
try:
|
325 |
return float(x.rstrip('%')) / 100 if isinstance(x, str) and x.endswith('%') else float(x)
|
326 |
except ValueError:
|
327 |
+
return 0
|
328 |
|
329 |
report['ctr_numeric'] = report['ctr'].apply(safe_float_convert)
|
330 |
+
report['relevancy_score_numeric'] = report['relevancy_score'].apply(lambda x: safe_float_convert(x) if x != "Not calculated" else -1)
|
331 |
|
|
|
332 |
sort_column_numeric = sort_column + '_numeric' if sort_column in ['ctr', 'relevancy_score'] else sort_column
|
333 |
report = report.sort_values(by=sort_column_numeric, ascending=ascending)
|
334 |
|
|
|
335 |
report = report.drop(columns=['ctr_numeric', 'relevancy_score_numeric'])
|
336 |
|
337 |
total_rows = len(report)
|
|
|
353 |
start_idx = (st.session_state.current_page - 1) * rows_per_page
|
354 |
end_idx = start_idx + rows_per_page
|
355 |
|
356 |
+
# Create a placeholder for the dataframe
|
357 |
+
dataframe_placeholder = st.empty()
|
358 |
+
|
359 |
+
# Function to update the dataframe
|
360 |
+
def update_dataframe():
|
361 |
+
df_html = report.iloc[start_idx:end_idx].to_html(escape=False, index=False)
|
362 |
+
df_html = df_html.replace('</table>', ''.join([
|
363 |
+
'<th>Action</th></tr>',
|
364 |
+
*[f'<tr><td colspan="7"></td><td><button onclick="calculate_relevancy({i})">Calculate Relevancy</button></td></tr>'
|
365 |
+
for i in range(start_idx, min(end_idx, len(report)))]
|
366 |
+
]) + '</table>')
|
367 |
+
dataframe_placeholder.markdown(df_html, unsafe_allow_html=True)
|
368 |
+
|
369 |
+
# Initial dataframe display
|
370 |
+
update_dataframe()
|
371 |
+
|
372 |
+
# JavaScript to handle button clicks
|
373 |
+
st.markdown("""
|
374 |
+
<script>
|
375 |
+
function calculate_relevancy(row_index) {
|
376 |
+
Streamlit.setComponentValue('calculate_relevancy', row_index);
|
377 |
+
}
|
378 |
+
</script>
|
379 |
+
""", unsafe_allow_html=True)
|
380 |
+
|
381 |
+
# Handle relevancy calculation
|
382 |
+
if st.session_state.get('calculate_relevancy'):
|
383 |
+
row_index = st.session_state.calculate_relevancy
|
384 |
+
page_content = fetch_content(report.iloc[row_index]['page'])
|
385 |
+
query = report.iloc[row_index]['query']
|
386 |
+
relevancy_score = calculate_single_relevancy_score(page_content, query, model_type)
|
387 |
+
report.at[row_index, 'relevancy_score'] = relevancy_score
|
388 |
+
st.session_state.calculate_relevancy = None # Reset the state
|
389 |
+
update_dataframe() # Update the dataframe display
|
390 |
+
|
391 |
+
return report
|
392 |
# -------------
|
393 |
# Main Streamlit App Function
|
394 |
# -------------
|
|
|
439 |
st.session_state.report_data = fetch_gsc_data(webproperty, search_type, start_date, end_date, selected_dimensions)
|
440 |
|
441 |
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
442 |
+
st.write("Data fetched successfully. Click the 'Calculate Relevancy' button in each row to calculate its relevancy score.")
|
443 |
+
st.session_state.report_data = show_paginated_dataframe(st.session_state.report_data, model_type=model_type)
|
|
|
|
|
|
|
444 |
download_csv_link(st.session_state.report_data)
|
445 |
elif st.session_state.report_data is not None:
|
446 |
st.warning("No data found for the selected criteria.")
|