Spaces:
Running
Running
poemsforaphrodite
commited on
Commit
•
09e6287
1
Parent(s):
4068829
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
import logging
|
2 |
-
|
3 |
# Standard library imports
|
4 |
import datetime
|
5 |
import base64
|
@@ -20,26 +18,20 @@ from bs4 import BeautifulSoup
|
|
20 |
from apify_client import ApifyClient
|
21 |
import urllib.parse
|
22 |
|
23 |
-
# Configure logging
|
24 |
-
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
25 |
|
26 |
load_dotenv()
|
27 |
-
|
28 |
-
logger = logging.getLogger(__name__)
|
29 |
|
30 |
# Initialize Cohere client
|
31 |
APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN')
|
32 |
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
33 |
co = cohere.Client(COHERE_API_KEY)
|
34 |
-
logging.info("Cohere client initialized")
|
35 |
if not APIFY_API_TOKEN:
|
36 |
-
logger.error("APIFY_API_TOKEN is not set in the environment variables.")
|
37 |
st.error("APIFY_API_TOKEN is not set in the environment variables. Please set it and restart the application.")
|
38 |
|
39 |
# Initialize the ApifyClient with the API token
|
40 |
client = ApifyClient(APIFY_API_TOKEN)
|
41 |
# Initialize the ApifyClient with the API token
|
42 |
-
logger.info("ApifyClient initialized")
|
43 |
|
44 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
45 |
IS_LOCAL = False
|
@@ -89,11 +81,23 @@ def init_session_state():
|
|
89 |
st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
|
90 |
if 'custom_end_date' not in st.session_state:
|
91 |
st.session_state.custom_end_date = datetime.date.today()
|
92 |
-
logging.info("Session state initialized")
|
93 |
|
94 |
# -------------
|
95 |
# Data Processing Functions
|
96 |
# -------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
def get_serp_results(query):
|
99 |
if not APIFY_API_TOKEN:
|
@@ -116,24 +120,24 @@ def get_serp_results(query):
|
|
116 |
#logger.debug(f"Calling Apify Actor with input: {run_input}")
|
117 |
# Run the Actor and wait for it to finish
|
118 |
run = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)
|
119 |
-
|
120 |
|
121 |
# Fetch results from the run's dataset
|
122 |
|
123 |
#logger.debug(f"Fetching results from dataset ID: {run.get('defaultDatasetId')}")
|
124 |
results = list(client.dataset(run["defaultDatasetId"]).iterate_items())
|
125 |
-
|
126 |
|
127 |
if results and 'organicResults' in results[0]:
|
128 |
urls = [item['url'] for item in results[0]['organicResults']]
|
129 |
-
|
130 |
return urls
|
131 |
else:
|
132 |
-
|
133 |
st.warning("No organic results found in the SERP data.")
|
134 |
return []
|
135 |
except Exception as e:
|
136 |
-
|
137 |
st.error(f"Error fetching SERP results: {str(e)}")
|
138 |
return []
|
139 |
|
@@ -141,7 +145,7 @@ def get_serp_results(query):
|
|
141 |
|
142 |
|
143 |
def fetch_content(url):
|
144 |
-
|
145 |
try:
|
146 |
# Decode URL-encoded characters
|
147 |
decoded_url = urllib.parse.unquote(url)
|
@@ -152,29 +156,29 @@ def fetch_content(url):
|
|
152 |
#logger.debug(f"Fetched {len(content)} characters from {url}")
|
153 |
return content
|
154 |
except requests.RequestException as e:
|
155 |
-
|
156 |
st.warning(f"Error fetching content from {url}: {e}")
|
157 |
return ""
|
158 |
|
159 |
def calculate_relevance_score(page_content, query, co):
|
160 |
-
|
161 |
try:
|
162 |
if not page_content:
|
163 |
-
|
164 |
return 0
|
165 |
|
166 |
page_embedding = co.embed(texts=[page_content], model='embed-english-v3.0', input_type='search_document').embeddings[0]
|
167 |
query_embedding = co.embed(texts=[query], model='embed-english-v3.0', input_type='search_query').embeddings[0]
|
168 |
score = cosine_similarity([query_embedding], [page_embedding])[0][0]
|
169 |
-
|
170 |
return score
|
171 |
except Exception as e:
|
172 |
-
|
173 |
st.error(f"Error calculating relevance score: {str(e)}")
|
174 |
return 0
|
175 |
|
176 |
def analyze_competitors(row, co):
|
177 |
-
|
178 |
query = row['query']
|
179 |
our_url = row['page']
|
180 |
|
@@ -184,14 +188,14 @@ def analyze_competitors(row, co):
|
|
184 |
|
185 |
# Calculate score for our page first
|
186 |
our_content = fetch_content(our_url)
|
187 |
-
print(
|
188 |
print(our_content)
|
189 |
if our_content:
|
190 |
our_score = calculate_relevance_score(our_content, query, co)
|
191 |
results.append({'url': our_url, 'relevancy_score': our_score})
|
192 |
-
logger.info(f"Our URL: {our_url}, Score: {our_score}")
|
193 |
else:
|
194 |
-
logger.warning(f"No content fetched for our URL: {our_url}")
|
195 |
|
196 |
# Calculate scores for competitor pages
|
197 |
for url in competitor_urls:
|
@@ -199,25 +203,25 @@ def analyze_competitors(row, co):
|
|
199 |
# logger.debug(f"Processing competitor URL: {url}")
|
200 |
content = fetch_content(url)
|
201 |
if not content:
|
202 |
-
|
203 |
continue
|
204 |
|
205 |
score = calculate_relevance_score(content, query, co)
|
206 |
|
207 |
-
|
208 |
results.append({'url': url, 'relevancy_score': score})
|
209 |
except Exception as e:
|
210 |
-
|
211 |
st.error(f"Error processing URL {url}: {str(e)}")
|
212 |
|
213 |
results_df = pd.DataFrame(results).sort_values('relevancy_score', ascending=False)
|
214 |
|
215 |
-
|
216 |
return results_df
|
217 |
|
218 |
def show_competitor_analysis(row, co):
|
219 |
if st.button("Check Competitors", key=f"comp_{row['page']}"):
|
220 |
-
|
221 |
with st.spinner('Analyzing competitors...'):
|
222 |
results_df = analyze_competitors(row, co)
|
223 |
st.write("Relevancy Score Comparison:")
|
@@ -226,7 +230,7 @@ def show_competitor_analysis(row, co):
|
|
226 |
our_data = results_df[results_df['url'] == row['page']]
|
227 |
if our_data.empty:
|
228 |
st.error(f"Our page '{row['page']}' is not in the results. This indicates an error in fetching or processing the page.")
|
229 |
-
|
230 |
|
231 |
# Additional debugging information
|
232 |
# st.write("Debugging Information:")
|
@@ -241,7 +245,7 @@ def show_competitor_analysis(row, co):
|
|
241 |
total_results = len(results_df)
|
242 |
our_score = our_data['relevancy_score'].values[0]
|
243 |
|
244 |
-
|
245 |
st.write(f"Our page ('{row['page']}') ranks {our_rank} out of {total_results} in terms of relevancy score.")
|
246 |
st.write(f"Our relevancy score: {our_score:.4f}")
|
247 |
|
@@ -280,7 +284,7 @@ def analyze_competitors(row, co):
|
|
280 |
|
281 |
return results_df
|
282 |
def process_gsc_data(df):
|
283 |
-
logging.info("Processing GSC data")
|
284 |
df_sorted = df.sort_values(['impressions'], ascending=[False])
|
285 |
df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
|
286 |
|
@@ -290,7 +294,7 @@ def process_gsc_data(df):
|
|
290 |
df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
|
291 |
|
292 |
result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
|
293 |
-
logging.info("GSC data processed successfully")
|
294 |
return result
|
295 |
|
296 |
# -------------
|
@@ -298,7 +302,7 @@ def process_gsc_data(df):
|
|
298 |
# -------------
|
299 |
|
300 |
def load_config():
|
301 |
-
logging.info("Loading Google client configuration")
|
302 |
client_config = {
|
303 |
"web": {
|
304 |
"client_id": os.environ["CLIENT_ID"],
|
@@ -308,25 +312,25 @@ def load_config():
|
|
308 |
"redirect_uris": ["https://poemsforaphrodite-gscpro.hf.space/"],
|
309 |
}
|
310 |
}
|
311 |
-
logging.info("Google client configuration loaded")
|
312 |
return client_config
|
313 |
|
314 |
def init_oauth_flow(client_config):
|
315 |
-
logging.info("Initializing OAuth flow")
|
316 |
scopes = ["https://www.googleapis.com/auth/webmasters.readonly"]
|
317 |
flow = Flow.from_client_config(
|
318 |
client_config,
|
319 |
scopes=scopes,
|
320 |
redirect_uri=client_config["web"]["redirect_uris"][0]
|
321 |
)
|
322 |
-
logging.info("OAuth flow initialized")
|
323 |
return flow
|
324 |
|
325 |
def google_auth(client_config):
|
326 |
-
|
327 |
flow = init_oauth_flow(client_config)
|
328 |
auth_url, _ = flow.authorization_url(prompt="consent")
|
329 |
-
logging.info("Google authentication URL generated")
|
330 |
return flow, auth_url
|
331 |
|
332 |
def auth_search_console(client_config, credentials):
|
@@ -348,7 +352,7 @@ def auth_search_console(client_config, credentials):
|
|
348 |
# -------------
|
349 |
|
350 |
def list_gsc_properties(credentials):
|
351 |
-
|
352 |
service = build('webmasters', 'v3', credentials=credentials)
|
353 |
site_list = service.sites().list().execute()
|
354 |
properties = [site['siteUrl'] for site in site_list.get('siteEntry', [])] or ["No properties found"]
|
@@ -362,16 +366,16 @@ def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, d
|
|
362 |
query = query.filter('device', 'equals', device_type.lower())
|
363 |
try:
|
364 |
df = query.limit(MAX_ROWS).get().to_dataframe()
|
365 |
-
logging.info("GSC data fetched successfully")
|
366 |
return process_gsc_data(df)
|
367 |
except Exception as e:
|
368 |
-
logging.error(f"Error fetching GSC data: {e}")
|
369 |
show_error(e)
|
370 |
return pd.DataFrame()
|
371 |
|
372 |
|
373 |
def calculate_relevancy_scores(df, model_type):
|
374 |
-
logging.info("Calculating relevancy scores")
|
375 |
with st.spinner('Calculating relevancy scores...'):
|
376 |
try:
|
377 |
page_contents = [fetch_content(url) for url in df['page']]
|
@@ -379,9 +383,9 @@ def calculate_relevancy_scores(df, model_type):
|
|
379 |
query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
|
380 |
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
381 |
df = df.assign(relevancy_score=relevancy_scores)
|
382 |
-
logging.info("Relevancy scores calculated successfully")
|
383 |
except Exception as e:
|
384 |
-
logging.error(f"Error calculating relevancy scores: {e}")
|
385 |
st.warning(f"Error calculating relevancy scores: {e}")
|
386 |
df = df.assign(relevancy_score=0)
|
387 |
return df
|
@@ -417,7 +421,7 @@ def calc_date_range(selection, custom_start=None, custom_end=None):
|
|
417 |
return date_range
|
418 |
|
419 |
def show_error(e):
|
420 |
-
logging.error(f"An error occurred: {e}")
|
421 |
st.error(f"An error occurred: {e}")
|
422 |
|
423 |
def property_change():
|
@@ -429,33 +433,33 @@ def property_change():
|
|
429 |
# -------------
|
430 |
|
431 |
def show_dataframe(report):
|
432 |
-
logging.info("Showing dataframe preview")
|
433 |
with st.expander("Preview the First 100 Rows (Unique Pages with Top Query)"):
|
434 |
st.dataframe(report.head(DF_PREVIEW_ROWS))
|
435 |
|
436 |
def download_csv_link(report):
|
437 |
-
logging.info("Generating CSV download link")
|
438 |
def to_csv(df):
|
439 |
return df.to_csv(index=False, encoding='utf-8-sig')
|
440 |
csv = to_csv(report)
|
441 |
b64_csv = base64.b64encode(csv.encode()).decode()
|
442 |
href = f'<a href="data:file/csv;base64,{b64_csv}" download="search_console_data.csv">Download CSV File</a>'
|
443 |
st.markdown(href, unsafe_allow_html=True)
|
444 |
-
logging.info("CSV download link generated")
|
445 |
|
446 |
# -------------
|
447 |
# Streamlit UI Components
|
448 |
# -------------
|
449 |
|
450 |
def show_google_sign_in(auth_url):
|
451 |
-
|
452 |
with st.sidebar:
|
453 |
if st.button("Sign in with Google"):
|
454 |
st.write('Please click the link below to sign in:')
|
455 |
st.markdown(f'[Google Sign-In]({auth_url})', unsafe_allow_html=True)
|
456 |
|
457 |
def show_property_selector(properties, account):
|
458 |
-
|
459 |
selected_property = st.selectbox(
|
460 |
"Select a Search Console Property:",
|
461 |
properties,
|
@@ -467,7 +471,7 @@ def show_property_selector(properties, account):
|
|
467 |
return account[selected_property]
|
468 |
|
469 |
def show_search_type_selector():
|
470 |
-
|
471 |
return st.selectbox(
|
472 |
"Select Search Type:",
|
473 |
SEARCH_TYPES,
|
@@ -476,7 +480,7 @@ def show_search_type_selector():
|
|
476 |
)
|
477 |
|
478 |
def show_model_type_selector():
|
479 |
-
|
480 |
return st.selectbox(
|
481 |
"Select the embedding model:",
|
482 |
["english", "multilingual"],
|
@@ -512,7 +516,7 @@ def show_tabular_data(df, co):
|
|
512 |
|
513 |
|
514 |
def show_date_range_selector():
|
515 |
-
|
516 |
return st.selectbox(
|
517 |
"Select Date Range:",
|
518 |
DATE_RANGE_OPTIONS,
|
@@ -521,12 +525,12 @@ def show_date_range_selector():
|
|
521 |
)
|
522 |
|
523 |
def show_custom_date_inputs():
|
524 |
-
|
525 |
st.session_state.custom_start_date = st.date_input("Start Date", st.session_state.custom_start_date)
|
526 |
st.session_state.custom_end_date = st.date_input("End Date", st.session_state.custom_end_date)
|
527 |
|
528 |
def show_dimensions_selector(search_type):
|
529 |
-
|
530 |
available_dimensions = update_dimensions(search_type)
|
531 |
return st.multiselect(
|
532 |
"Select Dimensions:",
|
@@ -536,7 +540,7 @@ def show_dimensions_selector(search_type):
|
|
536 |
)
|
537 |
|
538 |
def show_paginated_dataframe(report, rows_per_page=20):
|
539 |
-
|
540 |
report['position'] = report['position'].astype(int)
|
541 |
report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
|
542 |
|
@@ -608,7 +612,7 @@ def show_paginated_dataframe(report, rows_per_page=20):
|
|
608 |
# -------------
|
609 |
|
610 |
def main():
|
611 |
-
|
612 |
setup_streamlit()
|
613 |
client_config = load_config()
|
614 |
|
@@ -653,17 +657,17 @@ def main():
|
|
653 |
st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
|
654 |
|
655 |
if st.button("Calculate Relevancy Scores"):
|
656 |
-
|
657 |
st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
|
658 |
|
659 |
show_tabular_data(st.session_state.report_data, co)
|
660 |
|
661 |
download_csv_link(st.session_state.report_data)
|
662 |
elif st.session_state.report_data is not None:
|
663 |
-
|
664 |
st.warning("No data found for the selected criteria.")
|
665 |
|
666 |
if __name__ == "__main__":
|
667 |
-
|
668 |
main()
|
669 |
-
logger.info("Script completed")
|
|
|
|
|
|
|
1 |
# Standard library imports
|
2 |
import datetime
|
3 |
import base64
|
|
|
18 |
from apify_client import ApifyClient
|
19 |
import urllib.parse
|
20 |
|
|
|
|
|
21 |
|
22 |
load_dotenv()
|
23 |
+
|
|
|
24 |
|
25 |
# Initialize Cohere client
|
26 |
APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN')
|
27 |
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
28 |
co = cohere.Client(COHERE_API_KEY)
|
|
|
29 |
if not APIFY_API_TOKEN:
|
|
|
30 |
st.error("APIFY_API_TOKEN is not set in the environment variables. Please set it and restart the application.")
|
31 |
|
32 |
# Initialize the ApifyClient with the API token
|
33 |
client = ApifyClient(APIFY_API_TOKEN)
|
34 |
# Initialize the ApifyClient with the API token
|
|
|
35 |
|
36 |
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
37 |
IS_LOCAL = False
|
|
|
81 |
st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
|
82 |
if 'custom_end_date' not in st.session_state:
|
83 |
st.session_state.custom_end_date = datetime.date.today()
|
84 |
+
#logging.info("Session state initialized")
|
85 |
|
86 |
# -------------
|
87 |
# Data Processing Functions
|
88 |
# -------------
|
89 |
+
def generate_embeddings(text_list, model_type):
|
90 |
+
#logging.debug(f"Generating embeddings for model type: {model_type}")
|
91 |
+
if not text_list:
|
92 |
+
logging.warning("Text list is empty, returning empty embeddings")
|
93 |
+
return []
|
94 |
+
model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
|
95 |
+
input_type = 'search_document'
|
96 |
+
response = co.embed(model=model, texts=text_list, input_type=input_type)
|
97 |
+
embeddings = response.embeddings
|
98 |
+
# logging.debug(f"Embeddings generated successfully for model type: {model_type}")
|
99 |
+
return embeddings
|
100 |
+
|
101 |
|
102 |
def get_serp_results(query):
|
103 |
if not APIFY_API_TOKEN:
|
|
|
120 |
#logger.debug(f"Calling Apify Actor with input: {run_input}")
|
121 |
# Run the Actor and wait for it to finish
|
122 |
run = client.actor("nFJndFXA5zjCTuudP").call(run_input=run_input)
|
123 |
+
# logger.info(f"Apify Actor run completed. Run ID: {run.get('id')}")
|
124 |
|
125 |
# Fetch results from the run's dataset
|
126 |
|
127 |
#logger.debug(f"Fetching results from dataset ID: {run.get('defaultDatasetId')}")
|
128 |
results = list(client.dataset(run["defaultDatasetId"]).iterate_items())
|
129 |
+
# logger.info(f"Fetched {len(results)} results from Apify dataset")
|
130 |
|
131 |
if results and 'organicResults' in results[0]:
|
132 |
urls = [item['url'] for item in results[0]['organicResults']]
|
133 |
+
# logger.info(f"Extracted {len(urls)} URLs from organic results")
|
134 |
return urls
|
135 |
else:
|
136 |
+
# logger.warning("No organic results found in the SERP data.")
|
137 |
st.warning("No organic results found in the SERP data.")
|
138 |
return []
|
139 |
except Exception as e:
|
140 |
+
# logger.exception(f"Error fetching SERP results: {str(e)}")
|
141 |
st.error(f"Error fetching SERP results: {str(e)}")
|
142 |
return []
|
143 |
|
|
|
145 |
|
146 |
|
147 |
def fetch_content(url):
|
148 |
+
# logger.info(f"Fetching content from URL: {url}")
|
149 |
try:
|
150 |
# Decode URL-encoded characters
|
151 |
decoded_url = urllib.parse.unquote(url)
|
|
|
156 |
#logger.debug(f"Fetched {len(content)} characters from {url}")
|
157 |
return content
|
158 |
except requests.RequestException as e:
|
159 |
+
# logger.error(f"Error fetching content from {url}: {e}")
|
160 |
st.warning(f"Error fetching content from {url}: {e}")
|
161 |
return ""
|
162 |
|
163 |
def calculate_relevance_score(page_content, query, co):
|
164 |
+
# logger.info(f"Calculating relevance score for query: {query}")
|
165 |
try:
|
166 |
if not page_content:
|
167 |
+
# logger.warning("Empty page content. Returning score 0.")
|
168 |
return 0
|
169 |
|
170 |
page_embedding = co.embed(texts=[page_content], model='embed-english-v3.0', input_type='search_document').embeddings[0]
|
171 |
query_embedding = co.embed(texts=[query], model='embed-english-v3.0', input_type='search_query').embeddings[0]
|
172 |
score = cosine_similarity([query_embedding], [page_embedding])[0][0]
|
173 |
+
# logger.debug(f"Relevance score calculated: {score}")
|
174 |
return score
|
175 |
except Exception as e:
|
176 |
+
# logger.exception(f"Error calculating relevance score: {str(e)}")
|
177 |
st.error(f"Error calculating relevance score: {str(e)}")
|
178 |
return 0
|
179 |
|
180 |
def analyze_competitors(row, co):
|
181 |
+
# logger.info(f"Analyzing competitors for query: {row['query']}")
|
182 |
query = row['query']
|
183 |
our_url = row['page']
|
184 |
|
|
|
188 |
|
189 |
# Calculate score for our page first
|
190 |
our_content = fetch_content(our_url)
|
191 |
+
print(our_url)
|
192 |
print(our_content)
|
193 |
if our_content:
|
194 |
our_score = calculate_relevance_score(our_content, query, co)
|
195 |
results.append({'url': our_url, 'relevancy_score': our_score})
|
196 |
+
#logger.info(f"Our URL: {our_url}, Score: {our_score}")
|
197 |
else:
|
198 |
+
#logger.warning(f"No content fetched for our URL: {our_url}")
|
199 |
|
200 |
# Calculate scores for competitor pages
|
201 |
for url in competitor_urls:
|
|
|
203 |
# logger.debug(f"Processing competitor URL: {url}")
|
204 |
content = fetch_content(url)
|
205 |
if not content:
|
206 |
+
# logger.warning(f"No content fetched for competitor URL: {url}")
|
207 |
continue
|
208 |
|
209 |
score = calculate_relevance_score(content, query, co)
|
210 |
|
211 |
+
# logger.info(f"Competitor URL: {url}, Score: {score}")
|
212 |
results.append({'url': url, 'relevancy_score': score})
|
213 |
except Exception as e:
|
214 |
+
# logger.error(f"Error processing URL {url}: {str(e)}")
|
215 |
st.error(f"Error processing URL {url}: {str(e)}")
|
216 |
|
217 |
results_df = pd.DataFrame(results).sort_values('relevancy_score', ascending=False)
|
218 |
|
219 |
+
# logger.info(f"Competitor analysis completed. {len(results)} results obtained.")
|
220 |
return results_df
|
221 |
|
222 |
def show_competitor_analysis(row, co):
|
223 |
if st.button("Check Competitors", key=f"comp_{row['page']}"):
|
224 |
+
# logger.info(f"Competitor analysis requested for page: {row['page']}")
|
225 |
with st.spinner('Analyzing competitors...'):
|
226 |
results_df = analyze_competitors(row, co)
|
227 |
st.write("Relevancy Score Comparison:")
|
|
|
230 |
our_data = results_df[results_df['url'] == row['page']]
|
231 |
if our_data.empty:
|
232 |
st.error(f"Our page '{row['page']}' is not in the results. This indicates an error in fetching or processing the page.")
|
233 |
+
# logger.error(f"Our page '{row['page']}' is missing from the results.")
|
234 |
|
235 |
# Additional debugging information
|
236 |
# st.write("Debugging Information:")
|
|
|
245 |
total_results = len(results_df)
|
246 |
our_score = our_data['relevancy_score'].values[0]
|
247 |
|
248 |
+
# logger.info(f"Our page ranks {our_rank} out of {total_results} in terms of relevancy score.")
|
249 |
st.write(f"Our page ('{row['page']}') ranks {our_rank} out of {total_results} in terms of relevancy score.")
|
250 |
st.write(f"Our relevancy score: {our_score:.4f}")
|
251 |
|
|
|
284 |
|
285 |
return results_df
|
286 |
def process_gsc_data(df):
|
287 |
+
#logging.info("Processing GSC data")
|
288 |
df_sorted = df.sort_values(['impressions'], ascending=[False])
|
289 |
df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
|
290 |
|
|
|
294 |
df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
|
295 |
|
296 |
result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
|
297 |
+
#logging.info("GSC data processed successfully")
|
298 |
return result
|
299 |
|
300 |
# -------------
|
|
|
302 |
# -------------
|
303 |
|
304 |
def load_config():
|
305 |
+
#logging.info("Loading Google client configuration")
|
306 |
client_config = {
|
307 |
"web": {
|
308 |
"client_id": os.environ["CLIENT_ID"],
|
|
|
312 |
"redirect_uris": ["https://poemsforaphrodite-gscpro.hf.space/"],
|
313 |
}
|
314 |
}
|
315 |
+
#logging.info("Google client configuration loaded")
|
316 |
return client_config
|
317 |
|
318 |
def init_oauth_flow(client_config):
|
319 |
+
#logging.info("Initializing OAuth flow")
|
320 |
scopes = ["https://www.googleapis.com/auth/webmasters.readonly"]
|
321 |
flow = Flow.from_client_config(
|
322 |
client_config,
|
323 |
scopes=scopes,
|
324 |
redirect_uri=client_config["web"]["redirect_uris"][0]
|
325 |
)
|
326 |
+
#logging.info("OAuth flow initialized")
|
327 |
return flow
|
328 |
|
329 |
def google_auth(client_config):
|
330 |
+
# logging.info("Starting Google authentication")
|
331 |
flow = init_oauth_flow(client_config)
|
332 |
auth_url, _ = flow.authorization_url(prompt="consent")
|
333 |
+
#logging.info("Google authentication URL generated")
|
334 |
return flow, auth_url
|
335 |
|
336 |
def auth_search_console(client_config, credentials):
|
|
|
352 |
# -------------
|
353 |
|
354 |
def list_gsc_properties(credentials):
|
355 |
+
# logging.info("Listing GSC properties")
|
356 |
service = build('webmasters', 'v3', credentials=credentials)
|
357 |
site_list = service.sites().list().execute()
|
358 |
properties = [site['siteUrl'] for site in site_list.get('siteEntry', [])] or ["No properties found"]
|
|
|
366 |
query = query.filter('device', 'equals', device_type.lower())
|
367 |
try:
|
368 |
df = query.limit(MAX_ROWS).get().to_dataframe()
|
369 |
+
#logging.info("GSC data fetched successfully")
|
370 |
return process_gsc_data(df)
|
371 |
except Exception as e:
|
372 |
+
#logging.error(f"Error fetching GSC data: {e}")
|
373 |
show_error(e)
|
374 |
return pd.DataFrame()
|
375 |
|
376 |
|
377 |
def calculate_relevancy_scores(df, model_type):
|
378 |
+
#logging.info("Calculating relevancy scores")
|
379 |
with st.spinner('Calculating relevancy scores...'):
|
380 |
try:
|
381 |
page_contents = [fetch_content(url) for url in df['page']]
|
|
|
383 |
query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
|
384 |
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
385 |
df = df.assign(relevancy_score=relevancy_scores)
|
386 |
+
#logging.info("Relevancy scores calculated successfully")
|
387 |
except Exception as e:
|
388 |
+
#logging.error(f"Error calculating relevancy scores: {e}")
|
389 |
st.warning(f"Error calculating relevancy scores: {e}")
|
390 |
df = df.assign(relevancy_score=0)
|
391 |
return df
|
|
|
421 |
return date_range
|
422 |
|
423 |
def show_error(e):
|
424 |
+
#logging.error(f"An error occurred: {e}")
|
425 |
st.error(f"An error occurred: {e}")
|
426 |
|
427 |
def property_change():
|
|
|
433 |
# -------------
|
434 |
|
435 |
def show_dataframe(report):
|
436 |
+
#logging.info("Showing dataframe preview")
|
437 |
with st.expander("Preview the First 100 Rows (Unique Pages with Top Query)"):
|
438 |
st.dataframe(report.head(DF_PREVIEW_ROWS))
|
439 |
|
440 |
def download_csv_link(report):
|
441 |
+
#logging.info("Generating CSV download link")
|
442 |
def to_csv(df):
|
443 |
return df.to_csv(index=False, encoding='utf-8-sig')
|
444 |
csv = to_csv(report)
|
445 |
b64_csv = base64.b64encode(csv.encode()).decode()
|
446 |
href = f'<a href="data:file/csv;base64,{b64_csv}" download="search_console_data.csv">Download CSV File</a>'
|
447 |
st.markdown(href, unsafe_allow_html=True)
|
448 |
+
#logging.info("CSV download link generated")
|
449 |
|
450 |
# -------------
|
451 |
# Streamlit UI Components
|
452 |
# -------------
|
453 |
|
454 |
def show_google_sign_in(auth_url):
|
455 |
+
# logging.info("Showing Google sign-in button")
|
456 |
with st.sidebar:
|
457 |
if st.button("Sign in with Google"):
|
458 |
st.write('Please click the link below to sign in:')
|
459 |
st.markdown(f'[Google Sign-In]({auth_url})', unsafe_allow_html=True)
|
460 |
|
461 |
def show_property_selector(properties, account):
|
462 |
+
# logging.info("Showing property selector")
|
463 |
selected_property = st.selectbox(
|
464 |
"Select a Search Console Property:",
|
465 |
properties,
|
|
|
471 |
return account[selected_property]
|
472 |
|
473 |
def show_search_type_selector():
|
474 |
+
# logging.info("Showing search type selector")
|
475 |
return st.selectbox(
|
476 |
"Select Search Type:",
|
477 |
SEARCH_TYPES,
|
|
|
480 |
)
|
481 |
|
482 |
def show_model_type_selector():
|
483 |
+
# logging.info("Showing model type selector")
|
484 |
return st.selectbox(
|
485 |
"Select the embedding model:",
|
486 |
["english", "multilingual"],
|
|
|
516 |
|
517 |
|
518 |
def show_date_range_selector():
|
519 |
+
# logging.info("Showing date range selector")
|
520 |
return st.selectbox(
|
521 |
"Select Date Range:",
|
522 |
DATE_RANGE_OPTIONS,
|
|
|
525 |
)
|
526 |
|
527 |
def show_custom_date_inputs():
|
528 |
+
# logging.info("Showing custom date inputs")
|
529 |
st.session_state.custom_start_date = st.date_input("Start Date", st.session_state.custom_start_date)
|
530 |
st.session_state.custom_end_date = st.date_input("End Date", st.session_state.custom_end_date)
|
531 |
|
532 |
def show_dimensions_selector(search_type):
|
533 |
+
# logging.info("Showing dimensions selector")
|
534 |
available_dimensions = update_dimensions(search_type)
|
535 |
return st.multiselect(
|
536 |
"Select Dimensions:",
|
|
|
540 |
)
|
541 |
|
542 |
def show_paginated_dataframe(report, rows_per_page=20):
|
543 |
+
# logging.info("Showing paginated dataframe")
|
544 |
report['position'] = report['position'].astype(int)
|
545 |
report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
|
546 |
|
|
|
612 |
# -------------
|
613 |
|
614 |
def main():
|
615 |
+
# logging.info("Starting main function")
|
616 |
setup_streamlit()
|
617 |
client_config = load_config()
|
618 |
|
|
|
657 |
st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
|
658 |
|
659 |
if st.button("Calculate Relevancy Scores"):
|
660 |
+
# logger.info("Calculating relevancy scores for all rows")
|
661 |
st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
|
662 |
|
663 |
show_tabular_data(st.session_state.report_data, co)
|
664 |
|
665 |
download_csv_link(st.session_state.report_data)
|
666 |
elif st.session_state.report_data is not None:
|
667 |
+
# logger.warning("No data found for the selected criteria.")
|
668 |
st.warning("No data found for the selected criteria.")
|
669 |
|
670 |
if __name__ == "__main__":
|
671 |
+
# logging.info("Running main function")
|
672 |
main()
|
673 |
+
#logger.info("Script completed")
|