Spaces:
Sleeping
Sleeping
poemsforaphrodite
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,342 +1,3 @@
|
|
1 |
-
# Standard library imports
|
2 |
-
import datetime
|
3 |
-
import base64
|
4 |
-
import os
|
5 |
-
|
6 |
-
# Related third-party imports
|
7 |
-
import streamlit as st
|
8 |
-
from streamlit_elements import elements
|
9 |
-
from google_auth_oauthlib.flow import Flow
|
10 |
-
from googleapiclient.discovery import build
|
11 |
-
from dotenv import load_dotenv
|
12 |
-
import pandas as pd
|
13 |
-
import searchconsole
|
14 |
-
import cohere
|
15 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
16 |
-
import requests
|
17 |
-
from bs4 import BeautifulSoup
|
18 |
-
|
19 |
-
load_dotenv()
|
20 |
-
#test
|
21 |
-
|
22 |
-
# Initialize Cohere client
|
23 |
-
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
|
24 |
-
co = cohere.Client(COHERE_API_KEY)
|
25 |
-
|
26 |
-
# Configuration: Set to True if running locally, False if running on Streamlit Cloud
|
27 |
-
IS_LOCAL = False
|
28 |
-
|
29 |
-
# Constants
|
30 |
-
SEARCH_TYPES = ["web", "image", "video", "news", "discover", "googleNews"]
|
31 |
-
DATE_RANGE_OPTIONS = [
|
32 |
-
"Last 7 Days",
|
33 |
-
"Last 30 Days",
|
34 |
-
"Last 3 Months",
|
35 |
-
"Last 6 Months",
|
36 |
-
"Last 12 Months",
|
37 |
-
"Last 16 Months",
|
38 |
-
"Custom Range"
|
39 |
-
]
|
40 |
-
DEVICE_OPTIONS = ["All Devices", "desktop", "mobile", "tablet"]
|
41 |
-
BASE_DIMENSIONS = ["page", "query", "country", "date"]
|
42 |
-
MAX_ROWS = 250_000
|
43 |
-
DF_PREVIEW_ROWS = 100
|
44 |
-
|
45 |
-
# -------------
|
46 |
-
# Streamlit App Configuration
|
47 |
-
# -------------
|
48 |
-
|
49 |
-
def setup_streamlit():
|
50 |
-
st.set_page_config(page_title="Simple Google Search Console Data", layout="wide")
|
51 |
-
st.title("GSC Relenvacy Score Calculator")
|
52 |
-
# st.markdown(f"### Lightweight GSC Data Extractor. (Max {MAX_ROWS:,} Rows)")
|
53 |
-
st.divider()
|
54 |
-
|
55 |
-
def init_session_state():
|
56 |
-
if 'selected_property' not in st.session_state:
|
57 |
-
st.session_state.selected_property = None
|
58 |
-
if 'selected_search_type' not in st.session_state:
|
59 |
-
st.session_state.selected_search_type = 'web'
|
60 |
-
if 'selected_date_range' not in st.session_state:
|
61 |
-
st.session_state.selected_date_range = 'Last 7 Days'
|
62 |
-
if 'start_date' not in st.session_state:
|
63 |
-
st.session_state.start_date = datetime.date.today() - datetime.timedelta(days=7)
|
64 |
-
if 'end_date' not in st.session_state:
|
65 |
-
st.session_state.end_date = datetime.date.today()
|
66 |
-
if 'selected_dimensions' not in st.session_state:
|
67 |
-
st.session_state.selected_dimensions = ['page', 'query']
|
68 |
-
if 'selected_device' not in st.session_state:
|
69 |
-
st.session_state.selected_device = 'All Devices'
|
70 |
-
if 'custom_start_date' not in st.session_state:
|
71 |
-
st.session_state.custom_start_date = datetime.date.today() - datetime.timedelta(days=7)
|
72 |
-
if 'custom_end_date' not in st.session_state:
|
73 |
-
st.session_state.custom_end_date = datetime.date.today()
|
74 |
-
|
75 |
-
# -------------
|
76 |
-
# Data Processing Functions
|
77 |
-
# -------------
|
78 |
-
|
79 |
-
def fetch_content(url):
|
80 |
-
try:
|
81 |
-
response = requests.get(url)
|
82 |
-
response.raise_for_status()
|
83 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
84 |
-
content = soup.get_text(separator=' ', strip=True)
|
85 |
-
return content
|
86 |
-
except requests.RequestException as e:
|
87 |
-
return str(e)
|
88 |
-
|
89 |
-
def generate_embeddings(text_list, model_type):
|
90 |
-
if not text_list:
|
91 |
-
return []
|
92 |
-
model = 'embed-english-v3.0' if model_type == 'english' else 'embed-multilingual-v3.0'
|
93 |
-
input_type = 'search_document'
|
94 |
-
response = co.embed(model=model, texts=text_list, input_type=input_type)
|
95 |
-
embeddings = response.embeddings
|
96 |
-
return embeddings
|
97 |
-
|
98 |
-
def calculate_relevancy_scores(df, model_type):
|
99 |
-
try:
|
100 |
-
page_contents = [fetch_content(url) for url in df['page']]
|
101 |
-
page_embeddings = generate_embeddings(page_contents, model_type)
|
102 |
-
query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
|
103 |
-
relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
|
104 |
-
df = df.assign(relevancy_score=relevancy_scores)
|
105 |
-
except Exception as e:
|
106 |
-
st.warning(f"Error calculating relevancy scores: {e}")
|
107 |
-
df = df.assign(relevancy_score=0)
|
108 |
-
return df
|
109 |
-
|
110 |
-
def process_gsc_data(df):
|
111 |
-
# Filter for queries below position 10
|
112 |
-
df_filtered = df[df['position'] > 10].copy()
|
113 |
-
|
114 |
-
# Sort by impressions in descending order
|
115 |
-
df_sorted = df_filtered.sort_values(['impressions'], ascending=[False])
|
116 |
-
|
117 |
-
# Keep only the highest impression query for each page
|
118 |
-
df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
|
119 |
-
|
120 |
-
if 'relevancy_score' not in df_unique.columns:
|
121 |
-
df_unique['relevancy_score'] = 0
|
122 |
-
else:
|
123 |
-
df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
|
124 |
-
|
125 |
-
result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
|
126 |
-
return result
|
127 |
-
|
128 |
-
# -------------
|
129 |
-
# Google Authentication Functions
|
130 |
-
# -------------
|
131 |
-
|
132 |
-
def load_config():
|
133 |
-
client_config = {
|
134 |
-
"web": {
|
135 |
-
"client_id": os.environ["CLIENT_ID"],
|
136 |
-
"client_secret": os.environ["CLIENT_SECRET"],
|
137 |
-
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
138 |
-
"token_uri": "https://oauth2.googleapis.com/token",
|
139 |
-
"redirect_uris": ["https://poemsforaphrodite-gscpro.hf.space/"],
|
140 |
-
}
|
141 |
-
}
|
142 |
-
return client_config
|
143 |
-
|
144 |
-
def init_oauth_flow(client_config):
|
145 |
-
scopes = ["https://www.googleapis.com/auth/webmasters.readonly"]
|
146 |
-
flow = Flow.from_client_config(
|
147 |
-
client_config,
|
148 |
-
scopes=scopes,
|
149 |
-
redirect_uri=client_config["web"]["redirect_uris"][0]
|
150 |
-
)
|
151 |
-
return flow
|
152 |
-
|
153 |
-
def google_auth(client_config):
|
154 |
-
flow = init_oauth_flow(client_config)
|
155 |
-
auth_url, _ = flow.authorization_url(prompt="consent")
|
156 |
-
return flow, auth_url
|
157 |
-
|
158 |
-
def auth_search_console(client_config, credentials):
|
159 |
-
token = {
|
160 |
-
"token": credentials.token,
|
161 |
-
"refresh_token": credentials.refresh_token,
|
162 |
-
"token_uri": credentials.token_uri,
|
163 |
-
"client_id": credentials.client_id,
|
164 |
-
"client_secret": credentials.client_secret,
|
165 |
-
"scopes": credentials.scopes,
|
166 |
-
"id_token": getattr(credentials, "id_token", None),
|
167 |
-
}
|
168 |
-
return searchconsole.authenticate(client_config=client_config, credentials=token)
|
169 |
-
|
170 |
-
# -------------
|
171 |
-
# Data Fetching Functions
|
172 |
-
# -------------
|
173 |
-
|
174 |
-
def list_gsc_properties(credentials):
|
175 |
-
service = build('webmasters', 'v3', credentials=credentials)
|
176 |
-
site_list = service.sites().list().execute()
|
177 |
-
return [site['siteUrl'] for site in site_list.get('siteEntry', [])] or ["No properties found"]
|
178 |
-
|
179 |
-
def fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type=None):
|
180 |
-
query = webproperty.query.range(start_date, end_date).search_type(search_type).dimension(*dimensions)
|
181 |
-
if 'device' in dimensions and device_type and device_type != 'All Devices':
|
182 |
-
query = query.filter('device', 'equals', device_type.lower())
|
183 |
-
try:
|
184 |
-
df = query.limit(MAX_ROWS).get().to_dataframe()
|
185 |
-
return process_gsc_data(df)
|
186 |
-
except Exception as e:
|
187 |
-
show_error(e)
|
188 |
-
return pd.DataFrame()
|
189 |
-
|
190 |
-
def fetch_data_loading(webproperty, search_type, start_date, end_date, dimensions, device_type=None, model_type='english'):
|
191 |
-
with st.spinner('Fetching data and calculating relevancy scores...'):
|
192 |
-
df = fetch_gsc_data(webproperty, search_type, start_date, end_date, dimensions, device_type)
|
193 |
-
if not df.empty:
|
194 |
-
df = calculate_relevancy_scores(df, model_type)
|
195 |
-
processed_df = process_gsc_data(df)
|
196 |
-
return processed_df
|
197 |
-
|
198 |
-
# -------------
|
199 |
-
# Utility Functions
|
200 |
-
# -------------
|
201 |
-
|
202 |
-
def update_dimensions(selected_search_type):
|
203 |
-
return BASE_DIMENSIONS + ['device'] if selected_search_type in SEARCH_TYPES else BASE_DIMENSIONS
|
204 |
-
|
205 |
-
def calc_date_range(selection, custom_start=None, custom_end=None):
|
206 |
-
range_map = {
|
207 |
-
'Last 7 Days': 7,
|
208 |
-
'Last 30 Days': 30,
|
209 |
-
'Last 3 Months': 90,
|
210 |
-
'Last 6 Months': 180,
|
211 |
-
'Last 12 Months': 365,
|
212 |
-
'Last 16 Months': 480
|
213 |
-
}
|
214 |
-
today = datetime.date.today()
|
215 |
-
if selection == 'Custom Range':
|
216 |
-
if custom_start and custom_end:
|
217 |
-
return custom_start, custom_end
|
218 |
-
else:
|
219 |
-
return today - datetime.timedelta(days=7), today
|
220 |
-
return today - datetime.timedelta(days=range_map.get(selection, 0)), today
|
221 |
-
|
222 |
-
def show_error(e):
|
223 |
-
st.error(f"An error occurred: {e}")
|
224 |
-
|
225 |
-
def property_change():
|
226 |
-
st.session_state.selected_property = st.session_state['selected_property_selector']
|
227 |
-
|
228 |
-
# -------------
|
229 |
-
# File & Download Operations
|
230 |
-
# -------------
|
231 |
-
|
232 |
-
def show_dataframe(report):
|
233 |
-
with st.expander("Preview the First 100 Rows (Unique Pages with Top Query)"):
|
234 |
-
st.dataframe(report.head(DF_PREVIEW_ROWS))
|
235 |
-
|
236 |
-
def download_csv_link(report):
|
237 |
-
def to_csv(df):
|
238 |
-
return df.to_csv(index=False, encoding='utf-8-sig')
|
239 |
-
csv = to_csv(report)
|
240 |
-
b64_csv = base64.b64encode(csv.encode()).decode()
|
241 |
-
href = f'<a href="data:file/csv;base64,{b64_csv}" download="search_console_data.csv">Download CSV File</a>'
|
242 |
-
st.markdown(href, unsafe_allow_html=True)
|
243 |
-
|
244 |
-
# -------------
|
245 |
-
# Streamlit UI Components
|
246 |
-
# -------------
|
247 |
-
|
248 |
-
def show_google_sign_in(auth_url):
|
249 |
-
with st.sidebar:
|
250 |
-
if st.button("Sign in with Google"):
|
251 |
-
st.write('Please click the link below to sign in:')
|
252 |
-
st.markdown(f'[Google Sign-In]({auth_url})', unsafe_allow_html=True)
|
253 |
-
|
254 |
-
def show_property_selector(properties, account):
|
255 |
-
selected_property = st.selectbox(
|
256 |
-
"Select a Search Console Property:",
|
257 |
-
properties,
|
258 |
-
index=properties.index(
|
259 |
-
st.session_state.selected_property) if st.session_state.selected_property in properties else 0,
|
260 |
-
key='selected_property_selector',
|
261 |
-
on_change=property_change
|
262 |
-
)
|
263 |
-
return account[selected_property]
|
264 |
-
|
265 |
-
def show_search_type_selector():
|
266 |
-
return st.selectbox(
|
267 |
-
"Select Search Type:",
|
268 |
-
SEARCH_TYPES,
|
269 |
-
index=SEARCH_TYPES.index(st.session_state.selected_search_type),
|
270 |
-
key='search_type_selector'
|
271 |
-
)
|
272 |
-
|
273 |
-
def show_model_type_selector():
|
274 |
-
return st.selectbox(
|
275 |
-
"Select the embedding model:",
|
276 |
-
["english", "multilingual"],
|
277 |
-
key='model_type_selector'
|
278 |
-
)
|
279 |
-
|
280 |
-
def show_date_range_selector():
|
281 |
-
return st.selectbox(
|
282 |
-
"Select Date Range:",
|
283 |
-
DATE_RANGE_OPTIONS,
|
284 |
-
index=DATE_RANGE_OPTIONS.index(st.session_state.selected_date_range),
|
285 |
-
key='date_range_selector'
|
286 |
-
)
|
287 |
-
|
288 |
-
def show_custom_date_inputs():
|
289 |
-
st.session_state.custom_start_date = st.date_input("Start Date", st.session_state.custom_start_date)
|
290 |
-
st.session_state.custom_end_date = st.date_input("End Date", st.session_state.custom_end_date)
|
291 |
-
|
292 |
-
def show_dimensions_selector(search_type):
|
293 |
-
available_dimensions = update_dimensions(search_type)
|
294 |
-
return st.multiselect(
|
295 |
-
"Select Dimensions:",
|
296 |
-
available_dimensions,
|
297 |
-
default=st.session_state.selected_dimensions,
|
298 |
-
key='dimensions_selector'
|
299 |
-
)
|
300 |
-
|
301 |
-
def show_paginated_dataframe(report, rows_per_page=20):
|
302 |
-
# Convert 'position' column to integer
|
303 |
-
report['position'] = report['position'].astype(int)
|
304 |
-
|
305 |
-
# Create a clickable URL column
|
306 |
-
def make_clickable(url):
|
307 |
-
return f'<a href="{url}" target="_blank">{url}</a>'
|
308 |
-
|
309 |
-
report['clickable_url'] = report['page'].apply(make_clickable)
|
310 |
-
|
311 |
-
# Reorder columns to put clickable_url first and sort by impressions
|
312 |
-
columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
|
313 |
-
report = report[columns].sort_values('impressions', ascending=False)
|
314 |
-
|
315 |
-
total_rows = len(report)
|
316 |
-
total_pages = (total_rows - 1) // rows_per_page + 1
|
317 |
-
|
318 |
-
if 'current_page' not in st.session_state:
|
319 |
-
st.session_state.current_page = 1
|
320 |
-
|
321 |
-
col1, col2, col3 = st.columns([1,3,1])
|
322 |
-
with col1:
|
323 |
-
if st.button("Previous", disabled=st.session_state.current_page == 1):
|
324 |
-
st.session_state.current_page -= 1
|
325 |
-
with col2:
|
326 |
-
st.write(f"Page {st.session_state.current_page} of {total_pages}")
|
327 |
-
with col3:
|
328 |
-
if st.button("Next", disabled=st.session_state.current_page == total_pages):
|
329 |
-
st.session_state.current_page += 1
|
330 |
-
|
331 |
-
start_idx = (st.session_state.current_page - 1) * rows_per_page
|
332 |
-
end_idx = start_idx + rows_per_page
|
333 |
-
|
334 |
-
# Use st.markdown to display the dataframe with clickable links
|
335 |
-
st.markdown(report.iloc[start_idx:end_idx].to_html(escape=False, index=False), unsafe_allow_html=True)
|
336 |
-
# -------------
|
337 |
-
# Main Streamlit App Function
|
338 |
-
# -------------
|
339 |
-
|
340 |
def main():
|
341 |
setup_streamlit()
|
342 |
client_config = load_config()
|
@@ -345,7 +6,7 @@ def main():
|
|
345 |
st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
|
346 |
|
347 |
# Retrieve query parameters
|
348 |
-
query_params = st.
|
349 |
print("Query Parameters:", query_params)
|
350 |
|
351 |
# Display the query parameters in the Streamlit app for debugging
|
@@ -377,7 +38,7 @@ def main():
|
|
377 |
webproperty = show_property_selector(properties, account)
|
378 |
search_type = show_search_type_selector()
|
379 |
date_range_selection = show_date_range_selector()
|
380 |
-
model_type = show_model_type_selector()
|
381 |
if date_range_selection == 'Custom Range':
|
382 |
show_custom_date_inputs()
|
383 |
start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
|
@@ -391,7 +52,7 @@ def main():
|
|
391 |
|
392 |
if st.button("Fetch Data"):
|
393 |
with st.spinner('Fetching data...'):
|
394 |
-
st.session_state.report_data = fetch_data_loading(webproperty, search_type, start_date, end_date, selected_dimensions, model_type=model_type)
|
395 |
|
396 |
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
397 |
show_paginated_dataframe(st.session_state.report_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
def main():
|
2 |
setup_streamlit()
|
3 |
client_config = load_config()
|
|
|
6 |
st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
|
7 |
|
8 |
# Retrieve query parameters
|
9 |
+
query_params = st.query_params
|
10 |
print("Query Parameters:", query_params)
|
11 |
|
12 |
# Display the query parameters in the Streamlit app for debugging
|
|
|
38 |
webproperty = show_property_selector(properties, account)
|
39 |
search_type = show_search_type_selector()
|
40 |
date_range_selection = show_date_range_selector()
|
41 |
+
model_type = show_model_type_selector()
|
42 |
if date_range_selection == 'Custom Range':
|
43 |
show_custom_date_inputs()
|
44 |
start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
|
|
|
52 |
|
53 |
if st.button("Fetch Data"):
|
54 |
with st.spinner('Fetching data...'):
|
55 |
+
st.session_state.report_data = fetch_data_loading(webproperty, search_type, start_date, end_date, selected_dimensions, model_type=model_type)
|
56 |
|
57 |
if st.session_state.report_data is not None and not st.session_state.report_data.empty:
|
58 |
show_paginated_dataframe(st.session_state.report_data)
|