Spaces:

poemsforaphrodite
/

gscpro

Sleeping

App Files Files Community

poemsforaphrodite commited on Jul 29, 2024

Commit

3a8e960

verified ·

1 Parent(s): 3dc03f5

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -113

app.py CHANGED Viewed

@@ -2,8 +2,6 @@
 import datetime
 import base64
 import os
-import sys
-import json
 # Related third-party imports
 import streamlit as st
@@ -16,20 +14,11 @@ import searchconsole
 import cohere
 from sklearn.metrics.pairwise import cosine_similarity
 import requests
-import logging
 from bs4 import BeautifulSoup
 load_dotenv()
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    stream=sys.stdout  # This will ensure the logs are captured by Hugging Face
-)
-logger = logging.getLogger(__name__)
-# Explicitly set Streamlit's logg
-st.set_option('deprecation.showfileUploaderEncoding', False)
 # Initialize Cohere client
 COHERE_API_KEY = os.environ["COHERE_API_KEY"]
 co = cohere.Client(COHERE_API_KEY)
@@ -105,20 +94,33 @@ def generate_embeddings(text_list, model_type):
     embeddings = response.embeddings
     return embeddings
-def calculate_single_relevancy_score(page_content, query, model_type):
-    page_embedding = generate_embeddings([page_content], model_type)[0]
-    query_embedding = generate_embeddings([query], model_type)[0]
-    relevancy_score = cosine_similarity([query_embedding], [page_embedding])[0][0]
-    return relevancy_score
 def process_gsc_data(df):
     df_sorted = df.sort_values(['impressions'], ascending=[False])
     df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
-    result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position']]
-    result['relevancy_score'] = None  # Initialize relevancy_score as None
     return result
 # -------------
 # Google Authentication Functions
 # -------------
@@ -298,40 +300,59 @@ def show_dimensions_selector(search_type):
         key='dimensions_selector'
     )
-def show_paginated_dataframe(report, rows_per_page=20, model_type='english'):
-    logger.info("Displaying paginated dataframe")
-    # Check if required columns are present
-    required_columns = ['page', 'query', 'clicks', 'impressions', 'ctr', 'position']
-    missing_columns = [col for col in required_columns if col not in report.columns]
-    if missing_columns:
-        st.error(f"Error: The following required columns are missing from the data: {', '.join(missing_columns)}")
-        return report
     report['position'] = report['position'].astype(int)
     report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
     def format_ctr(x):
         try:
             return f"{float(x):.2%}"
         except ValueError:
-            return x
     report['ctr'] = report['ctr'].apply(format_ctr)
-    if 'relevancy_score' not in report.columns:
-        report['relevancy_score'] = None
-    columns = ['page', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
     report = report[columns]
-    sort_column = st.selectbox("Sort by:", columns, index=columns.index('impressions'))
     sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
     ascending = sort_order == "Ascending"
-    report = report.sort_values(by=sort_column, ascending=ascending)
     total_rows = len(report)
     total_pages = (total_rows - 1) // rows_per_page + 1
@@ -352,72 +373,17 @@ def show_paginated_dataframe(report, rows_per_page=20, model_type='english'):
     start_idx = (st.session_state.current_page - 1) * rows_per_page
     end_idx = start_idx + rows_per_page
-    page_data = report.iloc[start_idx:end_idx].reset_index(drop=True)
-    # Display column headers
-    col1, col2, col3, col4, col5, col6, col7, col8 = st.columns([3, 2, 1, 1, 1, 1, 1, 1])
-    col1.write("**Page**")
-    col2.write("**Query**")
-    col3.write("**Impressions**")
-    col4.write("**Clicks**")
-    col5.write("**CTR**")
-    col6.write("**Position**")
-    col7.write("**Relevancy Score**")
-    col8.write("**Action**")
-    # Display data rows
-    for idx, row in page_data.iterrows():
-        col1, col2, col3, col4, col5, col6, col7, col8 = st.columns([3, 2, 1, 1, 1, 1, 1, 1])
-        with col1:
-            st.write(f"[{row['page']}]({row['page']})")
-        with col2:
-            st.write(row['query'])
-        with col3:
-            st.write(row['impressions'])
-        with col4:
-            st.write(row['clicks'])
-        with col5:
-            st.write(row['ctr'])
-        with col6:
-            st.write(row['position'])
-        with col7:
-            st.write(row['relevancy_score'] if row['relevancy_score'] is not None else "N/A")
-        with col8:
-            if st.button("Calculate", key=f"calc_{idx}"):
-                logger.info(f"Calculating relevancy for row index: {start_idx + idx}")
-                try:
-                    page_content = fetch_content(row['page'])
-                    logger.info(f"Fetched content for {row['page']}: {page_content[:100]}...")  # Log the first 100 characters
-                    query = row['query']
-                    relevancy_score = calculate_single_relevancy_score(page_content, query, model_type)
-                    logger.info(f"Relevancy score calculated: {relevancy_score}")
-                    report.at[start_idx + idx, 'relevancy_score'] = f"{relevancy_score:.2f}"
-                    st.success(f"Relevancy score calculated for row {start_idx + idx + 1}")
-                    st.experimental_rerun()
-                except Exception as e:
-                    logger.error(f"Error calculating relevancy score: {str(e)}")
-                    logger.error(f"Error details: {type(e).__name__}, {str(e)}")
-                    st.error(f"Error calculating relevancy score: {str(e)}")
-                    if isinstance(e, requests.exceptions.RequestException):
-                        st.error(f"Error fetching content from {row['page']}. Please check if the URL is accessible.")
-                    elif isinstance(e, json.JSONDecodeError):
-                        st.error("Error parsing JSON response. The content might not be in the expected format.")
-    return report
-# Make sure to import json at the top of your file
 # -------------
 # Main Streamlit App Function
 # -------------
 def main():
-    logger.info("Starting the Streamlit app")
     setup_streamlit()
     client_config = load_config()
     if 'auth_flow' not in st.session_state or 'auth_url' not in st.session_state:
-        logger.info("Initializing Google auth flow")
         st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
     # Directly access query parameters using st.query_params
@@ -426,27 +392,23 @@ def main():
     # Retrieve the 'code' parameter
     auth_code = query_params.get("code", None)
     if auth_code and 'credentials' not in st.session_state:
-        logger.info("Fetching token with auth code")
         st.session_state.auth_flow.fetch_token(code=auth_code)
         st.session_state.credentials = st.session_state.auth_flow.credentials
-        logger.info("Credentials stored in session state")
     if 'credentials' not in st.session_state:
-        logger.info("No credentials found, showing Google sign-in")
         show_google_sign_in(st.session_state.auth_url)
     else:
-        logger.info("Credentials found, initializing session state")
         init_session_state()
         account = auth_search_console(client_config, st.session_state.credentials)
         properties = list_gsc_properties(st.session_state.credentials)
         if properties:
-            logger.info(f"Found {len(properties)} properties")
             webproperty = show_property_selector(properties, account)
             search_type = show_search_type_selector()
             date_range_selection = show_date_range_selector()
-            model_type = show_model_type_selector()
             if date_range_selection == 'Custom Range':
                 show_custom_date_inputs()
                 start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
@@ -460,22 +422,18 @@ def main():
             if st.button("Fetch Data"):
                 with st.spinner('Fetching data...'):
-                    logger.info(f"Fetching GSC data for {webproperty} from {start_date} to {end_date}")
                     st.session_state.report_data = fetch_gsc_data(webproperty, search_type, start_date, end_date, selected_dimensions)
-                    logger.info(f"Data fetched: {len(st.session_state.report_data)} rows")
             if st.session_state.report_data is not None and not st.session_state.report_data.empty:
-                logger.info("Displaying fetched data")
-                st.write("Data fetched successfully. Click the 'Calculate' button in the Relevancy Score column to calculate the score for each row.")
-                st.session_state.report_data = show_paginated_dataframe(st.session_state.report_data, model_type=model_type)
                 download_csv_link(st.session_state.report_data)
             elif st.session_state.report_data is not None:
-                logger.warning("No data found for the selected criteria")
                 st.warning("No data found for the selected criteria.")
-        else:
-            logger.warning("No properties found for the account")
-            st.warning("No properties found for your Google Search Console account.")
 if __name__ == "__main__":
-    logger.info("Application started")
     main()

 import datetime
 import base64
 import os
 # Related third-party imports
 import streamlit as st
 import cohere
 from sklearn.metrics.pairwise import cosine_similarity
 import requests
 from bs4 import BeautifulSoup
 load_dotenv()
+#test
 # Initialize Cohere client
 COHERE_API_KEY = os.environ["COHERE_API_KEY"]
 co = cohere.Client(COHERE_API_KEY)
     embeddings = response.embeddings
     return embeddings
+def calculate_relevancy_scores(df, model_type):
+    try:
+        page_contents = [fetch_content(url) for url in df['page']]
+        page_embeddings = generate_embeddings(page_contents, model_type)
+        query_embeddings = generate_embeddings(df['query'].tolist(), model_type)
+        relevancy_scores = cosine_similarity(query_embeddings, page_embeddings).diagonal()
+        df = df.assign(relevancy_score=relevancy_scores)
+    except Exception as e:
+        st.warning(f"Error calculating relevancy scores: {e}")
+        df = df.assign(relevancy_score=0)
+    return df
 def process_gsc_data(df):
+    # Remove the filter for queries below position 10
     df_sorted = df.sort_values(['impressions'], ascending=[False])
+    # Keep only the highest impression query for each page
     df_unique = df_sorted.drop_duplicates(subset='page', keep='first')
+    if 'relevancy_score' not in df_unique.columns:
+        df_unique['relevancy_score'] = 0
+    else:
+        df_unique['relevancy_score'] = df_sorted.groupby('page')['relevancy_score'].first().values
+    result = df_unique[['page', 'query', 'clicks', 'impressions', 'ctr', 'position', 'relevancy_score']]
     return result
 # -------------
 # Google Authentication Functions
 # -------------
         key='dimensions_selector'
     )
+def show_paginated_dataframe(report, rows_per_page=20):
+    # Convert 'position' column to integer and 'impressions' to numeric
     report['position'] = report['position'].astype(int)
     report['impressions'] = pd.to_numeric(report['impressions'], errors='coerce')
+    # Format CTR as percentage and relevancy_score with two decimal places
     def format_ctr(x):
         try:
             return f"{float(x):.2%}"
         except ValueError:
+            return x  # Return the original value if it can't be converted to float
+    def format_relevancy_score(x):
+        try:
+            return f"{float(x):.2f}"
+        except ValueError:
+            return x  # Return the original value if it can't be converted to float
     report['ctr'] = report['ctr'].apply(format_ctr)
+    report['relevancy_score'] = report['relevancy_score'].apply(format_relevancy_score)
+    # Create a clickable URL column
+    def make_clickable(url):
+        return f'<a href="{url}" target="_blank">{url}</a>'
+    report['clickable_url'] = report['page'].apply(make_clickable)
+    # Reorder columns to put clickable_url first
+    columns = ['clickable_url', 'query', 'impressions', 'clicks', 'ctr', 'position', 'relevancy_score']
     report = report[columns]
+    # Add sorting functionality
+    sort_column = st.selectbox("Sort by:", columns[1:], index=columns[1:].index('impressions'))  # Set 'impressions' as default
     sort_order = st.radio("Sort order:", ("Descending", "Ascending"))
     ascending = sort_order == "Ascending"
+    # Convert back to numeric for sorting
+    def safe_float_convert(x):
+        try:
+            return float(x.rstrip('%')) / 100 if isinstance(x, str) and x.endswith('%') else float(x)
+        except ValueError:
+            return 0  # Return 0 or another default value if conversion fails
+    report['ctr_numeric'] = report['ctr'].apply(safe_float_convert)
+    report['relevancy_score_numeric'] = report['relevancy_score'].apply(safe_float_convert)
+    # Sort using the numeric columns
+    sort_column_numeric = sort_column + '_numeric' if sort_column in ['ctr', 'relevancy_score'] else sort_column
+    report = report.sort_values(by=sort_column_numeric, ascending=ascending)
+    # Remove the temporary numeric columns
+    report = report.drop(columns=['ctr_numeric', 'relevancy_score_numeric'])
     total_rows = len(report)
     total_pages = (total_rows - 1) // rows_per_page + 1
     start_idx = (st.session_state.current_page - 1) * rows_per_page
     end_idx = start_idx + rows_per_page
+    # Use st.markdown to display the dataframe with clickable links
+    st.markdown(report.iloc[start_idx:end_idx].to_html(escape=False, index=False), unsafe_allow_html=True)
 # -------------
 # Main Streamlit App Function
 # -------------
 def main():
     setup_streamlit()
     client_config = load_config()
     if 'auth_flow' not in st.session_state or 'auth_url' not in st.session_state:
         st.session_state.auth_flow, st.session_state.auth_url = google_auth(client_config)
     # Directly access query parameters using st.query_params
     # Retrieve the 'code' parameter
     auth_code = query_params.get("code", None)
     if auth_code and 'credentials' not in st.session_state:
         st.session_state.auth_flow.fetch_token(code=auth_code)
         st.session_state.credentials = st.session_state.auth_flow.credentials
     if 'credentials' not in st.session_state:
         show_google_sign_in(st.session_state.auth_url)
     else:
         init_session_state()
         account = auth_search_console(client_config, st.session_state.credentials)
         properties = list_gsc_properties(st.session_state.credentials)
         if properties:
             webproperty = show_property_selector(properties, account)
             search_type = show_search_type_selector()
             date_range_selection = show_date_range_selector()
+            model_type = show_model_type_selector()  # Add this line
             if date_range_selection == 'Custom Range':
                 show_custom_date_inputs()
                 start_date, end_date = st.session_state.custom_start_date, st.session_state.custom_end_date
             if st.button("Fetch Data"):
                 with st.spinner('Fetching data...'):
                     st.session_state.report_data = fetch_gsc_data(webproperty, search_type, start_date, end_date, selected_dimensions)
             if st.session_state.report_data is not None and not st.session_state.report_data.empty:
+                st.write("Data fetched successfully. Click the button below to calculate relevancy scores.")
+                if st.button("Calculate Relevancy Scores"):
+                    st.session_state.report_data = calculate_relevancy_scores(st.session_state.report_data, model_type)
+                show_paginated_dataframe(st.session_state.report_data)
                 download_csv_link(st.session_state.report_data)
             elif st.session_state.report_data is not None:
                 st.warning("No data found for the selected criteria.")
 if __name__ == "__main__":
     main()