Spaces:

blazingbunny
/

free-seo-headers-audit-tool-advertools

Sleeping

File size: 2,599 Bytes

7f5f166
 
 
aaa5c24
0129fc7
aaa5c24
85bf60f
7f5f166
aaa5c24
7f5f166
 
6a96128
c2d57f1
 
35ea367
 
 
 
 
 
 
c2d57f1
 
 
 
59cacb3
 
 
c2d57f1
35ea367
59cacb3
 
 
 
 
 
 
 
 
 
 
c2d57f1
59cacb3
6a96128
7f5f166
59cacb3
7f5f166
 
 
 
 
 
 
 
c2d57f1
59cacb3
c2d57f1

import streamlit as st
import advertools as adv
import pandas as pd
# Sidebar instructions
st.sidebar.markdown("### Web Page Header Extractor")
st.sidebar.markdown("""
Enter your webpage URL into the tool to analyze header tags. Shout out to Elias Dabbas for [Advertools](https://github.com/eliasdabbas/advertools) which i used in the backend and as always, thanks to Koray Tuğberk Gübür for all the knowledge I have learned from Topical Authority SEO course. [topicalauthority.digital](https://www.topicalauthority.digital/)""")

st.sidebar.markdown("## Tool uploaded and maintained by: [Blazing SEO](http://blazing-seo.com/)")
def extract_headers(url):
    try:
        # Define the output file path
        output_file = "crawl_output.jl"

        # Perform the crawl with restricted settings
        adv.crawl(
            url,
            output_file=output_file,
            follow_links=False,  # Do not follow links
            allowed_domains=[url.split('//')[1].split('/')[0]]  # Restrict to the base domain
        )

        # Load the crawl data
        crawl_df = pd.read_json(output_file, lines=True)

        # Display the column names for debugging
        print("Columns in the crawl data:", crawl_df.columns)

        # Extract headers from h1 to h6
        headers_columns = [col for col in crawl_df.columns if col.startswith('h') and col[1:].isdigit()]
        print("Header columns found:", headers_columns)

        # Create a DataFrame for headers
        headers = crawl_df[headers_columns]
        
        # Melt and split headers by @@ delimiter
        headers_melted = headers.melt(var_name='Header', value_name='Content').dropna()
        headers_melted['Content'] = headers_melted['Content'].apply(lambda x: x.split('@@') if isinstance(x, str) else [])

        # Explode the headers to separate rows
        headers_exploded = headers_melted.explode('Content').dropna().reset_index(drop=True)

        return headers_exploded

    except Exception as e:
        print("Error occurred:", e)
        return str(e)

def main():
    st.title("Web Page Header Extractor")

    url = st.text_input("Enter the URL of the web page:")
    if st.button("Extract Headers"):
        if url:
            headers = extract_headers(url)
            if isinstance(headers, pd.DataFrame) and not headers.empty:
                st.write("Extracted Headers:")
                st.write(headers)
            else:
                st.error("No headers found or an error occurred.")
        else:
            st.error("Please enter a valid URL.")

if __name__ == "__main__":
    main()