File size: 2,599 Bytes
7f5f166 aaa5c24 0129fc7 aaa5c24 85bf60f 7f5f166 aaa5c24 7f5f166 6a96128 c2d57f1 35ea367 c2d57f1 59cacb3 c2d57f1 35ea367 59cacb3 c2d57f1 59cacb3 6a96128 7f5f166 59cacb3 7f5f166 c2d57f1 59cacb3 c2d57f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import streamlit as st
import advertools as adv
import pandas as pd
# Sidebar instructions
st.sidebar.markdown("### Web Page Header Extractor")
st.sidebar.markdown("""
Enter your webpage URL into the tool to analyze header tags. Shout out to Elias Dabbas for [Advertools](https://github.com/eliasdabbas/advertools) which i used in the backend and as always, thanks to Koray Tuğberk Gübür for all the knowledge I have learned from Topical Authority SEO course. [topicalauthority.digital](https://www.topicalauthority.digital/)""")
st.sidebar.markdown("## Tool uploaded and maintained by: [Blazing SEO](http://blazing-seo.com/)")
def extract_headers(url):
try:
# Define the output file path
output_file = "crawl_output.jl"
# Perform the crawl with restricted settings
adv.crawl(
url,
output_file=output_file,
follow_links=False, # Do not follow links
allowed_domains=[url.split('//')[1].split('/')[0]] # Restrict to the base domain
)
# Load the crawl data
crawl_df = pd.read_json(output_file, lines=True)
# Display the column names for debugging
print("Columns in the crawl data:", crawl_df.columns)
# Extract headers from h1 to h6
headers_columns = [col for col in crawl_df.columns if col.startswith('h') and col[1:].isdigit()]
print("Header columns found:", headers_columns)
# Create a DataFrame for headers
headers = crawl_df[headers_columns]
# Melt and split headers by @@ delimiter
headers_melted = headers.melt(var_name='Header', value_name='Content').dropna()
headers_melted['Content'] = headers_melted['Content'].apply(lambda x: x.split('@@') if isinstance(x, str) else [])
# Explode the headers to separate rows
headers_exploded = headers_melted.explode('Content').dropna().reset_index(drop=True)
return headers_exploded
except Exception as e:
print("Error occurred:", e)
return str(e)
def main():
st.title("Web Page Header Extractor")
url = st.text_input("Enter the URL of the web page:")
if st.button("Extract Headers"):
if url:
headers = extract_headers(url)
if isinstance(headers, pd.DataFrame) and not headers.empty:
st.write("Extracted Headers:")
st.write(headers)
else:
st.error("No headers found or an error occurred.")
else:
st.error("Please enter a valid URL.")
if __name__ == "__main__":
main()
|