blazingbunny's picture
Update app.py
35ea367 verified
raw
history blame
1.45 kB
import streamlit as st
import advertools as adv
import pandas as pd
def extract_headers(url):
try:
# Define the output file path
output_file = "crawl_output.jl"
# Perform the crawl with restricted settings
adv.crawl(
url,
output_file=output_file,
follow_links=False, # Do not follow links
allowed_domains=[url.split('//')[1].split('/')[0]] # Restrict to the base domain
)
# Load the crawl data
crawl_df = pd.read_json(output_file, lines=True)
# Extract headers from h1 to h6
headers_columns = [col for col in crawl_df.columns if col.startswith('h') and col[1:].isdigit()]
headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
return headers
except Exception as e:
return str(e)
def main():
st.title("Web Page Header Extractor")
url = st.text_input("Enter the URL of the web page:")
if st.button("Extract Headers"):
if url:
headers = extract_headers(url)
if isinstance(headers, pd.Series) and not headers.empty:
st.write("Extracted Headers:")
st.write(headers)
else:
st.error("No headers found or an error occurred.")
else:
st.error("Please enter a valid URL.")
if __name__ == "__main__":
main()