blazingbunny's picture
Update app.py
c2d57f1 verified
raw
history blame
1.24 kB
import streamlit as st
import advertools as adv
import pandas as pd
def extract_headers(url):
try:
# Define the output file path
output_file = "crawl_output.jl"
# Perform the crawl
adv.crawl(url, output_file=output_file, follow_links=True)
# Load the crawl data
crawl_df = pd.read_json(output_file, lines=True)
# Extract headers from h1 to h6
headers_columns = [col for col in crawl_df.columns if col.startswith('h')]
headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
return headers
except Exception as e:
return str(e)
def main():
st.title("Web Page Header Extractor")
url = st.text_input("Enter the URL of the web page:")
if st.button("Extract Headers"):
if url:
headers = extract_headers(url)
if isinstance(headers, pd.DataFrame) and not headers.empty:
st.write("Extracted Headers:")
st.write(headers)
else:
st.error("No headers found or an error occurred.")
else:
st.error("Please enter a valid URL.")
if __name__ == "__main__":
main()