File size: 1,236 Bytes
7f5f166 6a96128 c2d57f1 6a96128 7f5f166 c2d57f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import streamlit as st
import advertools as adv
import pandas as pd
def extract_headers(url):
try:
# Define the output file path
output_file = "crawl_output.jl"
# Perform the crawl
adv.crawl(url, output_file=output_file, follow_links=True)
# Load the crawl data
crawl_df = pd.read_json(output_file, lines=True)
# Extract headers from h1 to h6
headers_columns = [col for col in crawl_df.columns if col.startswith('h')]
headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
return headers
except Exception as e:
return str(e)
def main():
st.title("Web Page Header Extractor")
url = st.text_input("Enter the URL of the web page:")
if st.button("Extract Headers"):
if url:
headers = extract_headers(url)
if isinstance(headers, pd.DataFrame) and not headers.empty:
st.write("Extracted Headers:")
st.write(headers)
else:
st.error("No headers found or an error occurred.")
else:
st.error("Please enter a valid URL.")
if __name__ == "__main__":
main()
|