import streamlit as st import advertools as adv import pandas as pd import os def extract_headers(url): try: # Define the output file path output_file = "crawl_data.jl" # Crawl the webpage adv.crawl(url, output_file=output_file, follow_links=False) # Load the crawl data from the output file crawl_data = pd.read_json(output_file, lines=True) # Extract headers from the HTML content headers = [] for _, row in crawl_data.iterrows(): html_content = row['body'] # Using Pandas to parse headers for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: headers.extend(pd.read_html(f'<{header_tag}>{html_content}', header=0)[0].values.flatten()) # Remove duplicate headers and empty values headers = [header for header in headers if header and isinstance(header, str)] return list(set(headers)) except Exception as e: return str(e) def main(): st.title("Web Page Header Extractor") url = st.text_input("Enter the URL of the web page:") if st.button("Extract Headers"): if url: headers = extract_he