import streamlit as st
import advertools as adv
import pandas as pd
import os

def extract_headers(url):
    try:
        # Define the output file path
        output_file = "crawl_data.jl"

        # Crawl the webpage
        adv.crawl(url, output_file=output_file, follow_links=False)

        # Load the crawl data from the output file
        crawl_data = pd.read_json(output_file, lines=True)

        # Extract headers from the HTML content
        headers = []
        for _, row in crawl_data.iterrows():
            html_content = row['body']
            # Using Pandas to parse headers
            for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                headers.extend(pd.read_html(f'<{header_tag}>{html_content}</{header_tag}>', header=0)[0].values.flatten())
        
        # Remove duplicate headers and empty values
        headers = [header for header in headers if header and isinstance(header, str)]
        return list(set(headers))

    except Exception as e:
        return str(e)

def main():
    st.title("Web Page Header Extractor")

    url = st.text_input("Enter the URL of the web page:")
    if st.button("Extract Headers"):
        if url:
            headers = extract_he