import streamlit as st import advertools as adv import pandas as pd def extract_headers(url): try: # Define the output file path output_file = "crawl_output.jl" # Perform the crawl adv.crawl(url, output_file=output_file, follow_links=True) # Load the crawl data crawl_df = pd.read_json(output_file, lines=True) # Extract headers from h1 to h6 headers_columns = [col for col in crawl_df.columns if col.startswith('h')] headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True) return headers except Exception as e: return str(e) def main(): st.title("Web Page Header Extractor") url = st.text_input("Enter the URL of the web page:") if st.button("Extract Headers"): if url: headers = extract_headers(url) if isinstance(headers, pd.DataFrame) and not headers.empty: st.write("Extracted Headers:") st.write(headers) else: st.error("No headers found or an error occurred.") else: st.error("Please enter a valid URL.") if __name__ == "__main__": main()