|
import streamlit as st |
|
import advertools as adv |
|
import pandas as pd |
|
|
|
def extract_headers(url): |
|
try: |
|
|
|
output_file = "crawl_output.jl" |
|
|
|
|
|
adv.crawl(url, output_file=output_file, follow_links=True) |
|
|
|
|
|
crawl_df = pd.read_json(output_file, lines=True) |
|
|
|
|
|
headers_columns = [col for col in crawl_df.columns if col.startswith('h')] |
|
headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True) |
|
|
|
return headers |
|
|
|
except Exception as e: |
|
return str(e) |
|
|
|
def main(): |
|
st.title("Web Page Header Extractor") |
|
|
|
url = st.text_input("Enter the URL of the web page:") |
|
if st.button("Extract Headers"): |
|
if url: |
|
headers = extract_headers(url) |
|
if isinstance(headers, pd.DataFrame) and not headers.empty: |
|
st.write("Extracted Headers:") |
|
st.write(headers) |
|
else: |
|
st.error("No headers found or an error occurred.") |
|
else: |
|
st.error("Please enter a valid URL.") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|