Spaces:

blazingbunny
/

free-seo-headers-audit-tool-advertools

Sleeping

Update app.py

35ea367 verified 7 months ago

1.45 kB

	import streamlit as st
	import advertools as adv
	import pandas as pd

	def extract_headers(url):
	try:
	# Define the output file path
	output_file = "crawl_output.jl"

	# Perform the crawl with restricted settings
	adv.crawl(
	url,
	output_file=output_file,
	follow_links=False, # Do not follow links
	allowed_domains=[url.split('//')[1].split('/')[0]] # Restrict to the base domain
	)

	# Load the crawl data
	crawl_df = pd.read_json(output_file, lines=True)

	# Extract headers from h1 to h6
	headers_columns = [col for col in crawl_df.columns if col.startswith('h') and col[1:].isdigit()]
	headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)

	return headers

	except Exception as e:
	return str(e)

	def main():
	st.title("Web Page Header Extractor")

	url = st.text_input("Enter the URL of the web page:")
	if st.button("Extract Headers"):
	if url:
	headers = extract_headers(url)
	if isinstance(headers, pd.Series) and not headers.empty:
	st.write("Extracted Headers:")
	st.write(headers)
	else:
	st.error("No headers found or an error occurred.")
	else:
	st.error("Please enter a valid URL.")

	if __name__ == "__main__":
	main()