blazingbunny commited on
Commit
7f5f166
1 Parent(s): ec84052

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -0
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import advertools as adv
3
+ import pandas as pd
4
+
5
+ def extract_headers(url):
6
+ try:
7
+ # Crawl the webpage
8
+ crawl_data = adv.crawl(url, follow_links=False)
9
+
10
+ # Extract HTML content
11
+ html_content = crawl_data['body'][0]
12
+
13
+ # Use pandas to parse the headers
14
+ headers = pd.read_html(html_content, header=0)[0]
15
+ headers = headers.loc[:, headers.columns.str.contains('h1|h2|h3|h4|h5|h6', case=False)]
16
+
17
+ return headers
18
+ except Exception as e:
19
+ return str(e)
20
+
21
+ def main():
22
+ st.title("Web Page Header Extractor")
23
+
24
+ url = st.text_input("Enter the URL of the web page:")
25
+ if st.button("Extract Headers"):
26
+ if url:
27
+ headers = extract_headers(url)
28
+ st.write("Extracted Headers:")
29
+ st.write(headers)
30
+ else:
31
+ st.error("Please enter a valid URL.")
32
+
33
+ if __name__ == "__main__":
34
+ main()