blazingbunny
commited on
Commit
•
6a96128
1
Parent(s):
249e9a3
Update app.py
Browse files
app.py
CHANGED
@@ -1,20 +1,31 @@
|
|
1 |
import streamlit as st
|
2 |
import advertools as adv
|
3 |
import pandas as pd
|
|
|
4 |
|
5 |
def extract_headers(url):
|
6 |
try:
|
|
|
|
|
|
|
7 |
# Crawl the webpage
|
8 |
-
|
9 |
|
10 |
-
#
|
11 |
-
|
12 |
|
13 |
-
#
|
14 |
-
headers =
|
15 |
-
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
|
|
|
|
|
|
|
18 |
except Exception as e:
|
19 |
return str(e)
|
20 |
|
@@ -24,11 +35,4 @@ def main():
|
|
24 |
url = st.text_input("Enter the URL of the web page:")
|
25 |
if st.button("Extract Headers"):
|
26 |
if url:
|
27 |
-
headers =
|
28 |
-
st.write("Extracted Headers:")
|
29 |
-
st.write(headers)
|
30 |
-
else:
|
31 |
-
st.error("Please enter a valid URL.")
|
32 |
-
|
33 |
-
if __name__ == "__main__":
|
34 |
-
main()
|
|
|
1 |
import streamlit as st
|
2 |
import advertools as adv
|
3 |
import pandas as pd
|
4 |
+
import os
|
5 |
|
6 |
def extract_headers(url):
|
7 |
try:
|
8 |
+
# Define the output file path
|
9 |
+
output_file = "crawl_data.jl"
|
10 |
+
|
11 |
# Crawl the webpage
|
12 |
+
adv.crawl(url, output_file=output_file, follow_links=False)
|
13 |
|
14 |
+
# Load the crawl data from the output file
|
15 |
+
crawl_data = pd.read_json(output_file, lines=True)
|
16 |
|
17 |
+
# Extract headers from the HTML content
|
18 |
+
headers = []
|
19 |
+
for _, row in crawl_data.iterrows():
|
20 |
+
html_content = row['body']
|
21 |
+
# Using Pandas to parse headers
|
22 |
+
for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
23 |
+
headers.extend(pd.read_html(f'<{header_tag}>{html_content}</{header_tag}>', header=0)[0].values.flatten())
|
24 |
|
25 |
+
# Remove duplicate headers and empty values
|
26 |
+
headers = [header for header in headers if header and isinstance(header, str)]
|
27 |
+
return list(set(headers))
|
28 |
+
|
29 |
except Exception as e:
|
30 |
return str(e)
|
31 |
|
|
|
35 |
url = st.text_input("Enter the URL of the web page:")
|
36 |
if st.button("Extract Headers"):
|
37 |
if url:
|
38 |
+
headers = extract_he
|
|
|
|
|
|
|
|
|
|
|
|
|
|