blazingbunny commited on
Commit
6a96128
1 Parent(s): 249e9a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -15
app.py CHANGED
@@ -1,20 +1,31 @@
1
  import streamlit as st
2
  import advertools as adv
3
  import pandas as pd
 
4
 
5
  def extract_headers(url):
6
  try:
 
 
 
7
  # Crawl the webpage
8
- crawl_data = adv.crawl(url, follow_links=False)
9
 
10
- # Extract HTML content
11
- html_content = crawl_data['body'][0]
12
 
13
- # Use pandas to parse the headers
14
- headers = pd.read_html(html_content, header=0)[0]
15
- headers = headers.loc[:, headers.columns.str.contains('h1|h2|h3|h4|h5|h6', case=False)]
 
 
 
 
16
 
17
- return headers
 
 
 
18
  except Exception as e:
19
  return str(e)
20
 
@@ -24,11 +35,4 @@ def main():
24
  url = st.text_input("Enter the URL of the web page:")
25
  if st.button("Extract Headers"):
26
  if url:
27
- headers = extract_headers(url)
28
- st.write("Extracted Headers:")
29
- st.write(headers)
30
- else:
31
- st.error("Please enter a valid URL.")
32
-
33
- if __name__ == "__main__":
34
- main()
 
1
  import streamlit as st
2
  import advertools as adv
3
  import pandas as pd
4
+ import os
5
 
6
  def extract_headers(url):
7
  try:
8
+ # Define the output file path
9
+ output_file = "crawl_data.jl"
10
+
11
  # Crawl the webpage
12
+ adv.crawl(url, output_file=output_file, follow_links=False)
13
 
14
+ # Load the crawl data from the output file
15
+ crawl_data = pd.read_json(output_file, lines=True)
16
 
17
+ # Extract headers from the HTML content
18
+ headers = []
19
+ for _, row in crawl_data.iterrows():
20
+ html_content = row['body']
21
+ # Using Pandas to parse headers
22
+ for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
23
+ headers.extend(pd.read_html(f'<{header_tag}>{html_content}</{header_tag}>', header=0)[0].values.flatten())
24
 
25
+ # Remove duplicate headers and empty values
26
+ headers = [header for header in headers if header and isinstance(header, str)]
27
+ return list(set(headers))
28
+
29
  except Exception as e:
30
  return str(e)
31
 
 
35
  url = st.text_input("Enter the URL of the web page:")
36
  if st.button("Extract Headers"):
37
  if url:
38
+ headers = extract_he