Spaces:

blazingbunny
/

free-seo-headers-audit-tool-advertools

Sleeping

blazingbunny commited on Aug 1, 2024

Commit

6a96128

verified ·

1 Parent(s): 249e9a3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,20 +1,31 @@
 import streamlit as st
 import advertools as adv
 import pandas as pd
 def extract_headers(url):
     try:
         # Crawl the webpage
-        crawl_data = adv.crawl(url, follow_links=False)
-        # Extract HTML content
-        html_content = crawl_data['body'][0]
-        # Use pandas to parse the headers
-        headers = pd.read_html(html_content, header=0)[0]
-        headers = headers.loc[:, headers.columns.str.contains('h1|h2|h3|h4|h5|h6', case=False)]
-        return headers
     except Exception as e:
         return str(e)
@@ -24,11 +35,4 @@ def main():
     url = st.text_input("Enter the URL of the web page:")
     if st.button("Extract Headers"):
         if url:
-            headers = extract_headers(url)
-            st.write("Extracted Headers:")
-            st.write(headers)
-        else:
-            st.error("Please enter a valid URL.")
-if __name__ == "__main__":
-    main()

 import streamlit as st
 import advertools as adv
 import pandas as pd
+import os
 def extract_headers(url):
     try:
+        # Define the output file path
+        output_file = "crawl_data.jl"
         # Crawl the webpage
+        adv.crawl(url, output_file=output_file, follow_links=False)
+        # Load the crawl data from the output file
+        crawl_data = pd.read_json(output_file, lines=True)
+        # Extract headers from the HTML content
+        headers = []
+        for _, row in crawl_data.iterrows():
+            html_content = row['body']
+            # Using Pandas to parse headers
+            for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+                headers.extend(pd.read_html(f'<{header_tag}>{html_content}</{header_tag}>', header=0)[0].values.flatten())
+        # Remove duplicate headers and empty values
+        headers = [header for header in headers if header and isinstance(header, str)]
+        return list(set(headers))
     except Exception as e:
         return str(e)
     url = st.text_input("Enter the URL of the web page:")
     if st.button("Extract Headers"):
         if url:
+            headers = extract_he