Spaces:

blazingbunny
/

free-seo-headers-audit-tool-advertools

Sleeping

blazingbunny commited on Aug 1, 2024

Commit

35ea367

verified ·

1 Parent(s): ffbed98

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,14 +7,19 @@ def extract_headers(url):
         # Define the output file path
         output_file = "crawl_output.jl"
-        # Perform the crawl
-        adv.crawl(url, output_file=output_file, follow_links=True)
         # Load the crawl data
         crawl_df = pd.read_json(output_file, lines=True)
         # Extract headers from h1 to h6
-        headers_columns = [col for col in crawl_df.columns if col.startswith('h')]
         headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
         return headers
@@ -29,7 +34,7 @@ def main():
     if st.button("Extract Headers"):
         if url:
             headers = extract_headers(url)
-            if isinstance(headers, pd.DataFrame) and not headers.empty:
                 st.write("Extracted Headers:")
                 st.write(headers)
             else:

         # Define the output file path
         output_file = "crawl_output.jl"
+        # Perform the crawl with restricted settings
+        adv.crawl(
+            url,
+            output_file=output_file,
+            follow_links=False,  # Do not follow links
+            allowed_domains=[url.split('//')[1].split('/')[0]]  # Restrict to the base domain
+        )
         # Load the crawl data
         crawl_df = pd.read_json(output_file, lines=True)
         # Extract headers from h1 to h6
+        headers_columns = [col for col in crawl_df.columns if col.startswith('h') and col[1:].isdigit()]
         headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
         return headers
     if st.button("Extract Headers"):
         if url:
             headers = extract_headers(url)
+            if isinstance(headers, pd.Series) and not headers.empty:
                 st.write("Extracted Headers:")
                 st.write(headers)
             else: