blazingbunny commited on
Commit
35ea367
·
verified ·
1 Parent(s): ffbed98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -4
app.py CHANGED
@@ -7,14 +7,19 @@ def extract_headers(url):
7
  # Define the output file path
8
  output_file = "crawl_output.jl"
9
 
10
- # Perform the crawl
11
- adv.crawl(url, output_file=output_file, follow_links=True)
 
 
 
 
 
12
 
13
  # Load the crawl data
14
  crawl_df = pd.read_json(output_file, lines=True)
15
 
16
  # Extract headers from h1 to h6
17
- headers_columns = [col for col in crawl_df.columns if col.startswith('h')]
18
  headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
19
 
20
  return headers
@@ -29,7 +34,7 @@ def main():
29
  if st.button("Extract Headers"):
30
  if url:
31
  headers = extract_headers(url)
32
- if isinstance(headers, pd.DataFrame) and not headers.empty:
33
  st.write("Extracted Headers:")
34
  st.write(headers)
35
  else:
 
7
  # Define the output file path
8
  output_file = "crawl_output.jl"
9
 
10
+ # Perform the crawl with restricted settings
11
+ adv.crawl(
12
+ url,
13
+ output_file=output_file,
14
+ follow_links=False, # Do not follow links
15
+ allowed_domains=[url.split('//')[1].split('/')[0]] # Restrict to the base domain
16
+ )
17
 
18
  # Load the crawl data
19
  crawl_df = pd.read_json(output_file, lines=True)
20
 
21
  # Extract headers from h1 to h6
22
+ headers_columns = [col for col in crawl_df.columns if col.startswith('h') and col[1:].isdigit()]
23
  headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
24
 
25
  return headers
 
34
  if st.button("Extract Headers"):
35
  if url:
36
  headers = extract_headers(url)
37
+ if isinstance(headers, pd.Series) and not headers.empty:
38
  st.write("Extracted Headers:")
39
  st.write(headers)
40
  else: