Update app.py
Browse files
app.py
CHANGED
@@ -7,14 +7,19 @@ def extract_headers(url):
|
|
7 |
# Define the output file path
|
8 |
output_file = "crawl_output.jl"
|
9 |
|
10 |
-
# Perform the crawl
|
11 |
-
adv.crawl(
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# Load the crawl data
|
14 |
crawl_df = pd.read_json(output_file, lines=True)
|
15 |
|
16 |
# Extract headers from h1 to h6
|
17 |
-
headers_columns = [col for col in crawl_df.columns if col.startswith('h')]
|
18 |
headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
|
19 |
|
20 |
return headers
|
@@ -29,7 +34,7 @@ def main():
|
|
29 |
if st.button("Extract Headers"):
|
30 |
if url:
|
31 |
headers = extract_headers(url)
|
32 |
-
if isinstance(headers, pd.
|
33 |
st.write("Extracted Headers:")
|
34 |
st.write(headers)
|
35 |
else:
|
|
|
7 |
# Define the output file path
|
8 |
output_file = "crawl_output.jl"
|
9 |
|
10 |
+
# Perform the crawl with restricted settings
|
11 |
+
adv.crawl(
|
12 |
+
url,
|
13 |
+
output_file=output_file,
|
14 |
+
follow_links=False, # Do not follow links
|
15 |
+
allowed_domains=[url.split('//')[1].split('/')[0]] # Restrict to the base domain
|
16 |
+
)
|
17 |
|
18 |
# Load the crawl data
|
19 |
crawl_df = pd.read_json(output_file, lines=True)
|
20 |
|
21 |
# Extract headers from h1 to h6
|
22 |
+
headers_columns = [col for col in crawl_df.columns if col.startswith('h') and col[1:].isdigit()]
|
23 |
headers = crawl_df[headers_columns].apply(lambda x: x.str.split('@@').explode()).dropna().reset_index(drop=True)
|
24 |
|
25 |
return headers
|
|
|
34 |
if st.button("Extract Headers"):
|
35 |
if url:
|
36 |
headers = extract_headers(url)
|
37 |
+
if isinstance(headers, pd.Series) and not headers.empty:
|
38 |
st.write("Extracted Headers:")
|
39 |
st.write(headers)
|
40 |
else:
|