Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -9,24 +9,23 @@ import os
|
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
-
def
|
13 |
-
"""
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
return filtered_links.drop_duplicates()
|
30 |
|
31 |
def process_url(url, link_types):
|
32 |
"""Process URL and generate llms.txt content"""
|
@@ -52,30 +51,39 @@ def process_url(url, link_types):
|
|
52 |
title = crawl_df['title'].values[0]
|
53 |
meta_desc = crawl_df['meta_desc'].values[0]
|
54 |
|
55 |
-
# Process links using advertools
|
56 |
-
link_df = adv.crawlytics.links(crawl_df)
|
57 |
-
|
58 |
-
# Filter links based on selected types
|
59 |
-
if link_types:
|
60 |
-
link_df = filter_links_by_type(link_df, link_types)
|
61 |
-
|
62 |
-
# Generate all links content
|
63 |
all_links = []
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
finally:
|
74 |
# Cleanup temporary file
|
75 |
if os.path.exists(jsonl_path):
|
76 |
os.remove(jsonl_path)
|
77 |
|
78 |
-
return final_content, f"Successfully crawled website. Found {len(all_links)}
|
79 |
|
80 |
except Exception as e:
|
81 |
logger.error(f"Error processing URL {url}: {str(e)}")
|
@@ -108,7 +116,7 @@ theme = gr.themes.Soft(
|
|
108 |
c300="#a5b2ff",
|
109 |
c400="#8798ff",
|
110 |
c500="#6a7eff",
|
111 |
-
c600="#3452db", #
|
112 |
c700="#2a41af",
|
113 |
c800="#1f3183",
|
114 |
c900="#152156",
|
@@ -118,8 +126,8 @@ theme = gr.themes.Soft(
|
|
118 |
|
119 |
with gr.Blocks(theme=theme, css=css) as iface:
|
120 |
with gr.Row():
|
121 |
-
gr.Markdown("
|
122 |
-
|
123 |
with gr.Row():
|
124 |
url_input = gr.Textbox(
|
125 |
label="Enter the home page of a website:",
|
@@ -129,7 +137,7 @@ with gr.Blocks(theme=theme, css=css) as iface:
|
|
129 |
|
130 |
with gr.Row():
|
131 |
link_types = gr.Dropdown(
|
132 |
-
label="Select types of links to extract",
|
133 |
choices=["<header> links", "<nav> links", "<footer> links", "All links"],
|
134 |
multiselect=True,
|
135 |
value=["All links"]
|
|
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
+
def explode_link_df(crawl_df, col_group):
|
13 |
+
"""Process links from a specific column group in the crawl dataframe"""
|
14 |
+
try:
|
15 |
+
link = crawl_df[f'{col_group}_links_url'].str.split('@@').explode()
|
16 |
+
text = crawl_df[f'{col_group}_links_text'].str.split('@@').explode()
|
17 |
+
all_links = []
|
18 |
+
|
19 |
+
for link, text in zip(link.dropna(), text.dropna()):
|
20 |
+
if text and text.strip():
|
21 |
+
text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
|
22 |
+
text = re.sub(r"\s{3,}", " ", text)
|
23 |
+
all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
|
24 |
+
|
25 |
+
return "\n\n".join(all_links)
|
26 |
+
except Exception as e:
|
27 |
+
logger.error(f"Error processing {col_group} links: {str(e)}")
|
28 |
+
return ""
|
|
|
29 |
|
30 |
def process_url(url, link_types):
|
31 |
"""Process URL and generate llms.txt content"""
|
|
|
51 |
title = crawl_df['title'].values[0]
|
52 |
meta_desc = crawl_df['meta_desc'].values[0]
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
all_links = []
|
55 |
+
|
56 |
+
# Process links based on selected types
|
57 |
+
if link_types and "All links" not in link_types:
|
58 |
+
for link_type in link_types:
|
59 |
+
type_match = re.findall(r"header|footer|nav", link_type)
|
60 |
+
if type_match:
|
61 |
+
link_content = explode_link_df(crawl_df, type_match[0])
|
62 |
+
if link_content:
|
63 |
+
all_links.append(link_content)
|
64 |
+
all_links.append('\n\n')
|
65 |
+
else:
|
66 |
+
# Process all links using advertools
|
67 |
+
link_df = adv.crawlytics.links(crawl_df)
|
68 |
+
for link, text in link_df[['link', 'text']].values:
|
69 |
+
if text and text.strip():
|
70 |
+
text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
|
71 |
+
text = re.sub(r"\s{3,}", " ", text)
|
72 |
+
all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
|
73 |
+
|
74 |
+
# Generate final content with proper spacing
|
75 |
+
final_content = f"""# {title}
|
76 |
+
|
77 |
+
> {meta_desc}
|
78 |
+
|
79 |
+
{"\n\n".join(all_links)}"""
|
80 |
|
81 |
finally:
|
82 |
# Cleanup temporary file
|
83 |
if os.path.exists(jsonl_path):
|
84 |
os.remove(jsonl_path)
|
85 |
|
86 |
+
return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
|
87 |
|
88 |
except Exception as e:
|
89 |
logger.error(f"Error processing URL {url}: {str(e)}")
|
|
|
116 |
c300="#a5b2ff",
|
117 |
c400="#8798ff",
|
118 |
c500="#6a7eff",
|
119 |
+
c600="#3452db", # Main color
|
120 |
c700="#2a41af",
|
121 |
c800="#1f3183",
|
122 |
c900="#152156",
|
|
|
126 |
|
127 |
with gr.Blocks(theme=theme, css=css) as iface:
|
128 |
with gr.Row():
|
129 |
+
gr.Markdown("# Generate an `llms.txt` file")
|
130 |
+
|
131 |
with gr.Row():
|
132 |
url_input = gr.Textbox(
|
133 |
label="Enter the home page of a website:",
|
|
|
137 |
|
138 |
with gr.Row():
|
139 |
link_types = gr.Dropdown(
|
140 |
+
label="Select types of links to extract (leave empty to get all links)",
|
141 |
choices=["<header> links", "<nav> links", "<footer> links", "All links"],
|
142 |
multiselect=True,
|
143 |
value=["All links"]
|