cyberandy commited on
Commit
5292597
·
verified ·
1 Parent(s): b537966

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -39
app.py CHANGED
@@ -9,24 +9,23 @@ import os
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
- def filter_links_by_type(link_df, link_types):
13
- """Filter links based on selected types"""
14
- if not link_types or "All links" in link_types:
15
- return link_df
16
-
17
- filtered_links = pd.DataFrame(columns=link_df.columns)
18
- for link_type in link_types:
19
- if "<header>" in link_type:
20
- header_links = link_df[link_df['source'].str.contains('<header', case=False, na=False)]
21
- filtered_links = pd.concat([filtered_links, header_links])
22
- elif "<nav>" in link_type:
23
- nav_links = link_df[link_df['source'].str.contains('<nav', case=False, na=False)]
24
- filtered_links = pd.concat([filtered_links, nav_links])
25
- elif "<footer>" in link_type:
26
- footer_links = link_df[link_df['source'].str.contains('<footer', case=False, na=False)]
27
- filtered_links = pd.concat([filtered_links, footer_links])
28
-
29
- return filtered_links.drop_duplicates()
30
 
31
  def process_url(url, link_types):
32
  """Process URL and generate llms.txt content"""
@@ -52,30 +51,39 @@ def process_url(url, link_types):
52
  title = crawl_df['title'].values[0]
53
  meta_desc = crawl_df['meta_desc'].values[0]
54
 
55
- # Process links using advertools
56
- link_df = adv.crawlytics.links(crawl_df)
57
-
58
- # Filter links based on selected types
59
- if link_types:
60
- link_df = filter_links_by_type(link_df, link_types)
61
-
62
- # Generate all links content
63
  all_links = []
64
- for link, text in link_df[['link', 'text']].values:
65
- if text and text.strip():
66
- text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
67
- text = re.sub(r"\s{3,}", " ", text)
68
- all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
69
-
70
- # Generate final content
71
- final_content = f"# {title}\n> {meta_desc}\n{chr(10).join(all_links)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  finally:
74
  # Cleanup temporary file
75
  if os.path.exists(jsonl_path):
76
  os.remove(jsonl_path)
77
 
78
- return final_content, f"Successfully crawled website. Found {len(all_links)} links."
79
 
80
  except Exception as e:
81
  logger.error(f"Error processing URL {url}: {str(e)}")
@@ -108,7 +116,7 @@ theme = gr.themes.Soft(
108
  c300="#a5b2ff",
109
  c400="#8798ff",
110
  c500="#6a7eff",
111
- c600="#3452db", # Our main color
112
  c700="#2a41af",
113
  c800="#1f3183",
114
  c900="#152156",
@@ -118,8 +126,8 @@ theme = gr.themes.Soft(
118
 
119
  with gr.Blocks(theme=theme, css=css) as iface:
120
  with gr.Row():
121
- gr.Markdown("## Generate an `llms.txt` file")
122
-
123
  with gr.Row():
124
  url_input = gr.Textbox(
125
  label="Enter the home page of a website:",
@@ -129,7 +137,7 @@ with gr.Blocks(theme=theme, css=css) as iface:
129
 
130
  with gr.Row():
131
  link_types = gr.Dropdown(
132
- label="Select types of links to extract",
133
  choices=["<header> links", "<nav> links", "<footer> links", "All links"],
134
  multiselect=True,
135
  value=["All links"]
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
+ def explode_link_df(crawl_df, col_group):
13
+ """Process links from a specific column group in the crawl dataframe"""
14
+ try:
15
+ link = crawl_df[f'{col_group}_links_url'].str.split('@@').explode()
16
+ text = crawl_df[f'{col_group}_links_text'].str.split('@@').explode()
17
+ all_links = []
18
+
19
+ for link, text in zip(link.dropna(), text.dropna()):
20
+ if text and text.strip():
21
+ text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
22
+ text = re.sub(r"\s{3,}", " ", text)
23
+ all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
24
+
25
+ return "\n\n".join(all_links)
26
+ except Exception as e:
27
+ logger.error(f"Error processing {col_group} links: {str(e)}")
28
+ return ""
 
29
 
30
  def process_url(url, link_types):
31
  """Process URL and generate llms.txt content"""
 
51
  title = crawl_df['title'].values[0]
52
  meta_desc = crawl_df['meta_desc'].values[0]
53
 
 
 
 
 
 
 
 
 
54
  all_links = []
55
+
56
+ # Process links based on selected types
57
+ if link_types and "All links" not in link_types:
58
+ for link_type in link_types:
59
+ type_match = re.findall(r"header|footer|nav", link_type)
60
+ if type_match:
61
+ link_content = explode_link_df(crawl_df, type_match[0])
62
+ if link_content:
63
+ all_links.append(link_content)
64
+ all_links.append('\n\n')
65
+ else:
66
+ # Process all links using advertools
67
+ link_df = adv.crawlytics.links(crawl_df)
68
+ for link, text in link_df[['link', 'text']].values:
69
+ if text and text.strip():
70
+ text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
71
+ text = re.sub(r"\s{3,}", " ", text)
72
+ all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
73
+
74
+ # Generate final content with proper spacing
75
+ final_content = f"""# {title}
76
+
77
+ > {meta_desc}
78
+
79
+ {"\n\n".join(all_links)}"""
80
 
81
  finally:
82
  # Cleanup temporary file
83
  if os.path.exists(jsonl_path):
84
  os.remove(jsonl_path)
85
 
86
+ return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
87
 
88
  except Exception as e:
89
  logger.error(f"Error processing URL {url}: {str(e)}")
 
116
  c300="#a5b2ff",
117
  c400="#8798ff",
118
  c500="#6a7eff",
119
+ c600="#3452db", # Main color
120
  c700="#2a41af",
121
  c800="#1f3183",
122
  c900="#152156",
 
126
 
127
  with gr.Blocks(theme=theme, css=css) as iface:
128
  with gr.Row():
129
+ gr.Markdown("# Generate an `llms.txt` file")
130
+
131
  with gr.Row():
132
  url_input = gr.Textbox(
133
  label="Enter the home page of a website:",
 
137
 
138
  with gr.Row():
139
  link_types = gr.Dropdown(
140
+ label="Select types of links to extract (leave empty to get all links)",
141
  choices=["<header> links", "<nav> links", "<footer> links", "All links"],
142
  multiselect=True,
143
  value=["All links"]