cyberandy commited on
Commit
51ccc58
·
verified ·
1 Parent(s): 3bc0d96

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -76
app.py CHANGED
@@ -20,25 +20,23 @@ def safe_crawl(url, output_file):
20
  logger.error(f"Crawl error: {str(e)}")
21
  return False
22
 
23
- def process_links(df, link_type=None):
24
- """Process links based on type"""
25
  try:
26
- if link_type:
27
- mask = df['source'].str.contains(f'<{link_type}', case=False, na=False)
28
- df = df[mask]
29
-
30
  all_links = []
31
- for _, row in df.iterrows():
32
- if row['text'] and str(row['text']).strip():
33
- text = str(row['text']).strip()
34
- text = re.sub(r'\s+', ' ', text)
35
- link = str(row['link']).strip()
36
- all_links.append(f"## {text}\n[{text}]({link})")
37
 
38
- return all_links
 
 
 
 
 
 
39
  except Exception as e:
40
- logger.error(f"Link processing error: {str(e)}")
41
- return []
42
 
43
  def process_url(url, link_types):
44
  """Process URL and generate llms.txt content"""
@@ -46,87 +44,132 @@ def process_url(url, link_types):
46
  return "", "Please enter a URL"
47
 
48
  try:
49
- # Ensure URL has protocol
50
- if not url.startswith(('http://', 'https://')):
51
- url = 'https://' + url
52
-
53
- # Create temporary file
54
- output_file = f"{token_hex(4)}.jsonl"
55
-
56
  try:
57
- # Perform crawl
58
- if not safe_crawl(url, output_file):
59
  return "", "Crawl failed or timed out"
60
 
61
- # Read results
62
- df = pd.read_json(output_file, lines=True)
63
 
64
- # Get basic info
65
- title = df['title'].iloc[0] if not pd.isna(df['title'].iloc[0]) else "Untitled"
66
- meta_desc = df['meta_desc'].iloc[0] if not pd.isna(df['meta_desc'].iloc[0]) else ""
67
 
68
- # Process links
69
- link_df = adv.crawlytics.links(df)
70
  all_links = []
71
 
 
72
  if link_types and "All links" not in link_types:
73
  for link_type in link_types:
74
- type_name = re.search(r'<(\w+)>', link_type)
75
- if type_name:
76
- links = process_links(link_df, type_name.group(1))
77
- all_links.extend(links)
 
 
78
  else:
79
- all_links = process_links(link_df)
80
-
81
- # Create content
82
- content_parts = [
83
- f"# {title}",
84
- f"> {meta_desc}",
85
- "\n\n".join(all_links)
86
- ]
87
- final_content = "\n\n".join(content_parts)
88
-
89
- return final_content, f"Found {len(all_links)} links"
90
-
91
  finally:
92
- # Cleanup
93
- if os.path.exists(output_file):
94
- os.remove(output_file)
95
-
 
 
96
  except Exception as e:
97
- logger.error(f"Error processing {url}: {str(e)}")
98
  return "", f"Error: {str(e)}"
99
 
100
- # Create interface
101
- iface = gr.Interface(
102
- fn=process_url,
103
- inputs=[
104
- gr.Textbox(
105
- label="Enter website URL",
106
- placeholder="example: example.com"
107
- ),
108
- gr.Dropdown(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  choices=["<header> links", "<nav> links", "<footer> links", "All links"],
110
- label="Select link types",
111
  multiselect=True,
112
  value=["All links"]
113
  )
114
- ],
115
- outputs=[
116
- gr.Textbox(
117
- label="Generated llms.txt",
 
 
 
118
  lines=20,
119
- show_copy_button=True
120
- ),
121
- gr.Textbox(
122
- label="Status"
123
  )
124
- ],
125
- title="LLMs.txt Generator",
126
- description="Generate an llms.txt file from a website",
127
- theme=gr.themes.Soft(),
128
- allow_flagging="never"
129
- )
 
 
130
 
131
  if __name__ == "__main__":
132
  iface.launch()
 
20
  logger.error(f"Crawl error: {str(e)}")
21
  return False
22
 
23
+ def explode_link_df(crawl_df, col_group):
24
+ """Process links from a specific column group in the crawl dataframe"""
25
  try:
26
+ link = crawl_df[f'{col_group}_links_url'].str.split('@@').explode()
27
+ text = crawl_df[f'{col_group}_links_text'].str.split('@@').explode()
 
 
28
  all_links = []
 
 
 
 
 
 
29
 
30
+ for link, text in zip(link.dropna(), text.dropna()):
31
+ if text and text.strip():
32
+ text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
33
+ text = re.sub(r"\s{3,}", " ", text)
34
+ all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
35
+
36
+ return "\n\n".join(all_links)
37
  except Exception as e:
38
+ logger.error(f"Error processing {col_group} links: {str(e)}")
39
+ return ""
40
 
41
  def process_url(url, link_types):
42
  """Process URL and generate llms.txt content"""
 
44
  return "", "Please enter a URL"
45
 
46
  try:
47
+ if not url.startswith(("http://", "https://")):
48
+ url = "https://" + url
49
+
50
+ # Generate unique filename for this crawl
51
+ output_file = token_hex(6)
52
+ jsonl_path = f"{output_file}.jsonl"
53
+
54
  try:
55
+ # Perform the crawl using advertools
56
+ if not safe_crawl(url, jsonl_path):
57
  return "", "Crawl failed or timed out"
58
 
59
+ # Read the crawl results
60
+ crawl_df = pd.read_json(jsonl_path, lines=True)
61
 
62
+ # Extract title and meta description
63
+ title = crawl_df['title'].iloc[0] if not pd.isna(crawl_df['title'].iloc[0]) else "Untitled"
64
+ meta_desc = crawl_df['meta_desc'].iloc[0] if not pd.isna(crawl_df['meta_desc'].iloc[0]) else ""
65
 
 
 
66
  all_links = []
67
 
68
+ # Process links based on selected types
69
  if link_types and "All links" not in link_types:
70
  for link_type in link_types:
71
+ type_match = re.findall(r"header|footer|nav", link_type)
72
+ if type_match:
73
+ link_content = explode_link_df(crawl_df, type_match[0])
74
+ if link_content:
75
+ all_links.append(link_content)
76
+ all_links.append('\n\n')
77
  else:
78
+ # Process all links using advertools
79
+ link_df = adv.crawlytics.links(crawl_df)
80
+ for link, text in link_df[['link', 'text']].values:
81
+ if text and text.strip():
82
+ text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
83
+ text = re.sub(r"\s{3,}", " ", text)
84
+ all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
85
+
86
+ # Generate final content
87
+ links_text = "\n\n".join(all_links)
88
+ final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
89
+
90
  finally:
91
+ # Cleanup temporary file
92
+ if os.path.exists(jsonl_path):
93
+ os.remove(jsonl_path)
94
+
95
+ return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
96
+
97
  except Exception as e:
98
+ logger.error(f"Error processing URL {url}: {str(e)}")
99
  return "", f"Error: {str(e)}"
100
 
101
+ # Custom CSS for Open Sans font and color theme
102
+ css = """
103
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');
104
+
105
+ body {
106
+ font-family: 'Open Sans', sans-serif !important;
107
+ }
108
+
109
+ .primary-btn {
110
+ background-color: #3452db !important;
111
+ }
112
+
113
+ .primary-btn:hover {
114
+ background-color: #2a41af !important;
115
+ }
116
+ """
117
+
118
+ # Create custom theme with specific color
119
+ theme = gr.themes.Soft(
120
+ primary_hue=gr.themes.colors.Color(
121
+ name="blue",
122
+ c50="#eef1ff",
123
+ c100="#e0e5ff",
124
+ c200="#c3cbff",
125
+ c300="#a5b2ff",
126
+ c400="#8798ff",
127
+ c500="#6a7eff",
128
+ c600="#3452db", # Main color
129
+ c700="#2a41af",
130
+ c800="#1f3183",
131
+ c900="#152156",
132
+ c950="#0a102b",
133
+ )
134
+ )
135
+
136
+ with gr.Blocks(theme=theme, css=css) as iface:
137
+ with gr.Row():
138
+ gr.Markdown("# Generate an `llms.txt` file")
139
+
140
+ with gr.Row():
141
+ url_input = gr.Textbox(
142
+ label="Enter the home page of a website:",
143
+ placeholder="example: https://example.com",
144
+ lines=1,
145
+ )
146
+
147
+ with gr.Row():
148
+ link_types = gr.Dropdown(
149
+ label="Select types of links to extract (leave empty to get all links)",
150
  choices=["<header> links", "<nav> links", "<footer> links", "All links"],
 
151
  multiselect=True,
152
  value=["All links"]
153
  )
154
+
155
+ with gr.Row():
156
+ generate_btn = gr.Button("Submit", variant="primary", elem_classes=["primary-btn"])
157
+
158
+ with gr.Row():
159
+ output = gr.Textbox(
160
+ label="Generated llms.txt Content",
161
  lines=20,
162
+ show_copy_button=True,
163
+ container=True,
 
 
164
  )
165
+ status = gr.Textbox(label="Status", interactive=False)
166
+
167
+ # Set up the click event
168
+ generate_btn.click(
169
+ fn=process_url,
170
+ inputs=[url_input, link_types],
171
+ outputs=[output, status],
172
+ )
173
 
174
  if __name__ == "__main__":
175
  iface.launch()