cyberandy commited on
Commit
b537966
·
verified ·
1 Parent(s): 09ac8aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -34
app.py CHANGED
@@ -9,53 +9,110 @@ import os
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
- def process_url(url, *args):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  if not url:
14
  return "", "Please enter a URL"
15
 
16
  try:
17
- if not url.startswith(('http://', 'https://')):
18
- url = 'https://' + url.strip('/')
19
-
 
20
  output_file = token_hex(6)
21
- adv.crawl(url, f"{output_file}.jsonl", follow_links=True)
22
- crawl_df = pd.read_json(f"{output_file}.jsonl", lines=True)
23
-
24
- all_links = []
25
- title = crawl_df['title'].values[0]
26
- meta_desc = crawl_df['meta_desc'].values[0]
27
- link_df = adv.crawlytics.links(crawl_df)
28
-
29
- for link, text in link_df[['link', 'text']].values:
30
- if text.strip():
31
- text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
32
- text = re.sub(r"\s{3,}", " ", text)
33
- all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
34
-
35
- separator = "\n\n"
36
- final_content = f"# {title}\n> {meta_desc}\n{separator.join(all_links)}"
37
-
38
- if os.path.exists(f"{output_file}.jsonl"):
39
- os.remove(f"{output_file}.jsonl")
40
-
41
- return final_content, "Successfully crawled website."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  except Exception as e:
44
  logger.error(f"Error processing URL {url}: {str(e)}")
45
  return "", f"Error: {str(e)}"
46
 
 
47
  css = """
48
  @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');
49
- body { font-family: 'Open Sans', sans-serif !important; }
 
 
 
 
 
 
 
 
 
 
 
50
  """
51
 
 
52
  theme = gr.themes.Soft(
53
  primary_hue=gr.themes.colors.Color(
54
  name="blue",
55
- c50="#eef1ff", c100="#e0e5ff", c200="#c3cbff",
56
- c300="#a5b2ff", c400="#8798ff", c500="#6a7eff",
57
- c600="#3452db", c700="#2a41af", c800="#1f3183",
58
- c900="#152156", c950="#0a102b",
 
 
 
 
 
 
 
59
  )
60
  )
61
 
@@ -67,7 +124,7 @@ with gr.Blocks(theme=theme, css=css) as iface:
67
  url_input = gr.Textbox(
68
  label="Enter the home page of a website:",
69
  placeholder="example: https://example.com",
70
- lines=1
71
  )
72
 
73
  with gr.Row():
@@ -79,20 +136,22 @@ with gr.Blocks(theme=theme, css=css) as iface:
79
  )
80
 
81
  with gr.Row():
82
- generate_btn = gr.Button("Submit", variant="primary")
83
 
84
  with gr.Row():
85
  output = gr.Textbox(
86
  label="Generated llms.txt Content",
87
  lines=20,
88
- show_copy_button=True
 
89
  )
90
  status = gr.Textbox(label="Status", interactive=False)
91
 
 
92
  generate_btn.click(
93
  fn=process_url,
94
  inputs=[url_input, link_types],
95
- outputs=[output, status]
96
  )
97
 
98
  if __name__ == "__main__":
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
+ def filter_links_by_type(link_df, link_types):
13
+ """Filter links based on selected types"""
14
+ if not link_types or "All links" in link_types:
15
+ return link_df
16
+
17
+ filtered_links = pd.DataFrame(columns=link_df.columns)
18
+ for link_type in link_types:
19
+ if "<header>" in link_type:
20
+ header_links = link_df[link_df['source'].str.contains('<header', case=False, na=False)]
21
+ filtered_links = pd.concat([filtered_links, header_links])
22
+ elif "<nav>" in link_type:
23
+ nav_links = link_df[link_df['source'].str.contains('<nav', case=False, na=False)]
24
+ filtered_links = pd.concat([filtered_links, nav_links])
25
+ elif "<footer>" in link_type:
26
+ footer_links = link_df[link_df['source'].str.contains('<footer', case=False, na=False)]
27
+ filtered_links = pd.concat([filtered_links, footer_links])
28
+
29
+ return filtered_links.drop_duplicates()
30
+
31
+ def process_url(url, link_types):
32
+ """Process URL and generate llms.txt content"""
33
  if not url:
34
  return "", "Please enter a URL"
35
 
36
  try:
37
+ if not url.startswith(("http://", "https://")):
38
+ url = "https://" + url
39
+
40
+ # Generate unique filename for this crawl
41
  output_file = token_hex(6)
42
+ jsonl_path = f"{output_file}.jsonl"
43
+
44
+ try:
45
+ # Perform the crawl using advertools
46
+ adv.crawl(url, jsonl_path)
47
+
48
+ # Read the crawl results
49
+ crawl_df = pd.read_json(jsonl_path, lines=True)
50
+
51
+ # Extract title and meta description
52
+ title = crawl_df['title'].values[0]
53
+ meta_desc = crawl_df['meta_desc'].values[0]
54
+
55
+ # Process links using advertools
56
+ link_df = adv.crawlytics.links(crawl_df)
57
+
58
+ # Filter links based on selected types
59
+ if link_types:
60
+ link_df = filter_links_by_type(link_df, link_types)
61
+
62
+ # Generate all links content
63
+ all_links = []
64
+ for link, text in link_df[['link', 'text']].values:
65
+ if text and text.strip():
66
+ text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
67
+ text = re.sub(r"\s{3,}", " ", text)
68
+ all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
69
+
70
+ # Generate final content
71
+ final_content = f"# {title}\n> {meta_desc}\n{chr(10).join(all_links)}"
72
+
73
+ finally:
74
+ # Cleanup temporary file
75
+ if os.path.exists(jsonl_path):
76
+ os.remove(jsonl_path)
77
+
78
+ return final_content, f"Successfully crawled website. Found {len(all_links)} links."
79
 
80
  except Exception as e:
81
  logger.error(f"Error processing URL {url}: {str(e)}")
82
  return "", f"Error: {str(e)}"
83
 
84
+ # Custom CSS for Open Sans font and color theme
85
  css = """
86
  @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');
87
+
88
+ body {
89
+ font-family: 'Open Sans', sans-serif !important;
90
+ }
91
+
92
+ .primary-btn {
93
+ background-color: #3452db !important;
94
+ }
95
+
96
+ .primary-btn:hover {
97
+ background-color: #2a41af !important;
98
+ }
99
  """
100
 
101
+ # Create custom theme with specific color
102
  theme = gr.themes.Soft(
103
  primary_hue=gr.themes.colors.Color(
104
  name="blue",
105
+ c50="#eef1ff",
106
+ c100="#e0e5ff",
107
+ c200="#c3cbff",
108
+ c300="#a5b2ff",
109
+ c400="#8798ff",
110
+ c500="#6a7eff",
111
+ c600="#3452db", # Our main color
112
+ c700="#2a41af",
113
+ c800="#1f3183",
114
+ c900="#152156",
115
+ c950="#0a102b",
116
  )
117
  )
118
 
 
124
  url_input = gr.Textbox(
125
  label="Enter the home page of a website:",
126
  placeholder="example: https://example.com",
127
+ lines=1,
128
  )
129
 
130
  with gr.Row():
 
136
  )
137
 
138
  with gr.Row():
139
+ generate_btn = gr.Button("Submit", variant="primary", elem_classes=["primary-btn"])
140
 
141
  with gr.Row():
142
  output = gr.Textbox(
143
  label="Generated llms.txt Content",
144
  lines=20,
145
+ show_copy_button=True,
146
+ container=True,
147
  )
148
  status = gr.Textbox(label="Status", interactive=False)
149
 
150
+ # Set up the click event
151
  generate_btn.click(
152
  fn=process_url,
153
  inputs=[url_input, link_types],
154
+ outputs=[output, status],
155
  )
156
 
157
  if __name__ == "__main__":