cyberandy commited on
Commit
e81ffaf
1 Parent(s): 1c5e607

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -22
app.py CHANGED
@@ -49,16 +49,39 @@ class WebsiteCrawler:
49
 
50
  soup = BeautifulSoup(response.text, 'html.parser')
51
 
52
- # Extract metadata
53
- title = (
54
- soup.find('meta', property='og:title') or
55
- soup.find('title') or
56
- soup.find('h1')
57
- )
58
- title = self.clean_text(title.text if title else url.split('/')[-1], is_title=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
61
- desc = self.clean_text(desc['content'] if desc else '')
62
 
63
  # Determine category and importance
64
  url_lower = url.lower()
@@ -71,22 +94,35 @@ class WebsiteCrawler:
71
  elif 'api' in url_lower:
72
  category = 'API'
73
  importance = 4
 
 
 
 
 
 
 
 
 
74
 
75
  # Store metadata
76
  clean_url = re.sub(r'#.*', '', url).rstrip('/')
77
- self.url_metadata[clean_url] = {
78
- 'title': title,
79
- 'description': desc,
80
- 'category': category,
81
- 'importance': importance
82
- }
 
83
 
84
  # Find links
85
- return [
86
- urljoin(url, a['href'])
87
- for a in soup.find_all('a', href=True)
88
- if not any(x in a['href'].lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif'])
89
- ]
 
 
 
90
 
91
  except Exception as e:
92
  logger.error(f"Error crawling {url}: {str(e)}")
@@ -176,8 +212,41 @@ async def process_url(url, max_depth, max_pages):
176
  theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
177
 
178
  with gr.Blocks(theme=theme, css="""
179
- .primary-btn {background-color: #2436d4 !important;}
180
- .primary-btn:hover {background-color: #1c2aa8 !important;}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  """) as iface:
182
  gr.Markdown("# llms.txt Generator")
183
  gr.Markdown("Generate an llms.txt file from a website following the specification.")
 
49
 
50
  soup = BeautifulSoup(response.text, 'html.parser')
51
 
52
+ # Extract title with fallbacks
53
+ title = None
54
+ meta_title = soup.find('meta', property='og:title')
55
+ if meta_title and meta_title.get('content'):
56
+ title = meta_title['content']
57
+ if not title:
58
+ title_tag = soup.find('title')
59
+ if title_tag:
60
+ title = title_tag.text
61
+ if not title:
62
+ h1_tag = soup.find('h1')
63
+ if h1_tag:
64
+ title = h1_tag.text
65
+ if not title:
66
+ title = url.split('/')[-1]
67
+
68
+ title = self.clean_text(title, is_title=True)
69
+
70
+ # Extract description with fallbacks
71
+ desc = None
72
+ meta_desc = soup.find('meta', {'name': 'description'})
73
+ if meta_desc and meta_desc.get('content'):
74
+ desc = meta_desc['content']
75
+ if not desc:
76
+ og_desc = soup.find('meta', property='og:description')
77
+ if og_desc and og_desc.get('content'):
78
+ desc = og_desc['content']
79
+ if not desc:
80
+ first_p = soup.find('p')
81
+ if first_p:
82
+ desc = first_p.text
83
 
84
+ desc = self.clean_text(desc) if desc else ""
 
85
 
86
  # Determine category and importance
87
  url_lower = url.lower()
 
94
  elif 'api' in url_lower:
95
  category = 'API'
96
  importance = 4
97
+ elif 'guide' in url_lower or 'tutorial' in url_lower:
98
+ category = 'Guides'
99
+ importance = 3
100
+ elif 'example' in url_lower:
101
+ category = 'Examples'
102
+ importance = 2
103
+ elif 'blog' in url_lower:
104
+ category = 'Blog'
105
+ importance = 1
106
 
107
  # Store metadata
108
  clean_url = re.sub(r'#.*', '', url).rstrip('/')
109
+ if title and len(title.strip()) > 0: # Only store if we have a valid title
110
+ self.url_metadata[clean_url] = {
111
+ 'title': title,
112
+ 'description': desc,
113
+ 'category': category,
114
+ 'importance': importance
115
+ }
116
 
117
  # Find links
118
+ links = []
119
+ for a in soup.find_all('a', href=True):
120
+ href = a['href']
121
+ if not any(x in href.lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif']):
122
+ next_url = urljoin(url, href)
123
+ if urlparse(next_url).netloc == base_domain:
124
+ links.append(next_url)
125
+ return links
126
 
127
  except Exception as e:
128
  logger.error(f"Error crawling {url}: {str(e)}")
 
212
  theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
213
 
214
  with gr.Blocks(theme=theme, css="""
215
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
216
+
217
+ .gradio-container {
218
+ font-family: 'Open Sans', sans-serif !important;
219
+ }
220
+
221
+ .gr-button {
222
+ font-family: 'Open Sans', sans-serif !important;
223
+ font-weight: 600 !important;
224
+ }
225
+
226
+ .primary-btn {
227
+ background-color: #2436d4 !important;
228
+ color: white !important;
229
+ }
230
+
231
+ .primary-btn:hover {
232
+ background-color: #1c2aa8 !important;
233
+ }
234
+
235
+ [data-testid="textbox"] {
236
+ font-family: 'Open Sans', sans-serif !important;
237
+ }
238
+
239
+ .gr-padded {
240
+ font-family: 'Open Sans', sans-serif !important;
241
+ }
242
+
243
+ .gr-input {
244
+ font-family: 'Open Sans', sans-serif !important;
245
+ }
246
+
247
+ .gr-label {
248
+ font-family: 'Open Sans', sans-serif !important;
249
+ }
250
  """) as iface:
251
  gr.Markdown("# llms.txt Generator")
252
  gr.Markdown("Generate an llms.txt file from a website following the specification.")