Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 22 days ago

Commit

e81ffaf

•

1 Parent(s): 1c5e607

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -22

app.py CHANGED Viewed

@@ -49,16 +49,39 @@ class WebsiteCrawler:
             soup = BeautifulSoup(response.text, 'html.parser')
-            # Extract metadata
-            title = (
-                soup.find('meta', property='og:title') or
-                soup.find('title') or
-                soup.find('h1')
-            )
-            title = self.clean_text(title.text if title else url.split('/')[-1], is_title=True)
-            desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
-            desc = self.clean_text(desc['content'] if desc else '')
             # Determine category and importance
             url_lower = url.lower()
@@ -71,22 +94,35 @@ class WebsiteCrawler:
             elif 'api' in url_lower:
                 category = 'API'
                 importance = 4
             # Store metadata
             clean_url = re.sub(r'#.*', '', url).rstrip('/')
-            self.url_metadata[clean_url] = {
-                'title': title,
-                'description': desc,
-                'category': category,
-                'importance': importance
-            }
             # Find links
-            return [
-                urljoin(url, a['href'])
-                for a in soup.find_all('a', href=True)
-                if not any(x in a['href'].lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif'])
-            ]
         except Exception as e:
             logger.error(f"Error crawling {url}: {str(e)}")
@@ -176,8 +212,41 @@ async def process_url(url, max_depth, max_pages):
 theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
 with gr.Blocks(theme=theme, css="""
-    .primary-btn {background-color: #2436d4 !important;}
-    .primary-btn:hover {background-color: #1c2aa8 !important;}
 """) as iface:
     gr.Markdown("# llms.txt Generator")
     gr.Markdown("Generate an llms.txt file from a website following the specification.")

             soup = BeautifulSoup(response.text, 'html.parser')
+            # Extract title with fallbacks
+            title = None
+            meta_title = soup.find('meta', property='og:title')
+            if meta_title and meta_title.get('content'):
+                title = meta_title['content']
+            if not title:
+                title_tag = soup.find('title')
+                if title_tag:
+                    title = title_tag.text
+            if not title:
+                h1_tag = soup.find('h1')
+                if h1_tag:
+                    title = h1_tag.text
+            if not title:
+                title = url.split('/')[-1]
+            title = self.clean_text(title, is_title=True)
+            # Extract description with fallbacks
+            desc = None
+            meta_desc = soup.find('meta', {'name': 'description'})
+            if meta_desc and meta_desc.get('content'):
+                desc = meta_desc['content']
+            if not desc:
+                og_desc = soup.find('meta', property='og:description')
+                if og_desc and og_desc.get('content'):
+                    desc = og_desc['content']
+            if not desc:
+                first_p = soup.find('p')
+                if first_p:
+                    desc = first_p.text
+            desc = self.clean_text(desc) if desc else ""
             # Determine category and importance
             url_lower = url.lower()
             elif 'api' in url_lower:
                 category = 'API'
                 importance = 4
+            elif 'guide' in url_lower or 'tutorial' in url_lower:
+                category = 'Guides'
+                importance = 3
+            elif 'example' in url_lower:
+                category = 'Examples'
+                importance = 2
+            elif 'blog' in url_lower:
+                category = 'Blog'
+                importance = 1
             # Store metadata
             clean_url = re.sub(r'#.*', '', url).rstrip('/')
+            if title and len(title.strip()) > 0:  # Only store if we have a valid title
+                self.url_metadata[clean_url] = {
+                    'title': title,
+                    'description': desc,
+                    'category': category,
+                    'importance': importance
+                }
             # Find links
+            links = []
+            for a in soup.find_all('a', href=True):
+                href = a['href']
+                if not any(x in href.lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif']):
+                    next_url = urljoin(url, href)
+                    if urlparse(next_url).netloc == base_domain:
+                        links.append(next_url)
+            return links
         except Exception as e:
             logger.error(f"Error crawling {url}: {str(e)}")
 theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
 with gr.Blocks(theme=theme, css="""
+    @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
+    .gradio-container {
+        font-family: 'Open Sans', sans-serif !important;
+    }
+    .gr-button {
+        font-family: 'Open Sans', sans-serif !important;
+        font-weight: 600 !important;
+    }
+    .primary-btn {
+        background-color: #2436d4 !important;
+        color: white !important;
+    }
+    .primary-btn:hover {
+        background-color: #1c2aa8 !important;
+    }
+    [data-testid="textbox"] {
+        font-family: 'Open Sans', sans-serif !important;
+    }
+    .gr-padded {
+        font-family: 'Open Sans', sans-serif !important;
+    }
+    .gr-input {
+        font-family: 'Open Sans', sans-serif !important;
+    }
+    .gr-label {
+        font-family: 'Open Sans', sans-serif !important;
+    }
 """) as iface:
     gr.Markdown("# llms.txt Generator")
     gr.Markdown("Generate an llms.txt file from a website following the specification.")