Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 23, 2024

Commit

dd2349f

1 Parent(s): 8dd9e80

update

Browse files

Files changed (1) hide show

app.py +110 -93

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class WebsiteCrawler:
     def __init__(self, max_depth=3, max_pages=50):
         self.max_depth = max_depth
@@ -18,7 +19,7 @@ class WebsiteCrawler:
         self.visited_urls = set()
         self.url_metadata = defaultdict(dict)
         self.headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
         }
     def clean_text(self, text, is_title=False):
@@ -26,99 +27,106 @@ class WebsiteCrawler:
         if not text:
             return ""
         # Normalize unicode characters
-        text = unicodedata.normalize('NFKD', text)
-        text = re.sub(r'[^\x00-\x7F]+', '', text)
         if is_title:
             # Remove common suffixes and fragments for titles
-            text = re.sub(r'\s*[\|\-#:•].*', '', text)
-            text = re.sub(r'^\s*Welcome to\s+', '', text)
-            text = text.replace('docusaurus_skipToContent_fallback', '')
-        return ' '.join(text.split()).strip()
     async def crawl_page(self, url, depth, base_domain):
         """Crawl a single page and extract information"""
-        if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
             return []
         try:
             response = requests.get(url, headers=self.headers, timeout=10)
-            response.encoding = 'utf-8'
             self.visited_urls.add(url)
-            soup = BeautifulSoup(response.text, 'html.parser')
             # Extract title with fallbacks
             title = None
-            meta_title = soup.find('meta', property='og:title')
-            if meta_title and meta_title.get('content'):
-                title = meta_title['content']
             if not title:
-                title_tag = soup.find('title')
                 if title_tag:
                     title = title_tag.text
             if not title:
-                h1_tag = soup.find('h1')
                 if h1_tag:
                     title = h1_tag.text
             if not title:
-                title = url.split('/')[-1]
             title = self.clean_text(title, is_title=True)
             # Extract description with fallbacks
             desc = None
-            meta_desc = soup.find('meta', {'name': 'description'})
-            if meta_desc and meta_desc.get('content'):
-                desc = meta_desc['content']
             if not desc:
-                og_desc = soup.find('meta', property='og:description')
-                if og_desc and og_desc.get('content'):
-                    desc = og_desc['content']
             if not desc:
-                first_p = soup.find('p')
                 if first_p:
                     desc = first_p.text
             desc = self.clean_text(desc) if desc else ""
             # Determine category and importance
             url_lower = url.lower()
-            category = 'Optional'
             importance = 0
-            if 'docs' in url_lower or 'documentation' in url_lower:
-                category = 'Docs'
                 importance = 5
-            elif 'api' in url_lower:
-                category = 'API'
                 importance = 4
-            elif 'guide' in url_lower or 'tutorial' in url_lower:
-                category = 'Guides'
                 importance = 3
-            elif 'example' in url_lower:
-                category = 'Examples'
                 importance = 2
-            elif 'blog' in url_lower:
-                category = 'Blog'
                 importance = 1
             # Store metadata
-            clean_url = re.sub(r'#.*', '', url).rstrip('/')
             if title and len(title.strip()) > 0:  # Only store if we have a valid title
                 self.url_metadata[clean_url] = {
-                    'title': title,
-                    'description': desc,
-                    'category': category,
-                    'importance': importance
                 }
             # Find links
             links = []
-            for a in soup.find_all('a', href=True):
-                href = a['href']
-                if not any(x in href.lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif']):
                     next_url = urljoin(url, href)
                     if urlparse(next_url).netloc == base_domain:
                         links.append(next_url)
@@ -150,7 +158,7 @@ class WebsiteCrawler:
         if not desc:
             return ""
         # Remove leading dashes, hyphens, or colons
-        desc = re.sub(r'^[-:\s]+', '', desc)
         # Remove any strings that are just "Editors", "APIs", etc.
         if len(desc.split()) <= 1:
             return ""
@@ -164,33 +172,33 @@ class WebsiteCrawler:
         # Sort URLs by importance and remove duplicates
         sorted_urls = []
         seen_titles = set()
         for url, metadata in sorted(
             self.url_metadata.items(),
-            key=lambda x: (x[1]['importance'], x[0]),
-            reverse=True
         ):
-            if metadata['title'] not in seen_titles:
                 sorted_urls.append((url, metadata))
-                seen_titles.add(metadata['title'])
         if not sorted_urls:
             return "No valid content was found"
         # Generate content
         content = []
         # Find the best title for the main header (prefer "Welcome" or "Overview")
         main_title = "Welcome"  # Default to Welcome
         # Find a good description for the blockquote
         best_description = None
         for _, metadata in sorted_urls:
-            desc = self.clean_description(metadata['description'])
             if desc and len(desc) > 20 and "null" not in desc.lower():
                 best_description = desc
                 break
         content.append(f"# {main_title}")
         if best_description:
             content.append(f"\n> {best_description}")
@@ -198,34 +206,35 @@ class WebsiteCrawler:
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
-            if metadata['title'] and url:
-                categories[metadata['category']].append((url, metadata))
         # Add sections
-        for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
             if category in categories:
                 content.append(f"\n## {category}")
                 # Add links without extra newlines
                 links = []
                 for url, metadata in categories[category]:
-                    title = metadata['title'].strip()
-                    desc = self.clean_description(metadata['description'])
                     if desc:
                         links.append(f"- [{title}]({url}): {desc}")
                     else:
                         links.append(f"- [{title}]({url})")
-                content.append('\n'.join(links))
-        return '\n'.join(content)
 async def process_url(url, max_depth, max_pages):
     """Process URL and generate llms.txt"""
     try:
         # Add https:// if not present
-        if not url.startswith(('http://', 'https://')):
-            url = 'https://' + url
         # Validate URL
         result = urlparse(url)
@@ -236,22 +245,25 @@ async def process_url(url, max_depth, max_pages):
         crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
         await crawler.crawl_website(url)
         content = crawler.generate_llms_txt()
         return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
     except Exception as e:
         return "", f"Error: {str(e)}"
 # Create Gradio interface
 theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
-with gr.Blocks(theme=theme, css="""
     @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
     .gradio-container {
         font-family: 'Open Sans', sans-serif !important;
     }
     .gr-button {
         font-family: 'Open Sans', sans-serif !important;
         font-weight: 600 !important;
@@ -265,55 +277,60 @@ with gr.Blocks(theme=theme, css="""
     .primary-btn:hover {
         background-color: #1c2aa8 !important;
     }
     [data-testid="textbox"] {
         font-family: 'Open Sans', sans-serif !important;
     }
     .gr-padded {
         font-family: 'Open Sans', sans-serif !important;
     }
     .gr-input {
         font-family: 'Open Sans', sans-serif !important;
     }
     .gr-label {
         font-family: 'Open Sans', sans-serif !important;
     }
-""") as iface:
     gr.Markdown("# llms.txt Generator")
     gr.Markdown("Generate an llms.txt file from a website following the specification.")
     with gr.Row():
         url_input = gr.Textbox(
-            label="Website URL",
             placeholder="Enter the website URL (e.g., example.com)",
-            info="The URL will be automatically prefixed with https:// if not provided"
         )
     with gr.Row():
         with gr.Column():
-            depth_input = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth")
         with gr.Column():
-            pages_input = gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages")
     generate_btn = gr.Button("Generate llms.txt", variant="primary")
     output = gr.Textbox(
         label="Generated llms.txt Content",
         lines=20,
         show_copy_button=True,
-        container=True
     )
     status = gr.Textbox(label="Status")
     generate_btn.click(
         fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
         inputs=[url_input, depth_input, pages_input],
-        outputs=[output, status]
     )
 if __name__ == "__main__":
-    iface.launch()

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class WebsiteCrawler:
     def __init__(self, max_depth=3, max_pages=50):
         self.max_depth = max_depth
         self.visited_urls = set()
         self.url_metadata = defaultdict(dict)
         self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
         }
     def clean_text(self, text, is_title=False):
         if not text:
             return ""
         # Normalize unicode characters
+        text = unicodedata.normalize("NFKD", text)
+        text = re.sub(r"[^\x00-\x7F]+", "", text)
         if is_title:
             # Remove common suffixes and fragments for titles
+            text = re.sub(r"\s*[\|\-#:•].*", "", text)
+            text = re.sub(r"^\s*Welcome to\s+", "", text)
+            text = text.replace("docusaurus_skipToContent_fallback", "")
+        return " ".join(text.split()).strip()
     async def crawl_page(self, url, depth, base_domain):
         """Crawl a single page and extract information"""
+        if (
+            depth > self.max_depth
+            or url in self.visited_urls
+            or len(self.visited_urls) >= self.max_pages
+        ):
             return []
         try:
             response = requests.get(url, headers=self.headers, timeout=10)
+            response.encoding = "utf-8"
             self.visited_urls.add(url)
+            soup = BeautifulSoup(response.text, "html.parser")
             # Extract title with fallbacks
             title = None
+            meta_title = soup.find("meta", property="og:title")
+            if meta_title and meta_title.get("content"):
+                title = meta_title["content"]
             if not title:
+                title_tag = soup.find("title")
                 if title_tag:
                     title = title_tag.text
             if not title:
+                h1_tag = soup.find("h1")
                 if h1_tag:
                     title = h1_tag.text
             if not title:
+                title = url.split("/")[-1]
             title = self.clean_text(title, is_title=True)
             # Extract description with fallbacks
             desc = None
+            meta_desc = soup.find("meta", {"name": "description"})
+            if meta_desc and meta_desc.get("content"):
+                desc = meta_desc["content"]
             if not desc:
+                og_desc = soup.find("meta", property="og:description")
+                if og_desc and og_desc.get("content"):
+                    desc = og_desc["content"]
             if not desc:
+                first_p = soup.find("p")
                 if first_p:
                     desc = first_p.text
             desc = self.clean_text(desc) if desc else ""
             # Determine category and importance
             url_lower = url.lower()
+            category = "Optional"
             importance = 0
+            if "docs" in url_lower or "documentation" in url_lower:
+                category = "Docs"
                 importance = 5
+            elif "api" in url_lower:
+                category = "API"
                 importance = 4
+            elif "guide" in url_lower or "tutorial" in url_lower:
+                category = "Guides"
                 importance = 3
+            elif "example" in url_lower:
+                category = "Examples"
                 importance = 2
+            elif "blog" in url_lower:
+                category = "Blog"
                 importance = 1
             # Store metadata
+            clean_url = re.sub(r"#.*", "", url).rstrip("/")
             if title and len(title.strip()) > 0:  # Only store if we have a valid title
                 self.url_metadata[clean_url] = {
+                    "title": title,
+                    "description": desc,
+                    "category": category,
+                    "importance": importance,
                 }
             # Find links
             links = []
+            for a in soup.find_all("a", href=True):
+                href = a["href"]
+                if not any(
+                    x in href.lower()
+                    for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
+                ):
                     next_url = urljoin(url, href)
                     if urlparse(next_url).netloc == base_domain:
                         links.append(next_url)
         if not desc:
             return ""
         # Remove leading dashes, hyphens, or colons
+        desc = re.sub(r"^[-:\s]+", "", desc)
         # Remove any strings that are just "Editors", "APIs", etc.
         if len(desc.split()) <= 1:
             return ""
         # Sort URLs by importance and remove duplicates
         sorted_urls = []
         seen_titles = set()
         for url, metadata in sorted(
             self.url_metadata.items(),
+            key=lambda x: (x[1]["importance"], x[0]),
+            reverse=True,
         ):
+            if metadata["title"] not in seen_titles:
                 sorted_urls.append((url, metadata))
+                seen_titles.add(metadata["title"])
         if not sorted_urls:
             return "No valid content was found"
         # Generate content
         content = []
         # Find the best title for the main header (prefer "Welcome" or "Overview")
         main_title = "Welcome"  # Default to Welcome
         # Find a good description for the blockquote
         best_description = None
         for _, metadata in sorted_urls:
+            desc = self.clean_description(metadata["description"])
             if desc and len(desc) > 20 and "null" not in desc.lower():
                 best_description = desc
                 break
         content.append(f"# {main_title}")
         if best_description:
             content.append(f"\n> {best_description}")
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
+            if metadata["title"] and url:
+                categories[metadata["category"]].append((url, metadata))
         # Add sections
+        for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
             if category in categories:
                 content.append(f"\n## {category}")
                 # Add links without extra newlines
                 links = []
                 for url, metadata in categories[category]:
+                    title = metadata["title"].strip()
+                    desc = self.clean_description(metadata["description"])
                     if desc:
                         links.append(f"- [{title}]({url}): {desc}")
                     else:
                         links.append(f"- [{title}]({url})")
+                content.append("\n".join(links))
+        return "\n".join(content)
 async def process_url(url, max_depth, max_pages):
     """Process URL and generate llms.txt"""
     try:
         # Add https:// if not present
+        if not url.startswith(("http://", "https://")):
+            url = "https://" + url
         # Validate URL
         result = urlparse(url)
         crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
         await crawler.crawl_website(url)
         content = crawler.generate_llms_txt()
         return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
     except Exception as e:
         return "", f"Error: {str(e)}"
 # Create Gradio interface
 theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
+with gr.Blocks(
+    theme=theme,
+    css="""
     @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
     .gradio-container {
         font-family: 'Open Sans', sans-serif !important;
     }
     .gr-button {
         font-family: 'Open Sans', sans-serif !important;
         font-weight: 600 !important;
     .primary-btn:hover {
         background-color: #1c2aa8 !important;
     }
     [data-testid="textbox"] {
         font-family: 'Open Sans', sans-serif !important;
     }
     .gr-padded {
         font-family: 'Open Sans', sans-serif !important;
     }
     .gr-input {
         font-family: 'Open Sans', sans-serif !important;
     }
     .gr-label {
         font-family: 'Open Sans', sans-serif !important;
     }
+""",
+) as iface:
     gr.Markdown("# llms.txt Generator")
     gr.Markdown("Generate an llms.txt file from a website following the specification.")
     with gr.Row():
         url_input = gr.Textbox(
+            label="Website URL",
             placeholder="Enter the website URL (e.g., example.com)",
+            info="The URL will be automatically prefixed with https:// if not provided",
         )
     with gr.Row():
         with gr.Column():
+            depth_input = gr.Slider(
+                minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"
+            )
         with gr.Column():
+            pages_input = gr.Slider(
+                minimum=10, maximum=100, value=50, step=10, label="Maximum Pages"
+            )
     generate_btn = gr.Button("Generate llms.txt", variant="primary")
     output = gr.Textbox(
         label="Generated llms.txt Content",
         lines=20,
         show_copy_button=True,
+        container=True,
     )
     status = gr.Textbox(label="Status")
     generate_btn.click(
         fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
         inputs=[url_input, depth_input, pages_input],
+        outputs=[output, status],
     )
 if __name__ == "__main__":
+    iface.launch()