Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 21 days ago

Commit

4f7928b

•

1 Parent(s): ab2a9d9

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -31

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import asyncio
 from collections import defaultdict
 import time
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -26,6 +27,20 @@ class WebsiteCrawler:
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
     def is_valid_url(self, url, base_domain):
         """Check if URL is valid and belongs to the same domain"""
         try:
@@ -46,8 +61,8 @@ class WebsiteCrawler:
         # Get main content
         main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
         if main_content:
-            return main_content.get_text(strip=True)
-        return soup.get_text(strip=True)
     def get_page_metadata(self, soup, url):
         """Extract metadata from the page"""
@@ -58,20 +73,22 @@ class WebsiteCrawler:
             'category': 'Optional'
         }
-        # Title extraction
-        metadata['title'] = (
             soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
             soup.find('title').text if soup.find('title') else
             soup.find('h1').text if soup.find('h1') else
             url.split('/')[-1]
         )
-        # Description extraction
-        metadata['description'] = (
             soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
             soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
             ""
         )
         # Calculate importance based on various factors
         importance = 0
@@ -101,6 +118,7 @@ class WebsiteCrawler:
         try:
             response = requests.get(url, headers=self.headers, timeout=self.timeout)
             response.raise_for_status()
             self.visited_urls.add(url)
@@ -182,27 +200,27 @@ class WebsiteCrawler:
         return "\n".join(content)
-def save_llms_txt(content, save_path="llms.txt"):
-    """Save the generated content to a file"""
-    try:
-        with open(save_path, 'w', encoding='utf-8') as f:
-            f.write(content)
-        return f"Successfully saved to {save_path}"
-    except Exception as e:
-        return f"Error saving file: {str(e)}"
-async def process_url(url, max_depth, max_pages, save_to_file=False):
     """Process URL and generate llms.txt"""
     try:
         crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
         await crawler.crawl_website(url)
         content = crawler.generate_llms_txt()
-        if save_to_file:
-            save_message = save_llms_txt(content)
-            return content, f"Crawled {len(crawler.visited_urls)} pages. {save_message}"
-        return content, f"Crawled {len(crawler.visited_urls)} pages. File not saved (checkbox not selected)"
     except Exception as e:
         return "", f"Error: {str(e)}"
@@ -224,27 +242,48 @@ body, .gradio-container {
     font-family: 'Open Sans', sans-serif !important;
     font-weight: 600 !important;
 }
 """
 # Create the Gradio interface
 iface = gr.Interface(
-    fn=lambda url, max_depth, max_pages, save: asyncio.run(process_url(url, max_depth, max_pages, save)),
     inputs=[
-        gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
-        gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"),
-        gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages to Crawl"),
-        gr.Checkbox(label="Save to file", value=False)
     ],
     outputs=[
-        gr.Textbox(label="Generated llms.txt Content", lines=20),
         gr.Textbox(label="Status")
     ],
     title="llms.txt Generator",
     description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.",
-    examples=[
-        ["https://example.com", 3, 50, False],
-        ["https://docs.python.org", 3, 50, True]
-    ],
     theme=gr.themes.Soft(),
     css=css
 )

 from collections import defaultdict
 import time
 import logging
+import unicodedata
 # Set up logging
 logging.basicConfig(level=logging.INFO)
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
+    def normalize_text(self, text):
+        """Normalize text to handle encoding issues"""
+        if not text:
+            return ""
+        # Normalize unicode characters
+        text = unicodedata.normalize('NFKD', text)
+        # Replace special quotes and dashes with standard characters
+        text = text.replace('"', '"').replace('"', '"').replace(''', "'").replace('—', '-')
+        # Remove any remaining non-ASCII characters
+        text = text.encode('ascii', 'ignore').decode('ascii')
+        # Clean up extra whitespace
+        text = ' '.join(text.split())
+        return text
     def is_valid_url(self, url, base_domain):
         """Check if URL is valid and belongs to the same domain"""
         try:
         # Get main content
         main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
         if main_content:
+            return self.normalize_text(main_content.get_text(strip=True))
+        return self.normalize_text(soup.get_text(strip=True))
     def get_page_metadata(self, soup, url):
         """Extract metadata from the page"""
             'category': 'Optional'
         }
+        # Title extraction with normalization
+        title = (
             soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
             soup.find('title').text if soup.find('title') else
             soup.find('h1').text if soup.find('h1') else
             url.split('/')[-1]
         )
+        metadata['title'] = self.normalize_text(title)
+        # Description extraction with normalization
+        description = (
             soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
             soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
             ""
         )
+        metadata['description'] = self.normalize_text(description)
         # Calculate importance based on various factors
         importance = 0
         try:
             response = requests.get(url, headers=self.headers, timeout=self.timeout)
+            response.encoding = 'utf-8'  # Explicitly set encoding
             response.raise_for_status()
             self.visited_urls.add(url)
         return "\n".join(content)
+async def process_url(url, max_depth, max_pages):
     """Process URL and generate llms.txt"""
     try:
+        # Add https:// if not present
+        if not url.startswith(('http://', 'https://')):
+            url = 'https://' + url
+        # Validate URL format
+        try:
+            result = urlparse(url)
+            if not all([result.scheme, result.netloc]):
+                return "", "Invalid URL format. Please enter a valid URL."
+        except:
+            return "", "Invalid URL format. Please enter a valid URL."
+        # Create crawler and process
         crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
         await crawler.crawl_website(url)
         content = crawler.generate_llms_txt()
+        return content, f"Successfully crawled {len(crawler.visited_urls)} pages. You can now copy the generated content."
     except Exception as e:
         return "", f"Error: {str(e)}"
     font-family: 'Open Sans', sans-serif !important;
     font-weight: 600 !important;
 }
+.gr-input {
+    font-family: 'Open Sans', sans-serif !important;
+}
 """
 # Create the Gradio interface
 iface = gr.Interface(
+    fn=lambda url, max_depth, max_pages: asyncio.run(process_url(url, max_depth, max_pages)),
     inputs=[
+        gr.Textbox(
+            label="Website URL",
+            placeholder="Enter the website URL (e.g., example.com or https://example.com)",
+            info="The URL will be automatically prefixed with https:// if no protocol is specified."
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=5,
+            value=3,
+            step=1,
+            label="Maximum Crawl Depth",
+            info="Higher values will result in more thorough but slower crawling"
+        ),
+        gr.Slider(
+            minimum=10,
+            maximum=100,
+            value=50,
+            step=10,
+            label="Maximum Pages to Crawl",
+            info="Higher values will result in more comprehensive but slower results"
+        )
     ],
     outputs=[
+        gr.Textbox(
+            label="Generated llms.txt Content",
+            lines=20,
+            info="Copy this content to create your llms.txt file"
+        ),
         gr.Textbox(label="Status")
     ],
     title="llms.txt Generator",
     description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.",
     theme=gr.themes.Soft(),
     css=css
 )