Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urljoin, urlparse | |
import asyncio | |
import aiohttp | |
from collections import defaultdict | |
import unicodedata | |
import logging | |
import ssl | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class WebsiteCrawler: | |
def __init__(self, max_depth=3, max_pages=50): | |
self.max_depth = max_depth | |
self.max_pages = max_pages | |
self.visited_urls = set() | |
self.url_metadata = defaultdict(dict) | |
self.homepage_metadata = None | |
self.headers = { | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"Accept-Language": "en-US,en;q=0.5", | |
"Accept-Encoding": "gzip, deflate, br", | |
"DNT": "1", | |
"Connection": "keep-alive", | |
"Upgrade-Insecure-Requests": "1", | |
} | |
def determine_category_importance(self, url, title, desc): | |
"""Improved category detection""" | |
url_lower = url.lower() | |
path = urlparse(url).path.lower() | |
# Homepage | |
if path == "/" or path == "": | |
return "Main", 10 | |
# Documentation and Help | |
if any( | |
x in url_lower | |
for x in [ | |
"/docs", | |
"/documentation", | |
"/faq", | |
"/help", | |
"frequently-asked-questions", | |
] | |
): | |
return "Documentation", 8 | |
# API and Developer | |
elif any(x in url_lower for x in ["/api", "/developer", "developers"]): | |
return "API", 8 | |
# About/Company pages | |
elif any( | |
x in url_lower | |
for x in [ | |
"/about", | |
"/company", | |
"/references", | |
"/work-with-us", | |
"careers", | |
"/team", | |
"/contact", | |
"/about-us", | |
] | |
): | |
return "About", 7 | |
# News and Events | |
elif any( | |
x in url_lower | |
for x in [ | |
"/news", | |
"/blog", | |
"/events", | |
"/press", | |
"research", | |
"power-of", | |
"latest", | |
] | |
): | |
return "News", 5 | |
# Tools and Services | |
elif any( | |
x in url_lower | |
for x in [ | |
"/tools", | |
"/quote", | |
"/pricing", | |
"/services", | |
"/translate", | |
"/order", | |
"/buy", | |
] | |
): | |
return "Tools", 6 | |
# Check if URL path contains non-ASCII or percent-encoded characters | |
if bool(re.search(r"[^\x00-\x7F]", path)) or bool( | |
re.search(r"%[0-9A-F]{2}", path) | |
): | |
return "Optional", 0 | |
return "Optional", 1 | |
def is_duplicate_content(self, desc, title, url): | |
"""Improved duplicate/translation detection""" | |
if not desc or not title: | |
return False | |
# Skip non-latin character URLs or URLs with percent-encoded non-ASCII | |
if bool(re.search(r"[^\x00-\x7F]", url)) or bool( | |
re.search(r"%[0-9A-F]{2}", url) | |
): | |
return True | |
# Skip common translation paths | |
translation_indicators = [ | |
"/welcome", | |
"/bienvenue", | |
"/willkommen", | |
"/benvenuto", | |
"/tervetuloa", | |
"/bienvenido", | |
"/velkommen", | |
"/welkom", | |
"translate.com/", | |
"/translate/", | |
"/translation/", | |
] | |
if any(indicator in url.lower() for indicator in translation_indicators): | |
url_path = urlparse(url).path.lower() | |
if url_path != "/": # Don't skip homepage | |
return True | |
# Check for similar content length and patterns | |
for existing_metadata in self.url_metadata.values(): | |
existing_desc = existing_metadata.get("description", "") | |
existing_title = existing_metadata.get("title", "") | |
if not existing_desc or not existing_title: | |
continue | |
# If descriptions are very similar in length, likely a translation | |
if ( | |
abs(len(desc) - len(existing_desc)) < 20 | |
and len(desc) > 50 | |
and desc != existing_desc | |
): # Allow exact duplicates for main page | |
return True | |
return False | |
def clean_text(self, text, is_title=False): | |
"""Improved text cleaning""" | |
if not text or len(text.strip()) < 2: | |
return "" | |
# Normalize unicode characters | |
text = unicodedata.normalize("NFKD", text) | |
text = re.sub(r"[^\x00-\x7F]+", "", text) | |
# Remove any template variables/placeholders | |
text = re.sub(r"\{\{.*?\}\}", "", text) | |
text = re.sub(r"\{\%.*?\%\}", "", text) | |
text = re.sub(r"\${.*?\}", "", text) | |
if is_title: | |
# Remove common suffixes and fragments for titles | |
text = re.sub(r"^\s*Welcome to\s+", "", text) | |
text = re.sub(r"\s*[\|\-#:•].*", "", text) | |
text = re.sub(r"\s+Homepage$", "", text, flags=re.IGNORECASE) | |
# Handle overly generic titles | |
if text.lower() in ["features", "home", "homepage", "welcome"]: | |
return "" | |
# Only return if we have meaningful text | |
cleaned = " ".join(text.split()).strip() | |
if len(cleaned.split()) < 2 and not is_title: # Allow single-word titles | |
return "" | |
return cleaned | |
def clean_description(self, desc): | |
"""Clean description text""" | |
if not desc: | |
return "" | |
# Remove leading dashes, hyphens, or colons | |
desc = re.sub(r"^[-:\s]+", "", desc) | |
# Remove any strings that are just "Editors", "APIs", etc. | |
if len(desc.split()) <= 1: | |
return "" | |
return desc.strip() | |
def extract_homepage_description(self, soup): | |
"""Extract description from homepage with multiple fallbacks""" | |
# Try meta description first | |
meta_desc = soup.find("meta", {"name": "description"}) | |
if meta_desc and meta_desc.get("content"): | |
desc = meta_desc["content"] | |
if desc and len(desc.strip()) > 20: | |
return self.clean_text(desc) | |
# Try OpenGraph description | |
og_desc = soup.find("meta", property="og:description") | |
if og_desc and og_desc.get("content"): | |
desc = og_desc["content"] | |
if desc and len(desc.strip()) > 20: | |
return self.clean_text(desc) | |
# Try first significant paragraph | |
for p in soup.find_all("p"): | |
text = p.get_text().strip() | |
if len(text) > 50 and not any( | |
x in text.lower() for x in ["cookie", "accept", "privacy"] | |
): | |
return self.clean_text(text) | |
# Try main content area if exists | |
main = soup.find("main") | |
if main: | |
first_p = main.find("p") | |
if first_p: | |
text = first_p.get_text().strip() | |
if len(text) > 50: | |
return self.clean_text(text) | |
return None | |
async def crawl_page(self, url, depth, base_domain): | |
"""Crawl a single page and extract information""" | |
if ( | |
depth > self.max_depth | |
or url in self.visited_urls | |
or len(self.visited_urls) >= self.max_pages | |
): | |
return [] | |
try: | |
await asyncio.sleep(1) # Be polite to servers | |
async with aiohttp.ClientSession() as session: | |
async with session.get( | |
url, headers=self.headers, allow_redirects=True | |
) as response: | |
if response.status == 403: | |
# Try with alternative headers | |
alt_headers = { | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
} | |
async with session.get( | |
url, headers=alt_headers, allow_redirects=True | |
) as retry_response: | |
if retry_response.status != 200: | |
return [] | |
text = await retry_response.text() | |
elif response.status != 200: | |
return [] | |
else: | |
text = await response.text() | |
self.visited_urls.add(url) | |
soup = BeautifulSoup(text, "html.parser") | |
# Extract title with fallbacks | |
title = None | |
meta_title = soup.find("meta", property="og:title") | |
if meta_title and meta_title.get("content"): | |
title = meta_title["content"] | |
if not title: | |
title_tag = soup.find("title") | |
if title_tag: | |
title = title_tag.text | |
if not title: | |
h1_tag = soup.find("h1") | |
if h1_tag: | |
title = h1_tag.text | |
if not title: | |
title = url.split("/")[-1] | |
title = self.clean_text(title, is_title=True) | |
# Extract description with fallbacks | |
desc = None | |
meta_desc = soup.find("meta", {"name": "description"}) | |
if meta_desc and meta_desc.get("content"): | |
desc = meta_desc["content"] | |
if not desc: | |
og_desc = soup.find("meta", property="og:description") | |
if og_desc and og_desc.get("content"): | |
desc = og_desc["content"] | |
if not desc: | |
first_p = soup.find("p") | |
if first_p: | |
desc = first_p.text | |
desc = self.clean_text(desc) if desc else "" | |
# Skip if it's duplicate content | |
if self.is_duplicate_content(desc, title, url): | |
return [] | |
# Determine category and importance | |
category, importance = self.determine_category_importance( | |
url, title, desc | |
) | |
# Store metadata | |
clean_url = re.sub(r"#.*", "", url).rstrip("/") | |
if ( | |
title and len(title.strip()) > 0 | |
): # Only store if we have a valid title | |
logger.info( | |
f"Storing metadata for {clean_url}: {title[:30]}..." | |
) | |
self.url_metadata[clean_url] = { | |
"title": title, | |
"description": desc, | |
"category": category, | |
"importance": importance, | |
} | |
# Find links | |
links = [] | |
for a in soup.find_all("a", href=True): | |
href = a["href"] | |
if not any( | |
x in href.lower() | |
for x in [ | |
"javascript:", | |
"mailto:", | |
".pdf", | |
".jpg", | |
".png", | |
".gif", | |
] | |
): | |
next_url = urljoin(url, href) | |
if urlparse(next_url).netloc == base_domain: | |
links.append(next_url) | |
return links | |
except Exception as e: | |
logger.error(f"Error crawling {url}: {str(e)}") | |
return [] | |
async def process_homepage(self, url): | |
"""Specifically process the homepage to extract key metadata""" | |
try: | |
# Configure SSL context | |
ssl_context = ssl.create_default_context() | |
ssl_context.check_hostname = False | |
ssl_context.verify_mode = ssl.CERT_NONE | |
connector = aiohttp.TCPConnector(ssl=ssl_context) | |
timeout = aiohttp.ClientTimeout(total=30) | |
async with aiohttp.ClientSession( | |
connector=connector, timeout=timeout | |
) as session: | |
async with session.get( | |
url, headers=self.headers, allow_redirects=True | |
) as response: | |
if response.status != 200: | |
raise Exception( | |
f"Failed to fetch homepage: status {response.status}" | |
) | |
try: | |
text = await response.text() | |
except UnicodeDecodeError: | |
text = await response.read() | |
text = text.decode("utf-8", errors="ignore") | |
soup = BeautifulSoup(text, "html.parser") | |
# Extract site name with more fallbacks | |
site_name = None | |
# Try meta tags first | |
site_meta = soup.find("meta", property="og:site_name") | |
if site_meta and site_meta.get("content"): | |
site_name = site_meta["content"] | |
# Try structured data | |
if not site_name: | |
schema = soup.find("script", type="application/ld+json") | |
if schema: | |
try: | |
import json | |
data = json.loads(schema.string) | |
if isinstance(data, dict): | |
site_name = data.get("name") or data.get( | |
"organizationName" | |
) | |
except: | |
pass | |
# Try title tag | |
if not site_name: | |
title_tag = soup.find("title") | |
if title_tag: | |
site_name = title_tag.text.split("|")[0].strip() | |
# Last resort - use domain name | |
if not site_name: | |
site_name = urlparse(url).netloc.split(".")[0].capitalize() | |
# Get homepage description | |
description = self.extract_homepage_description(soup) | |
self.homepage_metadata = { | |
"site_name": self.clean_text(site_name, is_title=True), | |
"description": description, | |
} | |
except Exception as e: | |
logger.error(f"Error processing homepage {url}: {str(e)}") | |
self.homepage_metadata = { | |
"site_name": urlparse(url).netloc.split(".")[0].capitalize(), | |
"description": None, | |
} | |
async def crawl_website(self, start_url): | |
"""Crawl website starting from the given URL""" | |
try: | |
# First process the homepage | |
logger.info(f"Processing homepage: {start_url}") | |
await self.process_homepage(start_url) | |
base_domain = urlparse(start_url).netloc | |
queue = [(start_url, 0)] | |
seen = {start_url} | |
while queue and len(self.visited_urls) < self.max_pages: | |
current_url, depth = queue.pop(0) | |
if depth > self.max_depth: | |
continue | |
logger.info(f"Crawling page: {current_url} (depth: {depth})") | |
links = await self.crawl_page(current_url, depth, base_domain) | |
logger.info(f"Found {len(links)} links on {current_url}") | |
for link in links: | |
if link not in seen and urlparse(link).netloc == base_domain: | |
seen.add(link) | |
queue.append((link, depth + 1)) | |
logger.info(f"Crawl completed. Visited {len(self.visited_urls)} pages") | |
except Exception as e: | |
logger.error(f"Error during crawl: {str(e)}") | |
raise | |
def generate_llms_txt(self): | |
"""Generate llms.txt content""" | |
logger.info(f"Starting generate_llms_txt with {len(self.url_metadata)} URLs") | |
if not self.url_metadata: | |
logger.error("No URL metadata found") | |
return "No content was found to generate llms.txt" | |
# Sort URLs by importance and remove duplicates | |
sorted_urls = [] | |
seen_titles = set() | |
for url, metadata in sorted( | |
self.url_metadata.items(), | |
key=lambda x: (x[1]["importance"], x[0]), | |
reverse=True, | |
): | |
if metadata["title"] not in seen_titles: | |
sorted_urls.append((url, metadata)) | |
seen_titles.add(metadata["title"]) | |
logger.info(f"Found {len(sorted_urls)} unique URLs after deduplication") | |
if not sorted_urls: | |
logger.error("No valid URLs found after sorting") | |
return "No valid content was found" | |
# Generate content | |
content = [] | |
# Use homepage metadata for main title and description | |
main_title = self.homepage_metadata.get("site_name", "Welcome") | |
homepage_description = self.homepage_metadata.get("description") | |
logger.info(f"Homepage title: {main_title}") | |
logger.info(f"Homepage description: {homepage_description}") | |
content.append(f"# {main_title}") | |
if homepage_description: | |
content.append(f"\n> {homepage_description}") | |
elif len(sorted_urls) > 0: | |
# Fallback to first good description from content if no homepage description | |
for _, metadata in sorted_urls: | |
desc = self.clean_description(metadata["description"]) | |
if desc and len(desc) > 20 and "null" not in desc.lower(): | |
content.append(f"\n> {desc}") | |
break | |
# Group by category | |
categories = defaultdict(list) | |
for url, metadata in sorted_urls: | |
if metadata["title"] and url: | |
categories[metadata["category"]].append((url, metadata)) | |
logger.info(f"Categories found: {list(categories.keys())}") | |
# Add sections in a logical order | |
category_order = [ | |
"Main", | |
"Documentation", | |
"API", | |
"Tools", | |
"About", | |
"News", | |
"Optional", | |
] | |
# Only show Main section if it has content different from the homepage description | |
if "Main" in categories: | |
main_content = categories["Main"] | |
if ( | |
len(main_content) == 1 | |
and main_content[0][1]["description"] == homepage_description | |
): | |
logger.info("Removing duplicate Main content") | |
del categories["Main"] | |
for category in category_order: | |
if category in categories and categories[category]: | |
logger.info( | |
f"Processing category {category} with {len(categories[category])} items" | |
) | |
content.append(f"\n## {category}") | |
# Sort links within category by importance and description length | |
category_links = sorted( | |
categories[category], | |
key=lambda x: (-len(x[1]["description"] or ""), x[1]["title"]), | |
) | |
links = [] | |
seen_desc = set() # Avoid duplicate descriptions within category | |
for url, metadata in category_links: | |
title = metadata["title"].strip() | |
desc = self.clean_description(metadata["description"]) | |
# Skip if description is duplicate within category | |
if desc in seen_desc: | |
continue | |
seen_desc.add(desc) | |
if desc: | |
links.append(f"- [{title}]({url}): {desc}") | |
else: | |
links.append(f"- [{title}]({url})") | |
content.append("\n".join(links)) | |
final_content = "\n".join(content) | |
logger.info(f"Generated content length: {len(final_content)}") | |
return final_content | |
async def process_url(url, max_depth, max_pages): | |
"""Process URL and generate llms.txt""" | |
try: | |
# Add https:// if not present | |
if not url.startswith(("http://", "https://")): | |
url = "https://" + url | |
# Validate URL | |
result = urlparse(url) | |
if not all([result.scheme, result.netloc]): | |
return "", "Invalid URL format. Please enter a valid URL." | |
logger.info(f"Starting crawl of {url}") | |
# Process website | |
crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages)) | |
await crawler.crawl_website(url) | |
logger.info("Generating llms.txt content") | |
content = crawler.generate_llms_txt() | |
if not content or content.strip() == "": | |
return "", "No content was generated. Check the logs for details." | |
return content, f"Successfully crawled {len(crawler.visited_urls)} pages." | |
except Exception as e: | |
logger.error(f"Error processing URL {url}: {str(e)}") | |
return "", f"Error: {str(e)}" | |
# Create Gradio interface | |
theme = gr.themes.Soft(primary_hue="blue", font="Open Sans") | |
with gr.Blocks( | |
theme=theme, | |
css=""" | |
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap'); | |
.gradio-container { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.gr-button { | |
font-family: 'Open Sans', sans-serif !important; | |
font-weight: 600 !important; | |
} | |
.primary-btn { | |
background-color: #2436d4 !important; | |
color: white !important; | |
} | |
.primary-btn:hover { | |
background-color: #1c2aa8 !important; | |
} | |
[data-testid="textbox"] { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.gr-padded { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.gr-input { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.gr-label { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
""", | |
) as iface: | |
with gr.Row(): | |
url_input = gr.Textbox( | |
label="Website URL", | |
placeholder="Enter the website URL (e.g., example.com)", | |
info="The URL will be automatically prefixed with https:// if not provided", | |
) | |
with gr.Row(): | |
with gr.Column(): | |
depth_input = gr.Slider( | |
minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth" | |
) | |
with gr.Column(): | |
pages_input = gr.Slider( | |
minimum=10, maximum=100, value=50, step=10, label="Maximum Pages" | |
) | |
generate_btn = gr.Button("Generate llms.txt", variant="primary") | |
output = gr.Textbox( | |
label="Generated llms.txt Content", | |
lines=20, | |
show_copy_button=True, | |
container=True, | |
) | |
status = gr.Textbox(label="Status") | |
generate_btn.click( | |
fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)), | |
inputs=[url_input, depth_input, pages_input], | |
outputs=[output, status], | |
) | |
if __name__ == "__main__": | |
iface.launch() | |