cyberandy commited on
Commit
5e3183d
1 Parent(s): 1ce3f39

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+ from urllib.parse import urljoin, urlparse
6
+ import markdown
7
+
8
+ def get_website_title(soup):
9
+ """Extract website title from meta tags or title tag"""
10
+ # Try meta title first
11
+ meta_title = soup.find('meta', property='og:title')
12
+ if meta_title:
13
+ return meta_title['content']
14
+
15
+ # Try regular title tag
16
+ title_tag = soup.find('title')
17
+ if title_tag:
18
+ return title_tag.text.strip()
19
+
20
+ # Fallback to h1
21
+ h1_tag = soup.find('h1')
22
+ if h1_tag:
23
+ return h1_tag.text.strip()
24
+
25
+ return "Website Title"
26
+
27
+ def get_website_description(soup):
28
+ """Extract website description from meta tags"""
29
+ # Try meta description
30
+ meta_desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
31
+ if meta_desc:
32
+ return meta_desc.get('content', '')
33
+
34
+ # Fallback to first paragraph
35
+ first_p = soup.find('p')
36
+ if first_p:
37
+ return first_p.text.strip()
38
+
39
+ return "Website description"
40
+
41
+ def get_important_links(soup, base_url):
42
+ """Extract important links from the website"""
43
+ links = []
44
+ seen_urls = set()
45
+
46
+ # Look for navigation links
47
+ nav_elements = soup.find_all(['nav', 'header'])
48
+ for nav in nav_elements:
49
+ for a in nav.find_all('a', href=True):
50
+ url = urljoin(base_url, a['href'])
51
+ if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
52
+ text = a.text.strip()
53
+ if text and len(text) > 1: # Avoid empty or single-character links
54
+ links.append({
55
+ 'title': text,
56
+ 'url': url,
57
+ 'section': 'Docs'
58
+ })
59
+ seen_urls.add(url)
60
+
61
+ # Look for footer links
62
+ footer = soup.find('footer')
63
+ if footer:
64
+ for a in footer.find_all('a', href=True):
65
+ url = urljoin(base_url, a['href'])
66
+ if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
67
+ text = a.text.strip()
68
+ if text and len(text) > 1:
69
+ links.append({
70
+ 'title': text,
71
+ 'url': url,
72
+ 'section': 'Optional'
73
+ })
74
+ seen_urls.add(url)
75
+
76
+ return links
77
+
78
+ def generate_llms_txt(url):
79
+ try:
80
+ # Fetch the webpage
81
+ headers = {
82
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
83
+ }
84
+ response = requests.get(url, headers=headers, timeout=10)
85
+ response.raise_for_status()
86
+
87
+ # Parse the HTML
88
+ soup = BeautifulSoup(response.text, 'html.parser')
89
+
90
+ # Get base components
91
+ title = get_website_title(soup)
92
+ description = get_website_description(soup)
93
+ links = get_important_links(soup, url)
94
+
95
+ # Generate llms.txt content
96
+ content = [
97
+ f"# {title}\n",
98
+ f"> {description}\n",
99
+ "## Docs\n"
100
+ ]
101
+
102
+ # Add documentation links
103
+ doc_links = [link for link in links if link['section'] == 'Docs']
104
+ for link in doc_links:
105
+ content.append(f"- [{link['title']}]({link['url']}): Documentation page\n")
106
+
107
+ # Add optional links if present
108
+ optional_links = [link for link in links if link['section'] == 'Optional']
109
+ if optional_links:
110
+ content.append("\n## Optional\n")
111
+ for link in optional_links:
112
+ content.append(f"- [{link['title']}]({link['url']})\n")
113
+
114
+ # Join all content
115
+ llms_txt_content = "\n".join(content)
116
+
117
+ return llms_txt_content
118
+ except Exception as e:
119
+ return f"Error generating llms.txt: {str(e)}"
120
+
121
+ def save_llms_txt(content, save_path="llms.txt"):
122
+ """Save the generated content to a file"""
123
+ try:
124
+ with open(save_path, 'w', encoding='utf-8') as f:
125
+ f.write(content)
126
+ return f"Successfully saved to {save_path}"
127
+ except Exception as e:
128
+ return f"Error saving file: {str(e)}"
129
+
130
+ # Create Gradio interface
131
+ def process_url(url, save_to_file=False):
132
+ content = generate_llms_txt(url)
133
+ if save_to_file:
134
+ save_message = save_llms_txt(content)
135
+ return content, save_message
136
+ return content, "File not saved (checkbox not selected)"
137
+
138
+ # Create the Gradio interface
139
+ iface = gr.Interface(
140
+ fn=process_url,
141
+ inputs=[
142
+ gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
143
+ gr.Checkbox(label="Save to file", value=False)
144
+ ],
145
+ outputs=[
146
+ gr.Textbox(label="Generated llms.txt Content", lines=10),
147
+ gr.Textbox(label="Status")
148
+ ],
149
+ title="llms.txt Generator",
150
+ description="Generate an llms.txt file from a website following the specification. The tool extracts relevant information and creates a structured markdown file suitable for LLMs.",
151
+ examples=[
152
+ ["https://example.com", False],
153
+ ["https://docs.python.org", True]
154
+ ],
155
+ theme=gr.themes.Soft()
156
+ )
157
+
158
+ # Launch the app
159
+ if __name__ == "__main__":
160
+ iface.launch()