Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Dec 6, 2024

Commit

5292597

verified ·

1 Parent(s): b537966

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -39

app.py CHANGED Viewed

@@ -9,24 +9,23 @@ import os
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-def filter_links_by_type(link_df, link_types):
-    """Filter links based on selected types"""
-    if not link_types or "All links" in link_types:
-        return link_df
-    filtered_links = pd.DataFrame(columns=link_df.columns)
-    for link_type in link_types:
-        if "<header>" in link_type:
-            header_links = link_df[link_df['source'].str.contains('<header', case=False, na=False)]
-            filtered_links = pd.concat([filtered_links, header_links])
-        elif "<nav>" in link_type:
-            nav_links = link_df[link_df['source'].str.contains('<nav', case=False, na=False)]
-            filtered_links = pd.concat([filtered_links, nav_links])
-        elif "<footer>" in link_type:
-            footer_links = link_df[link_df['source'].str.contains('<footer', case=False, na=False)]
-            filtered_links = pd.concat([filtered_links, footer_links])
-    return filtered_links.drop_duplicates()
 def process_url(url, link_types):
     """Process URL and generate llms.txt content"""
@@ -52,30 +51,39 @@ def process_url(url, link_types):
             title = crawl_df['title'].values[0]
             meta_desc = crawl_df['meta_desc'].values[0]
-            # Process links using advertools
-            link_df = adv.crawlytics.links(crawl_df)
-            # Filter links based on selected types
-            if link_types:
-                link_df = filter_links_by_type(link_df, link_types)
-            # Generate all links content
             all_links = []
-            for link, text in link_df[['link', 'text']].values:
-                if text and text.strip():
-                    text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
-                    text = re.sub(r"\s{3,}", " ", text)
-                    all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
-            # Generate final content
-            final_content = f"# {title}\n> {meta_desc}\n{chr(10).join(all_links)}"
         finally:
             # Cleanup temporary file
             if os.path.exists(jsonl_path):
                 os.remove(jsonl_path)
-        return final_content, f"Successfully crawled website. Found {len(all_links)} links."
     except Exception as e:
         logger.error(f"Error processing URL {url}: {str(e)}")
@@ -108,7 +116,7 @@ theme = gr.themes.Soft(
         c300="#a5b2ff",
         c400="#8798ff",
         c500="#6a7eff",
-        c600="#3452db",  # Our main color
         c700="#2a41af",
         c800="#1f3183",
         c900="#152156",
@@ -118,8 +126,8 @@ theme = gr.themes.Soft(
 with gr.Blocks(theme=theme, css=css) as iface:
     with gr.Row():
-        gr.Markdown("## Generate an `llms.txt` file")
     with gr.Row():
         url_input = gr.Textbox(
             label="Enter the home page of a website:",
@@ -129,7 +137,7 @@ with gr.Blocks(theme=theme, css=css) as iface:
     with gr.Row():
         link_types = gr.Dropdown(
-            label="Select types of links to extract",
             choices=["<header> links", "<nav> links", "<footer> links", "All links"],
             multiselect=True,
             value=["All links"]

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+def explode_link_df(crawl_df, col_group):
+    """Process links from a specific column group in the crawl dataframe"""
+    try:
+        link = crawl_df[f'{col_group}_links_url'].str.split('@@').explode()
+        text = crawl_df[f'{col_group}_links_text'].str.split('@@').explode()
+        all_links = []
+        for link, text in zip(link.dropna(), text.dropna()):
+            if text and text.strip():
+                text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
+                text = re.sub(r"\s{3,}", " ", text)
+                all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
+        return "\n\n".join(all_links)
+    except Exception as e:
+        logger.error(f"Error processing {col_group} links: {str(e)}")
+        return ""
 def process_url(url, link_types):
     """Process URL and generate llms.txt content"""
             title = crawl_df['title'].values[0]
             meta_desc = crawl_df['meta_desc'].values[0]
             all_links = []
+            # Process links based on selected types
+            if link_types and "All links" not in link_types:
+                for link_type in link_types:
+                    type_match = re.findall(r"header|footer|nav", link_type)
+                    if type_match:
+                        link_content = explode_link_df(crawl_df, type_match[0])
+                        if link_content:
+                            all_links.append(link_content)
+                            all_links.append('\n\n')
+            else:
+                # Process all links using advertools
+                link_df = adv.crawlytics.links(crawl_df)
+                for link, text in link_df[['link', 'text']].values:
+                    if text and text.strip():
+                        text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
+                        text = re.sub(r"\s{3,}", " ", text)
+                        all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
+            # Generate final content with proper spacing
+            final_content = f"""# {title}
+> {meta_desc}
+{"\n\n".join(all_links)}"""
         finally:
             # Cleanup temporary file
             if os.path.exists(jsonl_path):
                 os.remove(jsonl_path)
+        return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
     except Exception as e:
         logger.error(f"Error processing URL {url}: {str(e)}")
         c300="#a5b2ff",
         c400="#8798ff",
         c500="#6a7eff",
+        c600="#3452db",  # Main color
         c700="#2a41af",
         c800="#1f3183",
         c900="#152156",
 with gr.Blocks(theme=theme, css=css) as iface:
     with gr.Row():
+        gr.Markdown("# Generate an `llms.txt` file")
     with gr.Row():
         url_input = gr.Textbox(
             label="Enter the home page of a website:",
     with gr.Row():
         link_types = gr.Dropdown(
+            label="Select types of links to extract (leave empty to get all links)",
             choices=["<header> links", "<nav> links", "<footer> links", "All links"],
             multiselect=True,
             value=["All links"]