Spaces:

oceansweep
/

tldw

Running

App Files Files Community

oceansweep commited on Sep 12, 2024

Commit

852b3e2

verified ·

1 Parent(s): e406469

Upload 2 files

Browse files

Files changed (2) hide show

App_Function_Libraries/Article_Extractor_Lib.py +308 -38
App_Function_Libraries/Article_Summarization_Lib.py +36 -11

App_Function_Libraries/Article_Extractor_Lib.py CHANGED Viewed

@@ -2,9 +2,6 @@
 #########################################
 # Article Extraction Library
 # This library is used to handle scraping and extraction of articles from web pages.
-# Currently, uses a combination of beatifulsoup4 and trafilatura to extract article text.
-# Firecrawl would be a better option for this, but it is not yet implemented.
-####
 #
 ####################
 # Function List
@@ -19,10 +16,19 @@
 import logging
 # 3rd-Party Imports
 import asyncio
 from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
 import requests
 import trafilatura
 # Import Local
 #
 #######################################################################################################################
@@ -41,11 +47,6 @@ def get_page_title(url: str) -> str:
         return "Untitled"
-def get_artice_title(article_url_arg: str) -> str:
-    # Use beautifulsoup to get the page title - Really should be using ytdlp for this....
-    article_title = get_page_title(article_url_arg)
 def scrape_article(url):
     async def fetch_html(url: str) -> str:
         async with async_playwright() as p:
@@ -59,49 +60,318 @@ def scrape_article(url):
             await browser.close()
             return content
-    def extract_article_data(html: str) -> dict:
         downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
         if downloaded:
-            metadata = trafilatura.extract_metadata(html)
-            if metadata:
-                return {
-                    'title': metadata.title if metadata.title else 'N/A',
-                    'author': metadata.author if metadata.author else 'N/A',
-                    'content': downloaded,
-                    'date': metadata.date if metadata.date else 'N/A',
-                }
-            else:
-                print("Metadata extraction failed.")
-                return None
         else:
-            print("Content extraction failed.")
-            return None
     def convert_html_to_markdown(html: str) -> str:
         soup = BeautifulSoup(html, 'html.parser')
-        # Convert each paragraph to markdown
         for para in soup.find_all('p'):
-            para.append('\n')  # Add a newline at the end of each paragraph for markdown separation
         # Use .get_text() with separator to keep paragraph separation
-        text = soup.get_text(separator='\n\n')
-        return text
     async def fetch_and_extract_article(url: str):
         html = await fetch_html(url)
-        print("HTML Content:", html[:500])  # Print first 500 characters of the HTML for inspection
-        article_data = extract_article_data(html)
-        if article_data:
             article_data['content'] = convert_html_to_markdown(article_data['content'])
-            return article_data
-        else:
-            return None
-    # Using asyncio.run to handle event loop creation and execution
-    article_data = asyncio.run(fetch_and_extract_article(url))
-    return article_data
 #
 #
-#######################################################################################################################

 #########################################
 # Article Extraction Library
 # This library is used to handle scraping and extraction of articles from web pages.
 #
 ####################
 # Function List
 import logging
 # 3rd-Party Imports
 import asyncio
+import os
+import tempfile
+from datetime import datetime
+from typing import List, Dict
+from urllib.parse import urljoin, urlparse
+from xml.dom import minidom
 from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
 import requests
 import trafilatura
+import xml.etree.ElementTree as ET
 # Import Local
 #
 #######################################################################################################################
         return "Untitled"
 def scrape_article(url):
     async def fetch_html(url: str) -> str:
         async with async_playwright() as p:
             await browser.close()
             return content
+    # FIXME - Add option for extracting comments/tables/images
+    def extract_article_data(html: str, url: str) -> dict:
         downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
+        metadata = trafilatura.extract_metadata(html)
+        result = {
+            'title': 'N/A',
+            'author': 'N/A',
+            'content': '',
+            'date': 'N/A',
+            'url': url,
+            'extraction_successful': False
+        }
         if downloaded:
+            result['content'] = downloaded
+            result['extraction_successful'] = True
+        if metadata:
+            result.update({
+                'title': metadata.title if metadata.title else 'N/A',
+                'author': metadata.author if metadata.author else 'N/A',
+                'date': metadata.date if metadata.date else 'N/A'
+            })
         else:
+            logging.warning("Metadata extraction failed.")
+        if not downloaded:
+            logging.warning("Content extraction failed.")
+        return result
     def convert_html_to_markdown(html: str) -> str:
         soup = BeautifulSoup(html, 'html.parser')
         for para in soup.find_all('p'):
+            # Add a newline at the end of each paragraph for markdown separation
+            para.append('\n')
         # Use .get_text() with separator to keep paragraph separation
+        return soup.get_text(separator='\n\n')
     async def fetch_and_extract_article(url: str):
         html = await fetch_html(url)
+        article_data = extract_article_data(html, url)
+        if article_data['extraction_successful']:
             article_data['content'] = convert_html_to_markdown(article_data['content'])
+        return article_data
+    return asyncio.run(fetch_and_extract_article(url))
+def collect_internal_links(base_url: str) -> set:
+    visited = set()
+    to_visit = {base_url}
+    while to_visit:
+        current_url = to_visit.pop()
+        if current_url in visited:
+            continue
+        try:
+            response = requests.get(current_url)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Collect internal links
+            for link in soup.find_all('a', href=True):
+                full_url = urljoin(base_url, link['href'])
+                # Only process links within the same domain
+                if urlparse(full_url).netloc == urlparse(base_url).netloc:
+                    if full_url not in visited:
+                        to_visit.add(full_url)
+            visited.add(current_url)
+        except requests.RequestException as e:
+            logging.error(f"Error visiting {current_url}: {e}")
+            continue
+    return visited
+def generate_temp_sitemap_from_links(links: set) -> str:
+    """
+    Generate a temporary sitemap file from collected links and return its path.
+    :param links: A set of URLs to include in the sitemap
+    :return: Path to the temporary sitemap file
+    """
+    # Create the root element
+    urlset = ET.Element("urlset")
+    urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
+    # Add each link to the sitemap
+    for link in links:
+        url = ET.SubElement(urlset, "url")
+        loc = ET.SubElement(url, "loc")
+        loc.text = link
+        lastmod = ET.SubElement(url, "lastmod")
+        lastmod.text = datetime.now().strftime("%Y-%m-%d")
+        changefreq = ET.SubElement(url, "changefreq")
+        changefreq.text = "daily"
+        priority = ET.SubElement(url, "priority")
+        priority.text = "0.5"
+    # Create the tree and get it as a string
+    xml_string = ET.tostring(urlset, 'utf-8')
+    # Pretty print the XML
+    pretty_xml = minidom.parseString(xml_string).toprettyxml(indent="  ")
+    # Create a temporary file
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".xml", delete=False) as temp_file:
+        temp_file.write(pretty_xml)
+        temp_file_path = temp_file.name
+    logging.info(f"Temporary sitemap created at: {temp_file_path}")
+    return temp_file_path
+def generate_sitemap_for_url(url: str) -> List[Dict[str, str]]:
+    """
+    Generate a sitemap for the given URL using the create_filtered_sitemap function.
+    Args:
+        url (str): The base URL to generate the sitemap for
+    Returns:
+        List[Dict[str, str]]: A list of dictionaries, each containing 'url' and 'title' keys
+    """
+    with tempfile.NamedTemporaryFile(mode="w+", suffix=".xml", delete=False) as temp_file:
+        create_filtered_sitemap(url, temp_file.name, is_content_page)
+        temp_file.seek(0)
+        tree = ET.parse(temp_file.name)
+        root = tree.getroot()
+        sitemap = []
+        for url_elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
+            loc = url_elem.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
+            sitemap.append({"url": loc, "title": loc.split("/")[-1] or url})  # Use the last part of the URL as a title
+    return sitemap
+def scrape_entire_site(base_url: str) -> List[Dict]:
+    """
+    Scrape the entire site by generating a temporary sitemap and extracting content from each page.
+    :param base_url: The base URL of the site to scrape
+    :return: A list of dictionaries containing scraped article data
+    """
+    # Step 1: Collect internal links from the site
+    links = collect_internal_links(base_url)
+    logging.info(f"Collected {len(links)} internal links.")
+    # Step 2: Generate the temporary sitemap
+    temp_sitemap_path = generate_temp_sitemap_from_links(links)
+    # Step 3: Scrape each URL in the sitemap
+    scraped_articles = []
+    try:
+        for link in links:
+            logging.info(f"Scraping {link} ...")
+            article_data = scrape_article(link)
+            if article_data:
+                logging.info(f"Title: {article_data['title']}")
+                logging.info(f"Author: {article_data['author']}")
+                logging.info(f"Date: {article_data['date']}")
+                logging.info(f"Content: {article_data['content'][:500]}...")
+                scraped_articles.append(article_data)
+    finally:
+        # Clean up the temporary sitemap file
+        os.unlink(temp_sitemap_path)
+        logging.info("Temporary sitemap file deleted")
+    return scraped_articles
+def scrape_by_url_level(base_url: str, level: int) -> list:
+    """Scrape articles from URLs up to a certain level under the base URL."""
+    def get_url_level(url: str) -> int:
+        return len(urlparse(url).path.strip('/').split('/'))
+    links = collect_internal_links(base_url)
+    filtered_links = [link for link in links if get_url_level(link) <= level]
+    return [article for link in filtered_links if (article := scrape_article(link))]
+def scrape_from_sitemap(sitemap_url: str) -> list:
+    """Scrape articles from a sitemap URL."""
+    try:
+        response = requests.get(sitemap_url)
+        response.raise_for_status()
+        root = ET.fromstring(response.content)
+        return [article for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
+                if (article := scrape_article(url.text))]
+    except requests.RequestException as e:
+        logging.error(f"Error fetching sitemap: {e}")
+        return []
+def convert_to_markdown(articles: list) -> str:
+    """Convert a list of article data into a single markdown document."""
+    markdown = ""
+    for article in articles:
+        markdown += f"# {article['title']}\n\n"
+        markdown += f"Author: {article['author']}\n"
+        markdown += f"Date: {article['date']}\n\n"
+        markdown += f"{article['content']}\n\n"
+        markdown += "---\n\n"  # Separator between articles
+    return markdown
+def is_content_page(url: str) -> bool:
+    """
+    Determine if a URL is likely to be a content page.
+    This is a basic implementation and may need to be adjusted based on the specific website structure.
+    :param url: The URL to check
+    :return: True if the URL is likely a content page, False otherwise
+    """
+    #Add more specific checks here based on the website's structure
+    # Exclude common non-content pages
+    exclude_patterns = [
+        '/tag/', '/category/', '/author/', '/search/', '/page/',
+        'wp-content', 'wp-includes', 'wp-json', 'wp-admin',
+        'login', 'register', 'cart', 'checkout', 'account',
+        '.jpg', '.png', '.gif', '.pdf', '.zip'
+    ]
+    return not any(pattern in url.lower() for pattern in exclude_patterns)
+def create_filtered_sitemap(base_url: str, output_file: str, filter_function):
+    """
+    Create a sitemap from internal links and filter them based on a custom function.
+    :param base_url: The base URL of the website
+    :param output_file: The file to save the sitemap to
+    :param filter_function: A function that takes a URL and returns True if it should be included
+    """
+    links = collect_internal_links(base_url)
+    filtered_links = set(filter(filter_function, links))
+    root = ET.Element("urlset")
+    root.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
+    for link in filtered_links:
+        url = ET.SubElement(root, "url")
+        loc = ET.SubElement(url, "loc")
+        loc.text = link
+    tree = ET.ElementTree(root)
+    tree.write(output_file, encoding='utf-8', xml_declaration=True)
+    print(f"Filtered sitemap saved to {output_file}")
+def scrape_from_filtered_sitemap(sitemap_file: str, filter_function) -> list:
+    """
+    Scrape articles from a sitemap file, applying an additional filter function.
+    :param sitemap_file: Path to the sitemap file
+    :param filter_function: A function that takes a URL and returns True if it should be scraped
+    :return: List of scraped articles
+    """
+    try:
+        tree = ET.parse(sitemap_file)
+        root = tree.getroot()
+        articles = []
+        for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
+            if filter_function(url.text):
+                article_data = scrape_article(url.text)
+                if article_data:
+                    articles.append(article_data)
+        return articles
+    except ET.ParseError as e:
+        logging.error(f"Error parsing sitemap: {e}")
+        return []
+def scrape_and_convert_with_filter(source: str, output_file: str, filter_function=is_content_page, level: int = None):
+    """
+    Scrape articles from a sitemap or by URL level, apply filtering, and convert to a single markdown file.
+    :param source: URL of the sitemap, base URL for level-based scraping, or path to a local sitemap file
+    :param output_file: Path to save the output markdown file
+    :param filter_function: Function to filter URLs (default is is_content_page)
+    :param level: URL level for scraping (None if using sitemap)
+    """
+    if level is not None:
+        # Scraping by URL level
+        articles = scrape_by_url_level(source, level)
+        articles = [article for article in articles if filter_function(article['url'])]
+    elif source.startswith('http'):
+        # Scraping from online sitemap
+        articles = scrape_from_sitemap(source)
+        articles = [article for article in articles if filter_function(article['url'])]
+    else:
+        # Scraping from local sitemap file
+        articles = scrape_from_filtered_sitemap(source, filter_function)
+    articles = [article for article in articles if filter_function(article['url'])]
+    markdown_content = convert_to_markdown(articles)
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(markdown_content)
+    logging.info(f"Scraped and filtered content saved to {output_file}")
 #
 #
+#######################################################################################################################

App_Function_Libraries/Article_Summarization_Lib.py CHANGED Viewed

@@ -24,15 +24,15 @@ import requests
 # 3rd-Party Imports
 from tqdm import tqdm
-from App_Function_Libraries.Utils import sanitize_filename
 # Local Imports
 from Article_Extractor_Lib import scrape_article
-from Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
     summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
-from Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
     summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
     summarize_with_mistral
-from App_Function_Libraries.DB_Manager import ingest_article_to_db
 #
 #######################################################################################################################
 # Function Definitions
@@ -51,22 +51,22 @@ def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, ke
     for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"):
         custom_title = custom_titles[i] if i < len(custom_titles) else None
         try:
-            result = scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_title, system_message)
-            results.append(f"Results for URL {i + 1}:\n{result}")
         except Exception as e:
             error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
             errors.append(error_message)
-            results.append(f"Failed to process URL {i + 1}: {url}")
         # Update progress
         progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
-    # Combine results and errors
-    combined_output = "\n".join(results)
     if errors:
-        combined_output += "\n\nErrors encountered:\n" + "\n".join(errors)
-    return combined_output
 def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None):
@@ -190,6 +190,31 @@ def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, cu
         return f"Failed to process URL {url}: {str(e)}"
 def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
     title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
     author = "Unknown"

 # 3rd-Party Imports
 from tqdm import tqdm
+from App_Function_Libraries.Utils.Utils import sanitize_filename
 # Local Imports
 from Article_Extractor_Lib import scrape_article
+from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
     summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
+from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
     summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
     summarize_with_mistral
+from App_Function_Libraries.DB.DB_Manager import ingest_article_to_db
 #
 #######################################################################################################################
 # Function Definitions
     for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"):
         custom_title = custom_titles[i] if i < len(custom_titles) else None
         try:
+            article = scrape_article(url)
+            if article and article['extraction_successful']:
+                if custom_title:
+                    article['title'] = custom_title
+                results.append(article)
         except Exception as e:
             error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
             errors.append(error_message)
         # Update progress
         progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
     if errors:
+        logging.error("\n".join(errors))
+    return results
 def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None):
         return f"Failed to process URL {url}: {str(e)}"
+def scrape_and_no_summarize_then_ingest(url, keywords, custom_article_title):
+    try:
+        # Step 1: Scrape the article
+        article_data = scrape_article(url)
+        print(f"Scraped Article Data: {article_data}")  # Debugging statement
+        if not article_data:
+            return "Failed to scrape the article."
+        # Use the custom title if provided, otherwise use the scraped title
+        title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
+        author = article_data.get('author', 'Unknown')
+        content = article_data.get('content', '')
+        ingestion_date = datetime.now().strftime('%Y-%m-%d')
+        print(f"Title: {title}, Author: {author}, Content Length: {len(content)}")  # Debugging statement
+        # Step 2: Ingest the article into the database
+        ingestion_result = ingest_article_to_db(url, title, author, content, keywords, ingestion_date, None, None)
+        return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nArticle Contents: {content}"
+    except Exception as e:
+        logging.error(f"Error processing URL {url}: {str(e)}")
+        return f"Failed to process URL {url}: {str(e)}"
 def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
     title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
     author = "Unknown"