# Article_Extractor_Lib.py ######################################### # Article Extraction Library # This library is used to handle scraping and extraction of articles from web pages. # Currently, uses a combination of beatifulsoup4 and trafilatura to extract article text. # Firecrawl would be a better option for this, but it is not yet implemented. #### # #################### # Function List # # 1. get_page_title(url) # 2. get_article_text(url) # 3. get_article_title(article_url_arg) # #################### # # Import necessary libraries import logging # 3rd-Party Imports import asyncio from playwright.async_api import async_playwright from bs4 import BeautifulSoup import requests import trafilatura # Import Local # ####################################################################################################################### # Function Definitions # def get_page_title(url: str) -> str: try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') title_tag = soup.find('title') return title_tag.string.strip() if title_tag else "Untitled" except requests.RequestException as e: logging.error(f"Error fetching page title: {e}") return "Untitled" def get_artice_title(article_url_arg: str) -> str: # Use beautifulsoup to get the page title - Really should be using ytdlp for this.... article_title = get_page_title(article_url_arg) def scrape_article(url): async def fetch_html(url: str) -> str: async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3") page = await context.new_page() await page.goto(url) await page.wait_for_load_state("networkidle") # Wait for the network to be idle content = await page.content() await browser.close() return content def extract_article_data(html: str) -> dict: downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False) if downloaded: metadata = trafilatura.extract_metadata(html) if metadata: return { 'title': metadata.title if metadata.title else 'N/A', 'author': metadata.author if metadata.author else 'N/A', 'content': downloaded, 'date': metadata.date if metadata.date else 'N/A', } else: print("Metadata extraction failed.") return None else: print("Content extraction failed.") return None def convert_html_to_markdown(html: str) -> str: soup = BeautifulSoup(html, 'html.parser') # Convert each paragraph to markdown for para in soup.find_all('p'): para.append('\n') # Add a newline at the end of each paragraph for markdown separation # Use .get_text() with separator to keep paragraph separation text = soup.get_text(separator='\n\n') return text async def fetch_and_extract_article(url: str): html = await fetch_html(url) print("HTML Content:", html[:500]) # Print first 500 characters of the HTML for inspection article_data = extract_article_data(html) if article_data: article_data['content'] = convert_html_to_markdown(article_data['content']) return article_data else: return None # Using asyncio.run to handle event loop creation and execution article_data = asyncio.run(fetch_and_extract_article(url)) return article_data # # #######################################################################################################################