Spaces:

oceansweep
/

tldw

Running on T4

tldw / App_Function_Libraries /Article_Extractor_Lib.py

ed28876 4 months ago

4.12 kB

	# Article_Extractor_Lib.py
	#########################################
	# Article Extraction Library
	# This library is used to handle scraping and extraction of articles from web pages.
	# Currently, uses a combination of beatifulsoup4 and trafilatura to extract article text.
	# Firecrawl would be a better option for this, but it is not yet implemented.
	####
	#
	####################
	# Function List
	#
	# 1. get_page_title(url)
	# 2. get_article_text(url)
	# 3. get_article_title(article_url_arg)
	#
	####################
	#
	# Import necessary libraries
	import logging
	# 3rd-Party Imports
	import asyncio
	from playwright.async_api import async_playwright
	from bs4 import BeautifulSoup
	import requests
	import trafilatura
	# Import Local
	#
	#######################################################################################################################
	# Function Definitions
	#

	def get_page_title(url: str) -> str:
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')
	title_tag = soup.find('title')
	return title_tag.string.strip() if title_tag else "Untitled"
	except requests.RequestException as e:
	logging.error(f"Error fetching page title: {e}")
	return "Untitled"


	def get_artice_title(article_url_arg: str) -> str:
	# Use beautifulsoup to get the page title - Really should be using ytdlp for this....
	article_title = get_page_title(article_url_arg)


	def scrape_article(url):
	async def fetch_html(url: str) -> str:
	async with async_playwright() as p:
	browser = await p.chromium.launch(headless=True)
	context = await browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
	page = await context.new_page()
	await page.goto(url)
	await page.wait_for_load_state("networkidle") # Wait for the network to be idle
	content = await page.content()
	await browser.close()
	return content

	def extract_article_data(html: str) -> dict:
	downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
	if downloaded:
	metadata = trafilatura.extract_metadata(html)
	if metadata:
	return {
	'title': metadata.title if metadata.title else 'N/A',
	'author': metadata.author if metadata.author else 'N/A',
	'content': downloaded,
	'date': metadata.date if metadata.date else 'N/A',
	}
	else:
	print("Metadata extraction failed.")
	return None
	else:
	print("Content extraction failed.")
	return None

	def convert_html_to_markdown(html: str) -> str:
	soup = BeautifulSoup(html, 'html.parser')
	# Convert each paragraph to markdown
	for para in soup.find_all('p'):
	para.append('\n') # Add a newline at the end of each paragraph for markdown separation

	# Use .get_text() with separator to keep paragraph separation
	text = soup.get_text(separator='\n\n')

	return text

	async def fetch_and_extract_article(url: str):
	html = await fetch_html(url)
	print("HTML Content:", html[:500]) # Print first 500 characters of the HTML for inspection
	article_data = extract_article_data(html)
	if article_data:
	article_data['content'] = convert_html_to_markdown(article_data['content'])
	return article_data
	else:
	return None

	# Using asyncio.run to handle event loop creation and execution
	article_data = asyncio.run(fetch_and_extract_article(url))
	return article_data

	#
	#
	#######################################################################################################################