File size: 4,120 Bytes
ed28876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Article_Extractor_Lib.py
#########################################
# Article Extraction Library
# This library is used to handle scraping and extraction of articles from web pages.
# Currently, uses a combination of beatifulsoup4 and trafilatura to extract article text.
# Firecrawl would be a better option for this, but it is not yet implemented.
####
#
####################
# Function List
#
# 1. get_page_title(url)
# 2. get_article_text(url)
# 3. get_article_title(article_url_arg)
#
####################
#
# Import necessary libraries
import logging
# 3rd-Party Imports
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import requests
import trafilatura
# Import Local
#
#######################################################################################################################
# Function Definitions
#

def get_page_title(url: str) -> str:
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        title_tag = soup.find('title')
        return title_tag.string.strip() if title_tag else "Untitled"
    except requests.RequestException as e:
        logging.error(f"Error fetching page title: {e}")
        return "Untitled"


def get_artice_title(article_url_arg: str) -> str:
    # Use beautifulsoup to get the page title - Really should be using ytdlp for this....
    article_title = get_page_title(article_url_arg)


def scrape_article(url):
    async def fetch_html(url: str) -> str:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context(
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
            page = await context.new_page()
            await page.goto(url)
            await page.wait_for_load_state("networkidle")  # Wait for the network to be idle
            content = await page.content()
            await browser.close()
            return content

    def extract_article_data(html: str) -> dict:
        downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
        if downloaded:
            metadata = trafilatura.extract_metadata(html)
            if metadata:
                return {
                    'title': metadata.title if metadata.title else 'N/A',
                    'author': metadata.author if metadata.author else 'N/A',
                    'content': downloaded,
                    'date': metadata.date if metadata.date else 'N/A',
                }
            else:
                print("Metadata extraction failed.")
                return None
        else:
            print("Content extraction failed.")
            return None

    def convert_html_to_markdown(html: str) -> str:
        soup = BeautifulSoup(html, 'html.parser')
        # Convert each paragraph to markdown
        for para in soup.find_all('p'):
            para.append('\n')  # Add a newline at the end of each paragraph for markdown separation

        # Use .get_text() with separator to keep paragraph separation
        text = soup.get_text(separator='\n\n')

        return text

    async def fetch_and_extract_article(url: str):
        html = await fetch_html(url)
        print("HTML Content:", html[:500])  # Print first 500 characters of the HTML for inspection
        article_data = extract_article_data(html)
        if article_data:
            article_data['content'] = convert_html_to_markdown(article_data['content'])
            return article_data
        else:
            return None

    # Using asyncio.run to handle event loop creation and execution
    article_data = asyncio.run(fetch_and_extract_article(url))
    return article_data

#
#
#######################################################################################################################