oceansweep commited on
Commit
852b3e2
1 Parent(s): e406469

Upload 2 files

Browse files
App_Function_Libraries/Article_Extractor_Lib.py CHANGED
@@ -2,9 +2,6 @@
2
  #########################################
3
  # Article Extraction Library
4
  # This library is used to handle scraping and extraction of articles from web pages.
5
- # Currently, uses a combination of beatifulsoup4 and trafilatura to extract article text.
6
- # Firecrawl would be a better option for this, but it is not yet implemented.
7
- ####
8
  #
9
  ####################
10
  # Function List
@@ -19,10 +16,19 @@
19
  import logging
20
  # 3rd-Party Imports
21
  import asyncio
 
 
 
 
 
 
22
  from playwright.async_api import async_playwright
23
  from bs4 import BeautifulSoup
24
  import requests
25
  import trafilatura
 
 
 
26
  # Import Local
27
  #
28
  #######################################################################################################################
@@ -41,11 +47,6 @@ def get_page_title(url: str) -> str:
41
  return "Untitled"
42
 
43
 
44
- def get_artice_title(article_url_arg: str) -> str:
45
- # Use beautifulsoup to get the page title - Really should be using ytdlp for this....
46
- article_title = get_page_title(article_url_arg)
47
-
48
-
49
  def scrape_article(url):
50
  async def fetch_html(url: str) -> str:
51
  async with async_playwright() as p:
@@ -59,49 +60,318 @@ def scrape_article(url):
59
  await browser.close()
60
  return content
61
 
62
- def extract_article_data(html: str) -> dict:
 
63
  downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
 
 
 
 
 
 
 
 
 
 
 
64
  if downloaded:
65
- metadata = trafilatura.extract_metadata(html)
66
- if metadata:
67
- return {
68
- 'title': metadata.title if metadata.title else 'N/A',
69
- 'author': metadata.author if metadata.author else 'N/A',
70
- 'content': downloaded,
71
- 'date': metadata.date if metadata.date else 'N/A',
72
- }
73
- else:
74
- print("Metadata extraction failed.")
75
- return None
76
  else:
77
- print("Content extraction failed.")
78
- return None
 
 
 
 
79
 
80
  def convert_html_to_markdown(html: str) -> str:
81
  soup = BeautifulSoup(html, 'html.parser')
82
- # Convert each paragraph to markdown
83
  for para in soup.find_all('p'):
84
- para.append('\n') # Add a newline at the end of each paragraph for markdown separation
85
-
86
  # Use .get_text() with separator to keep paragraph separation
87
- text = soup.get_text(separator='\n\n')
88
-
89
- return text
90
 
91
  async def fetch_and_extract_article(url: str):
92
  html = await fetch_html(url)
93
- print("HTML Content:", html[:500]) # Print first 500 characters of the HTML for inspection
94
- article_data = extract_article_data(html)
95
- if article_data:
96
  article_data['content'] = convert_html_to_markdown(article_data['content'])
97
- return article_data
98
- else:
99
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # Using asyncio.run to handle event loop creation and execution
102
- article_data = asyncio.run(fetch_and_extract_article(url))
103
- return article_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  #
106
  #
107
- #######################################################################################################################
 
2
  #########################################
3
  # Article Extraction Library
4
  # This library is used to handle scraping and extraction of articles from web pages.
 
 
 
5
  #
6
  ####################
7
  # Function List
 
16
  import logging
17
  # 3rd-Party Imports
18
  import asyncio
19
+ import os
20
+ import tempfile
21
+ from datetime import datetime
22
+ from typing import List, Dict
23
+ from urllib.parse import urljoin, urlparse
24
+ from xml.dom import minidom
25
  from playwright.async_api import async_playwright
26
  from bs4 import BeautifulSoup
27
  import requests
28
  import trafilatura
29
+ import xml.etree.ElementTree as ET
30
+
31
+
32
  # Import Local
33
  #
34
  #######################################################################################################################
 
47
  return "Untitled"
48
 
49
 
 
 
 
 
 
50
  def scrape_article(url):
51
  async def fetch_html(url: str) -> str:
52
  async with async_playwright() as p:
 
60
  await browser.close()
61
  return content
62
 
63
+ # FIXME - Add option for extracting comments/tables/images
64
+ def extract_article_data(html: str, url: str) -> dict:
65
  downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
66
+ metadata = trafilatura.extract_metadata(html)
67
+
68
+ result = {
69
+ 'title': 'N/A',
70
+ 'author': 'N/A',
71
+ 'content': '',
72
+ 'date': 'N/A',
73
+ 'url': url,
74
+ 'extraction_successful': False
75
+ }
76
+
77
  if downloaded:
78
+ result['content'] = downloaded
79
+ result['extraction_successful'] = True
80
+
81
+ if metadata:
82
+ result.update({
83
+ 'title': metadata.title if metadata.title else 'N/A',
84
+ 'author': metadata.author if metadata.author else 'N/A',
85
+ 'date': metadata.date if metadata.date else 'N/A'
86
+ })
 
 
87
  else:
88
+ logging.warning("Metadata extraction failed.")
89
+
90
+ if not downloaded:
91
+ logging.warning("Content extraction failed.")
92
+
93
+ return result
94
 
95
  def convert_html_to_markdown(html: str) -> str:
96
  soup = BeautifulSoup(html, 'html.parser')
 
97
  for para in soup.find_all('p'):
98
+ # Add a newline at the end of each paragraph for markdown separation
99
+ para.append('\n')
100
  # Use .get_text() with separator to keep paragraph separation
101
+ return soup.get_text(separator='\n\n')
 
 
102
 
103
  async def fetch_and_extract_article(url: str):
104
  html = await fetch_html(url)
105
+ article_data = extract_article_data(html, url)
106
+ if article_data['extraction_successful']:
 
107
  article_data['content'] = convert_html_to_markdown(article_data['content'])
108
+ return article_data
109
+
110
+ return asyncio.run(fetch_and_extract_article(url))
111
+
112
+
113
+ def collect_internal_links(base_url: str) -> set:
114
+ visited = set()
115
+ to_visit = {base_url}
116
+
117
+ while to_visit:
118
+ current_url = to_visit.pop()
119
+ if current_url in visited:
120
+ continue
121
+
122
+ try:
123
+ response = requests.get(current_url)
124
+ response.raise_for_status()
125
+ soup = BeautifulSoup(response.text, 'html.parser')
126
+
127
+ # Collect internal links
128
+ for link in soup.find_all('a', href=True):
129
+ full_url = urljoin(base_url, link['href'])
130
+ # Only process links within the same domain
131
+ if urlparse(full_url).netloc == urlparse(base_url).netloc:
132
+ if full_url not in visited:
133
+ to_visit.add(full_url)
134
+
135
+ visited.add(current_url)
136
+ except requests.RequestException as e:
137
+ logging.error(f"Error visiting {current_url}: {e}")
138
+ continue
139
+
140
+ return visited
141
+
142
+
143
+ def generate_temp_sitemap_from_links(links: set) -> str:
144
+ """
145
+ Generate a temporary sitemap file from collected links and return its path.
146
+
147
+ :param links: A set of URLs to include in the sitemap
148
+ :return: Path to the temporary sitemap file
149
+ """
150
+ # Create the root element
151
+ urlset = ET.Element("urlset")
152
+ urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
153
+
154
+ # Add each link to the sitemap
155
+ for link in links:
156
+ url = ET.SubElement(urlset, "url")
157
+ loc = ET.SubElement(url, "loc")
158
+ loc.text = link
159
+ lastmod = ET.SubElement(url, "lastmod")
160
+ lastmod.text = datetime.now().strftime("%Y-%m-%d")
161
+ changefreq = ET.SubElement(url, "changefreq")
162
+ changefreq.text = "daily"
163
+ priority = ET.SubElement(url, "priority")
164
+ priority.text = "0.5"
165
+
166
+ # Create the tree and get it as a string
167
+ xml_string = ET.tostring(urlset, 'utf-8')
168
+
169
+ # Pretty print the XML
170
+ pretty_xml = minidom.parseString(xml_string).toprettyxml(indent=" ")
171
+
172
+ # Create a temporary file
173
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".xml", delete=False) as temp_file:
174
+ temp_file.write(pretty_xml)
175
+ temp_file_path = temp_file.name
176
+
177
+ logging.info(f"Temporary sitemap created at: {temp_file_path}")
178
+ return temp_file_path
179
+
180
+
181
+ def generate_sitemap_for_url(url: str) -> List[Dict[str, str]]:
182
+ """
183
+ Generate a sitemap for the given URL using the create_filtered_sitemap function.
184
+
185
+ Args:
186
+ url (str): The base URL to generate the sitemap for
187
+
188
+ Returns:
189
+ List[Dict[str, str]]: A list of dictionaries, each containing 'url' and 'title' keys
190
+ """
191
+ with tempfile.NamedTemporaryFile(mode="w+", suffix=".xml", delete=False) as temp_file:
192
+ create_filtered_sitemap(url, temp_file.name, is_content_page)
193
+ temp_file.seek(0)
194
+ tree = ET.parse(temp_file.name)
195
+ root = tree.getroot()
196
+
197
+ sitemap = []
198
+ for url_elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
199
+ loc = url_elem.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
200
+ sitemap.append({"url": loc, "title": loc.split("/")[-1] or url}) # Use the last part of the URL as a title
201
+
202
+ return sitemap
203
+
204
+ def scrape_entire_site(base_url: str) -> List[Dict]:
205
+ """
206
+ Scrape the entire site by generating a temporary sitemap and extracting content from each page.
207
+
208
+ :param base_url: The base URL of the site to scrape
209
+ :return: A list of dictionaries containing scraped article data
210
+ """
211
+ # Step 1: Collect internal links from the site
212
+ links = collect_internal_links(base_url)
213
+ logging.info(f"Collected {len(links)} internal links.")
214
+
215
+ # Step 2: Generate the temporary sitemap
216
+ temp_sitemap_path = generate_temp_sitemap_from_links(links)
217
+
218
+ # Step 3: Scrape each URL in the sitemap
219
+ scraped_articles = []
220
+ try:
221
+ for link in links:
222
+ logging.info(f"Scraping {link} ...")
223
+ article_data = scrape_article(link)
224
+
225
+ if article_data:
226
+ logging.info(f"Title: {article_data['title']}")
227
+ logging.info(f"Author: {article_data['author']}")
228
+ logging.info(f"Date: {article_data['date']}")
229
+ logging.info(f"Content: {article_data['content'][:500]}...")
230
+
231
+ scraped_articles.append(article_data)
232
+ finally:
233
+ # Clean up the temporary sitemap file
234
+ os.unlink(temp_sitemap_path)
235
+ logging.info("Temporary sitemap file deleted")
236
+
237
+ return scraped_articles
238
+
239
+
240
+ def scrape_by_url_level(base_url: str, level: int) -> list:
241
+ """Scrape articles from URLs up to a certain level under the base URL."""
242
+
243
+ def get_url_level(url: str) -> int:
244
+ return len(urlparse(url).path.strip('/').split('/'))
245
+
246
+ links = collect_internal_links(base_url)
247
+ filtered_links = [link for link in links if get_url_level(link) <= level]
248
+
249
+ return [article for link in filtered_links if (article := scrape_article(link))]
250
+
251
+
252
+ def scrape_from_sitemap(sitemap_url: str) -> list:
253
+ """Scrape articles from a sitemap URL."""
254
+ try:
255
+ response = requests.get(sitemap_url)
256
+ response.raise_for_status()
257
+ root = ET.fromstring(response.content)
258
+
259
+ return [article for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
260
+ if (article := scrape_article(url.text))]
261
+ except requests.RequestException as e:
262
+ logging.error(f"Error fetching sitemap: {e}")
263
+ return []
264
+
265
+
266
+ def convert_to_markdown(articles: list) -> str:
267
+ """Convert a list of article data into a single markdown document."""
268
+ markdown = ""
269
+ for article in articles:
270
+ markdown += f"# {article['title']}\n\n"
271
+ markdown += f"Author: {article['author']}\n"
272
+ markdown += f"Date: {article['date']}\n\n"
273
+ markdown += f"{article['content']}\n\n"
274
+ markdown += "---\n\n" # Separator between articles
275
+ return markdown
276
+
277
+
278
+ def is_content_page(url: str) -> bool:
279
+ """
280
+ Determine if a URL is likely to be a content page.
281
+ This is a basic implementation and may need to be adjusted based on the specific website structure.
282
 
283
+ :param url: The URL to check
284
+ :return: True if the URL is likely a content page, False otherwise
285
+ """
286
+ #Add more specific checks here based on the website's structure
287
+ # Exclude common non-content pages
288
+ exclude_patterns = [
289
+ '/tag/', '/category/', '/author/', '/search/', '/page/',
290
+ 'wp-content', 'wp-includes', 'wp-json', 'wp-admin',
291
+ 'login', 'register', 'cart', 'checkout', 'account',
292
+ '.jpg', '.png', '.gif', '.pdf', '.zip'
293
+ ]
294
+ return not any(pattern in url.lower() for pattern in exclude_patterns)
295
+
296
+
297
+ def create_filtered_sitemap(base_url: str, output_file: str, filter_function):
298
+ """
299
+ Create a sitemap from internal links and filter them based on a custom function.
300
+
301
+ :param base_url: The base URL of the website
302
+ :param output_file: The file to save the sitemap to
303
+ :param filter_function: A function that takes a URL and returns True if it should be included
304
+ """
305
+ links = collect_internal_links(base_url)
306
+ filtered_links = set(filter(filter_function, links))
307
+
308
+ root = ET.Element("urlset")
309
+ root.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
310
+
311
+ for link in filtered_links:
312
+ url = ET.SubElement(root, "url")
313
+ loc = ET.SubElement(url, "loc")
314
+ loc.text = link
315
+
316
+ tree = ET.ElementTree(root)
317
+ tree.write(output_file, encoding='utf-8', xml_declaration=True)
318
+ print(f"Filtered sitemap saved to {output_file}")
319
+
320
+
321
+ def scrape_from_filtered_sitemap(sitemap_file: str, filter_function) -> list:
322
+ """
323
+ Scrape articles from a sitemap file, applying an additional filter function.
324
+
325
+ :param sitemap_file: Path to the sitemap file
326
+ :param filter_function: A function that takes a URL and returns True if it should be scraped
327
+ :return: List of scraped articles
328
+ """
329
+ try:
330
+ tree = ET.parse(sitemap_file)
331
+ root = tree.getroot()
332
+
333
+ articles = []
334
+ for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
335
+ if filter_function(url.text):
336
+ article_data = scrape_article(url.text)
337
+ if article_data:
338
+ articles.append(article_data)
339
+
340
+ return articles
341
+ except ET.ParseError as e:
342
+ logging.error(f"Error parsing sitemap: {e}")
343
+ return []
344
+
345
+
346
+ def scrape_and_convert_with_filter(source: str, output_file: str, filter_function=is_content_page, level: int = None):
347
+ """
348
+ Scrape articles from a sitemap or by URL level, apply filtering, and convert to a single markdown file.
349
+
350
+ :param source: URL of the sitemap, base URL for level-based scraping, or path to a local sitemap file
351
+ :param output_file: Path to save the output markdown file
352
+ :param filter_function: Function to filter URLs (default is is_content_page)
353
+ :param level: URL level for scraping (None if using sitemap)
354
+ """
355
+ if level is not None:
356
+ # Scraping by URL level
357
+ articles = scrape_by_url_level(source, level)
358
+ articles = [article for article in articles if filter_function(article['url'])]
359
+ elif source.startswith('http'):
360
+ # Scraping from online sitemap
361
+ articles = scrape_from_sitemap(source)
362
+ articles = [article for article in articles if filter_function(article['url'])]
363
+ else:
364
+ # Scraping from local sitemap file
365
+ articles = scrape_from_filtered_sitemap(source, filter_function)
366
+
367
+ articles = [article for article in articles if filter_function(article['url'])]
368
+ markdown_content = convert_to_markdown(articles)
369
+
370
+ with open(output_file, 'w', encoding='utf-8') as f:
371
+ f.write(markdown_content)
372
+
373
+ logging.info(f"Scraped and filtered content saved to {output_file}")
374
 
375
  #
376
  #
377
+ #######################################################################################################################
App_Function_Libraries/Article_Summarization_Lib.py CHANGED
@@ -24,15 +24,15 @@ import requests
24
  # 3rd-Party Imports
25
  from tqdm import tqdm
26
 
27
- from App_Function_Libraries.Utils import sanitize_filename
28
  # Local Imports
29
  from Article_Extractor_Lib import scrape_article
30
- from Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
31
  summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
32
- from Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
33
  summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
34
  summarize_with_mistral
35
- from App_Function_Libraries.DB_Manager import ingest_article_to_db
36
  #
37
  #######################################################################################################################
38
  # Function Definitions
@@ -51,22 +51,22 @@ def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, ke
51
  for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"):
52
  custom_title = custom_titles[i] if i < len(custom_titles) else None
53
  try:
54
- result = scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_title, system_message)
55
- results.append(f"Results for URL {i + 1}:\n{result}")
 
 
 
56
  except Exception as e:
57
  error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
58
  errors.append(error_message)
59
- results.append(f"Failed to process URL {i + 1}: {url}")
60
 
61
  # Update progress
62
  progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
63
 
64
- # Combine results and errors
65
- combined_output = "\n".join(results)
66
  if errors:
67
- combined_output += "\n\nErrors encountered:\n" + "\n".join(errors)
68
 
69
- return combined_output
70
 
71
 
72
  def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None):
@@ -190,6 +190,31 @@ def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, cu
190
  return f"Failed to process URL {url}: {str(e)}"
191
 
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
194
  title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
195
  author = "Unknown"
 
24
  # 3rd-Party Imports
25
  from tqdm import tqdm
26
 
27
+ from App_Function_Libraries.Utils.Utils import sanitize_filename
28
  # Local Imports
29
  from Article_Extractor_Lib import scrape_article
30
+ from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
31
  summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
32
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
33
  summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
34
  summarize_with_mistral
35
+ from App_Function_Libraries.DB.DB_Manager import ingest_article_to_db
36
  #
37
  #######################################################################################################################
38
  # Function Definitions
 
51
  for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"):
52
  custom_title = custom_titles[i] if i < len(custom_titles) else None
53
  try:
54
+ article = scrape_article(url)
55
+ if article and article['extraction_successful']:
56
+ if custom_title:
57
+ article['title'] = custom_title
58
+ results.append(article)
59
  except Exception as e:
60
  error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
61
  errors.append(error_message)
 
62
 
63
  # Update progress
64
  progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
65
 
 
 
66
  if errors:
67
+ logging.error("\n".join(errors))
68
 
69
+ return results
70
 
71
 
72
  def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None):
 
190
  return f"Failed to process URL {url}: {str(e)}"
191
 
192
 
193
+ def scrape_and_no_summarize_then_ingest(url, keywords, custom_article_title):
194
+ try:
195
+ # Step 1: Scrape the article
196
+ article_data = scrape_article(url)
197
+ print(f"Scraped Article Data: {article_data}") # Debugging statement
198
+ if not article_data:
199
+ return "Failed to scrape the article."
200
+
201
+ # Use the custom title if provided, otherwise use the scraped title
202
+ title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
203
+ author = article_data.get('author', 'Unknown')
204
+ content = article_data.get('content', '')
205
+ ingestion_date = datetime.now().strftime('%Y-%m-%d')
206
+
207
+ print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement
208
+
209
+ # Step 2: Ingest the article into the database
210
+ ingestion_result = ingest_article_to_db(url, title, author, content, keywords, ingestion_date, None, None)
211
+
212
+ return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nArticle Contents: {content}"
213
+ except Exception as e:
214
+ logging.error(f"Error processing URL {url}: {str(e)}")
215
+ return f"Failed to process URL {url}: {str(e)}"
216
+
217
+
218
  def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
219
  title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
220
  author = "Unknown"