# Book_Ingestion_Lib.py ######################################### # Library to hold functions for ingesting book files.# # #################### # Function List # # 1. ingest_text_file(file_path, title=None, author=None, keywords=None): # 2. # # #################### # # Import necessary libraries import os import re from datetime import datetime import logging import ebooklib from bs4 import BeautifulSoup from ebooklib import epub # # Import Local from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords # ####################################################################################################################### # Function Definitions # def read_epub(file_path): """Read and extract text from an EPUB file.""" book = epub.read_epub(file_path) chapters = [] for item in book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: chapters.append(item.get_content()) text = "" for html_content in chapters: soup = BeautifulSoup(html_content, 'html.parser') text += soup.get_text() + "\n\n" return text # Ingest a text file into the database with Title/Author/Keywords def extract_epub_metadata(content): title_match = re.search(r'Title:\s*(.*?)\n', content) author_match = re.search(r'Author:\s*(.*?)\n', content) title = title_match.group(1) if title_match else None author = author_match.group(1) if author_match else None return title, author def ingest_text_file(file_path, title=None, author=None, keywords=None): try: with open(file_path, 'r', encoding='utf-8') as file: content = file.read() # Check if it's a converted epub and extract metadata if so if 'epub_converted' in (keywords or ''): extracted_title, extracted_author = extract_epub_metadata(content) title = title or extracted_title author = author or extracted_author # If title is still not provided, use the filename without extension if not title: title = os.path.splitext(os.path.basename(file_path))[0] # If author is still not provided, set it to 'Unknown' if not author: author = 'Unknown' # If keywords are not provided, use a default keyword if not keywords: keywords = 'text_file,epub_converted' else: keywords = f'text_file,epub_converted,{keywords}' # Add the text file to the database add_media_with_keywords( url=file_path, title=title, media_type='document', content=content, keywords=keywords, prompt='No prompt for text files', summary='No summary for text files', transcription_model='None', author=author, ingestion_date=datetime.now().strftime('%Y-%m-%d') ) return f"Text file '{title}' by {author} ingested successfully." except Exception as e: logging.error(f"Error ingesting text file: {str(e)}") return f"Error ingesting text file: {str(e)}" def ingest_folder(folder_path, keywords=None): results = [] for filename in os.listdir(folder_path): if filename.lower().endswith('.txt'): file_path = os.path.join(folder_path, filename) result = ingest_text_file(file_path, keywords=keywords) results.append(result) def epub_to_markdown(epub_path): book = epub.read_epub(epub_path) markdown_content = "# Table of Contents\n\n" chapters = [] # Extract and format the table of contents toc = book.toc for item in toc: if isinstance(item, tuple): section, children = item level = 1 markdown_content += format_toc_item(section, level) for child in children: markdown_content += format_toc_item(child, level + 1) else: markdown_content += format_toc_item(item, 1) markdown_content += "\n---\n\n" # Process each chapter for item in book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: chapter_content = item.get_content().decode('utf-8') soup = BeautifulSoup(chapter_content, 'html.parser') # Extract chapter title title = soup.find(['h1', 'h2', 'h3']) if title: chapter_title = title.get_text() markdown_content += f"# {chapter_title}\n\n" # Process chapter content for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol']): if elem.name.startswith('h'): level = int(elem.name[1]) markdown_content += f"{'#' * level} {elem.get_text()}\n\n" elif elem.name == 'p': markdown_content += f"{elem.get_text()}\n\n" elif elem.name in ['ul', 'ol']: for li in elem.find_all('li'): markdown_content += f"- {li.get_text()}\n" markdown_content += "\n" markdown_content += "---\n\n" return markdown_content def format_toc_item(item, level): return f"{' ' * (level - 1)}- [{item.title}](#{slugify(item.title)})\n" def slugify(text): return re.sub(r'[\W_]+', '-', text.lower()) # # End of Function Definitions #######################################################################################################################