Spaces:
Sleeping
Sleeping
File size: 5,703 Bytes
45e1f81 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# Book_Ingestion_Lib.py
#########################################
# Library to hold functions for ingesting book files.#
#
####################
# Function List
#
# 1. ingest_text_file(file_path, title=None, author=None, keywords=None):
# 2.
#
#
####################
#
# Import necessary libraries
import os
import re
from datetime import datetime
import logging
import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub
#
# Import Local
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
#
#######################################################################################################################
# Function Definitions
#
def read_epub(file_path):
"""Read and extract text from an EPUB file."""
book = epub.read_epub(file_path)
chapters = []
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
chapters.append(item.get_content())
text = ""
for html_content in chapters:
soup = BeautifulSoup(html_content, 'html.parser')
text += soup.get_text() + "\n\n"
return text
# Ingest a text file into the database with Title/Author/Keywords
def extract_epub_metadata(content):
title_match = re.search(r'Title:\s*(.*?)\n', content)
author_match = re.search(r'Author:\s*(.*?)\n', content)
title = title_match.group(1) if title_match else None
author = author_match.group(1) if author_match else None
return title, author
def ingest_text_file(file_path, title=None, author=None, keywords=None):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Check if it's a converted epub and extract metadata if so
if 'epub_converted' in (keywords or ''):
extracted_title, extracted_author = extract_epub_metadata(content)
title = title or extracted_title
author = author or extracted_author
# If title is still not provided, use the filename without extension
if not title:
title = os.path.splitext(os.path.basename(file_path))[0]
# If author is still not provided, set it to 'Unknown'
if not author:
author = 'Unknown'
# If keywords are not provided, use a default keyword
if not keywords:
keywords = 'text_file,epub_converted'
else:
keywords = f'text_file,epub_converted,{keywords}'
# Add the text file to the database
add_media_with_keywords(
url=file_path,
title=title,
media_type='document',
content=content,
keywords=keywords,
prompt='No prompt for text files',
summary='No summary for text files',
transcription_model='None',
author=author,
ingestion_date=datetime.now().strftime('%Y-%m-%d')
)
return f"Text file '{title}' by {author} ingested successfully."
except Exception as e:
logging.error(f"Error ingesting text file: {str(e)}")
return f"Error ingesting text file: {str(e)}"
def ingest_folder(folder_path, keywords=None):
results = []
for filename in os.listdir(folder_path):
if filename.lower().endswith('.txt'):
file_path = os.path.join(folder_path, filename)
result = ingest_text_file(file_path, keywords=keywords)
results.append(result)
def epub_to_markdown(epub_path):
book = epub.read_epub(epub_path)
markdown_content = "# Table of Contents\n\n"
chapters = []
# Extract and format the table of contents
toc = book.toc
for item in toc:
if isinstance(item, tuple):
section, children = item
level = 1
markdown_content += format_toc_item(section, level)
for child in children:
markdown_content += format_toc_item(child, level + 1)
else:
markdown_content += format_toc_item(item, 1)
markdown_content += "\n---\n\n"
# Process each chapter
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
chapter_content = item.get_content().decode('utf-8')
soup = BeautifulSoup(chapter_content, 'html.parser')
# Extract chapter title
title = soup.find(['h1', 'h2', 'h3'])
if title:
chapter_title = title.get_text()
markdown_content += f"# {chapter_title}\n\n"
# Process chapter content
for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol']):
if elem.name.startswith('h'):
level = int(elem.name[1])
markdown_content += f"{'#' * level} {elem.get_text()}\n\n"
elif elem.name == 'p':
markdown_content += f"{elem.get_text()}\n\n"
elif elem.name in ['ul', 'ol']:
for li in elem.find_all('li'):
markdown_content += f"- {li.get_text()}\n"
markdown_content += "\n"
markdown_content += "---\n\n"
return markdown_content
def format_toc_item(item, level):
return f"{' ' * (level - 1)}- [{item.title}](#{slugify(item.title)})\n"
def slugify(text):
return re.sub(r'[\W_]+', '-', text.lower())
#
# End of Function Definitions
#######################################################################################################################
|