File size: 5,703 Bytes
45e1f81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Book_Ingestion_Lib.py
#########################################
# Library to hold functions for ingesting book files.#
#
####################
# Function List
#
# 1. ingest_text_file(file_path, title=None, author=None, keywords=None):
# 2.
#
#
####################
#
# Import necessary libraries
import os
import re
from datetime import datetime
import logging

import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub
#
# Import Local
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
#
#######################################################################################################################
# Function Definitions
#



def read_epub(file_path):
    """Read and extract text from an EPUB file."""
    book = epub.read_epub(file_path)
    chapters = []
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            chapters.append(item.get_content())

    text = ""
    for html_content in chapters:
        soup = BeautifulSoup(html_content, 'html.parser')
        text += soup.get_text() + "\n\n"
    return text


# Ingest a text file into the database with Title/Author/Keywords
def extract_epub_metadata(content):
    title_match = re.search(r'Title:\s*(.*?)\n', content)
    author_match = re.search(r'Author:\s*(.*?)\n', content)

    title = title_match.group(1) if title_match else None
    author = author_match.group(1) if author_match else None

    return title, author


def ingest_text_file(file_path, title=None, author=None, keywords=None):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Check if it's a converted epub and extract metadata if so
        if 'epub_converted' in (keywords or ''):
            extracted_title, extracted_author = extract_epub_metadata(content)
            title = title or extracted_title
            author = author or extracted_author

        # If title is still not provided, use the filename without extension
        if not title:
            title = os.path.splitext(os.path.basename(file_path))[0]

        # If author is still not provided, set it to 'Unknown'
        if not author:
            author = 'Unknown'

        # If keywords are not provided, use a default keyword
        if not keywords:
            keywords = 'text_file,epub_converted'
        else:
            keywords = f'text_file,epub_converted,{keywords}'

        # Add the text file to the database
        add_media_with_keywords(
            url=file_path,
            title=title,
            media_type='document',
            content=content,
            keywords=keywords,
            prompt='No prompt for text files',
            summary='No summary for text files',
            transcription_model='None',
            author=author,
            ingestion_date=datetime.now().strftime('%Y-%m-%d')
        )

        return f"Text file '{title}' by {author} ingested successfully."
    except Exception as e:
        logging.error(f"Error ingesting text file: {str(e)}")
        return f"Error ingesting text file: {str(e)}"


def ingest_folder(folder_path, keywords=None):
    results = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            result = ingest_text_file(file_path, keywords=keywords)
            results.append(result)


def epub_to_markdown(epub_path):
    book = epub.read_epub(epub_path)
    markdown_content = "# Table of Contents\n\n"
    chapters = []

    # Extract and format the table of contents
    toc = book.toc
    for item in toc:
        if isinstance(item, tuple):
            section, children = item
            level = 1
            markdown_content += format_toc_item(section, level)
            for child in children:
                markdown_content += format_toc_item(child, level + 1)
        else:
            markdown_content += format_toc_item(item, 1)

    markdown_content += "\n---\n\n"

    # Process each chapter
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            chapter_content = item.get_content().decode('utf-8')
            soup = BeautifulSoup(chapter_content, 'html.parser')

            # Extract chapter title
            title = soup.find(['h1', 'h2', 'h3'])
            if title:
                chapter_title = title.get_text()
                markdown_content += f"# {chapter_title}\n\n"

            # Process chapter content
            for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol']):
                if elem.name.startswith('h'):
                    level = int(elem.name[1])
                    markdown_content += f"{'#' * level} {elem.get_text()}\n\n"
                elif elem.name == 'p':
                    markdown_content += f"{elem.get_text()}\n\n"
                elif elem.name in ['ul', 'ol']:
                    for li in elem.find_all('li'):
                        markdown_content += f"- {li.get_text()}\n"
                    markdown_content += "\n"

            markdown_content += "---\n\n"

    return markdown_content


def format_toc_item(item, level):
    return f"{'  ' * (level - 1)}- [{item.title}](#{slugify(item.title)})\n"


def slugify(text):
    return re.sub(r'[\W_]+', '-', text.lower())

#
# End of Function Definitions
#######################################################################################################################