Spaces:
Sleeping
Sleeping
File size: 3,005 Bytes
ed28876 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# Book_Ingestion_Lib.py
#########################################
# Library to hold functions for ingesting book files.#
#
####################
# Function List
#
# 1. ingest_text_file(file_path, title=None, author=None, keywords=None):
# 2.
#
#
####################
# Import necessary libraries
import os
import re
from datetime import datetime
import logging
# Import Local
from SQLite_DB import add_media_with_keywords
#######################################################################################################################
# Function Definitions
#
# Ingest a text file into the database with Title/Author/Keywords
def extract_epub_metadata(content):
title_match = re.search(r'Title:\s*(.*?)\n', content)
author_match = re.search(r'Author:\s*(.*?)\n', content)
title = title_match.group(1) if title_match else None
author = author_match.group(1) if author_match else None
return title, author
def ingest_text_file(file_path, title=None, author=None, keywords=None):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Check if it's a converted epub and extract metadata if so
if 'epub_converted' in (keywords or ''):
extracted_title, extracted_author = extract_epub_metadata(content)
title = title or extracted_title
author = author or extracted_author
# If title is still not provided, use the filename without extension
if not title:
title = os.path.splitext(os.path.basename(file_path))[0]
# If author is still not provided, set it to 'Unknown'
if not author:
author = 'Unknown'
# If keywords are not provided, use a default keyword
if not keywords:
keywords = 'text_file,epub_converted'
else:
keywords = f'text_file,epub_converted,{keywords}'
# Add the text file to the database
add_media_with_keywords(
url=file_path,
title=title,
media_type='document',
content=content,
keywords=keywords,
prompt='No prompt for text files',
summary='No summary for text files',
transcription_model='None',
author=author,
ingestion_date=datetime.now().strftime('%Y-%m-%d')
)
return f"Text file '{title}' by {author} ingested successfully."
except Exception as e:
logging.error(f"Error ingesting text file: {str(e)}")
return f"Error ingesting text file: {str(e)}"
def ingest_folder(folder_path, keywords=None):
results = []
for filename in os.listdir(folder_path):
if filename.lower().endswith('.txt'):
file_path = os.path.join(folder_path, filename)
result = ingest_text_file(file_path, keywords=keywords)
results.append(result)
|