Spaces:

VTechAI
/

Chat

Runtime error

Chat / parser /file /epub_parser.py

init

8a41f4d 9 months ago

1.21 kB

	"""Epub parser.

	Contains parsers for epub files.
	"""

	from pathlib import Path
	from typing import Dict

	from application.parser.file.base_parser import BaseParser


	class EpubParser(BaseParser):
	"""Epub Parser."""

	def _init_parser(self) -> Dict:
	"""Init parser."""
	return {}

	def parse_file(self, file: Path, errors: str = "ignore") -> str:
	"""Parse file."""
	try:
	import ebooklib
	from ebooklib import epub
	except ImportError:
	raise ValueError("`EbookLib` is required to read Epub files.")
	try:
	import html2text
	except ImportError:
	raise ValueError("`html2text` is required to parse Epub files.")

	text_list = []
	book = epub.read_epub(file, options={"ignore_ncx": True})

	# Iterate through all chapters.
	for item in book.get_items():
	# Chapters are typically located in epub documents items.
	if item.get_type() == ebooklib.ITEM_DOCUMENT:
	text_list.append(
	html2text.html2text(item.get_content().decode("utf-8"))
	)

	text = "\n".join(text_list)
	return text