|
"""Epub parser. |
|
|
|
Contains parsers for epub files. |
|
""" |
|
|
|
from pathlib import Path |
|
from typing import Dict |
|
|
|
from application.parser.file.base_parser import BaseParser |
|
|
|
|
|
class EpubParser(BaseParser): |
|
"""Epub Parser.""" |
|
|
|
def _init_parser(self) -> Dict: |
|
"""Init parser.""" |
|
return {} |
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> str: |
|
"""Parse file.""" |
|
try: |
|
import ebooklib |
|
from ebooklib import epub |
|
except ImportError: |
|
raise ValueError("`EbookLib` is required to read Epub files.") |
|
try: |
|
import html2text |
|
except ImportError: |
|
raise ValueError("`html2text` is required to parse Epub files.") |
|
|
|
text_list = [] |
|
book = epub.read_epub(file, options={"ignore_ncx": True}) |
|
|
|
|
|
for item in book.get_items(): |
|
|
|
if item.get_type() == ebooklib.ITEM_DOCUMENT: |
|
text_list.append( |
|
html2text.html2text(item.get_content().decode("utf-8")) |
|
) |
|
|
|
text = "\n".join(text_list) |
|
return text |
|
|