|
"""Docs parser. |
|
|
|
Contains parsers for docx, pdf files. |
|
|
|
""" |
|
from pathlib import Path |
|
from typing import Dict |
|
|
|
from application.parser.file.base_parser import BaseParser |
|
|
|
|
|
class PDFParser(BaseParser): |
|
"""PDF parser.""" |
|
|
|
def _init_parser(self) -> Dict: |
|
"""Init parser.""" |
|
return {} |
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> str: |
|
"""Parse file.""" |
|
try: |
|
import PyPDF2 |
|
except ImportError: |
|
raise ValueError("PyPDF2 is required to read PDF files.") |
|
text_list = [] |
|
with open(file, "rb") as fp: |
|
|
|
pdf = PyPDF2.PdfReader(fp) |
|
|
|
|
|
num_pages = len(pdf.pages) |
|
|
|
|
|
for page in range(num_pages): |
|
|
|
page_text = pdf.pages[page].extract_text() |
|
text_list.append(page_text) |
|
text = "\n".join(text_list) |
|
|
|
return text |
|
|
|
|
|
class DocxParser(BaseParser): |
|
"""Docx parser.""" |
|
|
|
def _init_parser(self) -> Dict: |
|
"""Init parser.""" |
|
return {} |
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> str: |
|
"""Parse file.""" |
|
try: |
|
import docx2txt |
|
except ImportError: |
|
raise ValueError("docx2txt is required to read Microsoft Word files.") |
|
|
|
text = docx2txt.process(file) |
|
|
|
return text |
|
|