Spaces:

VTechAI
/

Chat

Runtime error

Chat / parser /file /docs_parser.py

init

8a41f4d 9 months ago

1.5 kB

	"""Docs parser.

	Contains parsers for docx, pdf files.

	"""
	from pathlib import Path
	from typing import Dict

	from application.parser.file.base_parser import BaseParser


	class PDFParser(BaseParser):
	"""PDF parser."""

	def _init_parser(self) -> Dict:
	"""Init parser."""
	return {}

	def parse_file(self, file: Path, errors: str = "ignore") -> str:
	"""Parse file."""
	try:
	import PyPDF2
	except ImportError:
	raise ValueError("PyPDF2 is required to read PDF files.")
	text_list = []
	with open(file, "rb") as fp:
	# Create a PDF object
	pdf = PyPDF2.PdfReader(fp)

	# Get the number of pages in the PDF document
	num_pages = len(pdf.pages)

	# Iterate over every page
	for page in range(num_pages):
	# Extract the text from the page
	page_text = pdf.pages[page].extract_text()
	text_list.append(page_text)
	text = "\n".join(text_list)

	return text


	class DocxParser(BaseParser):
	"""Docx parser."""

	def _init_parser(self) -> Dict:
	"""Init parser."""
	return {}

	def parse_file(self, file: Path, errors: str = "ignore") -> str:
	"""Parse file."""
	try:
	import docx2txt
	except ImportError:
	raise ValueError("docx2txt is required to read Microsoft Word files.")

	text = docx2txt.process(file)

	return text