Spaces:

schellrw
/

il-legal

Running

il-legal / utils /process.py

Create utils/process.py

f5c9f39 verified 7 months ago

895 Bytes

	import pymupdf ## fitz # PyMuPDF
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	def extract_text_from_pdf(pdf_file):
	## doc = pymupdf.open(pdf_file)
	# with pymupdf.open(pdf_file) as doc:
	# with fitz.open(pdf_file) as doc:
	text = ""
	with pymupdf.open(stream=pdf_file.read(), filetype="pdf") as doc:
	for page in doc:
	text += page.get_text()
	return text

	MARKDOWN_SEPARATORS = [
	"\n#{1,6} ",
	"```\n",
	"\n\\\\\\*+\n",
	"\n---+\n",
	"\n___+\n",
	"\n\n",
	"\n",
	" ",
	"",
	]

	def chunk_text(text, chunk_size=1000, chunk_overlap=100):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	add_start_index=True,
	strip_whitespace=True,
	separators=MARKDOWN_SEPARATORS
	)
	return text_splitter.split_text(text)