il-legal / utils /process.py
schellrw's picture
Create utils/process.py
f5c9f39 verified
raw
history blame contribute delete
895 Bytes
import pymupdf ## fitz # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
def extract_text_from_pdf(pdf_file):
## doc = pymupdf.open(pdf_file)
# with pymupdf.open(pdf_file) as doc:
# with fitz.open(pdf_file) as doc:
text = ""
with pymupdf.open(stream=pdf_file.read(), filetype="pdf") as doc:
for page in doc:
text += page.get_text()
return text
MARKDOWN_SEPARATORS = [
"\n#{1,6} ",
"```\n",
"\n\\*\\*\\*+\n",
"\n---+\n",
"\n___+\n",
"\n\n",
"\n",
" ",
"",
]
def chunk_text(text, chunk_size=1000, chunk_overlap=100):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
add_start_index=True,
strip_whitespace=True,
separators=MARKDOWN_SEPARATORS
)
return text_splitter.split_text(text)