|
import pymupdf |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
|
|
|
|
|
text = "" |
|
with pymupdf.open(stream=pdf_file.read(), filetype="pdf") as doc: |
|
for page in doc: |
|
text += page.get_text() |
|
return text |
|
|
|
MARKDOWN_SEPARATORS = [ |
|
"\n#{1,6} ", |
|
"```\n", |
|
"\n\\*\\*\\*+\n", |
|
"\n---+\n", |
|
"\n___+\n", |
|
"\n\n", |
|
"\n", |
|
" ", |
|
"", |
|
] |
|
|
|
def chunk_text(text, chunk_size=1000, chunk_overlap=100): |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap, |
|
add_start_index=True, |
|
strip_whitespace=True, |
|
separators=MARKDOWN_SEPARATORS |
|
) |
|
return text_splitter.split_text(text) |
|
|