paper-slides-summary / pdf_helper.py
zolicsaki's picture
Upload 7 files
57d4532 verified
import pdf4llm
import re
def py4llm_pdf_reader(pdf_path: str):
md_text = pdf4llm.to_markdown(pdf_path)
return md_text
def split_markdown_sections(text):
# Regex to match headers (e.g., #, ##, ###)
header_pattern = r'^(#{1,6})\s*(.+)$'
# Find all headers and their positions
matches = list(re.finditer(header_pattern, text, re.MULTILINE))
sections = []
# Iterate over all header matches and split text
for i, match in enumerate(matches):
header = match.group(0) # Full header text: number of # and header name
level = len(match.group(1)) # Header level (number of #)
title = match.group(2) # Header title
# Find the start position of the section (right after the header)
start_pos = match.end()
# Find the end position (start of the next header or end of the document)
if i + 1 < len(matches):
end_pos = matches[i + 1].start()
else:
end_pos = len(text)
# Extract section content between this header and the next one
section_content = text[start_pos:end_pos].strip()
# Store the section as a tuple: (header level, header title, section content)
sections.append({'level': level, 'title': title, 'content': section_content})
return sections
class PDFPaper4LLMParser(object):
def __init__(self, write_images=False, page_chunks=False) -> None:
self.write_images = write_images
self.page_chunks = page_chunks
def pdf2text(self, pdf_path: str):
md_text = pdf4llm.to_markdown(pdf_path, write_images=self.write_images, page_chunks=self.page_chunks)
if self.page_chunks:
text_array = []
for md_text_i in md_text:
text_array.append(md_text_i['text'])
markdown_text = '\n'.join(text_array)
else:
markdown_text = md_text
return markdown_text
def structured_paper_content(self, markdown_sections: list):
"""
markdown_sections: list of dictionary, each dictionary consists of
1. level
2. title
3. content
Title, Author, Abstract, Section_i (i = 1, 2, 3, ...)
"""
assert len(markdown_sections) > 0
struct_sections = {}
start_section = markdown_sections[0]
title_level = start_section['level']
main_text_idx = -1
meta_data = []
for sec_idx, section in enumerate(markdown_sections):
level_i = section['level']
title_i = section['title']
content_i = section['content']
if level_i == title_level and sec_idx == 0:
struct_sections['title'] = title_i
if len(content_i) > 0:
meta_data.append(content_i)
else:
if 'abstract' in title_i.lower() or 'abstract' in content_i.lower():
struct_sections['abstract'] = content_i
main_text_idx = sec_idx + 1
break
else:
meta_data.append(title_i + content_i)
struct_sections['author'] = meta_data
if main_text_idx == -1 and len(markdown_sections) > 0:
main_text_idx = 0
assert main_text_idx >= 0
main_text_list = markdown_sections[main_text_idx:]
struct_sections['main_text'] = main_text_list
return struct_sections
def run(self, pdf_path: str, verbose=True):
markdown_text = self.pdf2text(pdf_path=pdf_path)
sections = split_markdown_sections(text=markdown_text)
struct_sections = self.structured_paper_content(markdown_sections=sections)
if verbose:
paper_text = ''
for k, v in struct_sections.items():
if k == 'title':
paper_text += '\nTitle: ' + v + '\n\n'
elif k == 'abstract':
paper_text += '\nAbstract: \n' + v + '\n\n'
elif k == 'author':
paper_text += '\nAuthor: \n' + '\n'.join(v) + '\n\n'
elif k == 'main_text':
for section in v:
paper_text += '\n' + section['title'] + '\n\n' + section['content'] + '\n\n'
print(paper_text)
return struct_sections
def dict_to_markdown_list(d: dict, indent=0):
lines = []
for key, value in d.items():
prefix = ' ' * indent + f"- **{key}**: "
if isinstance(value, dict):
lines.append(prefix)
lines.append(dict_to_markdown_list(value, indent + 1))
else:
lines.append(prefix + str(value))
return "\n".join(lines)
def split_markdown_slides(markdown: str, sep: str = "<slide_sep>"):
return [slide.strip() for slide in markdown.strip().split(sep) if slide.strip()]
def parse_slide_to_dict(slide: str):
lines = slide.splitlines()
result = {}
current_key = None
sub_items = []
for line in lines:
line = line.strip()
# Capture headings (### or ##)
heading_match = re.match(r"^#{2,3}\s+(.*)", line)
if heading_match:
if current_key and sub_items:
result[current_key] = sub_items
sub_items = []
current_key = heading_match.group(1).strip()
continue
# Capture numbered list
numbered_match = re.match(r"^\d+\.\s+(.*)", line)
if numbered_match:
sub_items.append(numbered_match.group(1).strip())
continue
# Capture bulleted list
bullet_match = re.match(r"^[\*\-]\s+(.*)", line)
if bullet_match:
sub_items.append(bullet_match.group(1).strip())
continue
# Capture nested bullets
nested_bullet_match = re.match(r"^\s{2,}[\*\-]\s+(.*)", line)
if nested_bullet_match:
sub_items.append(nested_bullet_match.group(1).strip())
continue
# Fallback: add as freeform text
if current_key:
sub_items.append(line)
# Save the last block
if current_key and sub_items:
result[current_key] = sub_items
return result
def markdown_to_slide_dicts(full_markdown: str):
slides = split_markdown_slides(full_markdown)
return [parse_slide_to_dict(slide) for slide in slides]