Spaces:

sambanovasystems
/

paper-slides-summary

Running

App Files Files Community

paper-slides-summary / pdf_helper.py

zolicsaki

Upload 7 files

57d4532 verified 10 days ago

raw

history blame contribute delete

6.41 kB

	import pdf4llm
	import re

	def py4llm_pdf_reader(pdf_path: str):
	md_text = pdf4llm.to_markdown(pdf_path)
	return md_text

	def split_markdown_sections(text):
	# Regex to match headers (e.g., #, ##, ###)
	header_pattern = r'^(#{1,6})\s*(.+)$'

	# Find all headers and their positions
	matches = list(re.finditer(header_pattern, text, re.MULTILINE))

	sections = []

	# Iterate over all header matches and split text
	for i, match in enumerate(matches):
	header = match.group(0) # Full header text: number of # and header name
	level = len(match.group(1)) # Header level (number of #)
	title = match.group(2) # Header title

	# Find the start position of the section (right after the header)
	start_pos = match.end()

	# Find the end position (start of the next header or end of the document)
	if i + 1 < len(matches):
	end_pos = matches[i + 1].start()
	else:
	end_pos = len(text)

	# Extract section content between this header and the next one
	section_content = text[start_pos:end_pos].strip()

	# Store the section as a tuple: (header level, header title, section content)
	sections.append({'level': level, 'title': title, 'content': section_content})

	return sections


	class PDFPaper4LLMParser(object):
	def __init__(self, write_images=False, page_chunks=False) -> None:
	self.write_images = write_images
	self.page_chunks = page_chunks

	def pdf2text(self, pdf_path: str):
	md_text = pdf4llm.to_markdown(pdf_path, write_images=self.write_images, page_chunks=self.page_chunks)
	if self.page_chunks:
	text_array = []
	for md_text_i in md_text:
	text_array.append(md_text_i['text'])
	markdown_text = '\n'.join(text_array)
	else:
	markdown_text = md_text
	return markdown_text

	def structured_paper_content(self, markdown_sections: list):
	"""
	markdown_sections: list of dictionary, each dictionary consists of
	1. level
	2. title
	3. content

	Title, Author, Abstract, Section_i (i = 1, 2, 3, ...)
	"""
	assert len(markdown_sections) > 0
	struct_sections = {}
	start_section = markdown_sections[0]
	title_level = start_section['level']

	main_text_idx = -1
	meta_data = []
	for sec_idx, section in enumerate(markdown_sections):
	level_i = section['level']
	title_i = section['title']
	content_i = section['content']
	if level_i == title_level and sec_idx == 0:
	struct_sections['title'] = title_i
	if len(content_i) > 0:
	meta_data.append(content_i)
	else:
	if 'abstract' in title_i.lower() or 'abstract' in content_i.lower():
	struct_sections['abstract'] = content_i
	main_text_idx = sec_idx + 1
	break
	else:
	meta_data.append(title_i + content_i)
	struct_sections['author'] = meta_data
	if main_text_idx == -1 and len(markdown_sections) > 0:
	main_text_idx = 0
	assert main_text_idx >= 0
	main_text_list = markdown_sections[main_text_idx:]
	struct_sections['main_text'] = main_text_list
	return struct_sections

	def run(self, pdf_path: str, verbose=True):
	markdown_text = self.pdf2text(pdf_path=pdf_path)
	sections = split_markdown_sections(text=markdown_text)
	struct_sections = self.structured_paper_content(markdown_sections=sections)
	if verbose:
	paper_text = ''
	for k, v in struct_sections.items():
	if k == 'title':
	paper_text += '\nTitle: ' + v + '\n\n'
	elif k == 'abstract':
	paper_text += '\nAbstract: \n' + v + '\n\n'
	elif k == 'author':
	paper_text += '\nAuthor: \n' + '\n'.join(v) + '\n\n'
	elif k == 'main_text':
	for section in v:
	paper_text += '\n' + section['title'] + '\n\n' + section['content'] + '\n\n'
	print(paper_text)
	return struct_sections


	def dict_to_markdown_list(d: dict, indent=0):
	lines = []
	for key, value in d.items():
	prefix = ' ' * indent + f"- {key}: "
	if isinstance(value, dict):
	lines.append(prefix)
	lines.append(dict_to_markdown_list(value, indent + 1))
	else:
	lines.append(prefix + str(value))
	return "\n".join(lines)


	def split_markdown_slides(markdown: str, sep: str = "<slide_sep>"):
	return [slide.strip() for slide in markdown.strip().split(sep) if slide.strip()]


	def parse_slide_to_dict(slide: str):
	lines = slide.splitlines()
	result = {}
	current_key = None
	sub_items = []

	for line in lines:
	line = line.strip()

	# Capture headings (### or ##)
	heading_match = re.match(r"^#{2,3}\s+(.*)", line)
	if heading_match:
	if current_key and sub_items:
	result[current_key] = sub_items
	sub_items = []
	current_key = heading_match.group(1).strip()
	continue

	# Capture numbered list
	numbered_match = re.match(r"^\d+\.\s+(.*)", line)
	if numbered_match:
	sub_items.append(numbered_match.group(1).strip())
	continue

	# Capture bulleted list
	bullet_match = re.match(r"^[\\-]\s+(.)", line)
	if bullet_match:
	sub_items.append(bullet_match.group(1).strip())
	continue

	# Capture nested bullets
	nested_bullet_match = re.match(r"^\s{2,}[\\-]\s+(.)", line)
	if nested_bullet_match:
	sub_items.append(nested_bullet_match.group(1).strip())
	continue

	# Fallback: add as freeform text
	if current_key:
	sub_items.append(line)

	# Save the last block
	if current_key and sub_items:
	result[current_key] = sub_items

	return result


	def markdown_to_slide_dicts(full_markdown: str):
	slides = split_markdown_slides(full_markdown)
	return [parse_slide_to_dict(slide) for slide in slides]