Spaces:

sambanovasystems
/

paper-slides-summary

Sleeping

File size: 6,412 Bytes

57d4532

import pdf4llm
import re

def py4llm_pdf_reader(pdf_path: str):
    md_text = pdf4llm.to_markdown(pdf_path)
    return md_text

def split_markdown_sections(text):
    # Regex to match headers (e.g., #, ##, ###)
    header_pattern = r'^(#{1,6})\s*(.+)$'
    
    # Find all headers and their positions
    matches = list(re.finditer(header_pattern, text, re.MULTILINE))
    
    sections = []
    
    # Iterate over all header matches and split text
    for i, match in enumerate(matches):
        header = match.group(0)  # Full header text: number of # and header name
        level = len(match.group(1))  # Header level (number of #)
        title = match.group(2)  # Header title
        
        # Find the start position of the section (right after the header)
        start_pos = match.end()
        
        # Find the end position (start of the next header or end of the document)
        if i + 1 < len(matches):
            end_pos = matches[i + 1].start()
        else:
            end_pos = len(text)
        
        # Extract section content between this header and the next one
        section_content = text[start_pos:end_pos].strip()
        
        # Store the section as a tuple: (header level, header title, section content)
        sections.append({'level': level, 'title': title, 'content': section_content})
    
    return sections


class PDFPaper4LLMParser(object):
    def __init__(self, write_images=False, page_chunks=False) -> None:
        self.write_images = write_images
        self.page_chunks = page_chunks

    def pdf2text(self, pdf_path: str):
        md_text = pdf4llm.to_markdown(pdf_path, write_images=self.write_images, page_chunks=self.page_chunks)
        if self.page_chunks:
            text_array = []
            for md_text_i in md_text:
                text_array.append(md_text_i['text'])
            markdown_text = '\n'.join(text_array)
        else:
            markdown_text = md_text
        return markdown_text
    
    def structured_paper_content(self, markdown_sections: list):
        """
        markdown_sections: list of dictionary, each dictionary consists of
        1. level
        2. title
        3. content

        Title, Author, Abstract, Section_i (i = 1, 2, 3, ...)
        """
        assert len(markdown_sections) > 0
        struct_sections = {}
        start_section = markdown_sections[0]
        title_level = start_section['level']
        
        main_text_idx = -1
        meta_data = []
        for sec_idx, section in enumerate(markdown_sections):
            level_i = section['level']
            title_i = section['title']
            content_i = section['content']
            if level_i == title_level and sec_idx == 0:
                struct_sections['title'] = title_i
                if len(content_i) > 0:
                    meta_data.append(content_i)
            else:
                if 'abstract' in title_i.lower() or 'abstract' in content_i.lower():
                    struct_sections['abstract'] = content_i
                    main_text_idx = sec_idx + 1
                    break
                else:
                    meta_data.append(title_i + content_i)
        struct_sections['author'] = meta_data
        if main_text_idx == -1 and len(markdown_sections) > 0:
            main_text_idx = 0
        assert main_text_idx >= 0
        main_text_list = markdown_sections[main_text_idx:]
        struct_sections['main_text'] = main_text_list
        return struct_sections

    def run(self, pdf_path: str, verbose=True):
        markdown_text = self.pdf2text(pdf_path=pdf_path)
        sections = split_markdown_sections(text=markdown_text)
        struct_sections = self.structured_paper_content(markdown_sections=sections)
        if verbose:
            paper_text = ''
            for k, v in struct_sections.items():
                if k == 'title':
                    paper_text += '\nTitle: ' + v + '\n\n'
                elif k == 'abstract':
                    paper_text += '\nAbstract: \n'  + v + '\n\n'
                elif k == 'author':
                    paper_text += '\nAuthor: \n'  + '\n'.join(v) + '\n\n'
                elif k == 'main_text':
                    for section in v:
                        paper_text += '\n' + section['title'] + '\n\n' + section['content'] + '\n\n'
            print(paper_text)
        return struct_sections


def dict_to_markdown_list(d: dict, indent=0):
    lines = []
    for key, value in d.items():
        prefix = '  ' * indent + f"- **{key}**: "
        if isinstance(value, dict):
            lines.append(prefix)
            lines.append(dict_to_markdown_list(value, indent + 1))
        else:
            lines.append(prefix + str(value))
    return "\n".join(lines)


def split_markdown_slides(markdown: str, sep: str = "<slide_sep>"):
    return [slide.strip() for slide in markdown.strip().split(sep) if slide.strip()]


def parse_slide_to_dict(slide: str):
    lines = slide.splitlines()
    result = {}
    current_key = None
    sub_items = []

    for line in lines:
        line = line.strip()

        # Capture headings (### or ##)
        heading_match = re.match(r"^#{2,3}\s+(.*)", line)
        if heading_match:
            if current_key and sub_items:
                result[current_key] = sub_items
                sub_items = []
            current_key = heading_match.group(1).strip()
            continue

        # Capture numbered list
        numbered_match = re.match(r"^\d+\.\s+(.*)", line)
        if numbered_match:
            sub_items.append(numbered_match.group(1).strip())
            continue

        # Capture bulleted list
        bullet_match = re.match(r"^[\*\-]\s+(.*)", line)
        if bullet_match:
            sub_items.append(bullet_match.group(1).strip())
            continue

        # Capture nested bullets
        nested_bullet_match = re.match(r"^\s{2,}[\*\-]\s+(.*)", line)
        if nested_bullet_match:
            sub_items.append(nested_bullet_match.group(1).strip())
            continue

        # Fallback: add as freeform text
        if current_key:
            sub_items.append(line)

    # Save the last block
    if current_key and sub_items:
        result[current_key] = sub_items

    return result


def markdown_to_slide_dicts(full_markdown: str):
    slides = split_markdown_slides(full_markdown)
    return [parse_slide_to_dict(slide) for slide in slides]