File size: 6,412 Bytes
57d4532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import pdf4llm
import re

def py4llm_pdf_reader(pdf_path: str):
    md_text = pdf4llm.to_markdown(pdf_path)
    return md_text

def split_markdown_sections(text):
    # Regex to match headers (e.g., #, ##, ###)
    header_pattern = r'^(#{1,6})\s*(.+)$'
    
    # Find all headers and their positions
    matches = list(re.finditer(header_pattern, text, re.MULTILINE))
    
    sections = []
    
    # Iterate over all header matches and split text
    for i, match in enumerate(matches):
        header = match.group(0)  # Full header text: number of # and header name
        level = len(match.group(1))  # Header level (number of #)
        title = match.group(2)  # Header title
        
        # Find the start position of the section (right after the header)
        start_pos = match.end()
        
        # Find the end position (start of the next header or end of the document)
        if i + 1 < len(matches):
            end_pos = matches[i + 1].start()
        else:
            end_pos = len(text)
        
        # Extract section content between this header and the next one
        section_content = text[start_pos:end_pos].strip()
        
        # Store the section as a tuple: (header level, header title, section content)
        sections.append({'level': level, 'title': title, 'content': section_content})
    
    return sections


class PDFPaper4LLMParser(object):
    def __init__(self, write_images=False, page_chunks=False) -> None:
        self.write_images = write_images
        self.page_chunks = page_chunks

    def pdf2text(self, pdf_path: str):
        md_text = pdf4llm.to_markdown(pdf_path, write_images=self.write_images, page_chunks=self.page_chunks)
        if self.page_chunks:
            text_array = []
            for md_text_i in md_text:
                text_array.append(md_text_i['text'])
            markdown_text = '\n'.join(text_array)
        else:
            markdown_text = md_text
        return markdown_text
    
    def structured_paper_content(self, markdown_sections: list):
        """
        markdown_sections: list of dictionary, each dictionary consists of
        1. level
        2. title
        3. content

        Title, Author, Abstract, Section_i (i = 1, 2, 3, ...)
        """
        assert len(markdown_sections) > 0
        struct_sections = {}
        start_section = markdown_sections[0]
        title_level = start_section['level']
        
        main_text_idx = -1
        meta_data = []
        for sec_idx, section in enumerate(markdown_sections):
            level_i = section['level']
            title_i = section['title']
            content_i = section['content']
            if level_i == title_level and sec_idx == 0:
                struct_sections['title'] = title_i
                if len(content_i) > 0:
                    meta_data.append(content_i)
            else:
                if 'abstract' in title_i.lower() or 'abstract' in content_i.lower():
                    struct_sections['abstract'] = content_i
                    main_text_idx = sec_idx + 1
                    break
                else:
                    meta_data.append(title_i + content_i)
        struct_sections['author'] = meta_data
        if main_text_idx == -1 and len(markdown_sections) > 0:
            main_text_idx = 0
        assert main_text_idx >= 0
        main_text_list = markdown_sections[main_text_idx:]
        struct_sections['main_text'] = main_text_list
        return struct_sections

    def run(self, pdf_path: str, verbose=True):
        markdown_text = self.pdf2text(pdf_path=pdf_path)
        sections = split_markdown_sections(text=markdown_text)
        struct_sections = self.structured_paper_content(markdown_sections=sections)
        if verbose:
            paper_text = ''
            for k, v in struct_sections.items():
                if k == 'title':
                    paper_text += '\nTitle: ' + v + '\n\n'
                elif k == 'abstract':
                    paper_text += '\nAbstract: \n'  + v + '\n\n'
                elif k == 'author':
                    paper_text += '\nAuthor: \n'  + '\n'.join(v) + '\n\n'
                elif k == 'main_text':
                    for section in v:
                        paper_text += '\n' + section['title'] + '\n\n' + section['content'] + '\n\n'
            print(paper_text)
        return struct_sections


def dict_to_markdown_list(d: dict, indent=0):
    lines = []
    for key, value in d.items():
        prefix = '  ' * indent + f"- **{key}**: "
        if isinstance(value, dict):
            lines.append(prefix)
            lines.append(dict_to_markdown_list(value, indent + 1))
        else:
            lines.append(prefix + str(value))
    return "\n".join(lines)


def split_markdown_slides(markdown: str, sep: str = "<slide_sep>"):
    return [slide.strip() for slide in markdown.strip().split(sep) if slide.strip()]


def parse_slide_to_dict(slide: str):
    lines = slide.splitlines()
    result = {}
    current_key = None
    sub_items = []

    for line in lines:
        line = line.strip()

        # Capture headings (### or ##)
        heading_match = re.match(r"^#{2,3}\s+(.*)", line)
        if heading_match:
            if current_key and sub_items:
                result[current_key] = sub_items
                sub_items = []
            current_key = heading_match.group(1).strip()
            continue

        # Capture numbered list
        numbered_match = re.match(r"^\d+\.\s+(.*)", line)
        if numbered_match:
            sub_items.append(numbered_match.group(1).strip())
            continue

        # Capture bulleted list
        bullet_match = re.match(r"^[\*\-]\s+(.*)", line)
        if bullet_match:
            sub_items.append(bullet_match.group(1).strip())
            continue

        # Capture nested bullets
        nested_bullet_match = re.match(r"^\s{2,}[\*\-]\s+(.*)", line)
        if nested_bullet_match:
            sub_items.append(nested_bullet_match.group(1).strip())
            continue

        # Fallback: add as freeform text
        if current_key:
            sub_items.append(line)

    # Save the last block
    if current_key and sub_items:
        result[current_key] = sub_items

    return result


def markdown_to_slide_dicts(full_markdown: str):
    slides = split_markdown_slides(full_markdown)
    return [parse_slide_to_dict(slide) for slide in slides]