File size: 3,259 Bytes
c8a32e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import math
from typing import List, Optional

from pydantic import field_validator
import ftfy

from marker.schema.bbox import BboxElement
from marker.settings import settings


class BlockType(BboxElement):
    block_type: str


class Span(BboxElement):
    text: str
    span_id: str
    font: str
    font_weight: float
    font_size: float
    bold: Optional[bool] = None
    italic: Optional[bool] = None
    image: Optional[bool] = None


    @field_validator('text')
    @classmethod
    def fix_unicode(cls, text: str) -> str:
        return ftfy.fix_text(text)


class Line(BboxElement):
    spans: List[Span]

    @property
    def prelim_text(self):
        return "".join([s.text for s in self.spans])

    @property
    def start(self):
        return self.spans[0].bbox[0]


class Block(BboxElement):
    lines: List[Line]
    pnum: int
    block_type: Optional[str] = None

    @property
    def prelim_text(self):
        return "\n".join([l.prelim_text for l in self.lines])

    def filter_spans(self, bad_span_ids):
        new_lines = []
        for line in self.lines:
            new_spans = []
            for span in line.spans:
                if not span.span_id in bad_span_ids:
                    new_spans.append(span)
            line.spans = new_spans
            if len(new_spans) > 0:
                new_lines.append(line)
        self.lines = new_lines

    def filter_bad_span_types(self):
        new_lines = []
        for line in self.lines:
            new_spans = []
            for span in line.spans:
                if self.block_type not in settings.BAD_SPAN_TYPES:
                    new_spans.append(span)
            line.spans = new_spans
            if len(new_spans) > 0:
                new_lines.append(line)
        self.lines = new_lines

    def get_min_line_start(self):
        line_starts = [line.start for line in self.lines]
        if len(line_starts) == 0:
            return None
        return min(line_starts)


def bbox_from_lines(lines: List[Line]):
    min_x = min([line.bbox[0] for line in lines])
    min_y = min([line.bbox[1] for line in lines])
    max_x = max([line.bbox[2] for line in lines])
    max_y = max([line.bbox[3] for line in lines])
    return [min_x, min_y, max_x, max_y]


def split_block_lines(block: Block, split_line_idx: int):
    new_blocks = []
    if split_line_idx >= len(block.lines):
        return [block]
    elif split_line_idx == 0:
        return [block]
    else:
        new_blocks.append(Block(lines=block.lines[:split_line_idx], bbox=bbox_from_lines(block.lines[:split_line_idx]), pnum=block.pnum))
        new_blocks.append(Block(lines=block.lines[split_line_idx:], bbox=bbox_from_lines(block.lines[split_line_idx:]), pnum=block.pnum))
    return new_blocks


def find_insert_block(blocks: List[Block], bbox):
    nearest_match = None
    match_dist = None
    for idx, block in enumerate(blocks):
        try:
            dist = math.sqrt((block.bbox[1] - bbox[1]) ** 2 + (block.bbox[0] - bbox[0]) ** 2)
        except Exception as e:
            continue

        if nearest_match is None or dist < match_dist:
            nearest_match = idx
            match_dist = dist
    if nearest_match is None:
        return 0
    return nearest_match