Spaces:
Build error
Build error
Update DiT_Extractor/sentence_extractor.py
Browse files
DiT_Extractor/sentence_extractor.py
CHANGED
@@ -95,6 +95,11 @@ def sentence_extract(document):
|
|
95 |
for sentence in sentences:
|
96 |
t += len(sentence)
|
97 |
if t <= max_tokens:
|
|
|
|
|
|
|
|
|
|
|
98 |
word_section += sentence
|
99 |
else:
|
100 |
word_sections.append(word_section)
|
|
|
95 |
for sentence in sentences:
|
96 |
t += len(sentence)
|
97 |
if t <= max_tokens:
|
98 |
+
# update character indicies from concatenating sentences
|
99 |
+
if len(word_section) > 0:
|
100 |
+
last_word_obj = word_section[-1]
|
101 |
+
_, (_, char_idx_offset), _ = last_word_obj
|
102 |
+
sentence = [(w, (sc+char_idx_offset+1, ec+char_idx_offset+1), bbox) for w, (sc, ec), bbox in sentence]
|
103 |
word_section += sentence
|
104 |
else:
|
105 |
word_sections.append(word_section)
|