Spaces:
Paused
Paused
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
from docx import Document | |
import re | |
import pandas as pd | |
from collections import Counter | |
from rag.nlp import rag_tokenizer | |
from io import BytesIO | |
class RAGFlowDocxParser: | |
def __extract_table_content(self, tb): | |
df = [] | |
for row in tb.rows: | |
df.append([c.text for c in row.cells]) | |
return self.__compose_table_content(pd.DataFrame(df)) | |
def __compose_table_content(self, df): | |
def blockType(b): | |
patt = [ | |
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), | |
(r"^(20|19)[0-9]{2}年$", "Dt"), | |
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"), | |
("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), | |
(r"^第*[一二三四1-4]季度$", "Dt"), | |
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"), | |
(r"^(20|19)[0-9]{2}[ABCDE]$", "DT"), | |
("^[0-9.,+%/ -]+$", "Nu"), | |
(r"^[0-9A-Z/\._~-]+$", "Ca"), | |
(r"^[A-Z]*[a-z' -]+$", "En"), | |
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"), | |
(r"^.{1}$", "Sg") | |
] | |
for p, n in patt: | |
if re.search(p, b): | |
return n | |
tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1] | |
if len(tks) > 3: | |
if len(tks) < 12: | |
return "Tx" | |
else: | |
return "Lx" | |
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr": | |
return "Nr" | |
return "Ot" | |
if len(df) < 2: | |
return [] | |
max_type = Counter([blockType(str(df.iloc[i, j])) for i in range( | |
1, len(df)) for j in range(len(df.iloc[i, :]))]) | |
max_type = max(max_type.items(), key=lambda x: x[1])[0] | |
colnm = len(df.iloc[0, :]) | |
hdrows = [0] # header is not nessesarily appear in the first line | |
if max_type == "Nu": | |
for r in range(1, len(df)): | |
tys = Counter([blockType(str(df.iloc[r, j])) | |
for j in range(len(df.iloc[r, :]))]) | |
tys = max(tys.items(), key=lambda x: x[1])[0] | |
if tys != max_type: | |
hdrows.append(r) | |
lines = [] | |
for i in range(1, len(df)): | |
if i in hdrows: | |
continue | |
hr = [r - i for r in hdrows] | |
hr = [r for r in hr if r < 0] | |
t = len(hr) - 1 | |
while t > 0: | |
if hr[t] - hr[t - 1] > 1: | |
hr = hr[t:] | |
break | |
t -= 1 | |
headers = [] | |
for j in range(len(df.iloc[i, :])): | |
t = [] | |
for h in hr: | |
x = str(df.iloc[i + h, j]).strip() | |
if x in t: | |
continue | |
t.append(x) | |
t = ",".join(t) | |
if t: | |
t += ": " | |
headers.append(t) | |
cells = [] | |
for j in range(len(df.iloc[i, :])): | |
if not str(df.iloc[i, j]): | |
continue | |
cells.append(headers[j] + str(df.iloc[i, j])) | |
lines.append(";".join(cells)) | |
if colnm > 3: | |
return lines | |
return ["\n".join(lines)] | |
def __call__(self, fnm, from_page=0, to_page=100000): | |
self.doc = Document(fnm) if isinstance( | |
fnm, str) else Document(BytesIO(fnm)) | |
pn = 0 # parsed page | |
secs = [] # parsed contents | |
for p in self.doc.paragraphs: | |
if pn > to_page: | |
break | |
runs_within_single_paragraph = [] # save runs within the range of pages | |
for run in p.runs: | |
if pn > to_page: | |
break | |
if from_page <= pn < to_page and p.text.strip(): | |
runs_within_single_paragraph.append(run.text) # append run.text first | |
# wrap page break checker into a static method | |
if 'lastRenderedPageBreak' in run._element.xml: | |
pn += 1 | |
secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph | |
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] | |
return secs, tbls | |