KevinHuSh
remove unused codes, seperate layout detection out as a new api. Add new rag methed 'table' (#55)
407b252
import copy | |
import random | |
import re | |
from io import BytesIO | |
from xpinyin import Pinyin | |
import numpy as np | |
import pandas as pd | |
from nltk import word_tokenize | |
from openpyxl import load_workbook | |
from dateutil.parser import parse as datetime_parse | |
from rag.parser import is_english, tokenize | |
from rag.nlp import huqie, stemmer | |
class Excel(object): | |
def __call__(self, fnm, binary=None, callback=None): | |
if not binary: | |
wb = load_workbook(fnm) | |
else: | |
wb = load_workbook(BytesIO(binary)) | |
total = 0 | |
for sheetname in wb.sheetnames: | |
total += len(list(wb[sheetname].rows)) | |
res, fails, done = [], [], 0 | |
for sheetname in wb.sheetnames: | |
ws = wb[sheetname] | |
rows = list(ws.rows) | |
headers = [cell.value for cell in rows[0]] | |
missed = set([i for i,h in enumerate(headers) if h is None]) | |
headers = [cell.value for i,cell in enumerate(rows[0]) if i not in missed] | |
data = [] | |
for i, r in enumerate(rows[1:]): | |
row = [cell.value for ii,cell in enumerate(r) if ii not in missed] | |
if len(row) != len(headers): | |
fails.append(str(i)) | |
continue | |
data.append(row) | |
done += 1 | |
if done % 999 == 0: | |
callback(done * 0.6/total, ("Extract records: {}".format(len(res)) + (f"{len(fails)} failure({sheetname}), line: %s..."%(",".join(fails[:3])) if fails else ""))) | |
res.append(pd.DataFrame(np.array(data), columns=headers)) | |
callback(0.6, ("Extract records: {}. ".format(done) + ( | |
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |
return res | |
def trans_datatime(s): | |
try: | |
return datetime_parse(s.strip()).strftime("%Y-%m-%dT%H:%M:%S") | |
except Exception as e: | |
pass | |
def trans_bool(s): | |
if re.match(r"(true|yes|是)$", str(s).strip(), flags=re.IGNORECASE): return ["yes", "是"] | |
if re.match(r"(false|no|否)$", str(s).strip(), flags=re.IGNORECASE): return ["no", "否"] | |
def column_data_type(arr): | |
uni = len(set([a for a in arr if a is not None])) | |
counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0} | |
trans = {t:f for f,t in [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]} | |
for a in arr: | |
if a is None:continue | |
if re.match(r"[+-]?[0-9]+(\.0+)?$", str(a).replace("%%", "")): | |
counts["int"] += 1 | |
elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")): | |
counts["float"] += 1 | |
elif re.match(r"(true|false|yes|no|是|否)$", str(a), flags=re.IGNORECASE): | |
counts["bool"] += 1 | |
elif trans_datatime(str(a)): | |
counts["datetime"] += 1 | |
else: counts["text"] += 1 | |
counts = sorted(counts.items(), key=lambda x: x[1]*-1) | |
ty = counts[0][0] | |
for i in range(len(arr)): | |
if arr[i] is None:continue | |
try: | |
arr[i] = trans[ty](str(arr[i])) | |
except Exception as e: | |
arr[i] = None | |
if ty == "text": | |
if len(arr) > 128 and uni/len(arr) < 0.1: | |
ty = "keyword" | |
return arr, ty | |
def chunk(filename, binary=None, callback=None, **kwargs): | |
dfs = [] | |
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |
callback(0.1, "Start to parse.") | |
excel_parser = Excel() | |
dfs = excel_parser(filename, binary, callback) | |
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |
callback(0.1, "Start to parse.") | |
txt = "" | |
if binary: | |
txt = binary.decode("utf-8") | |
else: | |
with open(filename, "r") as f: | |
while True: | |
l = f.readline() | |
if not l: break | |
txt += l | |
lines = txt.split("\n") | |
fails = [] | |
headers = lines[0].split(kwargs.get("delimiter", "\t")) | |
rows = [] | |
for i, line in enumerate(lines[1:]): | |
row = [l for l in line.split(kwargs.get("delimiter", "\t"))] | |
if len(row) != len(headers): | |
fails.append(str(i)) | |
continue | |
rows.append(row) | |
if len(rows) % 999 == 0: | |
callback(len(rows) * 0.6 / len(lines), ("Extract records: {}".format(len(rows)) + ( | |
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |
callback(0.6, ("Extract records: {}".format(len(rows)) + ( | |
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |
dfs = [pd.DataFrame(np.array(rows), columns=headers)] | |
else: raise NotImplementedError("file type not supported yet(excel, text, csv supported)") | |
res = [] | |
PY = Pinyin() | |
fieds_map = {"text": "_tks", "int": "_int", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"} | |
for df in dfs: | |
for n in ["id", "_id", "index", "idx"]: | |
if n in df.columns:del df[n] | |
clmns = df.columns.values | |
txts = list(copy.deepcopy(clmns)) | |
py_clmns = [PY.get_pinyins(n)[0].replace("-", "_") for n in clmns] | |
clmn_tys = [] | |
for j in range(len(clmns)): | |
cln,ty = column_data_type(df[clmns[j]]) | |
clmn_tys.append(ty) | |
df[clmns[j]] = cln | |
if ty == "text": txts.extend([str(c) for c in cln if c]) | |
clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j]) for i in range(len(clmns))] | |
# TODO: set this column map to KB parser configuration | |
eng = is_english(txts) | |
for ii,row in df.iterrows(): | |
d = {} | |
row_txt = [] | |
for j in range(len(clmns)): | |
if row[clmns[j]] is None:continue | |
fld = clmns_map[j][0] | |
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(row[clmns[j]]) | |
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) | |
if not row_txt:continue | |
tokenize(d, "; ".join(row_txt), eng) | |
print(d) | |
res.append(d) | |
callback(0.6, "") | |
return res | |
if __name__== "__main__": | |
import sys | |
def dummy(a, b): | |
pass | |
chunk(sys.argv[1], callback=dummy) | |