File size: 3,885 Bytes
e6acaf6 407b252 e6acaf6 407b252 e6acaf6 407b252 e6acaf6 51482f3 e6acaf6 51482f3 e6acaf6 51482f3 e6acaf6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import random
import re
from io import BytesIO
from nltk import word_tokenize
from openpyxl import load_workbook
from rag.parser import is_english, random_choices
from rag.nlp import huqie, stemmer
class Excel(object):
def __call__(self, fnm, binary=None, callback=None):
if not binary:
wb = load_workbook(fnm)
else:
wb = load_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
total += len(list(wb[sheetname].rows))
res, fails = [], []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
for i, r in enumerate(rows):
q, a = "", ""
for cell in r:
if not cell.value: continue
if not q: q = str(cell.value)
elif not a: a = str(cell.value)
else: break
if q and a: res.append((q, a))
else: fails.append(str(i+1))
if len(res) % 999 == 0:
callback(len(res)*0.6/total, ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..."%(",".join(fails[:3])) if fails else "")))
callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
self.is_english = is_english([rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q)>1])
return res
def rmPrefix(txt):
return re.sub(r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
def beAdoc(d, q, a, eng):
qprefix = "Question: " if eng else "问题:"
aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join([qprefix+rmPrefix(q), aprefix+rmPrefix(a)])
if eng:
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(q)])
else:
d["content_ltks"] = huqie.qie(q)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
return d
def chunk(filename, binary=None, callback=None, **kwargs):
res = []
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
for q,a in excel_parser(filename, binary, callback):
res.append(beAdoc({}, q, a, excel_parser.is_english))
return res
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
txt = binary.decode("utf-8")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l: break
txt += l
lines = txt.split("\n")
eng = is_english([rmPrefix(l) for l in lines[:100]])
fails = []
for i, line in enumerate(lines):
arr = [l for l in line.split("\t") if len(l) > 1]
if len(arr) != 2:
fails.append(str(i))
continue
res.append(beAdoc({}, arr[0], arr[1], eng))
if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
if __name__== "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)
|