|
|
|
|
|
|
|
import itertools |
|
import re |
|
|
|
c_basic = "零一二三四五六七八九" |
|
d2c = {str(d): c for d, c in enumerate(c_basic)} |
|
d2c["."] = "點" |
|
|
|
|
|
def num4year(matched): |
|
def _num4year(num): |
|
return "{}".format("".join([c_basic[int(i)] for i in num])) |
|
|
|
matched_str = matched.group(0) |
|
for m in matched.groups(): |
|
matched_str = matched_str.replace(m, _num4year(m)) |
|
return matched_str |
|
|
|
|
|
def num2chines_simple(matched): |
|
return "{}".format("".join([d2c[i] for i in matched])) |
|
|
|
|
|
def num4percent(matched): |
|
matched = matched.group(1) |
|
return "百分之{}".format(num2chinese(matched[:-1])) |
|
|
|
|
|
def num4cellphone(matched): |
|
matched = matched.group(1) |
|
matched = matched.replace(" ", "").replace("-", "") |
|
return "".join([c_basic[int(i)] for i in matched]) |
|
|
|
|
|
def num4er(matched): |
|
matched = matched.group(1) |
|
return matched.replace("2", "二") |
|
|
|
|
|
def num4liang(matched): |
|
matched = matched.group(1) |
|
return matched.replace("2", "兩") |
|
|
|
|
|
def num4general(matched): |
|
num = matched.group(1) |
|
if re.match("[A-Za-z-─]", num[0]): |
|
if len(num[1:]) < 3: |
|
|
|
return "{}{}".format(num[0], num2chinese(num[1:])) |
|
else: |
|
|
|
return "{}{}".format(num[0], num2chines_simple(num[1:])) |
|
|
|
else: |
|
if re.match("[0-9]", num[0]): |
|
return "{}".format(num2chinese(num)) |
|
else: |
|
return "{}{}".format(num[0], num2chinese(num[1:])) |
|
|
|
|
|
def parse_num(text: str) -> str: |
|
|
|
text = re.sub("([0-9]{4})[到至]([0-9]{4})年", num4year, text) |
|
text = re.sub("([0-9]{4})年", num4year, text) |
|
|
|
|
|
text = re.sub(r"([0-9]+\.?[0-9]?%)", num4percent, text) |
|
|
|
|
|
text = re.sub(r"([0-9]{4}\s?-\s?[0-9]{6})", num4cellphone, text) |
|
|
|
|
|
text = re.sub(r"([^\d]2[診樓月號])", num4er, text) |
|
text = re.sub(r"([初]2[^\d])", num4er, text) |
|
|
|
|
|
text = re.sub(r"([^\d]2[^\d])", num4liang, text) |
|
|
|
|
|
text = re.sub(r"([^0-9]?[0-9]+\.?[0-9]?)", num4general, text) |
|
|
|
return text |
|
|
|
|
|
def num2chinese(num, big=False, simp=False, o=False, twoalt=True) -> str: |
|
""" |
|
Converts numbers to Chinese representations. |
|
https://gist.github.com/gumblex/0d65cad2ba607fd14de7 |
|
`big` : use financial characters. |
|
`simp` : use simplified characters instead of traditional characters. |
|
`o` : use 〇 for zero. |
|
`twoalt`: use 两/兩 for two when appropriate. |
|
Note that `o` and `twoalt` is ignored when `big` is used, |
|
and `twoalt` is ignored when `o` is used for formal representations. |
|
""" |
|
|
|
nd = str(num) |
|
if abs(float(nd)) >= 1e48: |
|
raise ValueError("number out of range") |
|
elif "e" in nd: |
|
raise ValueError("scientific notation is not supported") |
|
c_symbol = "正负点" if simp else "正負點" |
|
if o: |
|
twoalt = False |
|
if big: |
|
c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖" |
|
c_unit1 = "拾佰仟" |
|
c_twoalt = "贰" if simp else "貳" |
|
else: |
|
c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九" |
|
c_unit1 = "十百千" |
|
if twoalt: |
|
c_twoalt = "两" if simp else "兩" |
|
else: |
|
c_twoalt = "二" |
|
c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載" |
|
|
|
def revuniq(l): |
|
return "".join(k for k, g in itertools.groupby(reversed(l))) |
|
|
|
nd = str(num) |
|
result = [] |
|
if nd[0] == "+": |
|
result.append(c_symbol[0]) |
|
elif nd[0] == "-": |
|
result.append(c_symbol[1]) |
|
if "." in nd: |
|
integer, remainder = nd.lstrip("+-").split(".") |
|
else: |
|
integer, remainder = nd.lstrip("+-"), None |
|
if int(integer): |
|
splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)] |
|
intresult = [] |
|
for nu, unit in enumerate(splitted): |
|
|
|
if int(unit) == 0: |
|
intresult.append(c_basic[0]) |
|
continue |
|
elif nu > 0 and int(unit) == 2: |
|
intresult.append(c_twoalt + c_unit2[nu - 1]) |
|
continue |
|
ulist = [] |
|
unit = unit.zfill(4) |
|
for nc, ch in enumerate(reversed(unit)): |
|
if ch == "0": |
|
if ulist: |
|
ulist.append(c_basic[0]) |
|
elif nc == 0: |
|
ulist.append(c_basic[int(ch)]) |
|
elif nc == 1 and ch == "1" and all([i == "0" for i in unit[: nc + 1]]): |
|
|
|
|
|
|
|
ulist.append(c_unit1[0]) |
|
elif nc > 1 and ch == "2": |
|
ulist.append(c_twoalt + c_unit1[nc - 1]) |
|
else: |
|
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1]) |
|
|
|
ustr = revuniq(ulist) |
|
if nu == 0: |
|
intresult.append(ustr) |
|
else: |
|
intresult.append(ustr + c_unit2[nu - 1]) |
|
result.append(revuniq(intresult).strip(c_basic[0])) |
|
else: |
|
result.append(c_basic[0]) |
|
if remainder: |
|
result.append(c_symbol[2]) |
|
result.append("".join(c_basic[int(ch)] for ch in remainder)) |
|
return "".join(result) |
|
|
|
|
|
if __name__ == "__main__": |
|
text = "若手機仔幾多號?吾手機仔係0964-498042。" |
|
|
|
print(f"{text} -> {parse_num(text)}") |
|
|