|
|
|
|
|
|
|
import re |
|
from pathlib import Path |
|
from unicodedata import normalize |
|
|
|
import jieba |
|
import opencc |
|
|
|
jieba.setLogLevel(20) |
|
jieba.re_han_default = re.compile("([\u2e80-\U000e01efa-zA-Z0-9+#&\._%\-']+)", re.U) |
|
|
|
s2tw_converter = opencc.OpenCC("s2tw.json") |
|
|
|
|
|
def update_jieba_dict( |
|
lexicon: list, |
|
jieba_dict_path: Path, |
|
high_freq_words: list = [], |
|
high_freq_words_weight: int = 10, |
|
) -> list: |
|
lexicon = sorted(set(lexicon)) |
|
|
|
jieba_dict_path.unlink(missing_ok=True) |
|
Path("/tmp/jieba.cache").unlink(missing_ok=True) |
|
|
|
with jieba_dict_path.open("w", encoding="utf-8") as file: |
|
for word in lexicon: |
|
if word in high_freq_words: |
|
file.write(f"{word} {len(word) * high_freq_words_weight}\n") |
|
else: |
|
file.write(f"{word} {len(word)}\n") |
|
|
|
jieba.dt.initialized = False |
|
|
|
return lexicon |
|
|
|
|
|
def run_jieba(line: str) -> list: |
|
|
|
|
|
seg_list = list(jieba.cut(line, cut_all=False, HMM=False)) |
|
|
|
return seg_list |
|
|
|
|
|
def normalize_text(text: str, replace_dict: dict, replace_regex: str) -> str: |
|
def replace_match(match): |
|
return replace_dict[match.group(0)] |
|
|
|
text = re.sub("\x08", "", text) |
|
text = re.sub("\ufeff", "", text) |
|
text = re.sub("\u0010", "", text) |
|
text = normalize("NFKC", text) |
|
text = re.sub(replace_regex, replace_match, text) |
|
text = " ".join(text.split()).upper() |
|
|
|
return text |
|
|
|
|
|
def apply_v2f(word_list: list, v2f_dict: dict, v2f_regex: str) -> list: |
|
result = [] |
|
for word in word_list: |
|
result.append(re.sub(v2f_regex, lambda x: v2f_dict[x.group(0)], word)) |
|
|
|
return result |
|
|
|
|
|
def prep_regex( |
|
delimiter_list: list, replace_dict: dict = {}, v2f_dict: dict = {} |
|
) -> tuple[str, str, str]: |
|
delimiter_regex = "|".join(map(re.escape, delimiter_list)) |
|
|
|
replace_regex = "" |
|
if len(replace_dict): |
|
sorted_keys = sorted(replace_dict.keys(), key=len, reverse=True) |
|
replace_regex = "|".join(map(re.escape, sorted_keys)) |
|
|
|
v2f_regex = "" |
|
if len(v2f_dict): |
|
v2f_regex = "|".join(map(re.escape, v2f_dict.keys())) |
|
|
|
return delimiter_regex, replace_regex, v2f_regex |
|
|