Spaces:

united-link
/

taiwanese-hakka-tts

Running

App Files Files Community

taiwanese-hakka-tts / ipa /proc_text.py

txya900619

fix: jieba dict update encoding ascii bug

576392b 7 months ago

raw

history blame

2.29 kB

	# Copyright 2024 Hung-Shin Lee (hungshinlee@gmail.com)
	# Apache 2.0

	import re
	from pathlib import Path
	from unicodedata import normalize

	import jieba
	import opencc

	jieba.setLogLevel(20)
	jieba.re_han_default = re.compile("([\u2e80-\U000e01efa-zA-Z0-9+#&\._%\-']+)", re.U)

	s2tw_converter = opencc.OpenCC("s2tw.json")


	def update_jieba_dict(
	lexicon: list,
	jieba_dict_path: Path,
	high_freq_words: list = [],
	high_freq_words_weight: int = 10,
	) -> list:
	lexicon = sorted(set(lexicon))

	jieba_dict_path.unlink(missing_ok=True)
	Path("/tmp/jieba.cache").unlink(missing_ok=True)

	with jieba_dict_path.open("w", encoding="utf-8") as file:
	for word in lexicon:
	if word in high_freq_words:
	file.write(f"{word} {len(word) * high_freq_words_weight}\n")
	else:
	file.write(f"{word} {len(word)}\n")

	jieba.dt.initialized = False

	return lexicon


	def run_jieba(line: str) -> list:
	# NOTE JIEBA 處理多行文本的結果會失去原本的行結構

	seg_list = list(jieba.cut(line, cut_all=False, HMM=False))

	return seg_list


	def normalize_text(text: str, replace_dict: dict, replace_regex: str) -> str:
	def replace_match(match):
	return replace_dict[match.group(0)]

	text = re.sub("\x08", "", text)
	text = re.sub("\ufeff", "", text)
	text = re.sub("\u0010", "", text)
	text = normalize("NFKC", text)
	text = re.sub(replace_regex, replace_match, text)
	text = " ".join(text.split()).upper()

	return text


	def apply_v2f(word_list: list, v2f_dict: dict, v2f_regex: str) -> list:
	result = []
	for word in word_list:
	result.append(re.sub(v2f_regex, lambda x: v2f_dict[x.group(0)], word))

	return result


	def prep_regex(
	delimiter_list: list, replace_dict: dict = {}, v2f_dict: dict = {}
	) -> tuple[str, str, str]:
	delimiter_regex = "\|".join(map(re.escape, delimiter_list))

	replace_regex = ""
	if len(replace_dict):
	sorted_keys = sorted(replace_dict.keys(), key=len, reverse=True)
	replace_regex = "\|".join(map(re.escape, sorted_keys))

	v2f_regex = ""
	if len(v2f_dict):
	v2f_regex = "\|".join(map(re.escape, v2f_dict.keys()))

	return delimiter_regex, replace_regex, v2f_regex