GPT-SoVITS-v3 / GPT_SoVITS /TTS_infer_pack /text_segmentation_method.py

Upload folder using huggingface_hub

2c3577a verified 10 days ago

4.86 kB





	import re
	from typing import Callable

	punctuation = set(['!', '?', '…', ',', '.', '-'," "])
	METHODS = dict()

	def get_method(name:str)->Callable:
	method = METHODS.get(name, None)
	if method is None:
	raise ValueError(f"Method {name} not found")
	return method

	def get_method_names()->list:
	return list(METHODS.keys())

	def register_method(name):
	def decorator(func):
	METHODS[name] = func
	return func
	return decorator

	splits = {"，", "。", "？", "！", ",", ".", "?", "!", "~", ":", "：", "—", "…", }

	def split_big_text(text, max_len=510):
	# 定义全角和半角标点符号
	punctuation = "".join(splits)

	# 切割文本
	segments = re.split('([' + punctuation + '])', text)

	# 初始化结果列表和当前片段
	result = []
	current_segment = ''

	for segment in segments:
	# 如果当前片段加上新的片段长度超过max_len，就将当前片段加入结果列表，并重置当前片段
	if len(current_segment + segment) > max_len:
	result.append(current_segment)
	current_segment = segment
	else:
	current_segment += segment

	# 将最后一个片段加入结果列表
	if current_segment:
	result.append(current_segment)

	return result



	def split(todo_text):
	todo_text = todo_text.replace("……", "。").replace("——", "，")
	if todo_text[-1] not in splits:
	todo_text += "。"
	i_split_head = i_split_tail = 0
	len_text = len(todo_text)
	todo_texts = []
	while 1:
	if i_split_head >= len_text:
	break # 结尾一定有标点，所以直接跳出即可，最后一段在上次已加入
	if todo_text[i_split_head] in splits:
	i_split_head += 1
	todo_texts.append(todo_text[i_split_tail:i_split_head])
	i_split_tail = i_split_head
	else:
	i_split_head += 1
	return todo_texts


	# 不切
	@register_method("cut0")
	def cut0(inp):
	if not set(inp).issubset(punctuation):
	return inp
	else:
	return "/n"


	# 凑四句一切
	@register_method("cut1")
	def cut1(inp):
	inp = inp.strip("\n")
	inps = split(inp)
	split_idx = list(range(0, len(inps), 4))
	split_idx[-1] = None
	if len(split_idx) > 1:
	opts = []
	for idx in range(len(split_idx) - 1):
	opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
	else:
	opts = [inp]
	opts = [item for item in opts if not set(item).issubset(punctuation)]
	return "\n".join(opts)


	# 凑50字一切
	@register_method("cut2")
	def cut2(inp):
	inp = inp.strip("\n")
	inps = split(inp)
	if len(inps) < 2:
	return inp
	opts = []
	summ = 0
	tmp_str = ""
	for i in range(len(inps)):
	summ += len(inps[i])
	tmp_str += inps[i]
	if summ > 50:
	summ = 0
	opts.append(tmp_str)
	tmp_str = ""
	if tmp_str != "":
	opts.append(tmp_str)
	# print(opts)
	if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了，和前一个合一起
	opts[-2] = opts[-2] + opts[-1]
	opts = opts[:-1]
	opts = [item for item in opts if not set(item).issubset(punctuation)]
	return "\n".join(opts)

	# 按中文句号。切
	@register_method("cut3")
	def cut3(inp):
	inp = inp.strip("\n")
	opts = ["%s" % item for item in inp.strip("。").split("。")]
	opts = [item for item in opts if not set(item).issubset(punctuation)]
	return "\n".join(opts)

	#按英文句号.切
	@register_method("cut4")
	def cut4(inp):
	inp = inp.strip("\n")
	opts = ["%s" % item for item in inp.strip(".").split(".")]
	opts = [item for item in opts if not set(item).issubset(punctuation)]
	return "\n".join(opts)

	# 按标点符号切
	# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
	@register_method("cut5")
	def cut5(inp):
	inp = inp.strip("\n")
	punds = {',', '.', ';', '?', '!', '、', '，', '。', '？', '！', ';', '：', '…'}
	mergeitems = []
	items = []

	for i, char in enumerate(inp):
	if char in punds:
	if char == '.' and i > 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit():
	items.append(char)
	else:
	items.append(char)
	mergeitems.append("".join(items))
	items = []
	else:
	items.append(char)

	if items:
	mergeitems.append("".join(items))

	opt = [item for item in mergeitems if not set(item).issubset(punds)]
	return "\n".join(opt)



	if __name__ == '__main__':
	method = get_method("cut5")
	print(method("你好，我是小明。你好，我是小红。你好，我是小刚。你好，我是小张。"))