Spark-TTS-0.5B / sparktts /utils /token_parser.py
mrfakename's picture
Upload 43 files
d93aca0 verified
raw
history blame
5.18 kB
TASK_TOKEN_MAP = {
"vc": "<|task_vc|>",
"tts": "<|task_tts|>",
"asr": "<|task_asr|>",
"s2s": "<|task_s2s|>",
"t2s": "<|task_t2s|>",
"understand": "<|task_understand|>",
"caption": "<|task_cap|>",
"controllable_tts": "<|task_controllable_tts|>",
"prompt_tts": "<|task_prompt_tts|>",
"speech_edit": "<|task_edit|>",
}
LEVELS_MAP = {
"very_low": 0,
"low": 1,
"moderate": 2,
"high": 3,
"very_high": 4,
}
LEVELS_MAP_UI = {
1: 'very_low',
2: 'low',
3: 'moderate',
4: 'high',
5: 'very_high'
}
GENDER_MAP = {
"female": 0,
"male": 1,
}
AGE_MAP = {"Child": 0, "Teenager": 1, "Youth-Adult": 2, "Middle-aged": 3, "Elderly": 4}
EMO_MAP = {
"UNKNOWN": 0,
"NEUTRAL": 1,
"ANGRY": 2,
"HAPPY": 3,
"SAD": 4,
"FEARFUL": 5,
"DISGUSTED": 6,
"SURPRISED": 7,
"SARCASTIC": 8,
"EXCITED": 9,
"SLEEPY": 10,
"CONFUSED": 11,
"EMPHASIS": 12,
"LAUGHING": 13,
"SINGING": 14,
"WORRIED": 15,
"WHISPER": 16,
"ANXIOUS": 17,
"NO-AGREEMENT": 18,
"APOLOGETIC": 19,
"CONCERNED": 20,
"ENUNCIATED": 21,
"ASSERTIVE": 22,
"ENCOURAGING": 23,
"CONTEMPT": 24,
}
class TokenParser:
"""Turn label to special token"""
def __init__(self):
pass
"""Parse the attributes of a person."""
def __init__(self):
pass
@staticmethod
def age(age: str) -> str:
"""Turn age token."""
age_id = AGE_MAP[age]
return f"<|age_{age_id}|>"
@staticmethod
def gender(gender: str) -> str:
"""Turn gender token."""
gender_id = GENDER_MAP[gender]
return f"<|gender_{gender_id}|>"
@staticmethod
def mel_value(mel: int):
"""Turn special token of mel scale pitch."""
mel = max(0, int(mel))
mel = min(1000, int(mel))
return f"<|pitch_value_{mel}|>"
@staticmethod
def mel_level(level: str):
"""Turn special token of mel level."""
level_tag = LEVELS_MAP[level]
return f"<|pitch_label_{level_tag}|>"
@staticmethod
def pitch_var_value(pitch_std: int):
"""Turn special token of pitch_std value."""
assert isinstance(pitch_std, int)
pitch_std = max(0, int(pitch_std))
pitch_std = min(10, int(pitch_std))
return f"<|pitch_var_value_{pitch_std}|>"
@staticmethod
def pitch_var_level(level: str):
"""Turn special token of pitch std level."""
level_tag = LEVELS_MAP[level]
return f"<|pitch_var_label_{level_tag}|>"
@staticmethod
def loudness_value(loudness: int):
"""Turn special toak of loudness value [0, 30]"""
assert loudness >= 0
loudness = max(0, int(loudness))
loudness = min(30, int(loudness))
return f"<|loudness_value_{loudness}|>"
@staticmethod
def loudness_level(level: str):
"""Turn special token of loudness level."""
level_tag = LEVELS_MAP[level]
return f"<|loudness_label_{level_tag}|>"
@staticmethod
def speed_value(speed: int):
"""Turn special token of speed value."""
speed = max(0, int(speed))
speed = min(10, int(speed))
return f"<|speed_value_{speed}|>"
@staticmethod
def speed_level(level: str):
"""Turn special token of speed level."""
level_tag = LEVELS_MAP[level]
return f"<|speed_label_{level_tag}|>"
@staticmethod
def task(task: str) -> str:
"""Turn special token of task."""
assert task in TASK_TOKEN_MAP.keys()
return TASK_TOKEN_MAP[task]
@staticmethod
def emotion(emotion: str):
emo_id = EMO_MAP[emotion]
return f"<|emotion_{emo_id}|>"
# test
if __name__ == "__main__":
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
"/aifs4su/xinshengwang/code/StyleCraft/tokenizer/stylecraft-bicodec-pitch-loudness-speed-emotion-tokenizer"
)
tasks = ["tts", "tts", "understand", "controllable_tts", "prompt_tts"]
ages = ["Child", "Teenager", "Youth-Adult", "Middle-aged", "Elderly"]
genders = ["female", "female", "female", "male", "male"]
mels = [100, 200, 300, 400, 500]
mel_levels = ["very_low", "low", "moderate", "high", "very_high"]
loudnesses = [1, 10, 23, 19, 30]
loudness_levels = ["very_low", "low", "moderate", "high", "very_high"]
emotions = ["UNKNOWN", "NEUTRAL", "ANGRY", "HAPPY", "SAD"]
for i in range(5):
task = TokenParser.task(tasks[i])
age = TokenParser.age(ages[i])
gender = TokenParser.gender(genders[i])
mel = TokenParser.mel_value(mels[i])
mel_level = TokenParser.mel_level(mel_levels[i])
loudness = TokenParser.loudness_value(loudnesses[i])
loudness_level = TokenParser.loudness_level(loudness_levels[i])
emotion = TokenParser.emotion(emotions[i])
inputs = [task, age, gender, mel, mel_level, loudness, loudness_level, emotion]
inputs = "".join(inputs)
ids = tokenizer.encode(inputs, add_special_tokens=False)
print(ids)
print("decode", tokenizer.decode(ids))