Spaces:
Runtime error
Runtime error
TASK_TOKEN_MAP = { | |
"vc": "<|task_vc|>", | |
"tts": "<|task_tts|>", | |
"asr": "<|task_asr|>", | |
"s2s": "<|task_s2s|>", | |
"t2s": "<|task_t2s|>", | |
"understand": "<|task_understand|>", | |
"caption": "<|task_cap|>", | |
"controllable_tts": "<|task_controllable_tts|>", | |
"prompt_tts": "<|task_prompt_tts|>", | |
"speech_edit": "<|task_edit|>", | |
} | |
LEVELS_MAP = { | |
"very_low": 0, | |
"low": 1, | |
"moderate": 2, | |
"high": 3, | |
"very_high": 4, | |
} | |
LEVELS_MAP_UI = { | |
1: 'very_low', | |
2: 'low', | |
3: 'moderate', | |
4: 'high', | |
5: 'very_high' | |
} | |
GENDER_MAP = { | |
"female": 0, | |
"male": 1, | |
} | |
AGE_MAP = {"Child": 0, "Teenager": 1, "Youth-Adult": 2, "Middle-aged": 3, "Elderly": 4} | |
EMO_MAP = { | |
"UNKNOWN": 0, | |
"NEUTRAL": 1, | |
"ANGRY": 2, | |
"HAPPY": 3, | |
"SAD": 4, | |
"FEARFUL": 5, | |
"DISGUSTED": 6, | |
"SURPRISED": 7, | |
"SARCASTIC": 8, | |
"EXCITED": 9, | |
"SLEEPY": 10, | |
"CONFUSED": 11, | |
"EMPHASIS": 12, | |
"LAUGHING": 13, | |
"SINGING": 14, | |
"WORRIED": 15, | |
"WHISPER": 16, | |
"ANXIOUS": 17, | |
"NO-AGREEMENT": 18, | |
"APOLOGETIC": 19, | |
"CONCERNED": 20, | |
"ENUNCIATED": 21, | |
"ASSERTIVE": 22, | |
"ENCOURAGING": 23, | |
"CONTEMPT": 24, | |
} | |
class TokenParser: | |
"""Turn label to special token""" | |
def __init__(self): | |
pass | |
"""Parse the attributes of a person.""" | |
def __init__(self): | |
pass | |
def age(age: str) -> str: | |
"""Turn age token.""" | |
age_id = AGE_MAP[age] | |
return f"<|age_{age_id}|>" | |
def gender(gender: str) -> str: | |
"""Turn gender token.""" | |
gender_id = GENDER_MAP[gender] | |
return f"<|gender_{gender_id}|>" | |
def mel_value(mel: int): | |
"""Turn special token of mel scale pitch.""" | |
mel = max(0, int(mel)) | |
mel = min(1000, int(mel)) | |
return f"<|pitch_value_{mel}|>" | |
def mel_level(level: str): | |
"""Turn special token of mel level.""" | |
level_tag = LEVELS_MAP[level] | |
return f"<|pitch_label_{level_tag}|>" | |
def pitch_var_value(pitch_std: int): | |
"""Turn special token of pitch_std value.""" | |
assert isinstance(pitch_std, int) | |
pitch_std = max(0, int(pitch_std)) | |
pitch_std = min(10, int(pitch_std)) | |
return f"<|pitch_var_value_{pitch_std}|>" | |
def pitch_var_level(level: str): | |
"""Turn special token of pitch std level.""" | |
level_tag = LEVELS_MAP[level] | |
return f"<|pitch_var_label_{level_tag}|>" | |
def loudness_value(loudness: int): | |
"""Turn special toak of loudness value [0, 30]""" | |
assert loudness >= 0 | |
loudness = max(0, int(loudness)) | |
loudness = min(30, int(loudness)) | |
return f"<|loudness_value_{loudness}|>" | |
def loudness_level(level: str): | |
"""Turn special token of loudness level.""" | |
level_tag = LEVELS_MAP[level] | |
return f"<|loudness_label_{level_tag}|>" | |
def speed_value(speed: int): | |
"""Turn special token of speed value.""" | |
speed = max(0, int(speed)) | |
speed = min(10, int(speed)) | |
return f"<|speed_value_{speed}|>" | |
def speed_level(level: str): | |
"""Turn special token of speed level.""" | |
level_tag = LEVELS_MAP[level] | |
return f"<|speed_label_{level_tag}|>" | |
def task(task: str) -> str: | |
"""Turn special token of task.""" | |
assert task in TASK_TOKEN_MAP.keys() | |
return TASK_TOKEN_MAP[task] | |
def emotion(emotion: str): | |
emo_id = EMO_MAP[emotion] | |
return f"<|emotion_{emo_id}|>" | |
# test | |
if __name__ == "__main__": | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained( | |
"/aifs4su/xinshengwang/code/StyleCraft/tokenizer/stylecraft-bicodec-pitch-loudness-speed-emotion-tokenizer" | |
) | |
tasks = ["tts", "tts", "understand", "controllable_tts", "prompt_tts"] | |
ages = ["Child", "Teenager", "Youth-Adult", "Middle-aged", "Elderly"] | |
genders = ["female", "female", "female", "male", "male"] | |
mels = [100, 200, 300, 400, 500] | |
mel_levels = ["very_low", "low", "moderate", "high", "very_high"] | |
loudnesses = [1, 10, 23, 19, 30] | |
loudness_levels = ["very_low", "low", "moderate", "high", "very_high"] | |
emotions = ["UNKNOWN", "NEUTRAL", "ANGRY", "HAPPY", "SAD"] | |
for i in range(5): | |
task = TokenParser.task(tasks[i]) | |
age = TokenParser.age(ages[i]) | |
gender = TokenParser.gender(genders[i]) | |
mel = TokenParser.mel_value(mels[i]) | |
mel_level = TokenParser.mel_level(mel_levels[i]) | |
loudness = TokenParser.loudness_value(loudnesses[i]) | |
loudness_level = TokenParser.loudness_level(loudness_levels[i]) | |
emotion = TokenParser.emotion(emotions[i]) | |
inputs = [task, age, gender, mel, mel_level, loudness, loudness_level, emotion] | |
inputs = "".join(inputs) | |
ids = tokenizer.encode(inputs, add_special_tokens=False) | |
print(ids) | |
print("decode", tokenizer.decode(ids)) | |