deploy-s2s-api / utils /g2p /symbols.py
3v324v23's picture
Add application file
ad48e75
raw
history blame
4.2 kB
'''
Defines the set of symbols used in text input to the model.
'''
# japanese_cleaners
# _pad = '_'
# _punctuation = ',.!?-'
# _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
'''# japanese_cleaners2
_pad = '_'
_punctuation = ',.!?-~…'
_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
'''
'''# korean_cleaners
_pad = '_'
_punctuation = ',.!?…~'
_letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
'''
'''# chinese_cleaners
_pad = '_'
_punctuation = ',。!?—…'
_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
'''
# # zh_ja_mixture_cleaners
# _pad = '_'
# _punctuation = ',.!?-~…'
# _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
'''# sanskrit_cleaners
_pad = '_'
_punctuation = '।'
_letters = 'ँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहऽािीुूृॄेैोौ्ॠॢ '
'''
'''# cjks_cleaners
_pad = '_'
_punctuation = ',.!?-~…'
_letters = 'NQabdefghijklmnopstuvwxyzʃʧʥʦɯɹəɥçɸɾβŋɦː⁼ʰ`^#*=→↓↑ '
'''
'''# thai_cleaners
_pad = '_'
_punctuation = '.!? '
_letters = 'กขฃคฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลวศษสหฬอฮฯะัาำิีึืุูเแโใไๅๆ็่้๊๋์'
'''
# # cjke_cleaners2
_pad = '_'
_punctuation = ',.!?-~…'
_letters = 'NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ '
'''# shanghainese_cleaners
_pad = '_'
_punctuation = ',.!?…'
_letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
'''
'''# chinese_dialect_cleaners
_pad = '_'
_punctuation = ',.!?~…─'
_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
'''
''' # arabic cleaners
PADDING_TOKEN = '_pad_'
EOS_TOKEN = '_eos_'
DOUBLING_TOKEN = '_dbl_'
SEPARATOR_TOKEN = '_+_'
EOS_TOKENS = [SEPARATOR_TOKEN, EOS_TOKEN]
symbols = [
# special tokens
PADDING_TOKEN, # padding
EOS_TOKEN, # eos-token
'_sil_', # silence
DOUBLING_TOKEN, # doubling
SEPARATOR_TOKEN, # word separator
# consonants
'<', # hamza
'b', # baa'
't', # taa'
'^', # thaa'
'j', # jiim
'H', # Haa'
'x', # xaa'
'd', # daal
'*', # dhaal
'r', # raa'
'z', # zaay
's', # siin
'$', # shiin
'S', # Saad
'D', # Daad
'T', # Taa'
'Z', # Zhaa'
'E', # 3ayn
'g', # ghain
'f', # faa'
'q', # qaaf
'k', # kaaf
'l', # laam
'm', # miim
'n', # nuun
'h', # haa'
'w', # waaw
'y', # yaa'
'v', # /v/ for loanwords e.g. in u'fydyw': u'v i0 d y uu1',
# vowels
'a', # short
'u',
'i',
'aa', # long
'uu',
'ii',
]
'''
EOS_TOKEN = '_eos_'
DOUBLING_TOKEN = '_dbl_'
SEPARATOR_TOKEN = '_+_'
EOS_TOKENS = [SEPARATOR_TOKEN, EOS_TOKEN]
symbols = [
# special tokens
EOS_TOKEN, # eos-token
'_sil_', # silence
DOUBLING_TOKEN, # doubling
SEPARATOR_TOKEN, # word separator
# consonants
'<', # hamza
'b', # baa'
't', # taa'
'^', # thaa'
'j', # jiim
'H', # Haa'
'x', # xaa'
'd', # daal
'*', # dhaal
'r', # raa'
'z', # zaay
's', # siin
'$', # shiin
'S', # Saad
'D', # Daad
'T', # Taa'
'Z', # Zhaa'
'E', # 3ayn
'g', # ghain
'f', # faa'
'q', # qaaf
'k', # kaaf
'l', # laam
'm', # miim
'n', # nuun
'h', # haa'
'w', # waaw
'y', # yaa'
'v', # /v/ for loanwords e.g. in u'fydyw': u'v i0 d y uu1',
# vowels
'a', # short
'u',
'i',
'aa', # long
'uu',
'ii',
]
# Export all symbols:
symbols += [_pad] + list(_punctuation) + list(_letters)
# Special symbol ids
SPACE_ID = symbols.index(" ")