korean-gender-predictor / name_change.py
go00od's picture
๋ณ€์ˆ˜ ์—…๋ฐ์ดํŠธ
69c7923 verified
from OldHangeul import text_to_jamo
import torch
import string
# ํ•œ๊ธ€ ์ž๋ชจ์˜ ๋กœ๋งˆ์ž ๋Œ€์‘ ๊ฐ’
initials = {
'แ„€': 'K', 'แ„': 'KK', 'แ„‚': 'N', 'แ„ƒ': 'T', 'แ„„': 'TT', 'แ„…': 'R', 'แ„†': 'M', 'แ„‡': 'P', 'แ„ˆ': 'PP', 'แ„‰': 'S', 'แ„Š': 'SS',
'แ„‹': 'NG', 'แ„Œ': 'C', 'แ„': 'CC', 'แ„Ž': 'CH', 'แ„': 'KH', 'แ„': 'TH', 'แ„‘': 'PH', 'แ„’': 'H'
}
medials = {
'แ…ก': 'a', 'แ…ข': 'ae', 'แ…ฃ': 'ya', 'แ…ค': 'yae', 'แ…ฅ': 'eo', 'แ…ฆ': 'e', 'แ…ง': 'yeo', 'แ…จ': 'ye', 'แ…ฉ': 'o', 'แ…ช': 'wa',
'แ…ซ': 'wae', 'แ…ฌ': 'oe', 'แ…ญ': 'yo', 'แ…ฎ': 'u', 'แ…ฏ': 'wo', 'แ…ฐ': 'we', 'แ…ฑ': 'wi', 'แ…ฒ': 'yu', 'แ…ณ': 'eu', 'แ…ด': 'ui', 'แ…ต': 'i'
}
finals = {
'': '', 'แ†จ': 'k', 'แ†ฉ': 'kk', 'แ†ช': 'ks', 'แ†ซ': 'n', 'แ†ฌ': 'nj', 'แ†ญ': 'nh', 'แ†ฎ': 't', 'แ†ฏ': 'r', 'แ†ฐ': 'rk', 'แ†ฑ': 'rm',
'แ†ฒ': 'rb', 'แ†ณ': 'rs', 'แ†ด': 'rt', 'แ†ต': 'rp', 'แ†ถ': 'rh', 'แ†ท': 'm', 'แ†ธ': 'p', 'แ†น': 'ps', 'แ†บ': 's', 'แ†ป': 'ss', 'แ†ผ': 'ng',
'แ†ฝ': 'c', 'แ†พ': 'ch', 'แ†ฟ': 'kh', 'แ‡€': 'th', 'แ‡': 'ph', 'แ‡‚': 'h'
}
# ์—ญ๋ฐฉํ–ฅ ๋งคํ•‘์„ ์œ„ํ•œ ์‚ฌ์ „ ์ƒ์„ฑ
rev_initials = {v: k for k, v in initials.items()}
rev_medials = {v: k for k, v in medials.items()}
rev_finals = {v: k for k, v in finals.items()}
def hangul_to_roman(hangul):
result = []
for char in hangul:
if '๊ฐ€' <= char <= 'ํžฃ':
jamos=text_to_jamo(char, compatibility=False, spacing=False)
initial = initials[jamos[0]]
medial = medials[jamos[2]]
final = finals[jamos[4]] if len(jamos) == 5 else ''
result.append(initial + medial + final)
else:
result.append(char)
return ''.join(result)
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
# all_letters ๋กœ ๋ฌธ์ž์˜ ์ฃผ์†Œ ์ฐพ๊ธฐ, ์˜ˆ์‹œ "a" = 0
def letterToIndex(letter):
return all_letters.find(letter)
# ํ•œ ์ค„(์ด๋ฆ„)์„ <line_length x 1 x n_letters>,
# ๋˜๋Š” One-Hot ๋ฌธ์ž ๋ฒกํ„ฐ์˜ Array๋กœ ๋ณ€๊ฒฝ
def lineToTensor(line):
tensor = torch.zeros(len(line), 1, n_letters)
for li, letter in enumerate(line):
tensor[li][0][letterToIndex(letter)] = 1
return tensor