radiobee-aligner / radiobee /gen_vector.py
freemt
Update before sent-align
4c04f50
raw
history blame
802 Bytes
"""gen tokens for english or chinese text for a given model."""
# pylint: disable=
from typing import List, Union
from textacy.representations import Vectorizer
from radiobee.insert_spaces import insert_spaces
# from radiobee.gen_model import gen_model
def gen_vector(text: Union[str, List[str]], model: Vectorizer) -> List[float]:
r"""Gen vector for a give model.
Args:
text: string of Chinese chars or English words.
model: model used
filename = r"data\test-dual.txt"
text = loadtext(filename)
list1, list2 = zip(*text2lists(text))
model = gen_model(list1)
"""
if isinstance(text, str):
vec = insert_spaces(text).split()
return model.transform(vec)
# already same tokens as used to gen_model
return model.transform(text)