Spaces:
Build error
Build error
File size: 1,749 Bytes
3d38118 52771bf 3d38118 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
"""Genereat align set (aset) based on pset (pair set), src_lang and tgt_len."""
from typing import List, Tuple, Union
from itertools import zip_longest
# from logzero import logger
# fmt: off
def gen_aset(
pset: List[Tuple[Union[str, float], Union[str, float], Union[str, float]]],
src_len: int, # n_rows
tgt_len: int, # n_cols
) -> List[Tuple[Union[str, float], Union[str, float], Union[str, float]]]:
# fmt: on
"""Genereat align set (aset) based on pset, src_lang and tgt_len.
src_len, tgt_len = cmat.shape
zip_longest(..., fillvalue="")
Args:
pset: [x(lang2 zh), y(lang1 en), cos]
src_len: lang1 (en)
tgt_len: lang2 (zh)
Returns:
aset:
[0...tgt_len, 0...src_len]
[0, 0, .]
...
[tgt_len-1, src_len-1, .]
"""
# empty pset []
if not pset:
return [*zip_longest(range(tgt_len), range(src_len), fillvalue="")]
# empty [[]]
if len(pset) == 1:
if not pset[0]:
return [*zip_longest(range(tgt_len), range(src_len), fillvalue="")]
buff = []
pos0, pos1 = -1, -1
for elm in pset:
# elm0, elm1, elm2 = elm
elm0, elm1, *elm2 = elm
elm0 = int(elm0)
elm1 = int(elm1)
interval = max(elm0 - pos0 - 1, elm1 - pos1 - 1)
_ = zip_longest(range(pos0 + 1, elm0), range(pos1 + 1, elm1), [""] * interval, fillvalue="")
buff.extend(_)
buff.append(elm)
pos0, pos1 = elm0, elm1
# last batch if any
elm0, elm1 = tgt_len, src_len
interval = max(elm0 - pos0 - 1, elm1 - pos1 - 1)
_ = zip_longest(range(pos0 + 1, elm0), range(pos1 + 1, elm1), [""] * interval, fillvalue="")
buff.extend(_)
return buff
|