Spaces:
Sleeping
Sleeping
10zinten
commited on
Commit
•
1a3c007
0
Parent(s):
Duplicate from openpecha/tibetan-aligner-api
Browse files- .gitattributes +34 -0
- .gitignore +165 -0
- README.md +14 -0
- app.py +115 -0
- flagged/file_urls/tmpfuhgfj7m.json +1 -0
- flagged/log.csv +2 -0
- import_tibetan_aligner_source.py +26 -0
- requirements.txt +5 -0
- tibetan-aligner/README.md +5 -0
- tibetan-aligner/align_tib_en.sh +43 -0
- tibetan-aligner/convert_to_wylie.py +17 -0
- tibetan-aligner/create_train.py +60 -0
- tibetan-aligner/create_train_clean.py +37 -0
- tibetan-aligner/dp_core.cpython-310-x86_64-linux-gnu.so.reload1 +0 -0
- tibetan-aligner/dp_core.cpython-39-darwin.so.reload1 +0 -0
- tibetan-aligner/dp_core.pyx +411 -0
- tibetan-aligner/dp_utils.py +668 -0
- tibetan-aligner/get_vectors.py +35 -0
- tibetan-aligner/ladder +11 -0
- tibetan-aligner/ladder2org.py +47 -0
- tibetan-aligner/model_to_hub.py +7 -0
- tibetan-aligner/requirements.txt +3 -0
- tibetan-aligner/score.py +170 -0
- tibetan-aligner/vecalign.py +148 -0
- tm.py +169 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
.idea/
|
161 |
+
|
162 |
+
*/ladder
|
163 |
+
data
|
164 |
+
|
165 |
+
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Tibetan Aligner
|
3 |
+
emoji: 📖
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.34.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
license: mit
|
11 |
+
duplicated_from: openpecha/tibetan-aligner-api
|
12 |
+
---
|
13 |
+
|
14 |
+
DISCLAIMER: This space has been created solely for testing and educational purposes. We do not claim any ownership or copyright over the align-tibetan script, which remains the sole property of its original creator, Sebastian Nehrlich. We have created this space to facilitate the use and testing of the align-tibetan script for interested users. If you use the align-tibetan script for any commercial or production purposes, we strongly encourage you to obtain permission from the original creator and comply with any relevant licensing requirements.
|
app.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import shutil
|
5 |
+
import stat
|
6 |
+
import subprocess
|
7 |
+
import time
|
8 |
+
import uuid
|
9 |
+
from contextlib import contextmanager
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
import gradio as gr
|
13 |
+
import requests
|
14 |
+
|
15 |
+
from tm import create_tm
|
16 |
+
|
17 |
+
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
|
18 |
+
|
19 |
+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
|
20 |
+
|
21 |
+
ALIGNER_SCRIPT_DIR = Path("./tibetan-aligner").resolve()
|
22 |
+
ALIGNER_SCRIPT_NAME = "align_tib_en.sh"
|
23 |
+
ALIGNER_SCRIPT_PATH = ALIGNER_SCRIPT_DIR / ALIGNER_SCRIPT_NAME
|
24 |
+
assert ALIGNER_SCRIPT_PATH.is_file()
|
25 |
+
|
26 |
+
|
27 |
+
def make_dir_executable(dir_path: Path):
|
28 |
+
for fn in dir_path.iterdir():
|
29 |
+
st = os.stat(fn)
|
30 |
+
os.chmod(fn, st.st_mode | stat.S_IEXEC)
|
31 |
+
st = os.stat(fn)
|
32 |
+
os.chmod(fn, st.st_mode | stat.S_IXGRP)
|
33 |
+
st = os.stat(fn)
|
34 |
+
os.chmod(fn, st.st_mode | stat.S_IXOTH)
|
35 |
+
|
36 |
+
|
37 |
+
make_dir_executable(ALIGNER_SCRIPT_DIR)
|
38 |
+
|
39 |
+
|
40 |
+
@contextmanager
|
41 |
+
def TemporaryDirectory():
|
42 |
+
tmpdir = Path("./output").resolve() / uuid.uuid4().hex[:8]
|
43 |
+
tmpdir.mkdir(exist_ok=True, parents=True)
|
44 |
+
try:
|
45 |
+
yield tmpdir
|
46 |
+
finally:
|
47 |
+
shutil.rmtree(str(tmpdir))
|
48 |
+
|
49 |
+
|
50 |
+
def download_file(github_file_url: str, output_fn) -> Path:
|
51 |
+
"""Download file from github"""
|
52 |
+
headers = {
|
53 |
+
"Authorization": f"token {GITHUB_TOKEN}",
|
54 |
+
"Accept": "application/vnd.github+json",
|
55 |
+
}
|
56 |
+
authenticated_file_url = f"{github_file_url}?token={GITHUB_TOKEN}"
|
57 |
+
with requests.get(authenticated_file_url, headers=headers, stream=True) as r:
|
58 |
+
r.raise_for_status()
|
59 |
+
with open(output_fn, "wb") as f:
|
60 |
+
for chunk in r.iter_content(chunk_size=8192):
|
61 |
+
f.write(chunk)
|
62 |
+
return output_fn
|
63 |
+
|
64 |
+
|
65 |
+
def _run_align_script(bo_fn, en_fn, output_dir):
|
66 |
+
start = time.time()
|
67 |
+
cmd = [str(ALIGNER_SCRIPT_PATH), str(bo_fn), str(en_fn), str(output_dir)]
|
68 |
+
output = subprocess.run(
|
69 |
+
cmd,
|
70 |
+
check=True,
|
71 |
+
capture_output=True,
|
72 |
+
text=True,
|
73 |
+
cwd=str(ALIGNER_SCRIPT_DIR),
|
74 |
+
)
|
75 |
+
output_fn = re.search(r"\[OUTPUT\] (.*)", output.stdout).group(1)
|
76 |
+
output_fn = "/" + output_fn.split("//")[-1]
|
77 |
+
end = time.time()
|
78 |
+
total_time = round((end - start) / 60, 2)
|
79 |
+
logging.info(f"Total time taken for Aligning: {total_time} mins")
|
80 |
+
return output_fn
|
81 |
+
|
82 |
+
|
83 |
+
def align(text_pair):
|
84 |
+
logging.info(f"Running aligner for TM{text_pair['text_id']}...")
|
85 |
+
with TemporaryDirectory() as tmpdir:
|
86 |
+
output_dir = Path(tmpdir)
|
87 |
+
bo_fn = download_file(text_pair["bo_file_url"], output_fn=output_dir / "bo.tx")
|
88 |
+
en_fn = download_file(text_pair["en_file_url"], output_fn=output_dir / "en.tx")
|
89 |
+
aligned_fn = _run_align_script(bo_fn, en_fn, output_dir)
|
90 |
+
repo_url = create_tm(aligned_fn, text_pair=text_pair)
|
91 |
+
return {"tm_repo_url": repo_url}
|
92 |
+
|
93 |
+
|
94 |
+
with gr.Blocks() as demo:
|
95 |
+
gr.Markdown("## Tibetan-English Aligner API")
|
96 |
+
gr.Markdown("Please use Via API")
|
97 |
+
input = gr.JSON(
|
98 |
+
# value={
|
99 |
+
# "text_id": f"{uuid.uuid4().hex[:4]}",
|
100 |
+
# "bo_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-bo.txt",
|
101 |
+
# "en_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-en.txt",
|
102 |
+
# }
|
103 |
+
)
|
104 |
+
output = gr.JSON()
|
105 |
+
align_btn = gr.Button("Align")
|
106 |
+
align_btn.click(
|
107 |
+
fn=align,
|
108 |
+
inputs=input,
|
109 |
+
outputs=output,
|
110 |
+
api_name="align",
|
111 |
+
)
|
112 |
+
|
113 |
+
|
114 |
+
if __name__ == "__main__":
|
115 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, debug=True)
|
flagged/file_urls/tmpfuhgfj7m.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bo_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-bo.txt", "en_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-en.txt"}
|
flagged/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
file_urls,output,flag,username,timestamp
|
2 |
+
/home/user/app/flagged/file_urls/tmpfuhgfj7m.json,,,,2023-04-10 11:44:49.529324
|
import_tibetan_aligner_source.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import shutil
|
2 |
+
import sys
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
NON_SOURCES_FILES = [
|
6 |
+
".",
|
7 |
+
"..",
|
8 |
+
".git",
|
9 |
+
".github",
|
10 |
+
".gitignore",
|
11 |
+
".venv",
|
12 |
+
".idea",
|
13 |
+
"Dockerfile",
|
14 |
+
"__pycache__",
|
15 |
+
"tests",
|
16 |
+
]
|
17 |
+
|
18 |
+
if __name__ == "__main__":
|
19 |
+
source_dir = Path(sys.argv[1])
|
20 |
+
dest_dir = Path(__file__).parent / source_dir.name
|
21 |
+
dest_dir.mkdir(exist_ok=True)
|
22 |
+
for fn in source_dir.iterdir():
|
23 |
+
if fn.name in NON_SOURCES_FILES:
|
24 |
+
continue
|
25 |
+
dest_fn = dest_dir / fn.name
|
26 |
+
shutil.copy2(str(fn), str(dest_fn))
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sentence-transformers==2.2.2
|
2 |
+
pyewts==0.2.0
|
3 |
+
Cython==0.29.34
|
4 |
+
gradio>=3.34.0, <4.0
|
5 |
+
requests==2.28.2
|
tibetan-aligner/README.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# align-tibetan
|
2 |
+
Tibetan English sentence alignment
|
3 |
+
Simply run bash align_tib_en.sh <tib_file> <eng_file>.
|
4 |
+
Tib file should be in Tibetan unicode, English file should be plain text English.
|
5 |
+
There are some possible parameters, please look into align_tib_en.sh.
|
tibetan-aligner/align_tib_en.sh
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
number_of_overlays=6 # the higher the number of overlays, the more precise alignment is going to be, but also slower
|
3 |
+
deletion=0.06 # higher = less precise
|
4 |
+
search_buffer_size=50
|
5 |
+
|
6 |
+
# Args:
|
7 |
+
# first parameter is a file in Tibetan unicode
|
8 |
+
# second parameter is a file with English in plain text.
|
9 |
+
# third parameter is output path
|
10 |
+
|
11 |
+
cp $1 $1.work
|
12 |
+
cp $2 $2.work
|
13 |
+
output_dir=${3:-"output"}
|
14 |
+
mkdir $output_dir
|
15 |
+
|
16 |
+
cp $2.work $2.work2
|
17 |
+
|
18 |
+
echo '[INFO] Getting Embedding...'
|
19 |
+
time python get_vectors.py $1.work $number_of_overlays
|
20 |
+
time python get_vectors.py $2.work $number_of_overlays
|
21 |
+
|
22 |
+
rm ladder
|
23 |
+
echo '[INFO] Running alignment...'
|
24 |
+
time ./vecalign.py -a $number_of_overlays -d $deletion --search_buffer_size $search_buffer_size --alignment_max_size $number_of_overlays --src $1.work --tgt $2.work \
|
25 |
+
--src_embed $1.work_overlay $1.work_vectors.npy \
|
26 |
+
--tgt_embed $2.work_overlay $2.work_vectors.npy >> ladder
|
27 |
+
|
28 |
+
rm $1.org
|
29 |
+
rm $1.train
|
30 |
+
python ladder2org.py $1.work $2.work ladder >> $1.org
|
31 |
+
python create_train.py $1.work $2.work ladder >> $1.train
|
32 |
+
python create_train_clean.py $1.work $2.work ladder >> $1.train_cleaned
|
33 |
+
|
34 |
+
# clean up
|
35 |
+
mv *.txt* $output_dir/
|
36 |
+
mv $output_dir/requirements.txt ./
|
37 |
+
rm $output_dir/$1.work
|
38 |
+
rm $output_dir/$2.work
|
39 |
+
rm $output_dir/$2.work2
|
40 |
+
rm $output_dir/$1.work_vectors.npy
|
41 |
+
rm $output_dir/$2.work_vectors.npy
|
42 |
+
|
43 |
+
echo "[OUTPUT] $output_dir/$1.train_cleaned"
|
tibetan-aligner/convert_to_wylie.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import pyewts
|
3 |
+
|
4 |
+
|
5 |
+
converter = pyewts.pyewts()
|
6 |
+
path = sys.argv[1]
|
7 |
+
result = ""
|
8 |
+
|
9 |
+
for line in open(path, "r"):
|
10 |
+
line = converter.toWylie(line)
|
11 |
+
result += line
|
12 |
+
|
13 |
+
|
14 |
+
with open(path,"w") as outfile:
|
15 |
+
outfile.write(result)
|
16 |
+
|
17 |
+
|
tibetan-aligner/create_train.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import re
|
3 |
+
import re
|
4 |
+
f1 = open(sys.argv[1],'r')
|
5 |
+
f2 = open(sys.argv[2],'r')
|
6 |
+
ladder_file = open(sys.argv[3],'r')
|
7 |
+
|
8 |
+
|
9 |
+
output = ""
|
10 |
+
ladder = []
|
11 |
+
sktfile = [line.rstrip('\n').strip() for line in f1]
|
12 |
+
tibfile = [line.rstrip('\n').strip() for line in f2]
|
13 |
+
last_score = 0.5
|
14 |
+
|
15 |
+
def clean_num(string):
|
16 |
+
string = re.sub("[^0-9, ]","",string)
|
17 |
+
return int(string.split(',')[0])
|
18 |
+
|
19 |
+
|
20 |
+
for line in ladder_file:
|
21 |
+
if len(line.split("\t")) == 3:
|
22 |
+
skt,tib,score = line.split('\t')
|
23 |
+
if re.search("[0-9]",skt) and re.search("[0-9]",tib):
|
24 |
+
skt_num = clean_num(skt)
|
25 |
+
tib_num = clean_num(tib)
|
26 |
+
ladder.append([skt_num,tib_num,score])
|
27 |
+
|
28 |
+
|
29 |
+
if ";" in line:
|
30 |
+
m = re.search("([0-9., ]+);([0-9., ]+).*=\"([0-9.,]+)", line)
|
31 |
+
if m:
|
32 |
+
skt_num = int(m.group(1).split()[0].replace(".","").replace(",",""))-1
|
33 |
+
tib_num = int(m.group(2).split()[0].replace(".","").replace(",",""))-1
|
34 |
+
score = float(m.group(3))
|
35 |
+
ladder.append([skt_num,tib_num,score])
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
if len(line.split(':')) == 3:
|
40 |
+
skt,tib,score = line.split(':')
|
41 |
+
if re.search("[0-9]",skt) and re.search("[0-9]",tib):
|
42 |
+
skt_num = clean_num(skt)
|
43 |
+
tib_num = clean_num(tib)
|
44 |
+
ladder.append([skt_num,tib_num,score])
|
45 |
+
last_skt = 0
|
46 |
+
last_tib = 0
|
47 |
+
for entry in ladder:
|
48 |
+
output = output + ' '.join(sktfile[last_skt:entry[0]]) + "\t"
|
49 |
+
output = output + ' '.join(tibfile[last_tib:entry[1]]) + "\n"
|
50 |
+
last_skt = entry[0]
|
51 |
+
last_tib = entry[1]
|
52 |
+
output = output + ' '.join(sktfile[last_skt:-1]) + "\t"
|
53 |
+
output = output + ' '.join(tibfile[last_tib:-1]) + "\n" # + str(entry[2])
|
54 |
+
|
55 |
+
short_f1 = re.sub("\.tsv.*","",sys.argv[1])
|
56 |
+
short_f2 = re.sub(".*/","",sys.argv[2])
|
57 |
+
short_f2 = re.sub("\.tsv.*","",short_f2)
|
58 |
+
print(output)
|
59 |
+
# with open(short_f1 + "_" + short_f2 + ".train", 'w') as file:
|
60 |
+
# file.write(output)
|
tibetan-aligner/create_train_clean.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import re
|
3 |
+
import re
|
4 |
+
f1 = open(sys.argv[1],'r')
|
5 |
+
f2 = open(sys.argv[2],'r')
|
6 |
+
ladder_file = open(sys.argv[3],'r')
|
7 |
+
|
8 |
+
|
9 |
+
output = ""
|
10 |
+
ladder = []
|
11 |
+
sktfile = [line.rstrip('\n').strip() for line in f1]
|
12 |
+
tibfile = [line.rstrip('\n').strip() for line in f2]
|
13 |
+
last_score = 0.5
|
14 |
+
|
15 |
+
def clean_num(string):
|
16 |
+
string = re.sub("[^0-9, ]","",string)
|
17 |
+
numbers = []
|
18 |
+
for number in string.split(','):
|
19 |
+
numbers.append(int(number))
|
20 |
+
return numbers
|
21 |
+
|
22 |
+
|
23 |
+
for line in ladder_file:
|
24 |
+
if len(line.split(':')) == 3:
|
25 |
+
skt,tib,score = line.split(':')
|
26 |
+
if re.search("[0-9]",skt) and re.search("[0-9]",tib):
|
27 |
+
skt_nums = clean_num(skt)
|
28 |
+
tib_nums = clean_num(tib)
|
29 |
+
for num in skt_nums:
|
30 |
+
output += sktfile[num] + " "
|
31 |
+
output += "\t"
|
32 |
+
for num in tib_nums:
|
33 |
+
output += tibfile[num] + " "
|
34 |
+
output += "\n"
|
35 |
+
print(output)
|
36 |
+
# with open(short_f1 + "_" + short_f2 + ".train", 'w') as file:
|
37 |
+
# file.write(output)
|
tibetan-aligner/dp_core.cpython-310-x86_64-linux-gnu.so.reload1
ADDED
Binary file (643 kB). View file
|
|
tibetan-aligner/dp_core.cpython-39-darwin.so.reload1
ADDED
Binary file (170 kB). View file
|
|
tibetan-aligner/dp_core.pyx
ADDED
@@ -0,0 +1,411 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# cython: language_level=3
|
2 |
+
|
3 |
+
"""
|
4 |
+
Copyright 2019 Brian Thompson
|
5 |
+
|
6 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
you may not use this file except in compliance with the License.
|
8 |
+
You may obtain a copy of the License at
|
9 |
+
|
10 |
+
https://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
|
12 |
+
Unless required by applicable law or agreed to in writing, software
|
13 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
See the License for the specific language governing permissions and
|
16 |
+
limitations under the License.
|
17 |
+
"""
|
18 |
+
|
19 |
+
import numpy as np
|
20 |
+
|
21 |
+
cimport numpy as np
|
22 |
+
cimport cython
|
23 |
+
|
24 |
+
|
25 |
+
def make_x_y_offsets(alignment_types):
|
26 |
+
# alignment types for which we will precompute costs
|
27 |
+
|
28 |
+
# deletion/insertion is added later
|
29 |
+
for x, y in alignment_types:
|
30 |
+
assert (x > 0)
|
31 |
+
assert (y > 0)
|
32 |
+
|
33 |
+
x_offsets = np.array([x for x, y in alignment_types], dtype=np.int32) # MUST **NOT** INCLUDE (0,1), (1,0)
|
34 |
+
y_offsets = np.array([y for x, y in alignment_types], dtype=np.int32) # MUST **NOT** INCLUDE (0,1), (1,0)
|
35 |
+
return x_offsets, y_offsets
|
36 |
+
|
37 |
+
|
38 |
+
def make_dense_costs(np.ndarray[float, ndim=3] vecs0, # itput
|
39 |
+
np.ndarray[float, ndim=3] vecs1, # input
|
40 |
+
np.ndarray[float, ndim=2] norm0, # input
|
41 |
+
np.ndarray[float, ndim=2] norm1, # input
|
42 |
+
int offset0 = 0, # index into vecs0/norms0
|
43 |
+
int offset1 = 0, # index into vecs1/norms1
|
44 |
+
):
|
45 |
+
"""
|
46 |
+
Make a full N*M feature matrix. By default, makes 1-1 alignments,
|
47 |
+
can build others by specifying offset0, offset1 to index into
|
48 |
+
vecs0, norms0 and vecs1, norms1 respectivly.
|
49 |
+
"""
|
50 |
+
assert vecs0.shape[0] > offset0
|
51 |
+
assert vecs1.shape[0] > offset1
|
52 |
+
assert norm0.shape[0] > offset0
|
53 |
+
assert norm1.shape[0] > offset1
|
54 |
+
|
55 |
+
cdef int size0 = np.shape(vecs0)[1]
|
56 |
+
assert norm0.shape[1] == size0
|
57 |
+
|
58 |
+
cdef int size1 = np.shape(vecs1)[1]
|
59 |
+
assert norm1.shape[1] == size1
|
60 |
+
|
61 |
+
cdef int vecsize = np.shape(vecs0)[2]
|
62 |
+
assert vecs1.shape[2] == vecsize
|
63 |
+
|
64 |
+
cdef int xi, yi
|
65 |
+
cdef float sumx
|
66 |
+
|
67 |
+
cdef np.ndarray[float, ndim=2] costs = np.empty((size0, size1), dtype=np.float32)
|
68 |
+
|
69 |
+
for xi in range(size0):
|
70 |
+
for yi in range(size1):
|
71 |
+
sumx = 0.0
|
72 |
+
for jj in range(vecsize):
|
73 |
+
sumx += vecs0[offset0, xi, jj] * vecs1[offset1, yi, jj]
|
74 |
+
|
75 |
+
costs[xi, yi] = 2.0 * (1.0 - sumx) / (1e-6 + norm0[offset0, xi] + norm1[offset1, yi])
|
76 |
+
# normalize by alignment type
|
77 |
+
costs[xi, yi] = costs[xi, yi] * (offset0 + 1) * (offset1 + 1)
|
78 |
+
|
79 |
+
return costs
|
80 |
+
|
81 |
+
|
82 |
+
def dense_dp(np.ndarray[float, ndim=2] alignment_cost, float pen):
|
83 |
+
"""
|
84 |
+
Compute cost matrix (csum) and backpointers (bp)
|
85 |
+
from full 2-D 1-1 alignment costs matrix (alignment_cost)
|
86 |
+
"""
|
87 |
+
|
88 |
+
size0 = alignment_cost.shape[0]
|
89 |
+
size1 = alignment_cost.shape[1]
|
90 |
+
# csum and traceback matrix are both on nodes
|
91 |
+
# so they are +1 in each dimension compared to the jump costs matrix
|
92 |
+
# For anything being used in accumulation, use float64
|
93 |
+
cdef np.ndarray[double, ndim=2] csum = np.empty((size0 + 1, size1 + 1), dtype=np.float64)
|
94 |
+
cdef np.ndarray[int, ndim=2] bp = np.empty((size0 + 1, size1 + 1), dtype=np.int32)
|
95 |
+
|
96 |
+
# bp and csum are nodes,
|
97 |
+
# while alignment_cost is the cost of going between the nodes
|
98 |
+
# Size of nodes should be one larger than alignment costs
|
99 |
+
b0, b1 = np.shape(bp)
|
100 |
+
c0, c1 = np.shape(csum)
|
101 |
+
j0, j1 = np.shape(alignment_cost)
|
102 |
+
assert (b0 == c0 == j0 + 1)
|
103 |
+
assert (b1 == c1 == j1 + 1)
|
104 |
+
|
105 |
+
cdef int cmax = np.shape(csum)[1]
|
106 |
+
cdef int rmax = np.shape(csum)[0]
|
107 |
+
cdef int c, r
|
108 |
+
cdef double cost0, cost1, cost2
|
109 |
+
|
110 |
+
# initialize the all c-direction deletion path
|
111 |
+
for c in range(cmax):
|
112 |
+
csum[0, c] = c * pen
|
113 |
+
bp[0, c] = 1
|
114 |
+
|
115 |
+
# initialize the all r-direction deletion path
|
116 |
+
for r in range(rmax):
|
117 |
+
csum[r, 0] = r * pen
|
118 |
+
bp[r, 0] = 2
|
119 |
+
|
120 |
+
# Initial cost is 0.0
|
121 |
+
csum[0, 0] = 0.0 # noop
|
122 |
+
bp[0, 0] = 4 # should not matter
|
123 |
+
|
124 |
+
# Calculate the rest recursively
|
125 |
+
for c in range(1, cmax):
|
126 |
+
for r in range(1, rmax):
|
127 |
+
|
128 |
+
# alignment_cost indexes are off by 1 wrt
|
129 |
+
# csum/bp, since csum/bp are nodes
|
130 |
+
cost0 = csum[r - 1, c - 1] + alignment_cost[r - 1, c - 1]
|
131 |
+
cost1 = csum[r, c - 1] + pen
|
132 |
+
cost2 = csum[r - 1, c] + pen
|
133 |
+
|
134 |
+
csum[r, c] = cost0
|
135 |
+
bp[r, c] = 0
|
136 |
+
|
137 |
+
if cost1 < csum[r, c]:
|
138 |
+
csum[r, c] = cost1
|
139 |
+
bp[r, c] = 1
|
140 |
+
if cost2 < csum[r, c]:
|
141 |
+
csum[r, c] = cost2
|
142 |
+
bp[r, c] = 2
|
143 |
+
|
144 |
+
return csum, bp
|
145 |
+
|
146 |
+
|
147 |
+
def score_path(np.ndarray[int, ndim=1] xx,
|
148 |
+
np.ndarray[int, ndim=1] yy,
|
149 |
+
np.ndarray[float, ndim=1] norm1,
|
150 |
+
np.ndarray[float, ndim=1] norm2,
|
151 |
+
np.ndarray[float, ndim=2] vecs1,
|
152 |
+
np.ndarray[float, ndim=2] vecs2,
|
153 |
+
np.ndarray[float, ndim=1] out):
|
154 |
+
cdef int xi, yi, ii, jj
|
155 |
+
cdef float outx
|
156 |
+
cdef int lenxy = xx.shape[0]
|
157 |
+
cdef int vecsize = vecs1.shape[1]
|
158 |
+
|
159 |
+
for ii in range(lenxy):
|
160 |
+
xi = xx[ii]
|
161 |
+
yi = yy[ii]
|
162 |
+
outx = 0.0
|
163 |
+
for jj in range(vecsize):
|
164 |
+
outx += vecs1[xi, jj] * vecs2[yi, jj]
|
165 |
+
out[ii] = 2.0 * (1.0 - outx) / (norm1[xi] + norm2[yi])
|
166 |
+
|
167 |
+
|
168 |
+
# Bounds checking and wraparound slow things down by about 2x
|
169 |
+
# Division by 0 checking has minimal speed impact
|
170 |
+
@cython.boundscheck(False) # turn off bounds-checking for entire function
|
171 |
+
@cython.wraparound(False) # turn off negative index wrapping for entire function
|
172 |
+
@cython.cdivision(True) # use c-style division (no division-by-zero check)
|
173 |
+
def make_sparse_costs(np.ndarray[float, ndim=3] vecs0, # intput: num aligns X num sents X dim
|
174 |
+
np.ndarray[float, ndim=3] vecs1, # input
|
175 |
+
np.ndarray[float, ndim=2] norms0, # intput: num aligns X num sents
|
176 |
+
np.ndarray[float, ndim=2] norms1, # input
|
177 |
+
x_y_path,
|
178 |
+
alignment_types,
|
179 |
+
int width_over2):
|
180 |
+
"""
|
181 |
+
Make features for DP, *for lines running across approximate path*, *for each alignment type*
|
182 |
+
x_offsets, y_offsets should not include (0,1), (1,0)
|
183 |
+
|
184 |
+
Basically, we take the feature matrix, rotate it 45 degress,
|
185 |
+
and compute a "wavy" matrix for the features.
|
186 |
+
It's like the diagonal but it moves around to hopefully always include the true path.
|
187 |
+
"""
|
188 |
+
|
189 |
+
cdef np.ndarray[int, ndim=2] x_y_path_ = np.array(x_y_path).astype(np.int32)
|
190 |
+
|
191 |
+
assert (vecs0.shape[0] == norms0.shape[0])
|
192 |
+
assert (vecs1.shape[0] == norms1.shape[0])
|
193 |
+
|
194 |
+
assert (vecs0.shape[1] == norms0.shape[1])
|
195 |
+
assert (vecs1.shape[1] == norms1.shape[1])
|
196 |
+
|
197 |
+
# check how many overlaps vectors were passed in
|
198 |
+
num_overlaps_in_vecs0 = vecs0.shape[0]
|
199 |
+
num_overlaps_in_vecs1 = vecs1.shape[0]
|
200 |
+
|
201 |
+
# check how many overlaps were requested
|
202 |
+
# edge case: alignment_types could be empty
|
203 |
+
# In that case, we should just return insertions/deletions
|
204 |
+
# and max_x_overlap == max_y_overlap == 0
|
205 |
+
max_x_overlap = max([0] + [x for x, y in alignment_types]) # add [0] in case alignment_types is empty
|
206 |
+
max_y_overlap = max([0] + [y for x, y in alignment_types]) # add [0] in case alignment_types is empty
|
207 |
+
|
208 |
+
# note: alignment types are specified 1-based, but vectors are stored 0-based
|
209 |
+
if max_x_overlap > num_overlaps_in_vecs0:
|
210 |
+
raise Exception('%d x overlaps requrested (via alignment_types), but vecs0 only has %d' % (
|
211 |
+
max_x_overlap, num_overlaps_in_vecs0))
|
212 |
+
if max_y_overlap > num_overlaps_in_vecs1:
|
213 |
+
raise Exception('%d y overlaps requrested (via alignment_types), but vecs1 only has %d' % (
|
214 |
+
max_y_overlap, num_overlaps_in_vecs1))
|
215 |
+
|
216 |
+
# number of sentences in each document
|
217 |
+
cdef int xsize = vecs0.shape[1]
|
218 |
+
cdef int ysize = vecs1.shape[1]
|
219 |
+
|
220 |
+
# vector diminsions should match
|
221 |
+
assert (vecs0.shape[2] == vecs1.shape[2])
|
222 |
+
|
223 |
+
cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
|
224 |
+
x_offsets, y_offsets = make_x_y_offsets(alignment_types)
|
225 |
+
|
226 |
+
# reserve outputs
|
227 |
+
a_len = x_y_path_.shape[0]
|
228 |
+
b_len = 2 * width_over2
|
229 |
+
cdef np.ndarray[float, ndim=3] a_b_feats = np.empty((len(alignment_types), a_len, b_len), dtype=np.float32)
|
230 |
+
cdef np.ndarray[int, ndim=1] b_offset = np.empty(a_len).astype(np.int32)
|
231 |
+
|
232 |
+
cdef int x, y, aa, bb, xx, yy, a_idx, b_idx, bb2, x_offset, y_offset, ii_align, x_offset_idx, y_offset_idx
|
233 |
+
cdef int vecsize = vecs0.shape[2]
|
234 |
+
cdef int num_alignments = x_offsets.shape[0]
|
235 |
+
|
236 |
+
cdef float sumx, feat
|
237 |
+
cdef float inf = np.inf
|
238 |
+
|
239 |
+
for ii in range(x_y_path_.shape[0]):
|
240 |
+
x = x_y_path_[ii, 0]
|
241 |
+
y = x_y_path_[ii, 1]
|
242 |
+
|
243 |
+
# convert xy to ab cords
|
244 |
+
aa = x + y
|
245 |
+
bb = y
|
246 |
+
|
247 |
+
a_idx = aa
|
248 |
+
b_offset[aa] = bb - width_over2
|
249 |
+
for b_idx, bb2 in enumerate(range(bb - width_over2, bb + width_over2)):
|
250 |
+
# convert ab to xy cords
|
251 |
+
xx = aa - bb2
|
252 |
+
yy = bb2
|
253 |
+
|
254 |
+
for ii_align in range(num_alignments):
|
255 |
+
x_offset = x_offsets[ii_align]
|
256 |
+
x_offset_idx = x_offset - 1 # overlaps start at 1, vectors stored 0-based
|
257 |
+
y_offset = y_offsets[ii_align]
|
258 |
+
y_offset_idx = y_offset - 1
|
259 |
+
|
260 |
+
if 0 <= xx < xsize and 0 <= yy < ysize:
|
261 |
+
sumx = 0.0
|
262 |
+
for jj in range(vecsize):
|
263 |
+
sumx += vecs0[x_offset_idx, xx, jj] * vecs1[y_offset_idx, yy, jj]
|
264 |
+
feat = 2.0 * x_offset * y_offset * (1.0 - sumx) / (
|
265 |
+
1e-6 + norms0[x_offset_idx, xx] + norms1[y_offset_idx, yy])
|
266 |
+
|
267 |
+
else:
|
268 |
+
feat = inf
|
269 |
+
|
270 |
+
a_b_feats[ii_align, a_idx, b_idx] = feat
|
271 |
+
|
272 |
+
return a_b_feats, b_offset
|
273 |
+
|
274 |
+
|
275 |
+
def sparse_dp(np.ndarray[float, ndim=3] a_b_costs,
|
276 |
+
np.ndarray[int, ndim=1] b_offset_in,
|
277 |
+
alignment_types,
|
278 |
+
double del_penalty,
|
279 |
+
int x_in_size,
|
280 |
+
int y_in_size):
|
281 |
+
"""
|
282 |
+
Do DP along a path, using features saved off along path.
|
283 |
+
x_offsets, y_offsets should not include (0,1), (1,0)
|
284 |
+
|
285 |
+
xsize, ysize refer to the costs a_b_csum, but in x/y space
|
286 |
+
|
287 |
+
As in the simpler full-DP case,
|
288 |
+
we compute cumulative costs and backpointers on notes,
|
289 |
+
and there are COSTS associated with moving between them.
|
290 |
+
|
291 |
+
This means the size of the notes +1,+1 larger (in x,y) than the COSTS.
|
292 |
+
|
293 |
+
So the size of a_b_csum, a_b_xp, a_b_yp are all one larger in x and y compared to the costs
|
294 |
+
|
295 |
+
In order to save memory (and time, vs a sparse matrix with hashes to look up values), let:
|
296 |
+
a = x + y
|
297 |
+
b = x - y
|
298 |
+
|
299 |
+
b_offsets tells us how far from the left edge the features are computed for.
|
300 |
+
basically it's like we are computing along the diagonal,
|
301 |
+
but we shift the diagonal around based on our belief
|
302 |
+
about where the alignments are.
|
303 |
+
|
304 |
+
b_offsets is used for both costs AND csum, backpointers, so it needs to be
|
305 |
+
+2 longer (it is in the a-direction) than the costs (in the a direction)
|
306 |
+
|
307 |
+
"""
|
308 |
+
cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
|
309 |
+
x_offsets, y_offsets = make_x_y_offsets(alignment_types)
|
310 |
+
|
311 |
+
# make x/y offsets, including (0,1), (1,), i.e. including deletion and insertion
|
312 |
+
x_offsets = np.concatenate([x_offsets, np.array([0, 1], dtype=np.int32)])
|
313 |
+
y_offsets = np.concatenate([y_offsets, np.array([1, 0], dtype=np.int32)])
|
314 |
+
|
315 |
+
cdef int a_in_size = a_b_costs.shape[1]
|
316 |
+
cdef int b_in_size = a_b_costs.shape[2]
|
317 |
+
|
318 |
+
cdef int a_out_size = a_in_size + 2
|
319 |
+
cdef int b_out_size = b_in_size
|
320 |
+
|
321 |
+
cdef int x_out_size = x_in_size + 1
|
322 |
+
cdef int y_out_size = y_in_size + 1
|
323 |
+
|
324 |
+
# costs are the costs of going between nodes.
|
325 |
+
# in x,y for the nodes, we basically add a buffer
|
326 |
+
# at x=0 and y=0, and shift the cost by (x=+1,y=+1)
|
327 |
+
# In a,b space, this means adding two points (for the buffer)
|
328 |
+
# at the beginning, and shifting by (a=+0,b=+1) since
|
329 |
+
# a=x+y and b=y
|
330 |
+
# for the first two points, we can simply replicate the
|
331 |
+
# original b_offset, since it should be -width_over2
|
332 |
+
# i.e. b_offset_in[0] == -width_over2
|
333 |
+
extra_two_points = np.array([b_offset_in[0], b_offset_in[0]], dtype=np.int32)
|
334 |
+
cdef np.ndarray[int, ndim=1] b_offset_out = np.concatenate([extra_two_points, b_offset_in + 1])
|
335 |
+
|
336 |
+
# outputs
|
337 |
+
# For anything being used in accumulation, use float64
|
338 |
+
cdef np.ndarray[double, ndim=2] a_b_csum = np.zeros((a_in_size + 2, b_in_size),
|
339 |
+
dtype=np.float64) + np.inf # error cumulative sum
|
340 |
+
cdef np.ndarray[int, ndim=2] a_b_xp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2 # backpointer for x
|
341 |
+
cdef np.ndarray[int, ndim=2] a_b_yp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2 # backpointer for y
|
342 |
+
|
343 |
+
cdef int num_alignments = x_offsets.shape[0]
|
344 |
+
cdef double inf = np.inf
|
345 |
+
cdef int xx_out, yy_out, ii_align, x_offset, y_offset
|
346 |
+
cdef int aa_in_cost, bb_in_cost, aa_out, bb_out, aa_out_prev, bb_out_prev, xx_in_cost, yy_in_cost, xx_out_prev, yy_out_prev
|
347 |
+
|
348 |
+
cdef double alignment_cost, total_cost, prev_cost
|
349 |
+
|
350 |
+
# increasing in a is the same as going along diagonals in x/y, so DP order works
|
351 |
+
# (and any ordering is fine in b - nothing depends on values adjacent on diagonal in x/y)
|
352 |
+
for aa_out in range(a_in_size + 2):
|
353 |
+
for bb_out in range(b_in_size):
|
354 |
+
#xx_out, yy_out = ab2xy_w_offset(aa_out, bb_out, b_offset_out)
|
355 |
+
yy_out = bb_out + b_offset_out[aa_out]
|
356 |
+
xx_out = aa_out - yy_out
|
357 |
+
|
358 |
+
# edge case: all deletions in y-direction
|
359 |
+
if xx_out == 0 and 0 <= yy_out < y_out_size:
|
360 |
+
a_b_csum[aa_out, bb_out] = del_penalty * yy_out
|
361 |
+
a_b_xp[aa_out, bb_out] = 0
|
362 |
+
a_b_yp[aa_out, bb_out] = 1
|
363 |
+
|
364 |
+
# edge case: all deletions in x-direction
|
365 |
+
elif yy_out == 0 and 0 <= xx_out < x_out_size:
|
366 |
+
a_b_csum[aa_out, bb_out] = del_penalty * xx_out
|
367 |
+
a_b_xp[aa_out, bb_out] = 1
|
368 |
+
a_b_yp[aa_out, bb_out] = 0
|
369 |
+
|
370 |
+
else:
|
371 |
+
# initialize output to inf
|
372 |
+
a_b_csum[aa_out, bb_out] = inf
|
373 |
+
a_b_xp[aa_out, bb_out] = -42
|
374 |
+
a_b_yp[aa_out, bb_out] = -42
|
375 |
+
|
376 |
+
for ii_align in range(num_alignments):
|
377 |
+
x_offset = x_offsets[ii_align]
|
378 |
+
y_offset = y_offsets[ii_align]
|
379 |
+
|
380 |
+
# coords of location of alignment cost, in input x/y space
|
381 |
+
xx_in_cost = xx_out - 1 # features were front padded,
|
382 |
+
yy_in_cost = yy_out - 1 # so offset is always 1
|
383 |
+
|
384 |
+
# the coords of location of previous cumsum cost, in input x/y space
|
385 |
+
xx_out_prev = xx_out - x_offset
|
386 |
+
yy_out_prev = yy_out - y_offset
|
387 |
+
|
388 |
+
if 0 <= xx_in_cost < x_in_size and 0 <= yy_in_cost < y_in_size and 0 <= xx_out_prev < x_out_size and 0 <= yy_out_prev < y_out_size:
|
389 |
+
# convert x,y to a,b
|
390 |
+
aa_in_cost = xx_in_cost + yy_in_cost
|
391 |
+
bb_in_cost = yy_in_cost - b_offset_in[aa_in_cost]
|
392 |
+
|
393 |
+
aa_out_prev = xx_out_prev + yy_out_prev
|
394 |
+
bb_out_prev = yy_out_prev - b_offset_out[aa_out_prev]
|
395 |
+
|
396 |
+
if 0 <= aa_in_cost < a_in_size and 0 <= bb_in_cost < b_in_size and 0 <= aa_out_prev < a_out_size and 0 <= bb_out_prev < b_out_size:
|
397 |
+
if x_offset == 0 or y_offset == 0:
|
398 |
+
alignment_cost = del_penalty
|
399 |
+
else:
|
400 |
+
alignment_cost = a_b_costs[ii_align, aa_in_cost, bb_in_cost]
|
401 |
+
|
402 |
+
prev_cost = a_b_csum[aa_out_prev, bb_out_prev]
|
403 |
+
|
404 |
+
total_cost = prev_cost + alignment_cost
|
405 |
+
|
406 |
+
if total_cost < a_b_csum[aa_out, bb_out]:
|
407 |
+
a_b_csum[aa_out, bb_out] = total_cost
|
408 |
+
a_b_xp[aa_out, bb_out] = x_offset
|
409 |
+
a_b_yp[aa_out, bb_out] = y_offset
|
410 |
+
|
411 |
+
return a_b_csum, a_b_xp, a_b_yp, b_offset_out
|
tibetan-aligner/dp_utils.py
ADDED
@@ -0,0 +1,668 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Copyright 2019 Brian Thompson
|
3 |
+
|
4 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
you may not use this file except in compliance with the License.
|
6 |
+
You may obtain a copy of the License at
|
7 |
+
|
8 |
+
https://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
|
10 |
+
Unless required by applicable law or agreed to in writing, software
|
11 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
See the License for the specific language governing permissions and
|
14 |
+
limitations under the License.
|
15 |
+
"""
|
16 |
+
|
17 |
+
import logging
|
18 |
+
import sys
|
19 |
+
from ast import literal_eval
|
20 |
+
from collections import OrderedDict
|
21 |
+
from math import ceil
|
22 |
+
from time import time
|
23 |
+
|
24 |
+
import numpy as np
|
25 |
+
|
26 |
+
import pyximport
|
27 |
+
pyximport.install(setup_args={'include_dirs':np.get_include()}, inplace=True, reload_support=True)
|
28 |
+
|
29 |
+
from dp_core import make_dense_costs, score_path, sparse_dp, make_sparse_costs, dense_dp
|
30 |
+
|
31 |
+
logger = logging.getLogger('vecalign') # set up in vecalign.py
|
32 |
+
|
33 |
+
|
34 |
+
def preprocess_line(line):
|
35 |
+
line = line.strip()
|
36 |
+
if len(line) == 0:
|
37 |
+
line = 'BLANK_LINE'
|
38 |
+
return line
|
39 |
+
|
40 |
+
|
41 |
+
def yield_overlaps(lines, num_overlaps):
|
42 |
+
lines = [preprocess_line(line) for line in lines]
|
43 |
+
for overlap in range(1, num_overlaps + 1):
|
44 |
+
for out_line in layer(lines, overlap):
|
45 |
+
# check must be here so all outputs are unique
|
46 |
+
out_line2 = out_line[:10000] # limit line so dont encode arbitrarily long sentences
|
47 |
+
yield out_line2
|
48 |
+
|
49 |
+
|
50 |
+
def read_in_embeddings(text_file, embed_file):
|
51 |
+
"""
|
52 |
+
Given a text file with candidate sentences and a corresponing embedding file,
|
53 |
+
make a maping from candidate sentence to embedding index,
|
54 |
+
and a numpy array of the embeddings
|
55 |
+
"""
|
56 |
+
sent2line = dict()
|
57 |
+
with open(text_file, 'rt', encoding="utf-8") as fin:
|
58 |
+
for ii, line in enumerate(fin):
|
59 |
+
# don't know if it is a good idea to uncomment these two lines ###
|
60 |
+
# if line.strip() in sent2line:
|
61 |
+
# raise Exception('got multiple embeddings for the same line:',line)
|
62 |
+
sent2line[line.strip()] = ii
|
63 |
+
|
64 |
+
line_embeddings = np.load(embed_file,allow_pickle=True)
|
65 |
+
print("LINE EMBEDDINGS SHAPE",line_embeddings.shape)
|
66 |
+
# line_embeddings = np.fromfile(embed_file, dtype=np.float32, count=-1)
|
67 |
+
# if line_embeddings.size == 0:
|
68 |
+
# raise Exception('Got empty embedding file')
|
69 |
+
# print("Line embeddings size",len(line_embeddings))
|
70 |
+
# laser_embedding_size = line_embeddings.size // len(sent2line) # currently hardcoded to 1024
|
71 |
+
# if laser_embedding_size != 1024:
|
72 |
+
# logger.warning('expected an embedding size of 1024, got %s', laser_embedding_size)
|
73 |
+
# logger.info('laser_embedding_size determined to be %d', laser_embedding_size)
|
74 |
+
# line_embeddings.resize(line_embeddings.shape[0] // laser_embedding_size, laser_embedding_size)
|
75 |
+
return sent2line, line_embeddings
|
76 |
+
|
77 |
+
|
78 |
+
def make_doc_embedding(sent2line, line_embeddings, lines, num_overlaps):
|
79 |
+
"""
|
80 |
+
lines: sentences in input document to embed
|
81 |
+
sent2line, line_embeddings: precomputed embeddings for lines (and overlaps of lines)
|
82 |
+
"""
|
83 |
+
|
84 |
+
lines = [preprocess_line(line) for line in lines]
|
85 |
+
|
86 |
+
vecsize = line_embeddings.shape[1]
|
87 |
+
|
88 |
+
vecs0 = np.empty((num_overlaps, len(lines), vecsize), dtype=np.float32)
|
89 |
+
|
90 |
+
for ii, overlap in enumerate(range(1, num_overlaps + 1)):
|
91 |
+
for jj, out_line in enumerate(layer(lines, overlap)):
|
92 |
+
try:
|
93 |
+
line_id = sent2line[out_line]
|
94 |
+
except KeyError:
|
95 |
+
logger.warning('Failed to find overlap=%d line "%s". Will use random vector.', overlap, out_line)
|
96 |
+
line_id = None
|
97 |
+
|
98 |
+
if line_id is not None:
|
99 |
+
vec = line_embeddings[line_id]
|
100 |
+
else:
|
101 |
+
vec = np.random.random(vecsize) - 0.5
|
102 |
+
vec = vec / np.linalg.norm(vec)
|
103 |
+
|
104 |
+
vecs0[ii, jj, :] = vec
|
105 |
+
|
106 |
+
return vecs0
|
107 |
+
|
108 |
+
|
109 |
+
def make_norm1(vecs0):
|
110 |
+
"""
|
111 |
+
make vectors norm==1 so that cosine distance can be computed via dot product
|
112 |
+
"""
|
113 |
+
for ii in range(vecs0.shape[0]):
|
114 |
+
for jj in range(vecs0.shape[1]):
|
115 |
+
norm = np.sqrt(np.square(vecs0[ii, jj, :]).sum())
|
116 |
+
vecs0[ii, jj, :] = vecs0[ii, jj, :] / (norm + 1e-5)
|
117 |
+
|
118 |
+
|
119 |
+
def layer(lines, num_overlaps, comb=' '):
|
120 |
+
"""
|
121 |
+
make front-padded overlapping sentences
|
122 |
+
"""
|
123 |
+
if num_overlaps < 1:
|
124 |
+
raise Exception('num_overlaps must be >= 1')
|
125 |
+
out = ['PAD', ] * min(num_overlaps - 1, len(lines))
|
126 |
+
for ii in range(len(lines) - num_overlaps + 1):
|
127 |
+
out.append(comb.join(lines[ii:ii + num_overlaps]))
|
128 |
+
return out
|
129 |
+
|
130 |
+
|
131 |
+
def read_alignments(fin):
|
132 |
+
alignments = []
|
133 |
+
with open(fin, 'rt', encoding="utf-8") as infile:
|
134 |
+
for line in infile:
|
135 |
+
fields = [x.strip() for x in line.split(':') if len(x.strip())]
|
136 |
+
if len(fields) < 2:
|
137 |
+
raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
|
138 |
+
try:
|
139 |
+
src = literal_eval(fields[0])
|
140 |
+
tgt = literal_eval(fields[1])
|
141 |
+
except:
|
142 |
+
raise Exception('Failed to parse line "%s"' % line.strip())
|
143 |
+
alignments.append((src, tgt))
|
144 |
+
|
145 |
+
# I know bluealign files have a few entries entries missing,
|
146 |
+
# but I don't fix them in order to be consistent previous reported scores
|
147 |
+
return alignments
|
148 |
+
|
149 |
+
|
150 |
+
def print_alignments(alignments, scores=None, file=sys.stdout):
|
151 |
+
if scores is not None:
|
152 |
+
for (x, y), s in zip(alignments, scores):
|
153 |
+
print('%s:%s:%.6f' % (x, y, s), file=file)
|
154 |
+
else:
|
155 |
+
for x, y in alignments:
|
156 |
+
print('%s:%s' % (x, y), file=file)
|
157 |
+
|
158 |
+
|
159 |
+
class DeletionKnob(object):
|
160 |
+
"""
|
161 |
+
A good deletion penalty is dependent on normalization, and probably language, domain, etc, etc
|
162 |
+
I want a way to control deletion penalty that generalizes well...
|
163 |
+
Sampling costs and use percentile seems to work fairly well.
|
164 |
+
"""
|
165 |
+
def __init__(self, samp, res_min, res_max):
|
166 |
+
|
167 |
+
self.res_min = res_min
|
168 |
+
self.res_max = res_max
|
169 |
+
|
170 |
+
if self.res_min >= self.res_max:
|
171 |
+
logger.warning('res_max <= res_min, increasing it')
|
172 |
+
self.res_max = self.res_min + 1e-4
|
173 |
+
|
174 |
+
num_bins = 1000
|
175 |
+
num_pts = 30
|
176 |
+
|
177 |
+
self.hist, self.bin_edges = np.histogram(samp, bins=num_bins,
|
178 |
+
range=[self.res_min, self.res_max],
|
179 |
+
density=True)
|
180 |
+
|
181 |
+
dx = self.bin_edges[1] - self.bin_edges[0]
|
182 |
+
self.cdf = np.cumsum(self.hist) * dx
|
183 |
+
|
184 |
+
interp_points = [(0, self.res_min), ]
|
185 |
+
for knob_val in np.linspace(0, 1, num_pts - 1)[1:-1]:
|
186 |
+
cdf_idx = np.searchsorted(self.cdf, knob_val)
|
187 |
+
cdf_val = self.res_min + cdf_idx / float(num_bins) * (self.res_max - self.res_min)
|
188 |
+
interp_points.append((knob_val, cdf_val))
|
189 |
+
interp_points.append((1, self.res_max))
|
190 |
+
self.x, self.y = zip(*interp_points)
|
191 |
+
|
192 |
+
def percentile_frac_to_del_penalty(self, knob_val):
|
193 |
+
del_pen = np.interp([knob_val], self.x, self.y)[0]
|
194 |
+
return del_pen
|
195 |
+
|
196 |
+
|
197 |
+
def make_alignment_types(max_alignment_size):
|
198 |
+
# return list of all (n,m) where n+m <= this
|
199 |
+
alignment_types = []
|
200 |
+
for x in range(1, max_alignment_size):
|
201 |
+
for y in range(1, max_alignment_size):
|
202 |
+
if x + y <= max_alignment_size:
|
203 |
+
alignment_types.append((x, y))
|
204 |
+
return alignment_types
|
205 |
+
|
206 |
+
|
207 |
+
def ab2xy_w_offset(aa, bb_idx, bb_offset):
|
208 |
+
bb_from_side = bb_idx + bb_offset[aa]
|
209 |
+
xx = aa - bb_from_side
|
210 |
+
yy = bb_from_side
|
211 |
+
return (xx, yy)
|
212 |
+
|
213 |
+
|
214 |
+
def xy2ab_w_offset(xx, yy, bb_offset):
|
215 |
+
aa = xx + yy
|
216 |
+
bb_from_side = yy
|
217 |
+
bb = bb_from_side - bb_offset[aa]
|
218 |
+
return aa, bb
|
219 |
+
|
220 |
+
|
221 |
+
def process_scores(scores, alignments):
|
222 |
+
# floating point sometimes gives negative numbers, which is a little unnerving ...
|
223 |
+
scores = np.clip(scores, a_min=0, a_max=None)
|
224 |
+
|
225 |
+
for ii, (x_algn, y_algn) in enumerate(alignments):
|
226 |
+
# deletion penalty is pretty arbitrary, just report 0
|
227 |
+
if len(x_algn) == 0 or len(y_algn) == 0:
|
228 |
+
scores[ii] = 0.0
|
229 |
+
# report sores un-normalized by alignment sizes
|
230 |
+
# (still normalized with random vectors, though)
|
231 |
+
else:
|
232 |
+
scores[ii] = scores[ii] / len(x_algn) / len(y_algn)
|
233 |
+
|
234 |
+
return scores
|
235 |
+
|
236 |
+
|
237 |
+
def sparse_traceback(a_b_csum, a_b_xp, a_b_yp, b_offset, xsize, ysize):
|
238 |
+
alignments = []
|
239 |
+
xx = xsize
|
240 |
+
yy = ysize
|
241 |
+
|
242 |
+
cum_costs = []
|
243 |
+
|
244 |
+
while True:
|
245 |
+
aa, bb = xy2ab_w_offset(xx, yy, b_offset)
|
246 |
+
|
247 |
+
cum_costs.append(a_b_csum[aa, bb])
|
248 |
+
|
249 |
+
xp = a_b_xp[aa, bb]
|
250 |
+
yp = a_b_yp[aa, bb]
|
251 |
+
|
252 |
+
if xx == yy == 0:
|
253 |
+
break
|
254 |
+
|
255 |
+
if xx < 0 or yy < 0:
|
256 |
+
raise Exception('traceback bug')
|
257 |
+
|
258 |
+
x_side = list(range(xx - xp, xx))
|
259 |
+
y_side = list(range(yy - yp, yy))
|
260 |
+
alignments.append((x_side, y_side))
|
261 |
+
|
262 |
+
xx = xx - xp
|
263 |
+
yy = yy - yp
|
264 |
+
|
265 |
+
alignments.reverse()
|
266 |
+
cum_costs.reverse()
|
267 |
+
costs = np.array(cum_costs[1:]) - np.array(cum_costs[:-1])
|
268 |
+
# "costs" are scaled by x_alignment_size * y_alignment_size
|
269 |
+
# and the cost of a deletion is del_penalty
|
270 |
+
# "scores": 0 for deletion/insertion,
|
271 |
+
# and cosine distance, *not* scaled
|
272 |
+
# by len(x_alignment)*len(y_alignment)
|
273 |
+
scores = process_scores(scores=costs, alignments=alignments)
|
274 |
+
|
275 |
+
return alignments, scores
|
276 |
+
|
277 |
+
|
278 |
+
def dense_traceback(x_y_tb):
|
279 |
+
xsize, ysize = x_y_tb.shape
|
280 |
+
|
281 |
+
xx = xsize - 1
|
282 |
+
yy = ysize - 1
|
283 |
+
|
284 |
+
alignments = []
|
285 |
+
while True:
|
286 |
+
if xx == yy == 0:
|
287 |
+
break
|
288 |
+
bp = x_y_tb[xx, yy]
|
289 |
+
if bp == 0:
|
290 |
+
xp, yp = 1, 1
|
291 |
+
alignments.append(([xx - 1], [yy - 1]))
|
292 |
+
elif bp == 1:
|
293 |
+
xp, yp = 0, 1
|
294 |
+
alignments.append(([], [yy - 1]))
|
295 |
+
elif bp == 2:
|
296 |
+
xp, yp = 1, 0
|
297 |
+
alignments.append(([xx - 1], []))
|
298 |
+
else:
|
299 |
+
raise Exception('got unknown value')
|
300 |
+
|
301 |
+
xx = xx - xp
|
302 |
+
yy = yy - yp
|
303 |
+
|
304 |
+
alignments.reverse()
|
305 |
+
|
306 |
+
return alignments
|
307 |
+
|
308 |
+
|
309 |
+
def append_slant(path, xwidth, ywidth):
|
310 |
+
"""
|
311 |
+
Append quantized approximation to a straight line
|
312 |
+
from current x,y to a point at (x+xwidth, y+ywidth)
|
313 |
+
"""
|
314 |
+
NN = xwidth + ywidth
|
315 |
+
xstart, ystart = path[-1]
|
316 |
+
for ii in range(1, NN + 1):
|
317 |
+
x = xstart + round(xwidth * ii / NN)
|
318 |
+
y = ystart + round(ywidth * ii / NN)
|
319 |
+
# In the case of ties we want them to round differently,
|
320 |
+
# so explicitly make sure we take a step of 1, not 0 or 2
|
321 |
+
lastx, lasty = path[-1]
|
322 |
+
delta = x + y - lastx - lasty
|
323 |
+
if delta == 1:
|
324 |
+
path.append((x, y))
|
325 |
+
elif delta == 2:
|
326 |
+
path.append((x - 1, y))
|
327 |
+
elif delta == 0:
|
328 |
+
path.append((x + 1, y))
|
329 |
+
|
330 |
+
|
331 |
+
def alignment_to_search_path(algn):
|
332 |
+
"""
|
333 |
+
Given an alignment, make searchpath.
|
334 |
+
Searchpath must step exactly one position in x XOR y at each time step.
|
335 |
+
|
336 |
+
In the case of a block of deletions, the order found by DP is not meaningful.
|
337 |
+
To make things consistent and to improve the probability of recovering
|
338 |
+
from search errors, we search an approximately straight line
|
339 |
+
through a block of deletions. We do the same through a many-many
|
340 |
+
alignment, even though we currently don't refine a many-many alignment...
|
341 |
+
"""
|
342 |
+
path = [(0, 0), ]
|
343 |
+
xdel, ydel = 0, 0
|
344 |
+
ydel = 0
|
345 |
+
for x, y in algn:
|
346 |
+
if len(x) and len(y):
|
347 |
+
append_slant(path, xdel, ydel)
|
348 |
+
xdel, ydel = 0, 0
|
349 |
+
append_slant(path, len(x), len(y))
|
350 |
+
elif len(x):
|
351 |
+
xdel += len(x)
|
352 |
+
elif len(y):
|
353 |
+
ydel += len(y)
|
354 |
+
|
355 |
+
append_slant(path, xdel, ydel)
|
356 |
+
|
357 |
+
return path
|
358 |
+
|
359 |
+
|
360 |
+
def extend_alignments(course_alignments, size0, size1):
|
361 |
+
"""
|
362 |
+
extend alignments to include new endpoints size0, size1
|
363 |
+
if alignments are larger than size0/size1, raise exception
|
364 |
+
"""
|
365 |
+
# could be a string of deletions or insertions at end, so cannot just grab last one
|
366 |
+
xmax = 0 # maximum x value in course_alignments
|
367 |
+
ymax = 0 # maximum y value in course_alignments
|
368 |
+
for x, y in course_alignments:
|
369 |
+
for xval in x:
|
370 |
+
xmax = max(xmax, xval)
|
371 |
+
for yval in y:
|
372 |
+
ymax = max(ymax, yval)
|
373 |
+
|
374 |
+
if xmax > size0 or ymax > size1:
|
375 |
+
raise Exception('asked to extend alignments but already bigger than requested')
|
376 |
+
|
377 |
+
# do not duplicate xmax/ymax, do include size0/size1
|
378 |
+
extra_x = list(range(xmax + 1, size0 + 1))
|
379 |
+
extra_y = list(range(ymax + 1, size1 + 1))
|
380 |
+
|
381 |
+
logger.debug('extending alignments in x by %d and y by %d', len(extra_x), len(extra_y))
|
382 |
+
|
383 |
+
if len(extra_x) == 0:
|
384 |
+
for yval in extra_y:
|
385 |
+
course_alignments.append(([], [yval]))
|
386 |
+
elif len(extra_y) == 0:
|
387 |
+
for xval in extra_x:
|
388 |
+
course_alignments.append(([xval], []))
|
389 |
+
else:
|
390 |
+
course_alignments.append((extra_x, extra_y))
|
391 |
+
|
392 |
+
|
393 |
+
def upsample_alignment(algn):
|
394 |
+
def upsample_one_alignment(xx):
|
395 |
+
return list(range(min(xx) * 2, (max(xx) + 1) * 2))
|
396 |
+
|
397 |
+
new_algn = []
|
398 |
+
for xx, yy in algn:
|
399 |
+
if len(xx) == 0:
|
400 |
+
for yyy in upsample_one_alignment(yy):
|
401 |
+
new_algn.append(([], [yyy]))
|
402 |
+
elif len(yy) == 0:
|
403 |
+
for xxx in upsample_one_alignment(xx):
|
404 |
+
new_algn.append(([xxx], []))
|
405 |
+
else:
|
406 |
+
new_algn.append((upsample_one_alignment(xx), upsample_one_alignment(yy)))
|
407 |
+
return new_algn
|
408 |
+
|
409 |
+
|
410 |
+
def make_del_knob(e_laser,
|
411 |
+
f_laser,
|
412 |
+
e_laser_norms,
|
413 |
+
f_laser_norms,
|
414 |
+
sample_size):
|
415 |
+
e_size = e_laser.shape[0]
|
416 |
+
f_size = f_laser.shape[0]
|
417 |
+
|
418 |
+
if e_size > 0 and f_size > 0 and sample_size > 0:
|
419 |
+
|
420 |
+
if e_size * f_size < sample_size:
|
421 |
+
# dont sample, just compute full matrix
|
422 |
+
sample_size = e_size * f_size
|
423 |
+
x_idxs = np.zeros(sample_size, dtype=np.int32)
|
424 |
+
y_idxs = np.zeros(sample_size, dtype=np.int32)
|
425 |
+
c = 0
|
426 |
+
for ii in range(e_size):
|
427 |
+
for jj in range(f_size):
|
428 |
+
x_idxs[c] = ii
|
429 |
+
y_idxs[c] = jj
|
430 |
+
c += 1
|
431 |
+
else:
|
432 |
+
# get random samples
|
433 |
+
x_idxs = np.random.choice(range(e_size), size=sample_size, replace=True).astype(np.int32)
|
434 |
+
y_idxs = np.random.choice(range(f_size), size=sample_size, replace=True).astype(np.int32)
|
435 |
+
|
436 |
+
# output
|
437 |
+
random_scores = np.empty(sample_size, dtype=np.float32)
|
438 |
+
|
439 |
+
score_path(x_idxs, y_idxs,
|
440 |
+
e_laser_norms, f_laser_norms,
|
441 |
+
e_laser, f_laser,
|
442 |
+
random_scores, )
|
443 |
+
|
444 |
+
min_score = 0
|
445 |
+
max_score = max(random_scores) # could bump this up... but its probably fine
|
446 |
+
|
447 |
+
else:
|
448 |
+
# Not much we can do here...
|
449 |
+
random_scores = np.array([0.0, 0.5, 1.0]) # ???
|
450 |
+
min_score = 0
|
451 |
+
max_score = 1 # ????
|
452 |
+
|
453 |
+
del_knob = DeletionKnob(random_scores, min_score, max_score)
|
454 |
+
|
455 |
+
return del_knob
|
456 |
+
|
457 |
+
|
458 |
+
def compute_norms(vecs0, vecs1, num_samples, overlaps_to_use=None):
|
459 |
+
# overlaps_to_use = 10 # 10 matches before
|
460 |
+
|
461 |
+
overlaps1, size1, dim = vecs1.shape
|
462 |
+
overlaps0, size0, dim0 = vecs0.shape
|
463 |
+
assert (dim == dim0)
|
464 |
+
|
465 |
+
if overlaps_to_use is not None:
|
466 |
+
if overlaps_to_use > overlaps1:
|
467 |
+
raise Exception('Cannot use more overlaps than provided. You may want to re-run make_verlaps.py with a larger -n value')
|
468 |
+
else:
|
469 |
+
overlaps_to_use = overlaps1
|
470 |
+
|
471 |
+
samps_per_overlap = ceil(num_samples / overlaps_to_use)
|
472 |
+
|
473 |
+
if size1 and samps_per_overlap:
|
474 |
+
# sample other size (from all overlaps) to compre to this side
|
475 |
+
vecs1_rand_sample = np.empty((samps_per_overlap * overlaps_to_use, dim), dtype=np.float32)
|
476 |
+
for overlap_ii in range(overlaps_to_use):
|
477 |
+
idxs = np.random.choice(range(size1), size=samps_per_overlap, replace=True)
|
478 |
+
random_vecs = vecs1[overlap_ii, idxs, :]
|
479 |
+
vecs1_rand_sample[overlap_ii * samps_per_overlap:(overlap_ii + 1) * samps_per_overlap, :] = random_vecs
|
480 |
+
|
481 |
+
norms0 = np.empty((overlaps0, size0), dtype=np.float32)
|
482 |
+
for overlap_ii in range(overlaps0):
|
483 |
+
e_laser = vecs0[overlap_ii, :, :]
|
484 |
+
sim = np.matmul(e_laser, vecs1_rand_sample.T)
|
485 |
+
norms0[overlap_ii, :] = 1.0 - sim.mean(axis=1)
|
486 |
+
|
487 |
+
else: # no samples, no normalization
|
488 |
+
norms0 = np.ones((overlaps0, size0)).astype(np.float32)
|
489 |
+
|
490 |
+
return norms0
|
491 |
+
|
492 |
+
|
493 |
+
def downsample_vectors(vecs1):
|
494 |
+
a, b, c = vecs1.shape
|
495 |
+
half = np.empty((a, b // 2, c), dtype=np.float32)
|
496 |
+
for ii in range(a):
|
497 |
+
# average consecutive vectors
|
498 |
+
for jj in range(0, b - b % 2, 2):
|
499 |
+
v1 = vecs1[ii, jj, :]
|
500 |
+
v2 = vecs1[ii, jj + 1, :]
|
501 |
+
half[ii, jj // 2, :] = v1 + v2
|
502 |
+
# compute mean for all vectors
|
503 |
+
mean = np.mean(half[ii, :, :], axis=0)
|
504 |
+
for jj in range(0, b - b % 2, 2):
|
505 |
+
# remove mean
|
506 |
+
half[ii, jj // 2, :] = half[ii, jj // 2, :] - mean
|
507 |
+
# make vectors norm==1 so dot product is cosine distance
|
508 |
+
make_norm1(half)
|
509 |
+
return half
|
510 |
+
|
511 |
+
|
512 |
+
def vecalign(vecs0,
|
513 |
+
vecs1,
|
514 |
+
final_alignment_types,
|
515 |
+
del_percentile_frac,
|
516 |
+
width_over2,
|
517 |
+
max_size_full_dp,
|
518 |
+
costs_sample_size,
|
519 |
+
num_samps_for_norm,
|
520 |
+
norms0=None,
|
521 |
+
norms1=None):
|
522 |
+
if width_over2 < 3:
|
523 |
+
logger.warning('width_over2 was set to %d, which does not make sense. increasing to 3.', width_over2)
|
524 |
+
width_over2 = 3
|
525 |
+
|
526 |
+
# make sure input embeddings are norm==1
|
527 |
+
make_norm1(vecs0)
|
528 |
+
make_norm1(vecs1)
|
529 |
+
|
530 |
+
# save off runtime stats for summary
|
531 |
+
runtimes = OrderedDict()
|
532 |
+
|
533 |
+
# Determine stack depth
|
534 |
+
s0, s1 = vecs0.shape[1], vecs1.shape[1]
|
535 |
+
max_depth = 0
|
536 |
+
while s0 * s1 > max_size_full_dp ** 2:
|
537 |
+
max_depth += 1
|
538 |
+
s0 = s0 // 2
|
539 |
+
s1 = s1 // 2
|
540 |
+
|
541 |
+
# init recursion stack
|
542 |
+
# depth is 0-based (full size is 0, 1 is half, 2 is quarter, etc)
|
543 |
+
stack = {0: {'v0': vecs0, 'v1': vecs1}}
|
544 |
+
|
545 |
+
# downsample sentence vectors
|
546 |
+
t0 = time()
|
547 |
+
for depth in range(1, max_depth + 1):
|
548 |
+
stack[depth] = {'v0': downsample_vectors(stack[depth - 1]['v0']),
|
549 |
+
'v1': downsample_vectors(stack[depth - 1]['v1'])}
|
550 |
+
runtimes['Downsample embeddings'] = time() - t0
|
551 |
+
|
552 |
+
# compute norms for all depths, add sizes, add alignment types
|
553 |
+
t0 = time()
|
554 |
+
for depth in stack:
|
555 |
+
stack[depth]['size0'] = stack[depth]['v0'].shape[1]
|
556 |
+
stack[depth]['size1'] = stack[depth]['v1'].shape[1]
|
557 |
+
stack[depth]['alignment_types'] = final_alignment_types if depth == 0 else [(1, 1)]
|
558 |
+
|
559 |
+
if depth == 0 and norms0 is not None:
|
560 |
+
if norms0.shape != vecs0.shape[:2]:
|
561 |
+
print('norms0.shape:', norms0.shape)
|
562 |
+
print('vecs0.shape[:2]:', vecs0.shape[:2])
|
563 |
+
raise Exception('norms0 wrong shape')
|
564 |
+
stack[depth]['n0'] = norms0
|
565 |
+
else:
|
566 |
+
stack[depth]['n0'] = compute_norms(stack[depth]['v0'], stack[depth]['v1'], num_samps_for_norm)
|
567 |
+
|
568 |
+
if depth == 0 and norms1 is not None:
|
569 |
+
if norms1.shape != vecs1.shape[:2]:
|
570 |
+
print('norms1.shape:', norms1.shape)
|
571 |
+
print('vecs1.shape[:2]:', vecs1.shape[:2])
|
572 |
+
raise Exception('norms1 wrong shape')
|
573 |
+
stack[depth]['n1'] = norms1
|
574 |
+
else:
|
575 |
+
stack[depth]['n1'] = compute_norms(stack[depth]['v1'], stack[depth]['v0'], num_samps_for_norm)
|
576 |
+
|
577 |
+
runtimes['Normalize embeddings'] = time() - t0
|
578 |
+
|
579 |
+
# Compute deletion penalty for all depths
|
580 |
+
t0 = time()
|
581 |
+
for depth in stack:
|
582 |
+
stack[depth]['del_knob'] = make_del_knob(e_laser=stack[depth]['v0'][0, :, :],
|
583 |
+
f_laser=stack[depth]['v1'][0, :, :],
|
584 |
+
e_laser_norms=stack[depth]['n0'][0, :],
|
585 |
+
f_laser_norms=stack[depth]['n1'][0, :],
|
586 |
+
sample_size=costs_sample_size)
|
587 |
+
stack[depth]['del_penalty'] = stack[depth]['del_knob'].percentile_frac_to_del_penalty(del_percentile_frac)
|
588 |
+
logger.debug('del_penalty at depth %d: %f', depth, stack[depth]['del_penalty'])
|
589 |
+
runtimes['Compute deletion penalties'] = time() - t0
|
590 |
+
tt = time() - t0
|
591 |
+
logger.debug('%d x %d full DP make features: %.6fs (%.3e per dot product)',
|
592 |
+
stack[max_depth]['size0'], stack[max_depth]['size1'], tt,
|
593 |
+
tt / (stack[max_depth]['size0'] + 1e-6) / (stack[max_depth]['size1'] + 1e-6))
|
594 |
+
# full DP at maximum recursion depth
|
595 |
+
t0 = time()
|
596 |
+
stack[max_depth]['costs_1to1'] = make_dense_costs(stack[max_depth]['v0'],
|
597 |
+
stack[max_depth]['v1'],
|
598 |
+
stack[max_depth]['n0'],
|
599 |
+
stack[max_depth]['n1'])
|
600 |
+
|
601 |
+
runtimes['Full DP make features'] = time() - t0
|
602 |
+
t0 = time()
|
603 |
+
_, stack[max_depth]['x_y_tb'] = dense_dp(stack[max_depth]['costs_1to1'], stack[max_depth]['del_penalty'])
|
604 |
+
stack[max_depth]['alignments'] = dense_traceback(stack[max_depth]['x_y_tb'])
|
605 |
+
runtimes['Full DP'] = time() - t0
|
606 |
+
|
607 |
+
# upsample the path up to the top resolution
|
608 |
+
compute_costs_times = []
|
609 |
+
dp_times = []
|
610 |
+
upsample_depths = [0, ] if max_depth == 0 else list(reversed(range(0, max_depth)))
|
611 |
+
for depth in upsample_depths:
|
612 |
+
if max_depth > 0: # upsample previoius alignment to current resolution
|
613 |
+
course_alignments = upsample_alignment(stack[depth + 1]['alignments'])
|
614 |
+
# features may have been truncated when downsampleing, so alignment may need extended
|
615 |
+
extend_alignments(course_alignments, stack[depth]['size0'], stack[depth]['size1']) # in-place
|
616 |
+
else: # We did a full size 1-1 search, so search same size with more alignment types
|
617 |
+
course_alignments = stack[0]['alignments']
|
618 |
+
|
619 |
+
# convert couse alignments to a searchpath
|
620 |
+
stack[depth]['searchpath'] = alignment_to_search_path(course_alignments)
|
621 |
+
|
622 |
+
# compute ccosts for sparse DP
|
623 |
+
t0 = time()
|
624 |
+
stack[depth]['a_b_costs'], stack[depth]['b_offset'] = make_sparse_costs(stack[depth]['v0'], stack[depth]['v1'],
|
625 |
+
stack[depth]['n0'], stack[depth]['n1'],
|
626 |
+
stack[depth]['searchpath'],
|
627 |
+
stack[depth]['alignment_types'],
|
628 |
+
width_over2)
|
629 |
+
|
630 |
+
tt = time() - t0
|
631 |
+
num_dot_products = len(stack[depth]['b_offset']) * len(stack[depth]['alignment_types']) * width_over2 * 2
|
632 |
+
logger.debug('%d x %d sparse DP (%d alignment types, %d window) make features: %.6fs (%.3e per dot product)',
|
633 |
+
stack[max_depth]['size0'], stack[max_depth]['size1'],
|
634 |
+
len(stack[depth]['alignment_types']), width_over2 * 2,
|
635 |
+
tt, tt / (num_dot_products + 1e6))
|
636 |
+
|
637 |
+
compute_costs_times.append(time() - t0)
|
638 |
+
t0 = time()
|
639 |
+
# perform sparse DP
|
640 |
+
stack[depth]['a_b_csum'], stack[depth]['a_b_xp'], stack[depth]['a_b_yp'], \
|
641 |
+
stack[depth]['new_b_offset'] = sparse_dp(stack[depth]['a_b_costs'], stack[depth]['b_offset'],
|
642 |
+
stack[depth]['alignment_types'], stack[depth]['del_penalty'],
|
643 |
+
stack[depth]['size0'], stack[depth]['size1'])
|
644 |
+
|
645 |
+
# performace traceback to get alignments and alignment scores
|
646 |
+
# for debugging, avoid overwriting stack[depth]['alignments']
|
647 |
+
akey = 'final_alignments' if depth == 0 else 'alignments'
|
648 |
+
stack[depth][akey], stack[depth]['alignment_scores'] = sparse_traceback(stack[depth]['a_b_csum'],
|
649 |
+
stack[depth]['a_b_xp'],
|
650 |
+
stack[depth]['a_b_yp'],
|
651 |
+
stack[depth]['new_b_offset'],
|
652 |
+
stack[depth]['size0'],
|
653 |
+
stack[depth]['size1'])
|
654 |
+
dp_times.append(time() - t0)
|
655 |
+
|
656 |
+
runtimes['Upsample DP compute costs'] = sum(compute_costs_times[:-1])
|
657 |
+
runtimes['Upsample DP'] = sum(dp_times[:-1])
|
658 |
+
|
659 |
+
runtimes['Final DP compute costs'] = compute_costs_times[-1]
|
660 |
+
runtimes['Final DP'] = dp_times[-1]
|
661 |
+
|
662 |
+
# log time stats
|
663 |
+
max_key_str_len = max([len(key) for key in runtimes])
|
664 |
+
for key in runtimes:
|
665 |
+
if runtimes[key] > 5e-5:
|
666 |
+
logger.info(key + ' took ' + '.' * (max_key_str_len + 5 - len(key)) + ('%.4fs' % runtimes[key]).rjust(7))
|
667 |
+
|
668 |
+
return stack
|
tibetan-aligner/get_vectors.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
filename = sys.argv[1]
|
7 |
+
number_of_overlays = int(sys.argv[2]) + 1 # +1 because we want to include the original sentence
|
8 |
+
|
9 |
+
def process_file(filename):
|
10 |
+
model_path = "buddhist-nlp/bod-eng-similarity"
|
11 |
+
model = SentenceTransformer(model_path)
|
12 |
+
|
13 |
+
model.max_seq_length = 500
|
14 |
+
file = open(filename,'r')
|
15 |
+
|
16 |
+
sentences = [line.rstrip('\n').strip() for line in file]
|
17 |
+
sentences_overlay = []
|
18 |
+
|
19 |
+
for x in range(len(sentences)):
|
20 |
+
val = number_of_overlays
|
21 |
+
if (len(sentences) - x) < val:
|
22 |
+
val = (len(sentences) - x) + 1
|
23 |
+
for i in range(1,val):
|
24 |
+
sentences_overlay.append(' '.join(sentences[x:x+i]))
|
25 |
+
overlay_string = "\n".join(sentences_overlay)
|
26 |
+
vectors = np.array(model.encode(sentences_overlay,show_progress_bar=False))
|
27 |
+
print("LEN SENTENCES",len(sentences_overlay))
|
28 |
+
print("LEN VECTORS",len(vectors))
|
29 |
+
with open(sys.argv[1] + "_overlay", "w") as text_file:
|
30 |
+
text_file.write(overlay_string)
|
31 |
+
|
32 |
+
np.save(sys.argv[1] + "_vectors",vectors)
|
33 |
+
|
34 |
+
process_file(filename)
|
35 |
+
|
tibetan-aligner/ladder
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LINE EMBEDDINGS SHAPE (15, 768)
|
2 |
+
LINE EMBEDDINGS SHAPE (87, 768)
|
3 |
+
[0]:[0]:0.264225
|
4 |
+
[1, 2]:[1]:0.354184
|
5 |
+
[]:[2]:0.000000
|
6 |
+
[]:[3]:0.000000
|
7 |
+
[]:[4]:0.000000
|
8 |
+
[]:[5]:0.000000
|
9 |
+
[3]:[6, 7, 8, 9, 10]:0.404515
|
10 |
+
[]:[11]:0.000000
|
11 |
+
[4]:[12, 13, 14, 15, 16]:0.280724
|
tibetan-aligner/ladder2org.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import re
|
3 |
+
import re
|
4 |
+
f1 = open(sys.argv[1],'r')
|
5 |
+
f2 = open(sys.argv[2],'r')
|
6 |
+
ladder_file = open(sys.argv[3],'r')
|
7 |
+
|
8 |
+
|
9 |
+
output = ""
|
10 |
+
ladder = []
|
11 |
+
sktfile = [line.rstrip('\n').strip() for line in f1]
|
12 |
+
tibfile = [line.rstrip('\n').strip() for line in f2]
|
13 |
+
last_score = 0.5
|
14 |
+
|
15 |
+
def clean_num(string):
|
16 |
+
string = re.sub("[^0-9, ]","",string)
|
17 |
+
return int(string.split(',')[0])
|
18 |
+
|
19 |
+
|
20 |
+
for line in ladder_file:
|
21 |
+
if len(line.split(':')) == 3:
|
22 |
+
skt,tib,score = line.split(':')
|
23 |
+
if re.search("[0-9]",skt) and re.search("[0-9]",tib):
|
24 |
+
skt_num = clean_num(skt)
|
25 |
+
tib_num = clean_num(tib)
|
26 |
+
score = float(score)
|
27 |
+
if score > 0.0:
|
28 |
+
ladder.append([skt_num,tib_num,score])
|
29 |
+
last_skt = 0
|
30 |
+
last_tib = 0
|
31 |
+
for entry in ladder:
|
32 |
+
output = output + ' +$+ '.join(sktfile[last_skt:entry[0]]) + "\n"
|
33 |
+
output = output + "# " + ' +!+ '.join(tibfile[last_tib:entry[1]]) + "\n" #+ "\t" + " SCORE: " + str(entry[2]) + "\n"
|
34 |
+
last_skt = entry[0]
|
35 |
+
last_tib = entry[1]
|
36 |
+
output = output + ' / '.join(sktfile[last_skt:-1]) + "\n"
|
37 |
+
output = output + "# " + ' / '.join(tibfile[last_tib:-1]) + "\n"
|
38 |
+
|
39 |
+
short_f1 = re.sub("\.tsv.*","",sys.argv[1])
|
40 |
+
short_f2 = re.sub(".*/","",sys.argv[2])
|
41 |
+
short_f2 = re.sub("\.tsv.*","",short_f2)
|
42 |
+
|
43 |
+
with open(short_f1 + "_" + short_f2 + ".org", 'w') as file:
|
44 |
+
file.write(output)
|
45 |
+
|
46 |
+
|
47 |
+
|
tibetan-aligner/model_to_hub.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
2 |
+
model_path = "model"
|
3 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
5 |
+
# push model and tokenizer to huggingface hub
|
6 |
+
model.push_to_hub("buddhist-nlp/bod-eng-similarity")
|
7 |
+
tokenizer.push_to_hub("buddhist-nlp/bod-eng-similarity")
|
tibetan-aligner/requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
sentence-transformers==2.2.2
|
2 |
+
pyewts==0.2.0
|
3 |
+
Cython==0.29.34
|
tibetan-aligner/score.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
"""
|
4 |
+
Copyright 2019 Brian Thompson
|
5 |
+
|
6 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
you may not use this file except in compliance with the License.
|
8 |
+
You may obtain a copy of the License at
|
9 |
+
|
10 |
+
https://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
|
12 |
+
Unless required by applicable law or agreed to in writing, software
|
13 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
See the License for the specific language governing permissions and
|
16 |
+
limitations under the License.
|
17 |
+
|
18 |
+
"""
|
19 |
+
|
20 |
+
import argparse
|
21 |
+
import sys
|
22 |
+
from collections import defaultdict
|
23 |
+
|
24 |
+
import numpy as np
|
25 |
+
|
26 |
+
from dp_utils import read_alignments
|
27 |
+
|
28 |
+
"""
|
29 |
+
Faster implementation of lax and strict precision and recall, based on
|
30 |
+
https://www.aclweb.org/anthology/W11-4624/.
|
31 |
+
|
32 |
+
"""
|
33 |
+
|
34 |
+
|
35 |
+
def _precision(goldalign, testalign):
|
36 |
+
"""
|
37 |
+
Computes tpstrict, fpstrict, tplax, fplax for gold/test alignments
|
38 |
+
"""
|
39 |
+
tpstrict = 0 # true positive strict counter
|
40 |
+
tplax = 0 # true positive lax counter
|
41 |
+
fpstrict = 0 # false positive strict counter
|
42 |
+
fplax = 0 # false positive lax counter
|
43 |
+
|
44 |
+
# convert to sets, remove alignments empty on both sides
|
45 |
+
testalign = set([(tuple(x), tuple(y)) for x, y in testalign if len(x) or len(y)])
|
46 |
+
goldalign = set([(tuple(x), tuple(y)) for x, y in goldalign if len(x) or len(y)])
|
47 |
+
|
48 |
+
# mappings from source test sentence idxs to
|
49 |
+
# target gold sentence idxs for which the source test sentence
|
50 |
+
# was found in corresponding source gold alignment
|
51 |
+
src_id_to_gold_tgt_ids = defaultdict(set)
|
52 |
+
for gold_src, gold_tgt in goldalign:
|
53 |
+
for gold_src_id in gold_src:
|
54 |
+
for gold_tgt_id in gold_tgt:
|
55 |
+
src_id_to_gold_tgt_ids[gold_src_id].add(gold_tgt_id)
|
56 |
+
|
57 |
+
for (test_src, test_target) in testalign:
|
58 |
+
if (test_src, test_target) == ((), ()):
|
59 |
+
continue
|
60 |
+
if (test_src, test_target) in goldalign:
|
61 |
+
# strict match
|
62 |
+
tpstrict += 1
|
63 |
+
tplax += 1
|
64 |
+
else:
|
65 |
+
# For anything with partial gold/test overlap on the source,
|
66 |
+
# see if there is also partial overlap on the gold/test target
|
67 |
+
# If so, its a lax match
|
68 |
+
target_ids = set()
|
69 |
+
for src_test_id in test_src:
|
70 |
+
for tgt_id in src_id_to_gold_tgt_ids[src_test_id]:
|
71 |
+
target_ids.add(tgt_id)
|
72 |
+
if set(test_target).intersection(target_ids):
|
73 |
+
fpstrict += 1
|
74 |
+
tplax += 1
|
75 |
+
else:
|
76 |
+
fpstrict += 1
|
77 |
+
fplax += 1
|
78 |
+
|
79 |
+
return np.array([tpstrict, fpstrict, tplax, fplax], dtype=np.int32)
|
80 |
+
|
81 |
+
|
82 |
+
def score_multiple(gold_list, test_list, value_for_div_by_0=0.0):
|
83 |
+
# accumulate counts for all gold/test files
|
84 |
+
pcounts = np.array([0, 0, 0, 0], dtype=np.int32)
|
85 |
+
rcounts = np.array([0, 0, 0, 0], dtype=np.int32)
|
86 |
+
for goldalign, testalign in zip(gold_list, test_list):
|
87 |
+
pcounts += _precision(goldalign=goldalign, testalign=testalign)
|
88 |
+
# recall is precision with no insertion/deletion and swap args
|
89 |
+
test_no_del = [(x, y) for x, y in testalign if len(x) and len(y)]
|
90 |
+
gold_no_del = [(x, y) for x, y in goldalign if len(x) and len(y)]
|
91 |
+
rcounts += _precision(goldalign=test_no_del, testalign=gold_no_del)
|
92 |
+
|
93 |
+
# Compute results
|
94 |
+
# pcounts: tpstrict,fnstrict,tplax,fnlax
|
95 |
+
# rcounts: tpstrict,fpstrict,tplax,fplax
|
96 |
+
|
97 |
+
if pcounts[0] + pcounts[1] == 0:
|
98 |
+
pstrict = value_for_div_by_0
|
99 |
+
else:
|
100 |
+
pstrict = pcounts[0] / float(pcounts[0] + pcounts[1])
|
101 |
+
|
102 |
+
if pcounts[2] + pcounts[3] == 0:
|
103 |
+
plax = value_for_div_by_0
|
104 |
+
else:
|
105 |
+
plax = pcounts[2] / float(pcounts[2] + pcounts[3])
|
106 |
+
|
107 |
+
if rcounts[0] + rcounts[1] == 0:
|
108 |
+
rstrict = value_for_div_by_0
|
109 |
+
else:
|
110 |
+
rstrict = rcounts[0] / float(rcounts[0] + rcounts[1])
|
111 |
+
|
112 |
+
if rcounts[2] + rcounts[3] == 0:
|
113 |
+
rlax = value_for_div_by_0
|
114 |
+
else:
|
115 |
+
rlax = rcounts[2] / float(rcounts[2] + rcounts[3])
|
116 |
+
|
117 |
+
if (pstrict + rstrict) == 0:
|
118 |
+
fstrict = value_for_div_by_0
|
119 |
+
else:
|
120 |
+
fstrict = 2 * (pstrict * rstrict) / (pstrict + rstrict)
|
121 |
+
|
122 |
+
if (plax + rlax) == 0:
|
123 |
+
flax = value_for_div_by_0
|
124 |
+
else:
|
125 |
+
flax = 2 * (plax * rlax) / (plax + rlax)
|
126 |
+
|
127 |
+
result = dict(recall_strict=rstrict,
|
128 |
+
recall_lax=rlax,
|
129 |
+
precision_strict=pstrict,
|
130 |
+
precision_lax=plax,
|
131 |
+
f1_strict=fstrict,
|
132 |
+
f1_lax=flax)
|
133 |
+
|
134 |
+
return result
|
135 |
+
|
136 |
+
|
137 |
+
def log_final_scores(res):
|
138 |
+
print(' ---------------------------------', file=sys.stderr)
|
139 |
+
print('| | Strict | Lax |', file=sys.stderr)
|
140 |
+
print('| Precision | {precision_strict:.3f} | {precision_lax:.3f} |'.format(**res), file=sys.stderr)
|
141 |
+
print('| Recall | {recall_strict:.3f} | {recall_lax:.3f} |'.format(**res), file=sys.stderr)
|
142 |
+
print('| F1 | {f1_strict:.3f} | {f1_lax:.3f} |'.format(**res), file=sys.stderr)
|
143 |
+
print(' ---------------------------------', file=sys.stderr)
|
144 |
+
|
145 |
+
|
146 |
+
def main():
|
147 |
+
parser = argparse.ArgumentParser(
|
148 |
+
'Compute strict/lax precision and recall for one or more pairs of gold/test alignments',
|
149 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
150 |
+
|
151 |
+
parser.add_argument('-t', '--test', type=str, nargs='+', required=True,
|
152 |
+
help='one or more test alignment files')
|
153 |
+
|
154 |
+
parser.add_argument('-g', '--gold', type=str, nargs='+', required=True,
|
155 |
+
help='one or more gold alignment files')
|
156 |
+
|
157 |
+
args = parser.parse_args()
|
158 |
+
|
159 |
+
if len(args.test) != len(args.gold):
|
160 |
+
raise Exception('number of gold/test files must be the same')
|
161 |
+
|
162 |
+
gold_list = [read_alignments(x) for x in args.gold]
|
163 |
+
test_list = [read_alignments(x) for x in args.test]
|
164 |
+
|
165 |
+
res = score_multiple(gold_list=gold_list, test_list=test_list)
|
166 |
+
log_final_scores(res)
|
167 |
+
|
168 |
+
|
169 |
+
if __name__ == '__main__':
|
170 |
+
main()
|
tibetan-aligner/vecalign.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
"""
|
4 |
+
Copyright 2019 Brian Thompson
|
5 |
+
|
6 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
you may not use this file except in compliance with the License.
|
8 |
+
You may obtain a copy of the License at
|
9 |
+
|
10 |
+
https://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
|
12 |
+
Unless required by applicable law or agreed to in writing, software
|
13 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
See the License for the specific language governing permissions and
|
16 |
+
limitations under the License.
|
17 |
+
"""
|
18 |
+
|
19 |
+
import argparse
|
20 |
+
import logging
|
21 |
+
import pickle
|
22 |
+
from math import ceil
|
23 |
+
from random import seed as seed
|
24 |
+
|
25 |
+
import numpy as np
|
26 |
+
|
27 |
+
logger = logging.getLogger('vecalign')
|
28 |
+
logger.setLevel(logging.WARNING)
|
29 |
+
logFormatter = logging.Formatter("%(asctime)s %(levelname)-5.5s %(message)s")
|
30 |
+
consoleHandler = logging.StreamHandler()
|
31 |
+
consoleHandler.setFormatter(logFormatter)
|
32 |
+
logger.addHandler(consoleHandler)
|
33 |
+
|
34 |
+
from dp_utils import make_alignment_types, print_alignments, read_alignments, \
|
35 |
+
read_in_embeddings, make_doc_embedding, vecalign
|
36 |
+
|
37 |
+
from score import score_multiple, log_final_scores
|
38 |
+
|
39 |
+
|
40 |
+
def _main():
|
41 |
+
# make runs consistent
|
42 |
+
seed(42)
|
43 |
+
np.random.seed(42)
|
44 |
+
|
45 |
+
parser = argparse.ArgumentParser('Sentence alignment using sentence embeddings and FastDTW',
|
46 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
47 |
+
|
48 |
+
parser.add_argument('-s', '--src', type=str, nargs='+', required=True,
|
49 |
+
help='preprocessed source file to align')
|
50 |
+
|
51 |
+
parser.add_argument('-t', '--tgt', type=str, nargs='+', required=True,
|
52 |
+
help='preprocessed target file to align')
|
53 |
+
|
54 |
+
parser.add_argument('-g', '--gold_alignment', type=str, nargs='+', required=False,
|
55 |
+
help='preprocessed target file to align')
|
56 |
+
|
57 |
+
parser.add_argument('--src_embed', type=str, nargs=2, required=True,
|
58 |
+
help='Source embeddings. Requires two arguments: first is a text file, sencond is a binary embeddings file. ')
|
59 |
+
|
60 |
+
parser.add_argument('--tgt_embed', type=str, nargs=2, required=True,
|
61 |
+
help='Target embeddings. Requires two arguments: first is a text file, sencond is a binary embeddings file. ')
|
62 |
+
|
63 |
+
parser.add_argument('-a', '--alignment_max_size', type=int, default=4,
|
64 |
+
help='Searches for alignments up to size N-M, where N+M <= this value. Note that the the embeddings must support the requested number of overlaps')
|
65 |
+
|
66 |
+
parser.add_argument('-d', '--del_percentile_frac', type=float, default=0.2,
|
67 |
+
help='Deletion penalty is set to this percentile (as a fraction) of the cost matrix distribution. Should be between 0 and 1.')
|
68 |
+
|
69 |
+
parser.add_argument('-v', '--verbose', help='sets consle to logging.DEBUG instead of logging.WARN',
|
70 |
+
action='store_true')
|
71 |
+
|
72 |
+
parser.add_argument('--max_size_full_dp', type=int, default=300, # org: 300
|
73 |
+
help='Maximum size N for which is is acceptable to run full N^2 dynamic programming.')
|
74 |
+
|
75 |
+
parser.add_argument('--costs_sample_size', type=int, default=20000,
|
76 |
+
help='Sample size to estimate costs distribution, used to set deletion penalty in conjunction with deletion_percentile.')
|
77 |
+
|
78 |
+
parser.add_argument('--num_samps_for_norm', type=int, default=100, # org 100
|
79 |
+
help='Number of samples used for normalizing embeddings')
|
80 |
+
|
81 |
+
parser.add_argument('--search_buffer_size', type=int, default=5,
|
82 |
+
help='Width (one side) of search buffer. Larger values makes search more likely to recover from errors but increases runtime.')
|
83 |
+
|
84 |
+
parser.add_argument('--debug_save_stack', type=str,
|
85 |
+
help='Write stack to pickle file for debug purposes')
|
86 |
+
|
87 |
+
args = parser.parse_args()
|
88 |
+
|
89 |
+
if len(args.src) != len(args.tgt):
|
90 |
+
raise Exception('number of source files must match number of target files')
|
91 |
+
|
92 |
+
if args.gold_alignment is not None:
|
93 |
+
if len(args.gold_alignment) != len(args.src):
|
94 |
+
raise Exception('number of gold alignment files, if provided, must match number of source and target files')
|
95 |
+
|
96 |
+
if args.verbose:
|
97 |
+
import logging
|
98 |
+
logger.setLevel(logging.INFO)
|
99 |
+
|
100 |
+
if args.alignment_max_size < 2:
|
101 |
+
logger.warning('Alignment_max_size < 2. Increasing to 2 so that 1-1 alignments will be considered')
|
102 |
+
args.alignment_max_size = 2
|
103 |
+
|
104 |
+
src_sent2line, src_line_embeddings = read_in_embeddings(args.src_embed[0], args.src_embed[1])
|
105 |
+
tgt_sent2line, tgt_line_embeddings = read_in_embeddings(args.tgt_embed[0], args.tgt_embed[1])
|
106 |
+
|
107 |
+
width_over2 = ceil(args.alignment_max_size / 2.0) + args.search_buffer_size
|
108 |
+
|
109 |
+
test_alignments = []
|
110 |
+
stack_list = []
|
111 |
+
for src_file, tgt_file in zip(args.src, args.tgt):
|
112 |
+
logger.info('Aligning src="%s" to tgt="%s"', src_file, tgt_file)
|
113 |
+
|
114 |
+
src_lines = open(src_file, 'rt', encoding="utf-8").readlines()
|
115 |
+
vecs0 = make_doc_embedding(src_sent2line, src_line_embeddings, src_lines, args.alignment_max_size)
|
116 |
+
|
117 |
+
tgt_lines = open(tgt_file, 'rt', encoding="utf-8").readlines()
|
118 |
+
vecs1 = make_doc_embedding(tgt_sent2line, tgt_line_embeddings, tgt_lines, args.alignment_max_size)
|
119 |
+
|
120 |
+
final_alignment_types = make_alignment_types(args.alignment_max_size)
|
121 |
+
logger.debug('Considering alignment types %s', final_alignment_types)
|
122 |
+
|
123 |
+
stack = vecalign(vecs0=vecs0,
|
124 |
+
vecs1=vecs1,
|
125 |
+
final_alignment_types=final_alignment_types,
|
126 |
+
del_percentile_frac=args.del_percentile_frac,
|
127 |
+
width_over2=width_over2,
|
128 |
+
max_size_full_dp=args.max_size_full_dp,
|
129 |
+
costs_sample_size=args.costs_sample_size,
|
130 |
+
num_samps_for_norm=args.num_samps_for_norm)
|
131 |
+
|
132 |
+
# write final alignments to stdout
|
133 |
+
print_alignments(stack[0]['final_alignments'], stack[0]['alignment_scores'])
|
134 |
+
|
135 |
+
test_alignments.append(stack[0]['final_alignments'])
|
136 |
+
stack_list.append(stack)
|
137 |
+
|
138 |
+
if args.gold_alignment is not None:
|
139 |
+
gold_list = [read_alignments(x) for x in args.gold_alignment]
|
140 |
+
res = score_multiple(gold_list=gold_list, test_list=test_alignments)
|
141 |
+
log_final_scores(res)
|
142 |
+
|
143 |
+
if args.debug_save_stack:
|
144 |
+
pickle.dump(stack_list, open(args.debug_save_stack, 'wb'))
|
145 |
+
|
146 |
+
|
147 |
+
if __name__ == '__main__':
|
148 |
+
_main()
|
tm.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import subprocess
|
5 |
+
import sys
|
6 |
+
import tempfile
|
7 |
+
import time
|
8 |
+
from pathlib import Path
|
9 |
+
from typing import Dict
|
10 |
+
|
11 |
+
import requests
|
12 |
+
|
13 |
+
GITHUB_USERNAME = os.getenv("GITHUB_USERNAME")
|
14 |
+
GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_TOKEN")
|
15 |
+
GITHUB_EMAIL = os.getenv("GITHUB_EMAIL")
|
16 |
+
GITHUB_ORG = os.getenv("MAI_GITHUB_ORG")
|
17 |
+
MAI_TM_PUBLISH_TODO_REPO = os.environ["MAI_TMS_PUBLISH_TODO_REPO"]
|
18 |
+
GITHUB_API_ENDPOINT = f"https://api.github.com/orgs/{GITHUB_ORG}/repos"
|
19 |
+
|
20 |
+
DEBUG = os.getenv("DEBUG", False)
|
21 |
+
|
22 |
+
quiet = "-q" if DEBUG else ""
|
23 |
+
|
24 |
+
|
25 |
+
def create_github_repo(repo_path: Path, repo_name: str):
|
26 |
+
logging.info("[INFO] Creating GitHub repo...")
|
27 |
+
|
28 |
+
# configure git users
|
29 |
+
subprocess.run(f"git config --global user.name {GITHUB_USERNAME}".split())
|
30 |
+
subprocess.run(f"git config --global user.email {GITHUB_EMAIL}".split())
|
31 |
+
|
32 |
+
# Initialize a Git repository
|
33 |
+
subprocess.run(f"git init {quiet}".split(), cwd=str(repo_path))
|
34 |
+
|
35 |
+
# Commit the changes
|
36 |
+
subprocess.run("git add . ".split(), cwd=str(repo_path))
|
37 |
+
subprocess.run(
|
38 |
+
f"git commit {quiet} -m".split() + ["Initial commit"], cwd=str(repo_path)
|
39 |
+
)
|
40 |
+
|
41 |
+
# Create a new repository on GitHub
|
42 |
+
response = requests.post(
|
43 |
+
GITHUB_API_ENDPOINT,
|
44 |
+
json={
|
45 |
+
"name": repo_name,
|
46 |
+
"private": True,
|
47 |
+
},
|
48 |
+
auth=(GITHUB_USERNAME, GITHUB_ACCESS_TOKEN),
|
49 |
+
)
|
50 |
+
response.raise_for_status()
|
51 |
+
|
52 |
+
time.sleep(3)
|
53 |
+
|
54 |
+
# Add the GitHub remote to the local Git repository and push the changes
|
55 |
+
remote_url = f"https://{GITHUB_ORG}:{GITHUB_ACCESS_TOKEN}@github.com/{GITHUB_ORG}/{repo_name}.git"
|
56 |
+
subprocess.run(
|
57 |
+
f"git remote add origin {remote_url}", cwd=str(repo_path), shell=True
|
58 |
+
)
|
59 |
+
# rename default branch to main
|
60 |
+
subprocess.run("git branch -M main".split(), cwd=str(repo_path))
|
61 |
+
subprocess.run(f"git push {quiet} -u origin main".split(), cwd=str(repo_path))
|
62 |
+
|
63 |
+
return response.json()["html_url"]
|
64 |
+
|
65 |
+
|
66 |
+
def convert_raw_align_to_tm(align_fn: Path, tm_path: Path):
|
67 |
+
if DEBUG:
|
68 |
+
logging.debug("[INFO] Conerting raw alignment to TM repo...")
|
69 |
+
|
70 |
+
def load_alignment(fn: Path):
|
71 |
+
content = fn.read_text()
|
72 |
+
if not content:
|
73 |
+
return []
|
74 |
+
|
75 |
+
for seg_pair in content.splitlines():
|
76 |
+
if not seg_pair:
|
77 |
+
continue
|
78 |
+
|
79 |
+
if "\t" in seg_pair:
|
80 |
+
try:
|
81 |
+
bo_seg, en_seg = seg_pair.split("\t", 1)
|
82 |
+
except Exception as e:
|
83 |
+
logging.error(f"{e} in {fn}")
|
84 |
+
raise
|
85 |
+
|
86 |
+
else:
|
87 |
+
bo_seg = seg_pair
|
88 |
+
en_seg = "\n"
|
89 |
+
yield bo_seg, en_seg
|
90 |
+
|
91 |
+
text_bo_fn = tm_path / f"{tm_path.name}-bo.txt"
|
92 |
+
text_en_fn = tm_path / f"{tm_path.name}-en.txt"
|
93 |
+
|
94 |
+
with open(text_bo_fn, "w", encoding="utf-8") as bo_file, open(
|
95 |
+
text_en_fn, "w", encoding="utf-8"
|
96 |
+
) as en_file:
|
97 |
+
for bo_seg, en_seg in load_alignment(align_fn):
|
98 |
+
bo_file.write(bo_seg + "\n")
|
99 |
+
en_file.write(en_seg + "\n")
|
100 |
+
|
101 |
+
return tm_path
|
102 |
+
|
103 |
+
|
104 |
+
def get_github_dev_url(raw_github_url: str) -> str:
|
105 |
+
base_url = "https://github.dev"
|
106 |
+
_, file_path = raw_github_url.split(".com")
|
107 |
+
blob_file_path = file_path.replace("main", "blob/main")
|
108 |
+
return base_url + blob_file_path
|
109 |
+
|
110 |
+
|
111 |
+
def add_input_in_readme(input_dict: Dict[str, str], path: Path) -> Path:
|
112 |
+
input_readme_fn = path / "README.md"
|
113 |
+
text_id = input_dict["text_id"]
|
114 |
+
bo_file_url = get_github_dev_url(input_dict["bo_file_url"])
|
115 |
+
en_file_url = get_github_dev_url(input_dict["en_file_url"])
|
116 |
+
input_string = "## Input\n- [BO{}]({})\n- [EN{}]({})".format(
|
117 |
+
text_id, bo_file_url, text_id, en_file_url
|
118 |
+
)
|
119 |
+
|
120 |
+
input_readme_fn.write_text(input_string)
|
121 |
+
|
122 |
+
return path
|
123 |
+
|
124 |
+
def add_to_publish_todo_repo(org, repo_name, file_path, access_token):
|
125 |
+
base_url = f"https://api.github.com/repos/{org}/{repo_name}/contents/"
|
126 |
+
|
127 |
+
headers = {
|
128 |
+
"Authorization": f"Bearer {access_token}",
|
129 |
+
"Accept": "application/vnd.github.v3+json",
|
130 |
+
}
|
131 |
+
|
132 |
+
url = base_url + file_path
|
133 |
+
|
134 |
+
response = requests.get(url, headers=headers)
|
135 |
+
|
136 |
+
if response.status_code == 200:
|
137 |
+
print(f"[INFO] '{file_path}' already added.")
|
138 |
+
return
|
139 |
+
|
140 |
+
payload = {"message": f"Add {file_path}", "content": ""}
|
141 |
+
|
142 |
+
response = requests.put(url, headers=headers, json=payload)
|
143 |
+
|
144 |
+
if response.status_code == 201:
|
145 |
+
print(f"[INFO] '{file_path}' added to publish todo")
|
146 |
+
else:
|
147 |
+
print(f"[ERROR] Failed to add '{file_path}'.")
|
148 |
+
print(f"[ERROR] Response: {response.text}")
|
149 |
+
|
150 |
+
|
151 |
+
def create_tm(align_fn: Path, text_pair: Dict[str, str]):
|
152 |
+
align_fn = Path(align_fn)
|
153 |
+
text_id = text_pair["text_id"]
|
154 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
155 |
+
output_dir = Path(tmp_dir)
|
156 |
+
repo_name = f"TM{text_id}"
|
157 |
+
tm_path = output_dir / repo_name
|
158 |
+
tm_path.mkdir(exist_ok=True, parents=True)
|
159 |
+
repo_path = convert_raw_align_to_tm(align_fn, tm_path)
|
160 |
+
repo_path = add_input_in_readme(text_pair, tm_path)
|
161 |
+
repo_url = create_github_repo(repo_path, repo_name)
|
162 |
+
logging.info(f"TM repo created: {repo_url}")
|
163 |
+
add_to_publish_todo_repo(GITHUB_ORG, MAI_TM_PUBLISH_TODO_REPO, repo_name, GITHUB_ACCESS_TOKEN)
|
164 |
+
return repo_url
|
165 |
+
|
166 |
+
|
167 |
+
if __name__ == "__main__":
|
168 |
+
align_fn = Path(sys.argv[1])
|
169 |
+
create_tm(align_fn)
|