10zinten commited on
Commit
1a3c007
0 Parent(s):

Duplicate from openpecha/tibetan-aligner-api

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ .idea/
161
+
162
+ */ladder
163
+ data
164
+
165
+
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Tibetan Aligner
3
+ emoji: 📖
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.34.0
8
+ app_file: app.py
9
+ pinned: true
10
+ license: mit
11
+ duplicated_from: openpecha/tibetan-aligner-api
12
+ ---
13
+
14
+ DISCLAIMER: This space has been created solely for testing and educational purposes. We do not claim any ownership or copyright over the align-tibetan script, which remains the sole property of its original creator, Sebastian Nehrlich. We have created this space to facilitate the use and testing of the align-tibetan script for interested users. If you use the align-tibetan script for any commercial or production purposes, we strongly encourage you to obtain permission from the original creator and comply with any relevant licensing requirements.
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ import shutil
5
+ import stat
6
+ import subprocess
7
+ import time
8
+ import uuid
9
+ from contextlib import contextmanager
10
+ from pathlib import Path
11
+
12
+ import gradio as gr
13
+ import requests
14
+
15
+ from tm import create_tm
16
+
17
+ logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
18
+
19
+ GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
20
+
21
+ ALIGNER_SCRIPT_DIR = Path("./tibetan-aligner").resolve()
22
+ ALIGNER_SCRIPT_NAME = "align_tib_en.sh"
23
+ ALIGNER_SCRIPT_PATH = ALIGNER_SCRIPT_DIR / ALIGNER_SCRIPT_NAME
24
+ assert ALIGNER_SCRIPT_PATH.is_file()
25
+
26
+
27
+ def make_dir_executable(dir_path: Path):
28
+ for fn in dir_path.iterdir():
29
+ st = os.stat(fn)
30
+ os.chmod(fn, st.st_mode | stat.S_IEXEC)
31
+ st = os.stat(fn)
32
+ os.chmod(fn, st.st_mode | stat.S_IXGRP)
33
+ st = os.stat(fn)
34
+ os.chmod(fn, st.st_mode | stat.S_IXOTH)
35
+
36
+
37
+ make_dir_executable(ALIGNER_SCRIPT_DIR)
38
+
39
+
40
+ @contextmanager
41
+ def TemporaryDirectory():
42
+ tmpdir = Path("./output").resolve() / uuid.uuid4().hex[:8]
43
+ tmpdir.mkdir(exist_ok=True, parents=True)
44
+ try:
45
+ yield tmpdir
46
+ finally:
47
+ shutil.rmtree(str(tmpdir))
48
+
49
+
50
+ def download_file(github_file_url: str, output_fn) -> Path:
51
+ """Download file from github"""
52
+ headers = {
53
+ "Authorization": f"token {GITHUB_TOKEN}",
54
+ "Accept": "application/vnd.github+json",
55
+ }
56
+ authenticated_file_url = f"{github_file_url}?token={GITHUB_TOKEN}"
57
+ with requests.get(authenticated_file_url, headers=headers, stream=True) as r:
58
+ r.raise_for_status()
59
+ with open(output_fn, "wb") as f:
60
+ for chunk in r.iter_content(chunk_size=8192):
61
+ f.write(chunk)
62
+ return output_fn
63
+
64
+
65
+ def _run_align_script(bo_fn, en_fn, output_dir):
66
+ start = time.time()
67
+ cmd = [str(ALIGNER_SCRIPT_PATH), str(bo_fn), str(en_fn), str(output_dir)]
68
+ output = subprocess.run(
69
+ cmd,
70
+ check=True,
71
+ capture_output=True,
72
+ text=True,
73
+ cwd=str(ALIGNER_SCRIPT_DIR),
74
+ )
75
+ output_fn = re.search(r"\[OUTPUT\] (.*)", output.stdout).group(1)
76
+ output_fn = "/" + output_fn.split("//")[-1]
77
+ end = time.time()
78
+ total_time = round((end - start) / 60, 2)
79
+ logging.info(f"Total time taken for Aligning: {total_time} mins")
80
+ return output_fn
81
+
82
+
83
+ def align(text_pair):
84
+ logging.info(f"Running aligner for TM{text_pair['text_id']}...")
85
+ with TemporaryDirectory() as tmpdir:
86
+ output_dir = Path(tmpdir)
87
+ bo_fn = download_file(text_pair["bo_file_url"], output_fn=output_dir / "bo.tx")
88
+ en_fn = download_file(text_pair["en_file_url"], output_fn=output_dir / "en.tx")
89
+ aligned_fn = _run_align_script(bo_fn, en_fn, output_dir)
90
+ repo_url = create_tm(aligned_fn, text_pair=text_pair)
91
+ return {"tm_repo_url": repo_url}
92
+
93
+
94
+ with gr.Blocks() as demo:
95
+ gr.Markdown("## Tibetan-English Aligner API")
96
+ gr.Markdown("Please use Via API")
97
+ input = gr.JSON(
98
+ # value={
99
+ # "text_id": f"{uuid.uuid4().hex[:4]}",
100
+ # "bo_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-bo.txt",
101
+ # "en_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-en.txt",
102
+ # }
103
+ )
104
+ output = gr.JSON()
105
+ align_btn = gr.Button("Align")
106
+ align_btn.click(
107
+ fn=align,
108
+ inputs=input,
109
+ outputs=output,
110
+ api_name="align",
111
+ )
112
+
113
+
114
+ if __name__ == "__main__":
115
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, debug=True)
flagged/file_urls/tmpfuhgfj7m.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bo_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-bo.txt", "en_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-en.txt"}
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ file_urls,output,flag,username,timestamp
2
+ /home/user/app/flagged/file_urls/tmpfuhgfj7m.json,,,,2023-04-10 11:44:49.529324
import_tibetan_aligner_source.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ NON_SOURCES_FILES = [
6
+ ".",
7
+ "..",
8
+ ".git",
9
+ ".github",
10
+ ".gitignore",
11
+ ".venv",
12
+ ".idea",
13
+ "Dockerfile",
14
+ "__pycache__",
15
+ "tests",
16
+ ]
17
+
18
+ if __name__ == "__main__":
19
+ source_dir = Path(sys.argv[1])
20
+ dest_dir = Path(__file__).parent / source_dir.name
21
+ dest_dir.mkdir(exist_ok=True)
22
+ for fn in source_dir.iterdir():
23
+ if fn.name in NON_SOURCES_FILES:
24
+ continue
25
+ dest_fn = dest_dir / fn.name
26
+ shutil.copy2(str(fn), str(dest_fn))
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sentence-transformers==2.2.2
2
+ pyewts==0.2.0
3
+ Cython==0.29.34
4
+ gradio>=3.34.0, <4.0
5
+ requests==2.28.2
tibetan-aligner/README.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # align-tibetan
2
+ Tibetan English sentence alignment
3
+ Simply run bash align_tib_en.sh <tib_file> <eng_file>.
4
+ Tib file should be in Tibetan unicode, English file should be plain text English.
5
+ There are some possible parameters, please look into align_tib_en.sh.
tibetan-aligner/align_tib_en.sh ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ number_of_overlays=6 # the higher the number of overlays, the more precise alignment is going to be, but also slower
3
+ deletion=0.06 # higher = less precise
4
+ search_buffer_size=50
5
+
6
+ # Args:
7
+ # first parameter is a file in Tibetan unicode
8
+ # second parameter is a file with English in plain text.
9
+ # third parameter is output path
10
+
11
+ cp $1 $1.work
12
+ cp $2 $2.work
13
+ output_dir=${3:-"output"}
14
+ mkdir $output_dir
15
+
16
+ cp $2.work $2.work2
17
+
18
+ echo '[INFO] Getting Embedding...'
19
+ time python get_vectors.py $1.work $number_of_overlays
20
+ time python get_vectors.py $2.work $number_of_overlays
21
+
22
+ rm ladder
23
+ echo '[INFO] Running alignment...'
24
+ time ./vecalign.py -a $number_of_overlays -d $deletion --search_buffer_size $search_buffer_size --alignment_max_size $number_of_overlays --src $1.work --tgt $2.work \
25
+ --src_embed $1.work_overlay $1.work_vectors.npy \
26
+ --tgt_embed $2.work_overlay $2.work_vectors.npy >> ladder
27
+
28
+ rm $1.org
29
+ rm $1.train
30
+ python ladder2org.py $1.work $2.work ladder >> $1.org
31
+ python create_train.py $1.work $2.work ladder >> $1.train
32
+ python create_train_clean.py $1.work $2.work ladder >> $1.train_cleaned
33
+
34
+ # clean up
35
+ mv *.txt* $output_dir/
36
+ mv $output_dir/requirements.txt ./
37
+ rm $output_dir/$1.work
38
+ rm $output_dir/$2.work
39
+ rm $output_dir/$2.work2
40
+ rm $output_dir/$1.work_vectors.npy
41
+ rm $output_dir/$2.work_vectors.npy
42
+
43
+ echo "[OUTPUT] $output_dir/$1.train_cleaned"
tibetan-aligner/convert_to_wylie.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import pyewts
3
+
4
+
5
+ converter = pyewts.pyewts()
6
+ path = sys.argv[1]
7
+ result = ""
8
+
9
+ for line in open(path, "r"):
10
+ line = converter.toWylie(line)
11
+ result += line
12
+
13
+
14
+ with open(path,"w") as outfile:
15
+ outfile.write(result)
16
+
17
+
tibetan-aligner/create_train.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import re
3
+ import re
4
+ f1 = open(sys.argv[1],'r')
5
+ f2 = open(sys.argv[2],'r')
6
+ ladder_file = open(sys.argv[3],'r')
7
+
8
+
9
+ output = ""
10
+ ladder = []
11
+ sktfile = [line.rstrip('\n').strip() for line in f1]
12
+ tibfile = [line.rstrip('\n').strip() for line in f2]
13
+ last_score = 0.5
14
+
15
+ def clean_num(string):
16
+ string = re.sub("[^0-9, ]","",string)
17
+ return int(string.split(',')[0])
18
+
19
+
20
+ for line in ladder_file:
21
+ if len(line.split("\t")) == 3:
22
+ skt,tib,score = line.split('\t')
23
+ if re.search("[0-9]",skt) and re.search("[0-9]",tib):
24
+ skt_num = clean_num(skt)
25
+ tib_num = clean_num(tib)
26
+ ladder.append([skt_num,tib_num,score])
27
+
28
+
29
+ if ";" in line:
30
+ m = re.search("([0-9., ]+);([0-9., ]+).*=\"([0-9.,]+)", line)
31
+ if m:
32
+ skt_num = int(m.group(1).split()[0].replace(".","").replace(",",""))-1
33
+ tib_num = int(m.group(2).split()[0].replace(".","").replace(",",""))-1
34
+ score = float(m.group(3))
35
+ ladder.append([skt_num,tib_num,score])
36
+
37
+
38
+
39
+ if len(line.split(':')) == 3:
40
+ skt,tib,score = line.split(':')
41
+ if re.search("[0-9]",skt) and re.search("[0-9]",tib):
42
+ skt_num = clean_num(skt)
43
+ tib_num = clean_num(tib)
44
+ ladder.append([skt_num,tib_num,score])
45
+ last_skt = 0
46
+ last_tib = 0
47
+ for entry in ladder:
48
+ output = output + ' '.join(sktfile[last_skt:entry[0]]) + "\t"
49
+ output = output + ' '.join(tibfile[last_tib:entry[1]]) + "\n"
50
+ last_skt = entry[0]
51
+ last_tib = entry[1]
52
+ output = output + ' '.join(sktfile[last_skt:-1]) + "\t"
53
+ output = output + ' '.join(tibfile[last_tib:-1]) + "\n" # + str(entry[2])
54
+
55
+ short_f1 = re.sub("\.tsv.*","",sys.argv[1])
56
+ short_f2 = re.sub(".*/","",sys.argv[2])
57
+ short_f2 = re.sub("\.tsv.*","",short_f2)
58
+ print(output)
59
+ # with open(short_f1 + "_" + short_f2 + ".train", 'w') as file:
60
+ # file.write(output)
tibetan-aligner/create_train_clean.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import re
3
+ import re
4
+ f1 = open(sys.argv[1],'r')
5
+ f2 = open(sys.argv[2],'r')
6
+ ladder_file = open(sys.argv[3],'r')
7
+
8
+
9
+ output = ""
10
+ ladder = []
11
+ sktfile = [line.rstrip('\n').strip() for line in f1]
12
+ tibfile = [line.rstrip('\n').strip() for line in f2]
13
+ last_score = 0.5
14
+
15
+ def clean_num(string):
16
+ string = re.sub("[^0-9, ]","",string)
17
+ numbers = []
18
+ for number in string.split(','):
19
+ numbers.append(int(number))
20
+ return numbers
21
+
22
+
23
+ for line in ladder_file:
24
+ if len(line.split(':')) == 3:
25
+ skt,tib,score = line.split(':')
26
+ if re.search("[0-9]",skt) and re.search("[0-9]",tib):
27
+ skt_nums = clean_num(skt)
28
+ tib_nums = clean_num(tib)
29
+ for num in skt_nums:
30
+ output += sktfile[num] + " "
31
+ output += "\t"
32
+ for num in tib_nums:
33
+ output += tibfile[num] + " "
34
+ output += "\n"
35
+ print(output)
36
+ # with open(short_f1 + "_" + short_f2 + ".train", 'w') as file:
37
+ # file.write(output)
tibetan-aligner/dp_core.cpython-310-x86_64-linux-gnu.so.reload1 ADDED
Binary file (643 kB). View file
 
tibetan-aligner/dp_core.cpython-39-darwin.so.reload1 ADDED
Binary file (170 kB). View file
 
tibetan-aligner/dp_core.pyx ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cython: language_level=3
2
+
3
+ """
4
+ Copyright 2019 Brian Thompson
5
+
6
+ Licensed under the Apache License, Version 2.0 (the "License");
7
+ you may not use this file except in compliance with the License.
8
+ You may obtain a copy of the License at
9
+
10
+ https://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+ """
18
+
19
+ import numpy as np
20
+
21
+ cimport numpy as np
22
+ cimport cython
23
+
24
+
25
+ def make_x_y_offsets(alignment_types):
26
+ # alignment types for which we will precompute costs
27
+
28
+ # deletion/insertion is added later
29
+ for x, y in alignment_types:
30
+ assert (x > 0)
31
+ assert (y > 0)
32
+
33
+ x_offsets = np.array([x for x, y in alignment_types], dtype=np.int32) # MUST **NOT** INCLUDE (0,1), (1,0)
34
+ y_offsets = np.array([y for x, y in alignment_types], dtype=np.int32) # MUST **NOT** INCLUDE (0,1), (1,0)
35
+ return x_offsets, y_offsets
36
+
37
+
38
+ def make_dense_costs(np.ndarray[float, ndim=3] vecs0, # itput
39
+ np.ndarray[float, ndim=3] vecs1, # input
40
+ np.ndarray[float, ndim=2] norm0, # input
41
+ np.ndarray[float, ndim=2] norm1, # input
42
+ int offset0 = 0, # index into vecs0/norms0
43
+ int offset1 = 0, # index into vecs1/norms1
44
+ ):
45
+ """
46
+ Make a full N*M feature matrix. By default, makes 1-1 alignments,
47
+ can build others by specifying offset0, offset1 to index into
48
+ vecs0, norms0 and vecs1, norms1 respectivly.
49
+ """
50
+ assert vecs0.shape[0] > offset0
51
+ assert vecs1.shape[0] > offset1
52
+ assert norm0.shape[0] > offset0
53
+ assert norm1.shape[0] > offset1
54
+
55
+ cdef int size0 = np.shape(vecs0)[1]
56
+ assert norm0.shape[1] == size0
57
+
58
+ cdef int size1 = np.shape(vecs1)[1]
59
+ assert norm1.shape[1] == size1
60
+
61
+ cdef int vecsize = np.shape(vecs0)[2]
62
+ assert vecs1.shape[2] == vecsize
63
+
64
+ cdef int xi, yi
65
+ cdef float sumx
66
+
67
+ cdef np.ndarray[float, ndim=2] costs = np.empty((size0, size1), dtype=np.float32)
68
+
69
+ for xi in range(size0):
70
+ for yi in range(size1):
71
+ sumx = 0.0
72
+ for jj in range(vecsize):
73
+ sumx += vecs0[offset0, xi, jj] * vecs1[offset1, yi, jj]
74
+
75
+ costs[xi, yi] = 2.0 * (1.0 - sumx) / (1e-6 + norm0[offset0, xi] + norm1[offset1, yi])
76
+ # normalize by alignment type
77
+ costs[xi, yi] = costs[xi, yi] * (offset0 + 1) * (offset1 + 1)
78
+
79
+ return costs
80
+
81
+
82
+ def dense_dp(np.ndarray[float, ndim=2] alignment_cost, float pen):
83
+ """
84
+ Compute cost matrix (csum) and backpointers (bp)
85
+ from full 2-D 1-1 alignment costs matrix (alignment_cost)
86
+ """
87
+
88
+ size0 = alignment_cost.shape[0]
89
+ size1 = alignment_cost.shape[1]
90
+ # csum and traceback matrix are both on nodes
91
+ # so they are +1 in each dimension compared to the jump costs matrix
92
+ # For anything being used in accumulation, use float64
93
+ cdef np.ndarray[double, ndim=2] csum = np.empty((size0 + 1, size1 + 1), dtype=np.float64)
94
+ cdef np.ndarray[int, ndim=2] bp = np.empty((size0 + 1, size1 + 1), dtype=np.int32)
95
+
96
+ # bp and csum are nodes,
97
+ # while alignment_cost is the cost of going between the nodes
98
+ # Size of nodes should be one larger than alignment costs
99
+ b0, b1 = np.shape(bp)
100
+ c0, c1 = np.shape(csum)
101
+ j0, j1 = np.shape(alignment_cost)
102
+ assert (b0 == c0 == j0 + 1)
103
+ assert (b1 == c1 == j1 + 1)
104
+
105
+ cdef int cmax = np.shape(csum)[1]
106
+ cdef int rmax = np.shape(csum)[0]
107
+ cdef int c, r
108
+ cdef double cost0, cost1, cost2
109
+
110
+ # initialize the all c-direction deletion path
111
+ for c in range(cmax):
112
+ csum[0, c] = c * pen
113
+ bp[0, c] = 1
114
+
115
+ # initialize the all r-direction deletion path
116
+ for r in range(rmax):
117
+ csum[r, 0] = r * pen
118
+ bp[r, 0] = 2
119
+
120
+ # Initial cost is 0.0
121
+ csum[0, 0] = 0.0 # noop
122
+ bp[0, 0] = 4 # should not matter
123
+
124
+ # Calculate the rest recursively
125
+ for c in range(1, cmax):
126
+ for r in range(1, rmax):
127
+
128
+ # alignment_cost indexes are off by 1 wrt
129
+ # csum/bp, since csum/bp are nodes
130
+ cost0 = csum[r - 1, c - 1] + alignment_cost[r - 1, c - 1]
131
+ cost1 = csum[r, c - 1] + pen
132
+ cost2 = csum[r - 1, c] + pen
133
+
134
+ csum[r, c] = cost0
135
+ bp[r, c] = 0
136
+
137
+ if cost1 < csum[r, c]:
138
+ csum[r, c] = cost1
139
+ bp[r, c] = 1
140
+ if cost2 < csum[r, c]:
141
+ csum[r, c] = cost2
142
+ bp[r, c] = 2
143
+
144
+ return csum, bp
145
+
146
+
147
+ def score_path(np.ndarray[int, ndim=1] xx,
148
+ np.ndarray[int, ndim=1] yy,
149
+ np.ndarray[float, ndim=1] norm1,
150
+ np.ndarray[float, ndim=1] norm2,
151
+ np.ndarray[float, ndim=2] vecs1,
152
+ np.ndarray[float, ndim=2] vecs2,
153
+ np.ndarray[float, ndim=1] out):
154
+ cdef int xi, yi, ii, jj
155
+ cdef float outx
156
+ cdef int lenxy = xx.shape[0]
157
+ cdef int vecsize = vecs1.shape[1]
158
+
159
+ for ii in range(lenxy):
160
+ xi = xx[ii]
161
+ yi = yy[ii]
162
+ outx = 0.0
163
+ for jj in range(vecsize):
164
+ outx += vecs1[xi, jj] * vecs2[yi, jj]
165
+ out[ii] = 2.0 * (1.0 - outx) / (norm1[xi] + norm2[yi])
166
+
167
+
168
+ # Bounds checking and wraparound slow things down by about 2x
169
+ # Division by 0 checking has minimal speed impact
170
+ @cython.boundscheck(False) # turn off bounds-checking for entire function
171
+ @cython.wraparound(False) # turn off negative index wrapping for entire function
172
+ @cython.cdivision(True) # use c-style division (no division-by-zero check)
173
+ def make_sparse_costs(np.ndarray[float, ndim=3] vecs0, # intput: num aligns X num sents X dim
174
+ np.ndarray[float, ndim=3] vecs1, # input
175
+ np.ndarray[float, ndim=2] norms0, # intput: num aligns X num sents
176
+ np.ndarray[float, ndim=2] norms1, # input
177
+ x_y_path,
178
+ alignment_types,
179
+ int width_over2):
180
+ """
181
+ Make features for DP, *for lines running across approximate path*, *for each alignment type*
182
+ x_offsets, y_offsets should not include (0,1), (1,0)
183
+
184
+ Basically, we take the feature matrix, rotate it 45 degress,
185
+ and compute a "wavy" matrix for the features.
186
+ It's like the diagonal but it moves around to hopefully always include the true path.
187
+ """
188
+
189
+ cdef np.ndarray[int, ndim=2] x_y_path_ = np.array(x_y_path).astype(np.int32)
190
+
191
+ assert (vecs0.shape[0] == norms0.shape[0])
192
+ assert (vecs1.shape[0] == norms1.shape[0])
193
+
194
+ assert (vecs0.shape[1] == norms0.shape[1])
195
+ assert (vecs1.shape[1] == norms1.shape[1])
196
+
197
+ # check how many overlaps vectors were passed in
198
+ num_overlaps_in_vecs0 = vecs0.shape[0]
199
+ num_overlaps_in_vecs1 = vecs1.shape[0]
200
+
201
+ # check how many overlaps were requested
202
+ # edge case: alignment_types could be empty
203
+ # In that case, we should just return insertions/deletions
204
+ # and max_x_overlap == max_y_overlap == 0
205
+ max_x_overlap = max([0] + [x for x, y in alignment_types]) # add [0] in case alignment_types is empty
206
+ max_y_overlap = max([0] + [y for x, y in alignment_types]) # add [0] in case alignment_types is empty
207
+
208
+ # note: alignment types are specified 1-based, but vectors are stored 0-based
209
+ if max_x_overlap > num_overlaps_in_vecs0:
210
+ raise Exception('%d x overlaps requrested (via alignment_types), but vecs0 only has %d' % (
211
+ max_x_overlap, num_overlaps_in_vecs0))
212
+ if max_y_overlap > num_overlaps_in_vecs1:
213
+ raise Exception('%d y overlaps requrested (via alignment_types), but vecs1 only has %d' % (
214
+ max_y_overlap, num_overlaps_in_vecs1))
215
+
216
+ # number of sentences in each document
217
+ cdef int xsize = vecs0.shape[1]
218
+ cdef int ysize = vecs1.shape[1]
219
+
220
+ # vector diminsions should match
221
+ assert (vecs0.shape[2] == vecs1.shape[2])
222
+
223
+ cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
224
+ x_offsets, y_offsets = make_x_y_offsets(alignment_types)
225
+
226
+ # reserve outputs
227
+ a_len = x_y_path_.shape[0]
228
+ b_len = 2 * width_over2
229
+ cdef np.ndarray[float, ndim=3] a_b_feats = np.empty((len(alignment_types), a_len, b_len), dtype=np.float32)
230
+ cdef np.ndarray[int, ndim=1] b_offset = np.empty(a_len).astype(np.int32)
231
+
232
+ cdef int x, y, aa, bb, xx, yy, a_idx, b_idx, bb2, x_offset, y_offset, ii_align, x_offset_idx, y_offset_idx
233
+ cdef int vecsize = vecs0.shape[2]
234
+ cdef int num_alignments = x_offsets.shape[0]
235
+
236
+ cdef float sumx, feat
237
+ cdef float inf = np.inf
238
+
239
+ for ii in range(x_y_path_.shape[0]):
240
+ x = x_y_path_[ii, 0]
241
+ y = x_y_path_[ii, 1]
242
+
243
+ # convert xy to ab cords
244
+ aa = x + y
245
+ bb = y
246
+
247
+ a_idx = aa
248
+ b_offset[aa] = bb - width_over2
249
+ for b_idx, bb2 in enumerate(range(bb - width_over2, bb + width_over2)):
250
+ # convert ab to xy cords
251
+ xx = aa - bb2
252
+ yy = bb2
253
+
254
+ for ii_align in range(num_alignments):
255
+ x_offset = x_offsets[ii_align]
256
+ x_offset_idx = x_offset - 1 # overlaps start at 1, vectors stored 0-based
257
+ y_offset = y_offsets[ii_align]
258
+ y_offset_idx = y_offset - 1
259
+
260
+ if 0 <= xx < xsize and 0 <= yy < ysize:
261
+ sumx = 0.0
262
+ for jj in range(vecsize):
263
+ sumx += vecs0[x_offset_idx, xx, jj] * vecs1[y_offset_idx, yy, jj]
264
+ feat = 2.0 * x_offset * y_offset * (1.0 - sumx) / (
265
+ 1e-6 + norms0[x_offset_idx, xx] + norms1[y_offset_idx, yy])
266
+
267
+ else:
268
+ feat = inf
269
+
270
+ a_b_feats[ii_align, a_idx, b_idx] = feat
271
+
272
+ return a_b_feats, b_offset
273
+
274
+
275
+ def sparse_dp(np.ndarray[float, ndim=3] a_b_costs,
276
+ np.ndarray[int, ndim=1] b_offset_in,
277
+ alignment_types,
278
+ double del_penalty,
279
+ int x_in_size,
280
+ int y_in_size):
281
+ """
282
+ Do DP along a path, using features saved off along path.
283
+ x_offsets, y_offsets should not include (0,1), (1,0)
284
+
285
+ xsize, ysize refer to the costs a_b_csum, but in x/y space
286
+
287
+ As in the simpler full-DP case,
288
+ we compute cumulative costs and backpointers on notes,
289
+ and there are COSTS associated with moving between them.
290
+
291
+ This means the size of the notes +1,+1 larger (in x,y) than the COSTS.
292
+
293
+ So the size of a_b_csum, a_b_xp, a_b_yp are all one larger in x and y compared to the costs
294
+
295
+ In order to save memory (and time, vs a sparse matrix with hashes to look up values), let:
296
+ a = x + y
297
+ b = x - y
298
+
299
+ b_offsets tells us how far from the left edge the features are computed for.
300
+ basically it's like we are computing along the diagonal,
301
+ but we shift the diagonal around based on our belief
302
+ about where the alignments are.
303
+
304
+ b_offsets is used for both costs AND csum, backpointers, so it needs to be
305
+ +2 longer (it is in the a-direction) than the costs (in the a direction)
306
+
307
+ """
308
+ cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
309
+ x_offsets, y_offsets = make_x_y_offsets(alignment_types)
310
+
311
+ # make x/y offsets, including (0,1), (1,), i.e. including deletion and insertion
312
+ x_offsets = np.concatenate([x_offsets, np.array([0, 1], dtype=np.int32)])
313
+ y_offsets = np.concatenate([y_offsets, np.array([1, 0], dtype=np.int32)])
314
+
315
+ cdef int a_in_size = a_b_costs.shape[1]
316
+ cdef int b_in_size = a_b_costs.shape[2]
317
+
318
+ cdef int a_out_size = a_in_size + 2
319
+ cdef int b_out_size = b_in_size
320
+
321
+ cdef int x_out_size = x_in_size + 1
322
+ cdef int y_out_size = y_in_size + 1
323
+
324
+ # costs are the costs of going between nodes.
325
+ # in x,y for the nodes, we basically add a buffer
326
+ # at x=0 and y=0, and shift the cost by (x=+1,y=+1)
327
+ # In a,b space, this means adding two points (for the buffer)
328
+ # at the beginning, and shifting by (a=+0,b=+1) since
329
+ # a=x+y and b=y
330
+ # for the first two points, we can simply replicate the
331
+ # original b_offset, since it should be -width_over2
332
+ # i.e. b_offset_in[0] == -width_over2
333
+ extra_two_points = np.array([b_offset_in[0], b_offset_in[0]], dtype=np.int32)
334
+ cdef np.ndarray[int, ndim=1] b_offset_out = np.concatenate([extra_two_points, b_offset_in + 1])
335
+
336
+ # outputs
337
+ # For anything being used in accumulation, use float64
338
+ cdef np.ndarray[double, ndim=2] a_b_csum = np.zeros((a_in_size + 2, b_in_size),
339
+ dtype=np.float64) + np.inf # error cumulative sum
340
+ cdef np.ndarray[int, ndim=2] a_b_xp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2 # backpointer for x
341
+ cdef np.ndarray[int, ndim=2] a_b_yp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2 # backpointer for y
342
+
343
+ cdef int num_alignments = x_offsets.shape[0]
344
+ cdef double inf = np.inf
345
+ cdef int xx_out, yy_out, ii_align, x_offset, y_offset
346
+ cdef int aa_in_cost, bb_in_cost, aa_out, bb_out, aa_out_prev, bb_out_prev, xx_in_cost, yy_in_cost, xx_out_prev, yy_out_prev
347
+
348
+ cdef double alignment_cost, total_cost, prev_cost
349
+
350
+ # increasing in a is the same as going along diagonals in x/y, so DP order works
351
+ # (and any ordering is fine in b - nothing depends on values adjacent on diagonal in x/y)
352
+ for aa_out in range(a_in_size + 2):
353
+ for bb_out in range(b_in_size):
354
+ #xx_out, yy_out = ab2xy_w_offset(aa_out, bb_out, b_offset_out)
355
+ yy_out = bb_out + b_offset_out[aa_out]
356
+ xx_out = aa_out - yy_out
357
+
358
+ # edge case: all deletions in y-direction
359
+ if xx_out == 0 and 0 <= yy_out < y_out_size:
360
+ a_b_csum[aa_out, bb_out] = del_penalty * yy_out
361
+ a_b_xp[aa_out, bb_out] = 0
362
+ a_b_yp[aa_out, bb_out] = 1
363
+
364
+ # edge case: all deletions in x-direction
365
+ elif yy_out == 0 and 0 <= xx_out < x_out_size:
366
+ a_b_csum[aa_out, bb_out] = del_penalty * xx_out
367
+ a_b_xp[aa_out, bb_out] = 1
368
+ a_b_yp[aa_out, bb_out] = 0
369
+
370
+ else:
371
+ # initialize output to inf
372
+ a_b_csum[aa_out, bb_out] = inf
373
+ a_b_xp[aa_out, bb_out] = -42
374
+ a_b_yp[aa_out, bb_out] = -42
375
+
376
+ for ii_align in range(num_alignments):
377
+ x_offset = x_offsets[ii_align]
378
+ y_offset = y_offsets[ii_align]
379
+
380
+ # coords of location of alignment cost, in input x/y space
381
+ xx_in_cost = xx_out - 1 # features were front padded,
382
+ yy_in_cost = yy_out - 1 # so offset is always 1
383
+
384
+ # the coords of location of previous cumsum cost, in input x/y space
385
+ xx_out_prev = xx_out - x_offset
386
+ yy_out_prev = yy_out - y_offset
387
+
388
+ if 0 <= xx_in_cost < x_in_size and 0 <= yy_in_cost < y_in_size and 0 <= xx_out_prev < x_out_size and 0 <= yy_out_prev < y_out_size:
389
+ # convert x,y to a,b
390
+ aa_in_cost = xx_in_cost + yy_in_cost
391
+ bb_in_cost = yy_in_cost - b_offset_in[aa_in_cost]
392
+
393
+ aa_out_prev = xx_out_prev + yy_out_prev
394
+ bb_out_prev = yy_out_prev - b_offset_out[aa_out_prev]
395
+
396
+ if 0 <= aa_in_cost < a_in_size and 0 <= bb_in_cost < b_in_size and 0 <= aa_out_prev < a_out_size and 0 <= bb_out_prev < b_out_size:
397
+ if x_offset == 0 or y_offset == 0:
398
+ alignment_cost = del_penalty
399
+ else:
400
+ alignment_cost = a_b_costs[ii_align, aa_in_cost, bb_in_cost]
401
+
402
+ prev_cost = a_b_csum[aa_out_prev, bb_out_prev]
403
+
404
+ total_cost = prev_cost + alignment_cost
405
+
406
+ if total_cost < a_b_csum[aa_out, bb_out]:
407
+ a_b_csum[aa_out, bb_out] = total_cost
408
+ a_b_xp[aa_out, bb_out] = x_offset
409
+ a_b_yp[aa_out, bb_out] = y_offset
410
+
411
+ return a_b_csum, a_b_xp, a_b_yp, b_offset_out
tibetan-aligner/dp_utils.py ADDED
@@ -0,0 +1,668 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2019 Brian Thompson
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import logging
18
+ import sys
19
+ from ast import literal_eval
20
+ from collections import OrderedDict
21
+ from math import ceil
22
+ from time import time
23
+
24
+ import numpy as np
25
+
26
+ import pyximport
27
+ pyximport.install(setup_args={'include_dirs':np.get_include()}, inplace=True, reload_support=True)
28
+
29
+ from dp_core import make_dense_costs, score_path, sparse_dp, make_sparse_costs, dense_dp
30
+
31
+ logger = logging.getLogger('vecalign') # set up in vecalign.py
32
+
33
+
34
+ def preprocess_line(line):
35
+ line = line.strip()
36
+ if len(line) == 0:
37
+ line = 'BLANK_LINE'
38
+ return line
39
+
40
+
41
+ def yield_overlaps(lines, num_overlaps):
42
+ lines = [preprocess_line(line) for line in lines]
43
+ for overlap in range(1, num_overlaps + 1):
44
+ for out_line in layer(lines, overlap):
45
+ # check must be here so all outputs are unique
46
+ out_line2 = out_line[:10000] # limit line so dont encode arbitrarily long sentences
47
+ yield out_line2
48
+
49
+
50
+ def read_in_embeddings(text_file, embed_file):
51
+ """
52
+ Given a text file with candidate sentences and a corresponing embedding file,
53
+ make a maping from candidate sentence to embedding index,
54
+ and a numpy array of the embeddings
55
+ """
56
+ sent2line = dict()
57
+ with open(text_file, 'rt', encoding="utf-8") as fin:
58
+ for ii, line in enumerate(fin):
59
+ # don't know if it is a good idea to uncomment these two lines ###
60
+ # if line.strip() in sent2line:
61
+ # raise Exception('got multiple embeddings for the same line:',line)
62
+ sent2line[line.strip()] = ii
63
+
64
+ line_embeddings = np.load(embed_file,allow_pickle=True)
65
+ print("LINE EMBEDDINGS SHAPE",line_embeddings.shape)
66
+ # line_embeddings = np.fromfile(embed_file, dtype=np.float32, count=-1)
67
+ # if line_embeddings.size == 0:
68
+ # raise Exception('Got empty embedding file')
69
+ # print("Line embeddings size",len(line_embeddings))
70
+ # laser_embedding_size = line_embeddings.size // len(sent2line) # currently hardcoded to 1024
71
+ # if laser_embedding_size != 1024:
72
+ # logger.warning('expected an embedding size of 1024, got %s', laser_embedding_size)
73
+ # logger.info('laser_embedding_size determined to be %d', laser_embedding_size)
74
+ # line_embeddings.resize(line_embeddings.shape[0] // laser_embedding_size, laser_embedding_size)
75
+ return sent2line, line_embeddings
76
+
77
+
78
+ def make_doc_embedding(sent2line, line_embeddings, lines, num_overlaps):
79
+ """
80
+ lines: sentences in input document to embed
81
+ sent2line, line_embeddings: precomputed embeddings for lines (and overlaps of lines)
82
+ """
83
+
84
+ lines = [preprocess_line(line) for line in lines]
85
+
86
+ vecsize = line_embeddings.shape[1]
87
+
88
+ vecs0 = np.empty((num_overlaps, len(lines), vecsize), dtype=np.float32)
89
+
90
+ for ii, overlap in enumerate(range(1, num_overlaps + 1)):
91
+ for jj, out_line in enumerate(layer(lines, overlap)):
92
+ try:
93
+ line_id = sent2line[out_line]
94
+ except KeyError:
95
+ logger.warning('Failed to find overlap=%d line "%s". Will use random vector.', overlap, out_line)
96
+ line_id = None
97
+
98
+ if line_id is not None:
99
+ vec = line_embeddings[line_id]
100
+ else:
101
+ vec = np.random.random(vecsize) - 0.5
102
+ vec = vec / np.linalg.norm(vec)
103
+
104
+ vecs0[ii, jj, :] = vec
105
+
106
+ return vecs0
107
+
108
+
109
+ def make_norm1(vecs0):
110
+ """
111
+ make vectors norm==1 so that cosine distance can be computed via dot product
112
+ """
113
+ for ii in range(vecs0.shape[0]):
114
+ for jj in range(vecs0.shape[1]):
115
+ norm = np.sqrt(np.square(vecs0[ii, jj, :]).sum())
116
+ vecs0[ii, jj, :] = vecs0[ii, jj, :] / (norm + 1e-5)
117
+
118
+
119
+ def layer(lines, num_overlaps, comb=' '):
120
+ """
121
+ make front-padded overlapping sentences
122
+ """
123
+ if num_overlaps < 1:
124
+ raise Exception('num_overlaps must be >= 1')
125
+ out = ['PAD', ] * min(num_overlaps - 1, len(lines))
126
+ for ii in range(len(lines) - num_overlaps + 1):
127
+ out.append(comb.join(lines[ii:ii + num_overlaps]))
128
+ return out
129
+
130
+
131
+ def read_alignments(fin):
132
+ alignments = []
133
+ with open(fin, 'rt', encoding="utf-8") as infile:
134
+ for line in infile:
135
+ fields = [x.strip() for x in line.split(':') if len(x.strip())]
136
+ if len(fields) < 2:
137
+ raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
138
+ try:
139
+ src = literal_eval(fields[0])
140
+ tgt = literal_eval(fields[1])
141
+ except:
142
+ raise Exception('Failed to parse line "%s"' % line.strip())
143
+ alignments.append((src, tgt))
144
+
145
+ # I know bluealign files have a few entries entries missing,
146
+ # but I don't fix them in order to be consistent previous reported scores
147
+ return alignments
148
+
149
+
150
+ def print_alignments(alignments, scores=None, file=sys.stdout):
151
+ if scores is not None:
152
+ for (x, y), s in zip(alignments, scores):
153
+ print('%s:%s:%.6f' % (x, y, s), file=file)
154
+ else:
155
+ for x, y in alignments:
156
+ print('%s:%s' % (x, y), file=file)
157
+
158
+
159
+ class DeletionKnob(object):
160
+ """
161
+ A good deletion penalty is dependent on normalization, and probably language, domain, etc, etc
162
+ I want a way to control deletion penalty that generalizes well...
163
+ Sampling costs and use percentile seems to work fairly well.
164
+ """
165
+ def __init__(self, samp, res_min, res_max):
166
+
167
+ self.res_min = res_min
168
+ self.res_max = res_max
169
+
170
+ if self.res_min >= self.res_max:
171
+ logger.warning('res_max <= res_min, increasing it')
172
+ self.res_max = self.res_min + 1e-4
173
+
174
+ num_bins = 1000
175
+ num_pts = 30
176
+
177
+ self.hist, self.bin_edges = np.histogram(samp, bins=num_bins,
178
+ range=[self.res_min, self.res_max],
179
+ density=True)
180
+
181
+ dx = self.bin_edges[1] - self.bin_edges[0]
182
+ self.cdf = np.cumsum(self.hist) * dx
183
+
184
+ interp_points = [(0, self.res_min), ]
185
+ for knob_val in np.linspace(0, 1, num_pts - 1)[1:-1]:
186
+ cdf_idx = np.searchsorted(self.cdf, knob_val)
187
+ cdf_val = self.res_min + cdf_idx / float(num_bins) * (self.res_max - self.res_min)
188
+ interp_points.append((knob_val, cdf_val))
189
+ interp_points.append((1, self.res_max))
190
+ self.x, self.y = zip(*interp_points)
191
+
192
+ def percentile_frac_to_del_penalty(self, knob_val):
193
+ del_pen = np.interp([knob_val], self.x, self.y)[0]
194
+ return del_pen
195
+
196
+
197
+ def make_alignment_types(max_alignment_size):
198
+ # return list of all (n,m) where n+m <= this
199
+ alignment_types = []
200
+ for x in range(1, max_alignment_size):
201
+ for y in range(1, max_alignment_size):
202
+ if x + y <= max_alignment_size:
203
+ alignment_types.append((x, y))
204
+ return alignment_types
205
+
206
+
207
+ def ab2xy_w_offset(aa, bb_idx, bb_offset):
208
+ bb_from_side = bb_idx + bb_offset[aa]
209
+ xx = aa - bb_from_side
210
+ yy = bb_from_side
211
+ return (xx, yy)
212
+
213
+
214
+ def xy2ab_w_offset(xx, yy, bb_offset):
215
+ aa = xx + yy
216
+ bb_from_side = yy
217
+ bb = bb_from_side - bb_offset[aa]
218
+ return aa, bb
219
+
220
+
221
+ def process_scores(scores, alignments):
222
+ # floating point sometimes gives negative numbers, which is a little unnerving ...
223
+ scores = np.clip(scores, a_min=0, a_max=None)
224
+
225
+ for ii, (x_algn, y_algn) in enumerate(alignments):
226
+ # deletion penalty is pretty arbitrary, just report 0
227
+ if len(x_algn) == 0 or len(y_algn) == 0:
228
+ scores[ii] = 0.0
229
+ # report sores un-normalized by alignment sizes
230
+ # (still normalized with random vectors, though)
231
+ else:
232
+ scores[ii] = scores[ii] / len(x_algn) / len(y_algn)
233
+
234
+ return scores
235
+
236
+
237
+ def sparse_traceback(a_b_csum, a_b_xp, a_b_yp, b_offset, xsize, ysize):
238
+ alignments = []
239
+ xx = xsize
240
+ yy = ysize
241
+
242
+ cum_costs = []
243
+
244
+ while True:
245
+ aa, bb = xy2ab_w_offset(xx, yy, b_offset)
246
+
247
+ cum_costs.append(a_b_csum[aa, bb])
248
+
249
+ xp = a_b_xp[aa, bb]
250
+ yp = a_b_yp[aa, bb]
251
+
252
+ if xx == yy == 0:
253
+ break
254
+
255
+ if xx < 0 or yy < 0:
256
+ raise Exception('traceback bug')
257
+
258
+ x_side = list(range(xx - xp, xx))
259
+ y_side = list(range(yy - yp, yy))
260
+ alignments.append((x_side, y_side))
261
+
262
+ xx = xx - xp
263
+ yy = yy - yp
264
+
265
+ alignments.reverse()
266
+ cum_costs.reverse()
267
+ costs = np.array(cum_costs[1:]) - np.array(cum_costs[:-1])
268
+ # "costs" are scaled by x_alignment_size * y_alignment_size
269
+ # and the cost of a deletion is del_penalty
270
+ # "scores": 0 for deletion/insertion,
271
+ # and cosine distance, *not* scaled
272
+ # by len(x_alignment)*len(y_alignment)
273
+ scores = process_scores(scores=costs, alignments=alignments)
274
+
275
+ return alignments, scores
276
+
277
+
278
+ def dense_traceback(x_y_tb):
279
+ xsize, ysize = x_y_tb.shape
280
+
281
+ xx = xsize - 1
282
+ yy = ysize - 1
283
+
284
+ alignments = []
285
+ while True:
286
+ if xx == yy == 0:
287
+ break
288
+ bp = x_y_tb[xx, yy]
289
+ if bp == 0:
290
+ xp, yp = 1, 1
291
+ alignments.append(([xx - 1], [yy - 1]))
292
+ elif bp == 1:
293
+ xp, yp = 0, 1
294
+ alignments.append(([], [yy - 1]))
295
+ elif bp == 2:
296
+ xp, yp = 1, 0
297
+ alignments.append(([xx - 1], []))
298
+ else:
299
+ raise Exception('got unknown value')
300
+
301
+ xx = xx - xp
302
+ yy = yy - yp
303
+
304
+ alignments.reverse()
305
+
306
+ return alignments
307
+
308
+
309
+ def append_slant(path, xwidth, ywidth):
310
+ """
311
+ Append quantized approximation to a straight line
312
+ from current x,y to a point at (x+xwidth, y+ywidth)
313
+ """
314
+ NN = xwidth + ywidth
315
+ xstart, ystart = path[-1]
316
+ for ii in range(1, NN + 1):
317
+ x = xstart + round(xwidth * ii / NN)
318
+ y = ystart + round(ywidth * ii / NN)
319
+ # In the case of ties we want them to round differently,
320
+ # so explicitly make sure we take a step of 1, not 0 or 2
321
+ lastx, lasty = path[-1]
322
+ delta = x + y - lastx - lasty
323
+ if delta == 1:
324
+ path.append((x, y))
325
+ elif delta == 2:
326
+ path.append((x - 1, y))
327
+ elif delta == 0:
328
+ path.append((x + 1, y))
329
+
330
+
331
+ def alignment_to_search_path(algn):
332
+ """
333
+ Given an alignment, make searchpath.
334
+ Searchpath must step exactly one position in x XOR y at each time step.
335
+
336
+ In the case of a block of deletions, the order found by DP is not meaningful.
337
+ To make things consistent and to improve the probability of recovering
338
+ from search errors, we search an approximately straight line
339
+ through a block of deletions. We do the same through a many-many
340
+ alignment, even though we currently don't refine a many-many alignment...
341
+ """
342
+ path = [(0, 0), ]
343
+ xdel, ydel = 0, 0
344
+ ydel = 0
345
+ for x, y in algn:
346
+ if len(x) and len(y):
347
+ append_slant(path, xdel, ydel)
348
+ xdel, ydel = 0, 0
349
+ append_slant(path, len(x), len(y))
350
+ elif len(x):
351
+ xdel += len(x)
352
+ elif len(y):
353
+ ydel += len(y)
354
+
355
+ append_slant(path, xdel, ydel)
356
+
357
+ return path
358
+
359
+
360
+ def extend_alignments(course_alignments, size0, size1):
361
+ """
362
+ extend alignments to include new endpoints size0, size1
363
+ if alignments are larger than size0/size1, raise exception
364
+ """
365
+ # could be a string of deletions or insertions at end, so cannot just grab last one
366
+ xmax = 0 # maximum x value in course_alignments
367
+ ymax = 0 # maximum y value in course_alignments
368
+ for x, y in course_alignments:
369
+ for xval in x:
370
+ xmax = max(xmax, xval)
371
+ for yval in y:
372
+ ymax = max(ymax, yval)
373
+
374
+ if xmax > size0 or ymax > size1:
375
+ raise Exception('asked to extend alignments but already bigger than requested')
376
+
377
+ # do not duplicate xmax/ymax, do include size0/size1
378
+ extra_x = list(range(xmax + 1, size0 + 1))
379
+ extra_y = list(range(ymax + 1, size1 + 1))
380
+
381
+ logger.debug('extending alignments in x by %d and y by %d', len(extra_x), len(extra_y))
382
+
383
+ if len(extra_x) == 0:
384
+ for yval in extra_y:
385
+ course_alignments.append(([], [yval]))
386
+ elif len(extra_y) == 0:
387
+ for xval in extra_x:
388
+ course_alignments.append(([xval], []))
389
+ else:
390
+ course_alignments.append((extra_x, extra_y))
391
+
392
+
393
+ def upsample_alignment(algn):
394
+ def upsample_one_alignment(xx):
395
+ return list(range(min(xx) * 2, (max(xx) + 1) * 2))
396
+
397
+ new_algn = []
398
+ for xx, yy in algn:
399
+ if len(xx) == 0:
400
+ for yyy in upsample_one_alignment(yy):
401
+ new_algn.append(([], [yyy]))
402
+ elif len(yy) == 0:
403
+ for xxx in upsample_one_alignment(xx):
404
+ new_algn.append(([xxx], []))
405
+ else:
406
+ new_algn.append((upsample_one_alignment(xx), upsample_one_alignment(yy)))
407
+ return new_algn
408
+
409
+
410
+ def make_del_knob(e_laser,
411
+ f_laser,
412
+ e_laser_norms,
413
+ f_laser_norms,
414
+ sample_size):
415
+ e_size = e_laser.shape[0]
416
+ f_size = f_laser.shape[0]
417
+
418
+ if e_size > 0 and f_size > 0 and sample_size > 0:
419
+
420
+ if e_size * f_size < sample_size:
421
+ # dont sample, just compute full matrix
422
+ sample_size = e_size * f_size
423
+ x_idxs = np.zeros(sample_size, dtype=np.int32)
424
+ y_idxs = np.zeros(sample_size, dtype=np.int32)
425
+ c = 0
426
+ for ii in range(e_size):
427
+ for jj in range(f_size):
428
+ x_idxs[c] = ii
429
+ y_idxs[c] = jj
430
+ c += 1
431
+ else:
432
+ # get random samples
433
+ x_idxs = np.random.choice(range(e_size), size=sample_size, replace=True).astype(np.int32)
434
+ y_idxs = np.random.choice(range(f_size), size=sample_size, replace=True).astype(np.int32)
435
+
436
+ # output
437
+ random_scores = np.empty(sample_size, dtype=np.float32)
438
+
439
+ score_path(x_idxs, y_idxs,
440
+ e_laser_norms, f_laser_norms,
441
+ e_laser, f_laser,
442
+ random_scores, )
443
+
444
+ min_score = 0
445
+ max_score = max(random_scores) # could bump this up... but its probably fine
446
+
447
+ else:
448
+ # Not much we can do here...
449
+ random_scores = np.array([0.0, 0.5, 1.0]) # ???
450
+ min_score = 0
451
+ max_score = 1 # ????
452
+
453
+ del_knob = DeletionKnob(random_scores, min_score, max_score)
454
+
455
+ return del_knob
456
+
457
+
458
+ def compute_norms(vecs0, vecs1, num_samples, overlaps_to_use=None):
459
+ # overlaps_to_use = 10 # 10 matches before
460
+
461
+ overlaps1, size1, dim = vecs1.shape
462
+ overlaps0, size0, dim0 = vecs0.shape
463
+ assert (dim == dim0)
464
+
465
+ if overlaps_to_use is not None:
466
+ if overlaps_to_use > overlaps1:
467
+ raise Exception('Cannot use more overlaps than provided. You may want to re-run make_verlaps.py with a larger -n value')
468
+ else:
469
+ overlaps_to_use = overlaps1
470
+
471
+ samps_per_overlap = ceil(num_samples / overlaps_to_use)
472
+
473
+ if size1 and samps_per_overlap:
474
+ # sample other size (from all overlaps) to compre to this side
475
+ vecs1_rand_sample = np.empty((samps_per_overlap * overlaps_to_use, dim), dtype=np.float32)
476
+ for overlap_ii in range(overlaps_to_use):
477
+ idxs = np.random.choice(range(size1), size=samps_per_overlap, replace=True)
478
+ random_vecs = vecs1[overlap_ii, idxs, :]
479
+ vecs1_rand_sample[overlap_ii * samps_per_overlap:(overlap_ii + 1) * samps_per_overlap, :] = random_vecs
480
+
481
+ norms0 = np.empty((overlaps0, size0), dtype=np.float32)
482
+ for overlap_ii in range(overlaps0):
483
+ e_laser = vecs0[overlap_ii, :, :]
484
+ sim = np.matmul(e_laser, vecs1_rand_sample.T)
485
+ norms0[overlap_ii, :] = 1.0 - sim.mean(axis=1)
486
+
487
+ else: # no samples, no normalization
488
+ norms0 = np.ones((overlaps0, size0)).astype(np.float32)
489
+
490
+ return norms0
491
+
492
+
493
+ def downsample_vectors(vecs1):
494
+ a, b, c = vecs1.shape
495
+ half = np.empty((a, b // 2, c), dtype=np.float32)
496
+ for ii in range(a):
497
+ # average consecutive vectors
498
+ for jj in range(0, b - b % 2, 2):
499
+ v1 = vecs1[ii, jj, :]
500
+ v2 = vecs1[ii, jj + 1, :]
501
+ half[ii, jj // 2, :] = v1 + v2
502
+ # compute mean for all vectors
503
+ mean = np.mean(half[ii, :, :], axis=0)
504
+ for jj in range(0, b - b % 2, 2):
505
+ # remove mean
506
+ half[ii, jj // 2, :] = half[ii, jj // 2, :] - mean
507
+ # make vectors norm==1 so dot product is cosine distance
508
+ make_norm1(half)
509
+ return half
510
+
511
+
512
+ def vecalign(vecs0,
513
+ vecs1,
514
+ final_alignment_types,
515
+ del_percentile_frac,
516
+ width_over2,
517
+ max_size_full_dp,
518
+ costs_sample_size,
519
+ num_samps_for_norm,
520
+ norms0=None,
521
+ norms1=None):
522
+ if width_over2 < 3:
523
+ logger.warning('width_over2 was set to %d, which does not make sense. increasing to 3.', width_over2)
524
+ width_over2 = 3
525
+
526
+ # make sure input embeddings are norm==1
527
+ make_norm1(vecs0)
528
+ make_norm1(vecs1)
529
+
530
+ # save off runtime stats for summary
531
+ runtimes = OrderedDict()
532
+
533
+ # Determine stack depth
534
+ s0, s1 = vecs0.shape[1], vecs1.shape[1]
535
+ max_depth = 0
536
+ while s0 * s1 > max_size_full_dp ** 2:
537
+ max_depth += 1
538
+ s0 = s0 // 2
539
+ s1 = s1 // 2
540
+
541
+ # init recursion stack
542
+ # depth is 0-based (full size is 0, 1 is half, 2 is quarter, etc)
543
+ stack = {0: {'v0': vecs0, 'v1': vecs1}}
544
+
545
+ # downsample sentence vectors
546
+ t0 = time()
547
+ for depth in range(1, max_depth + 1):
548
+ stack[depth] = {'v0': downsample_vectors(stack[depth - 1]['v0']),
549
+ 'v1': downsample_vectors(stack[depth - 1]['v1'])}
550
+ runtimes['Downsample embeddings'] = time() - t0
551
+
552
+ # compute norms for all depths, add sizes, add alignment types
553
+ t0 = time()
554
+ for depth in stack:
555
+ stack[depth]['size0'] = stack[depth]['v0'].shape[1]
556
+ stack[depth]['size1'] = stack[depth]['v1'].shape[1]
557
+ stack[depth]['alignment_types'] = final_alignment_types if depth == 0 else [(1, 1)]
558
+
559
+ if depth == 0 and norms0 is not None:
560
+ if norms0.shape != vecs0.shape[:2]:
561
+ print('norms0.shape:', norms0.shape)
562
+ print('vecs0.shape[:2]:', vecs0.shape[:2])
563
+ raise Exception('norms0 wrong shape')
564
+ stack[depth]['n0'] = norms0
565
+ else:
566
+ stack[depth]['n0'] = compute_norms(stack[depth]['v0'], stack[depth]['v1'], num_samps_for_norm)
567
+
568
+ if depth == 0 and norms1 is not None:
569
+ if norms1.shape != vecs1.shape[:2]:
570
+ print('norms1.shape:', norms1.shape)
571
+ print('vecs1.shape[:2]:', vecs1.shape[:2])
572
+ raise Exception('norms1 wrong shape')
573
+ stack[depth]['n1'] = norms1
574
+ else:
575
+ stack[depth]['n1'] = compute_norms(stack[depth]['v1'], stack[depth]['v0'], num_samps_for_norm)
576
+
577
+ runtimes['Normalize embeddings'] = time() - t0
578
+
579
+ # Compute deletion penalty for all depths
580
+ t0 = time()
581
+ for depth in stack:
582
+ stack[depth]['del_knob'] = make_del_knob(e_laser=stack[depth]['v0'][0, :, :],
583
+ f_laser=stack[depth]['v1'][0, :, :],
584
+ e_laser_norms=stack[depth]['n0'][0, :],
585
+ f_laser_norms=stack[depth]['n1'][0, :],
586
+ sample_size=costs_sample_size)
587
+ stack[depth]['del_penalty'] = stack[depth]['del_knob'].percentile_frac_to_del_penalty(del_percentile_frac)
588
+ logger.debug('del_penalty at depth %d: %f', depth, stack[depth]['del_penalty'])
589
+ runtimes['Compute deletion penalties'] = time() - t0
590
+ tt = time() - t0
591
+ logger.debug('%d x %d full DP make features: %.6fs (%.3e per dot product)',
592
+ stack[max_depth]['size0'], stack[max_depth]['size1'], tt,
593
+ tt / (stack[max_depth]['size0'] + 1e-6) / (stack[max_depth]['size1'] + 1e-6))
594
+ # full DP at maximum recursion depth
595
+ t0 = time()
596
+ stack[max_depth]['costs_1to1'] = make_dense_costs(stack[max_depth]['v0'],
597
+ stack[max_depth]['v1'],
598
+ stack[max_depth]['n0'],
599
+ stack[max_depth]['n1'])
600
+
601
+ runtimes['Full DP make features'] = time() - t0
602
+ t0 = time()
603
+ _, stack[max_depth]['x_y_tb'] = dense_dp(stack[max_depth]['costs_1to1'], stack[max_depth]['del_penalty'])
604
+ stack[max_depth]['alignments'] = dense_traceback(stack[max_depth]['x_y_tb'])
605
+ runtimes['Full DP'] = time() - t0
606
+
607
+ # upsample the path up to the top resolution
608
+ compute_costs_times = []
609
+ dp_times = []
610
+ upsample_depths = [0, ] if max_depth == 0 else list(reversed(range(0, max_depth)))
611
+ for depth in upsample_depths:
612
+ if max_depth > 0: # upsample previoius alignment to current resolution
613
+ course_alignments = upsample_alignment(stack[depth + 1]['alignments'])
614
+ # features may have been truncated when downsampleing, so alignment may need extended
615
+ extend_alignments(course_alignments, stack[depth]['size0'], stack[depth]['size1']) # in-place
616
+ else: # We did a full size 1-1 search, so search same size with more alignment types
617
+ course_alignments = stack[0]['alignments']
618
+
619
+ # convert couse alignments to a searchpath
620
+ stack[depth]['searchpath'] = alignment_to_search_path(course_alignments)
621
+
622
+ # compute ccosts for sparse DP
623
+ t0 = time()
624
+ stack[depth]['a_b_costs'], stack[depth]['b_offset'] = make_sparse_costs(stack[depth]['v0'], stack[depth]['v1'],
625
+ stack[depth]['n0'], stack[depth]['n1'],
626
+ stack[depth]['searchpath'],
627
+ stack[depth]['alignment_types'],
628
+ width_over2)
629
+
630
+ tt = time() - t0
631
+ num_dot_products = len(stack[depth]['b_offset']) * len(stack[depth]['alignment_types']) * width_over2 * 2
632
+ logger.debug('%d x %d sparse DP (%d alignment types, %d window) make features: %.6fs (%.3e per dot product)',
633
+ stack[max_depth]['size0'], stack[max_depth]['size1'],
634
+ len(stack[depth]['alignment_types']), width_over2 * 2,
635
+ tt, tt / (num_dot_products + 1e6))
636
+
637
+ compute_costs_times.append(time() - t0)
638
+ t0 = time()
639
+ # perform sparse DP
640
+ stack[depth]['a_b_csum'], stack[depth]['a_b_xp'], stack[depth]['a_b_yp'], \
641
+ stack[depth]['new_b_offset'] = sparse_dp(stack[depth]['a_b_costs'], stack[depth]['b_offset'],
642
+ stack[depth]['alignment_types'], stack[depth]['del_penalty'],
643
+ stack[depth]['size0'], stack[depth]['size1'])
644
+
645
+ # performace traceback to get alignments and alignment scores
646
+ # for debugging, avoid overwriting stack[depth]['alignments']
647
+ akey = 'final_alignments' if depth == 0 else 'alignments'
648
+ stack[depth][akey], stack[depth]['alignment_scores'] = sparse_traceback(stack[depth]['a_b_csum'],
649
+ stack[depth]['a_b_xp'],
650
+ stack[depth]['a_b_yp'],
651
+ stack[depth]['new_b_offset'],
652
+ stack[depth]['size0'],
653
+ stack[depth]['size1'])
654
+ dp_times.append(time() - t0)
655
+
656
+ runtimes['Upsample DP compute costs'] = sum(compute_costs_times[:-1])
657
+ runtimes['Upsample DP'] = sum(dp_times[:-1])
658
+
659
+ runtimes['Final DP compute costs'] = compute_costs_times[-1]
660
+ runtimes['Final DP'] = dp_times[-1]
661
+
662
+ # log time stats
663
+ max_key_str_len = max([len(key) for key in runtimes])
664
+ for key in runtimes:
665
+ if runtimes[key] > 5e-5:
666
+ logger.info(key + ' took ' + '.' * (max_key_str_len + 5 - len(key)) + ('%.4fs' % runtimes[key]).rjust(7))
667
+
668
+ return stack
tibetan-aligner/get_vectors.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ from sentence_transformers import SentenceTransformer
4
+ import numpy as np
5
+
6
+ filename = sys.argv[1]
7
+ number_of_overlays = int(sys.argv[2]) + 1 # +1 because we want to include the original sentence
8
+
9
+ def process_file(filename):
10
+ model_path = "buddhist-nlp/bod-eng-similarity"
11
+ model = SentenceTransformer(model_path)
12
+
13
+ model.max_seq_length = 500
14
+ file = open(filename,'r')
15
+
16
+ sentences = [line.rstrip('\n').strip() for line in file]
17
+ sentences_overlay = []
18
+
19
+ for x in range(len(sentences)):
20
+ val = number_of_overlays
21
+ if (len(sentences) - x) < val:
22
+ val = (len(sentences) - x) + 1
23
+ for i in range(1,val):
24
+ sentences_overlay.append(' '.join(sentences[x:x+i]))
25
+ overlay_string = "\n".join(sentences_overlay)
26
+ vectors = np.array(model.encode(sentences_overlay,show_progress_bar=False))
27
+ print("LEN SENTENCES",len(sentences_overlay))
28
+ print("LEN VECTORS",len(vectors))
29
+ with open(sys.argv[1] + "_overlay", "w") as text_file:
30
+ text_file.write(overlay_string)
31
+
32
+ np.save(sys.argv[1] + "_vectors",vectors)
33
+
34
+ process_file(filename)
35
+
tibetan-aligner/ladder ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LINE EMBEDDINGS SHAPE (15, 768)
2
+ LINE EMBEDDINGS SHAPE (87, 768)
3
+ [0]:[0]:0.264225
4
+ [1, 2]:[1]:0.354184
5
+ []:[2]:0.000000
6
+ []:[3]:0.000000
7
+ []:[4]:0.000000
8
+ []:[5]:0.000000
9
+ [3]:[6, 7, 8, 9, 10]:0.404515
10
+ []:[11]:0.000000
11
+ [4]:[12, 13, 14, 15, 16]:0.280724
tibetan-aligner/ladder2org.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import re
3
+ import re
4
+ f1 = open(sys.argv[1],'r')
5
+ f2 = open(sys.argv[2],'r')
6
+ ladder_file = open(sys.argv[3],'r')
7
+
8
+
9
+ output = ""
10
+ ladder = []
11
+ sktfile = [line.rstrip('\n').strip() for line in f1]
12
+ tibfile = [line.rstrip('\n').strip() for line in f2]
13
+ last_score = 0.5
14
+
15
+ def clean_num(string):
16
+ string = re.sub("[^0-9, ]","",string)
17
+ return int(string.split(',')[0])
18
+
19
+
20
+ for line in ladder_file:
21
+ if len(line.split(':')) == 3:
22
+ skt,tib,score = line.split(':')
23
+ if re.search("[0-9]",skt) and re.search("[0-9]",tib):
24
+ skt_num = clean_num(skt)
25
+ tib_num = clean_num(tib)
26
+ score = float(score)
27
+ if score > 0.0:
28
+ ladder.append([skt_num,tib_num,score])
29
+ last_skt = 0
30
+ last_tib = 0
31
+ for entry in ladder:
32
+ output = output + ' +$+ '.join(sktfile[last_skt:entry[0]]) + "\n"
33
+ output = output + "# " + ' +!+ '.join(tibfile[last_tib:entry[1]]) + "\n" #+ "\t" + " SCORE: " + str(entry[2]) + "\n"
34
+ last_skt = entry[0]
35
+ last_tib = entry[1]
36
+ output = output + ' / '.join(sktfile[last_skt:-1]) + "\n"
37
+ output = output + "# " + ' / '.join(tibfile[last_tib:-1]) + "\n"
38
+
39
+ short_f1 = re.sub("\.tsv.*","",sys.argv[1])
40
+ short_f2 = re.sub(".*/","",sys.argv[2])
41
+ short_f2 = re.sub("\.tsv.*","",short_f2)
42
+
43
+ with open(short_f1 + "_" + short_f2 + ".org", 'w') as file:
44
+ file.write(output)
45
+
46
+
47
+
tibetan-aligner/model_to_hub.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
2
+ model_path = "model"
3
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
4
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
5
+ # push model and tokenizer to huggingface hub
6
+ model.push_to_hub("buddhist-nlp/bod-eng-similarity")
7
+ tokenizer.push_to_hub("buddhist-nlp/bod-eng-similarity")
tibetan-aligner/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ sentence-transformers==2.2.2
2
+ pyewts==0.2.0
3
+ Cython==0.29.34
tibetan-aligner/score.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Copyright 2019 Brian Thompson
5
+
6
+ Licensed under the Apache License, Version 2.0 (the "License");
7
+ you may not use this file except in compliance with the License.
8
+ You may obtain a copy of the License at
9
+
10
+ https://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+
18
+ """
19
+
20
+ import argparse
21
+ import sys
22
+ from collections import defaultdict
23
+
24
+ import numpy as np
25
+
26
+ from dp_utils import read_alignments
27
+
28
+ """
29
+ Faster implementation of lax and strict precision and recall, based on
30
+ https://www.aclweb.org/anthology/W11-4624/.
31
+
32
+ """
33
+
34
+
35
+ def _precision(goldalign, testalign):
36
+ """
37
+ Computes tpstrict, fpstrict, tplax, fplax for gold/test alignments
38
+ """
39
+ tpstrict = 0 # true positive strict counter
40
+ tplax = 0 # true positive lax counter
41
+ fpstrict = 0 # false positive strict counter
42
+ fplax = 0 # false positive lax counter
43
+
44
+ # convert to sets, remove alignments empty on both sides
45
+ testalign = set([(tuple(x), tuple(y)) for x, y in testalign if len(x) or len(y)])
46
+ goldalign = set([(tuple(x), tuple(y)) for x, y in goldalign if len(x) or len(y)])
47
+
48
+ # mappings from source test sentence idxs to
49
+ # target gold sentence idxs for which the source test sentence
50
+ # was found in corresponding source gold alignment
51
+ src_id_to_gold_tgt_ids = defaultdict(set)
52
+ for gold_src, gold_tgt in goldalign:
53
+ for gold_src_id in gold_src:
54
+ for gold_tgt_id in gold_tgt:
55
+ src_id_to_gold_tgt_ids[gold_src_id].add(gold_tgt_id)
56
+
57
+ for (test_src, test_target) in testalign:
58
+ if (test_src, test_target) == ((), ()):
59
+ continue
60
+ if (test_src, test_target) in goldalign:
61
+ # strict match
62
+ tpstrict += 1
63
+ tplax += 1
64
+ else:
65
+ # For anything with partial gold/test overlap on the source,
66
+ # see if there is also partial overlap on the gold/test target
67
+ # If so, its a lax match
68
+ target_ids = set()
69
+ for src_test_id in test_src:
70
+ for tgt_id in src_id_to_gold_tgt_ids[src_test_id]:
71
+ target_ids.add(tgt_id)
72
+ if set(test_target).intersection(target_ids):
73
+ fpstrict += 1
74
+ tplax += 1
75
+ else:
76
+ fpstrict += 1
77
+ fplax += 1
78
+
79
+ return np.array([tpstrict, fpstrict, tplax, fplax], dtype=np.int32)
80
+
81
+
82
+ def score_multiple(gold_list, test_list, value_for_div_by_0=0.0):
83
+ # accumulate counts for all gold/test files
84
+ pcounts = np.array([0, 0, 0, 0], dtype=np.int32)
85
+ rcounts = np.array([0, 0, 0, 0], dtype=np.int32)
86
+ for goldalign, testalign in zip(gold_list, test_list):
87
+ pcounts += _precision(goldalign=goldalign, testalign=testalign)
88
+ # recall is precision with no insertion/deletion and swap args
89
+ test_no_del = [(x, y) for x, y in testalign if len(x) and len(y)]
90
+ gold_no_del = [(x, y) for x, y in goldalign if len(x) and len(y)]
91
+ rcounts += _precision(goldalign=test_no_del, testalign=gold_no_del)
92
+
93
+ # Compute results
94
+ # pcounts: tpstrict,fnstrict,tplax,fnlax
95
+ # rcounts: tpstrict,fpstrict,tplax,fplax
96
+
97
+ if pcounts[0] + pcounts[1] == 0:
98
+ pstrict = value_for_div_by_0
99
+ else:
100
+ pstrict = pcounts[0] / float(pcounts[0] + pcounts[1])
101
+
102
+ if pcounts[2] + pcounts[3] == 0:
103
+ plax = value_for_div_by_0
104
+ else:
105
+ plax = pcounts[2] / float(pcounts[2] + pcounts[3])
106
+
107
+ if rcounts[0] + rcounts[1] == 0:
108
+ rstrict = value_for_div_by_0
109
+ else:
110
+ rstrict = rcounts[0] / float(rcounts[0] + rcounts[1])
111
+
112
+ if rcounts[2] + rcounts[3] == 0:
113
+ rlax = value_for_div_by_0
114
+ else:
115
+ rlax = rcounts[2] / float(rcounts[2] + rcounts[3])
116
+
117
+ if (pstrict + rstrict) == 0:
118
+ fstrict = value_for_div_by_0
119
+ else:
120
+ fstrict = 2 * (pstrict * rstrict) / (pstrict + rstrict)
121
+
122
+ if (plax + rlax) == 0:
123
+ flax = value_for_div_by_0
124
+ else:
125
+ flax = 2 * (plax * rlax) / (plax + rlax)
126
+
127
+ result = dict(recall_strict=rstrict,
128
+ recall_lax=rlax,
129
+ precision_strict=pstrict,
130
+ precision_lax=plax,
131
+ f1_strict=fstrict,
132
+ f1_lax=flax)
133
+
134
+ return result
135
+
136
+
137
+ def log_final_scores(res):
138
+ print(' ---------------------------------', file=sys.stderr)
139
+ print('| | Strict | Lax |', file=sys.stderr)
140
+ print('| Precision | {precision_strict:.3f} | {precision_lax:.3f} |'.format(**res), file=sys.stderr)
141
+ print('| Recall | {recall_strict:.3f} | {recall_lax:.3f} |'.format(**res), file=sys.stderr)
142
+ print('| F1 | {f1_strict:.3f} | {f1_lax:.3f} |'.format(**res), file=sys.stderr)
143
+ print(' ---------------------------------', file=sys.stderr)
144
+
145
+
146
+ def main():
147
+ parser = argparse.ArgumentParser(
148
+ 'Compute strict/lax precision and recall for one or more pairs of gold/test alignments',
149
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
150
+
151
+ parser.add_argument('-t', '--test', type=str, nargs='+', required=True,
152
+ help='one or more test alignment files')
153
+
154
+ parser.add_argument('-g', '--gold', type=str, nargs='+', required=True,
155
+ help='one or more gold alignment files')
156
+
157
+ args = parser.parse_args()
158
+
159
+ if len(args.test) != len(args.gold):
160
+ raise Exception('number of gold/test files must be the same')
161
+
162
+ gold_list = [read_alignments(x) for x in args.gold]
163
+ test_list = [read_alignments(x) for x in args.test]
164
+
165
+ res = score_multiple(gold_list=gold_list, test_list=test_list)
166
+ log_final_scores(res)
167
+
168
+
169
+ if __name__ == '__main__':
170
+ main()
tibetan-aligner/vecalign.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Copyright 2019 Brian Thompson
5
+
6
+ Licensed under the Apache License, Version 2.0 (the "License");
7
+ you may not use this file except in compliance with the License.
8
+ You may obtain a copy of the License at
9
+
10
+ https://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+ """
18
+
19
+ import argparse
20
+ import logging
21
+ import pickle
22
+ from math import ceil
23
+ from random import seed as seed
24
+
25
+ import numpy as np
26
+
27
+ logger = logging.getLogger('vecalign')
28
+ logger.setLevel(logging.WARNING)
29
+ logFormatter = logging.Formatter("%(asctime)s %(levelname)-5.5s %(message)s")
30
+ consoleHandler = logging.StreamHandler()
31
+ consoleHandler.setFormatter(logFormatter)
32
+ logger.addHandler(consoleHandler)
33
+
34
+ from dp_utils import make_alignment_types, print_alignments, read_alignments, \
35
+ read_in_embeddings, make_doc_embedding, vecalign
36
+
37
+ from score import score_multiple, log_final_scores
38
+
39
+
40
+ def _main():
41
+ # make runs consistent
42
+ seed(42)
43
+ np.random.seed(42)
44
+
45
+ parser = argparse.ArgumentParser('Sentence alignment using sentence embeddings and FastDTW',
46
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
47
+
48
+ parser.add_argument('-s', '--src', type=str, nargs='+', required=True,
49
+ help='preprocessed source file to align')
50
+
51
+ parser.add_argument('-t', '--tgt', type=str, nargs='+', required=True,
52
+ help='preprocessed target file to align')
53
+
54
+ parser.add_argument('-g', '--gold_alignment', type=str, nargs='+', required=False,
55
+ help='preprocessed target file to align')
56
+
57
+ parser.add_argument('--src_embed', type=str, nargs=2, required=True,
58
+ help='Source embeddings. Requires two arguments: first is a text file, sencond is a binary embeddings file. ')
59
+
60
+ parser.add_argument('--tgt_embed', type=str, nargs=2, required=True,
61
+ help='Target embeddings. Requires two arguments: first is a text file, sencond is a binary embeddings file. ')
62
+
63
+ parser.add_argument('-a', '--alignment_max_size', type=int, default=4,
64
+ help='Searches for alignments up to size N-M, where N+M <= this value. Note that the the embeddings must support the requested number of overlaps')
65
+
66
+ parser.add_argument('-d', '--del_percentile_frac', type=float, default=0.2,
67
+ help='Deletion penalty is set to this percentile (as a fraction) of the cost matrix distribution. Should be between 0 and 1.')
68
+
69
+ parser.add_argument('-v', '--verbose', help='sets consle to logging.DEBUG instead of logging.WARN',
70
+ action='store_true')
71
+
72
+ parser.add_argument('--max_size_full_dp', type=int, default=300, # org: 300
73
+ help='Maximum size N for which is is acceptable to run full N^2 dynamic programming.')
74
+
75
+ parser.add_argument('--costs_sample_size', type=int, default=20000,
76
+ help='Sample size to estimate costs distribution, used to set deletion penalty in conjunction with deletion_percentile.')
77
+
78
+ parser.add_argument('--num_samps_for_norm', type=int, default=100, # org 100
79
+ help='Number of samples used for normalizing embeddings')
80
+
81
+ parser.add_argument('--search_buffer_size', type=int, default=5,
82
+ help='Width (one side) of search buffer. Larger values makes search more likely to recover from errors but increases runtime.')
83
+
84
+ parser.add_argument('--debug_save_stack', type=str,
85
+ help='Write stack to pickle file for debug purposes')
86
+
87
+ args = parser.parse_args()
88
+
89
+ if len(args.src) != len(args.tgt):
90
+ raise Exception('number of source files must match number of target files')
91
+
92
+ if args.gold_alignment is not None:
93
+ if len(args.gold_alignment) != len(args.src):
94
+ raise Exception('number of gold alignment files, if provided, must match number of source and target files')
95
+
96
+ if args.verbose:
97
+ import logging
98
+ logger.setLevel(logging.INFO)
99
+
100
+ if args.alignment_max_size < 2:
101
+ logger.warning('Alignment_max_size < 2. Increasing to 2 so that 1-1 alignments will be considered')
102
+ args.alignment_max_size = 2
103
+
104
+ src_sent2line, src_line_embeddings = read_in_embeddings(args.src_embed[0], args.src_embed[1])
105
+ tgt_sent2line, tgt_line_embeddings = read_in_embeddings(args.tgt_embed[0], args.tgt_embed[1])
106
+
107
+ width_over2 = ceil(args.alignment_max_size / 2.0) + args.search_buffer_size
108
+
109
+ test_alignments = []
110
+ stack_list = []
111
+ for src_file, tgt_file in zip(args.src, args.tgt):
112
+ logger.info('Aligning src="%s" to tgt="%s"', src_file, tgt_file)
113
+
114
+ src_lines = open(src_file, 'rt', encoding="utf-8").readlines()
115
+ vecs0 = make_doc_embedding(src_sent2line, src_line_embeddings, src_lines, args.alignment_max_size)
116
+
117
+ tgt_lines = open(tgt_file, 'rt', encoding="utf-8").readlines()
118
+ vecs1 = make_doc_embedding(tgt_sent2line, tgt_line_embeddings, tgt_lines, args.alignment_max_size)
119
+
120
+ final_alignment_types = make_alignment_types(args.alignment_max_size)
121
+ logger.debug('Considering alignment types %s', final_alignment_types)
122
+
123
+ stack = vecalign(vecs0=vecs0,
124
+ vecs1=vecs1,
125
+ final_alignment_types=final_alignment_types,
126
+ del_percentile_frac=args.del_percentile_frac,
127
+ width_over2=width_over2,
128
+ max_size_full_dp=args.max_size_full_dp,
129
+ costs_sample_size=args.costs_sample_size,
130
+ num_samps_for_norm=args.num_samps_for_norm)
131
+
132
+ # write final alignments to stdout
133
+ print_alignments(stack[0]['final_alignments'], stack[0]['alignment_scores'])
134
+
135
+ test_alignments.append(stack[0]['final_alignments'])
136
+ stack_list.append(stack)
137
+
138
+ if args.gold_alignment is not None:
139
+ gold_list = [read_alignments(x) for x in args.gold_alignment]
140
+ res = score_multiple(gold_list=gold_list, test_list=test_alignments)
141
+ log_final_scores(res)
142
+
143
+ if args.debug_save_stack:
144
+ pickle.dump(stack_list, open(args.debug_save_stack, 'wb'))
145
+
146
+
147
+ if __name__ == '__main__':
148
+ _main()
tm.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import subprocess
5
+ import sys
6
+ import tempfile
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Dict
10
+
11
+ import requests
12
+
13
+ GITHUB_USERNAME = os.getenv("GITHUB_USERNAME")
14
+ GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_TOKEN")
15
+ GITHUB_EMAIL = os.getenv("GITHUB_EMAIL")
16
+ GITHUB_ORG = os.getenv("MAI_GITHUB_ORG")
17
+ MAI_TM_PUBLISH_TODO_REPO = os.environ["MAI_TMS_PUBLISH_TODO_REPO"]
18
+ GITHUB_API_ENDPOINT = f"https://api.github.com/orgs/{GITHUB_ORG}/repos"
19
+
20
+ DEBUG = os.getenv("DEBUG", False)
21
+
22
+ quiet = "-q" if DEBUG else ""
23
+
24
+
25
+ def create_github_repo(repo_path: Path, repo_name: str):
26
+ logging.info("[INFO] Creating GitHub repo...")
27
+
28
+ # configure git users
29
+ subprocess.run(f"git config --global user.name {GITHUB_USERNAME}".split())
30
+ subprocess.run(f"git config --global user.email {GITHUB_EMAIL}".split())
31
+
32
+ # Initialize a Git repository
33
+ subprocess.run(f"git init {quiet}".split(), cwd=str(repo_path))
34
+
35
+ # Commit the changes
36
+ subprocess.run("git add . ".split(), cwd=str(repo_path))
37
+ subprocess.run(
38
+ f"git commit {quiet} -m".split() + ["Initial commit"], cwd=str(repo_path)
39
+ )
40
+
41
+ # Create a new repository on GitHub
42
+ response = requests.post(
43
+ GITHUB_API_ENDPOINT,
44
+ json={
45
+ "name": repo_name,
46
+ "private": True,
47
+ },
48
+ auth=(GITHUB_USERNAME, GITHUB_ACCESS_TOKEN),
49
+ )
50
+ response.raise_for_status()
51
+
52
+ time.sleep(3)
53
+
54
+ # Add the GitHub remote to the local Git repository and push the changes
55
+ remote_url = f"https://{GITHUB_ORG}:{GITHUB_ACCESS_TOKEN}@github.com/{GITHUB_ORG}/{repo_name}.git"
56
+ subprocess.run(
57
+ f"git remote add origin {remote_url}", cwd=str(repo_path), shell=True
58
+ )
59
+ # rename default branch to main
60
+ subprocess.run("git branch -M main".split(), cwd=str(repo_path))
61
+ subprocess.run(f"git push {quiet} -u origin main".split(), cwd=str(repo_path))
62
+
63
+ return response.json()["html_url"]
64
+
65
+
66
+ def convert_raw_align_to_tm(align_fn: Path, tm_path: Path):
67
+ if DEBUG:
68
+ logging.debug("[INFO] Conerting raw alignment to TM repo...")
69
+
70
+ def load_alignment(fn: Path):
71
+ content = fn.read_text()
72
+ if not content:
73
+ return []
74
+
75
+ for seg_pair in content.splitlines():
76
+ if not seg_pair:
77
+ continue
78
+
79
+ if "\t" in seg_pair:
80
+ try:
81
+ bo_seg, en_seg = seg_pair.split("\t", 1)
82
+ except Exception as e:
83
+ logging.error(f"{e} in {fn}")
84
+ raise
85
+
86
+ else:
87
+ bo_seg = seg_pair
88
+ en_seg = "\n"
89
+ yield bo_seg, en_seg
90
+
91
+ text_bo_fn = tm_path / f"{tm_path.name}-bo.txt"
92
+ text_en_fn = tm_path / f"{tm_path.name}-en.txt"
93
+
94
+ with open(text_bo_fn, "w", encoding="utf-8") as bo_file, open(
95
+ text_en_fn, "w", encoding="utf-8"
96
+ ) as en_file:
97
+ for bo_seg, en_seg in load_alignment(align_fn):
98
+ bo_file.write(bo_seg + "\n")
99
+ en_file.write(en_seg + "\n")
100
+
101
+ return tm_path
102
+
103
+
104
+ def get_github_dev_url(raw_github_url: str) -> str:
105
+ base_url = "https://github.dev"
106
+ _, file_path = raw_github_url.split(".com")
107
+ blob_file_path = file_path.replace("main", "blob/main")
108
+ return base_url + blob_file_path
109
+
110
+
111
+ def add_input_in_readme(input_dict: Dict[str, str], path: Path) -> Path:
112
+ input_readme_fn = path / "README.md"
113
+ text_id = input_dict["text_id"]
114
+ bo_file_url = get_github_dev_url(input_dict["bo_file_url"])
115
+ en_file_url = get_github_dev_url(input_dict["en_file_url"])
116
+ input_string = "## Input\n- [BO{}]({})\n- [EN{}]({})".format(
117
+ text_id, bo_file_url, text_id, en_file_url
118
+ )
119
+
120
+ input_readme_fn.write_text(input_string)
121
+
122
+ return path
123
+
124
+ def add_to_publish_todo_repo(org, repo_name, file_path, access_token):
125
+ base_url = f"https://api.github.com/repos/{org}/{repo_name}/contents/"
126
+
127
+ headers = {
128
+ "Authorization": f"Bearer {access_token}",
129
+ "Accept": "application/vnd.github.v3+json",
130
+ }
131
+
132
+ url = base_url + file_path
133
+
134
+ response = requests.get(url, headers=headers)
135
+
136
+ if response.status_code == 200:
137
+ print(f"[INFO] '{file_path}' already added.")
138
+ return
139
+
140
+ payload = {"message": f"Add {file_path}", "content": ""}
141
+
142
+ response = requests.put(url, headers=headers, json=payload)
143
+
144
+ if response.status_code == 201:
145
+ print(f"[INFO] '{file_path}' added to publish todo")
146
+ else:
147
+ print(f"[ERROR] Failed to add '{file_path}'.")
148
+ print(f"[ERROR] Response: {response.text}")
149
+
150
+
151
+ def create_tm(align_fn: Path, text_pair: Dict[str, str]):
152
+ align_fn = Path(align_fn)
153
+ text_id = text_pair["text_id"]
154
+ with tempfile.TemporaryDirectory() as tmp_dir:
155
+ output_dir = Path(tmp_dir)
156
+ repo_name = f"TM{text_id}"
157
+ tm_path = output_dir / repo_name
158
+ tm_path.mkdir(exist_ok=True, parents=True)
159
+ repo_path = convert_raw_align_to_tm(align_fn, tm_path)
160
+ repo_path = add_input_in_readme(text_pair, tm_path)
161
+ repo_url = create_github_repo(repo_path, repo_name)
162
+ logging.info(f"TM repo created: {repo_url}")
163
+ add_to_publish_todo_repo(GITHUB_ORG, MAI_TM_PUBLISH_TODO_REPO, repo_name, GITHUB_ACCESS_TOKEN)
164
+ return repo_url
165
+
166
+
167
+ if __name__ == "__main__":
168
+ align_fn = Path(sys.argv[1])
169
+ create_tm(align_fn)