Spaces:

openpecha
/

tibetan-aligner-api

Sleeping

App Files Files Community

10zinten commited on Aug 7, 2023

Commit

1a3c007

0 Parent(s):

Duplicate from openpecha/tibetan-aligner-api

Browse files

Files changed (25) hide show

.gitattributes +34 -0
.gitignore +165 -0
README.md +14 -0
app.py +115 -0
flagged/file_urls/tmpfuhgfj7m.json +1 -0
flagged/log.csv +2 -0
import_tibetan_aligner_source.py +26 -0
requirements.txt +5 -0
tibetan-aligner/README.md +5 -0
tibetan-aligner/align_tib_en.sh +43 -0
tibetan-aligner/convert_to_wylie.py +17 -0
tibetan-aligner/create_train.py +60 -0
tibetan-aligner/create_train_clean.py +37 -0
tibetan-aligner/dp_core.cpython-310-x86_64-linux-gnu.so.reload1 +0 -0
tibetan-aligner/dp_core.cpython-39-darwin.so.reload1 +0 -0
tibetan-aligner/dp_core.pyx +411 -0
tibetan-aligner/dp_utils.py +668 -0
tibetan-aligner/get_vectors.py +35 -0
tibetan-aligner/ladder +11 -0
tibetan-aligner/ladder2org.py +47 -0
tibetan-aligner/model_to_hub.py +7 -0
tibetan-aligner/requirements.txt +3 -0
tibetan-aligner/score.py +170 -0
tibetan-aligner/vecalign.py +148 -0
tm.py +169 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,165 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+*/ladder
+data

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Tibetan Aligner
+emoji: 📖
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 3.34.0
+app_file: app.py
+pinned: true
+license: mit
+duplicated_from: openpecha/tibetan-aligner-api
+---
+DISCLAIMER: This space has been created solely for testing and educational purposes. We do not claim any ownership or copyright over the align-tibetan script, which remains the sole property of its original creator, Sebastian Nehrlich. We have created this space to facilitate the use and testing of the align-tibetan script for interested users. If you use the align-tibetan script for any commercial or production purposes, we strongly encourage you to obtain permission from the original creator and comply with any relevant licensing requirements.

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import logging
+import os
+import re
+import shutil
+import stat
+import subprocess
+import time
+import uuid
+from contextlib import contextmanager
+from pathlib import Path
+import gradio as gr
+import requests
+from tm import create_tm
+logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
+ALIGNER_SCRIPT_DIR = Path("./tibetan-aligner").resolve()
+ALIGNER_SCRIPT_NAME = "align_tib_en.sh"
+ALIGNER_SCRIPT_PATH = ALIGNER_SCRIPT_DIR / ALIGNER_SCRIPT_NAME
+assert ALIGNER_SCRIPT_PATH.is_file()
+def make_dir_executable(dir_path: Path):
+    for fn in dir_path.iterdir():
+        st = os.stat(fn)
+        os.chmod(fn, st.st_mode | stat.S_IEXEC)
+        st = os.stat(fn)
+        os.chmod(fn, st.st_mode | stat.S_IXGRP)
+        st = os.stat(fn)
+        os.chmod(fn, st.st_mode | stat.S_IXOTH)
+make_dir_executable(ALIGNER_SCRIPT_DIR)
+@contextmanager
+def TemporaryDirectory():
+    tmpdir = Path("./output").resolve() / uuid.uuid4().hex[:8]
+    tmpdir.mkdir(exist_ok=True, parents=True)
+    try:
+        yield tmpdir
+    finally:
+        shutil.rmtree(str(tmpdir))
+def download_file(github_file_url: str, output_fn) -> Path:
+    """Download file from github"""
+    headers = {
+        "Authorization": f"token {GITHUB_TOKEN}",
+        "Accept": "application/vnd.github+json",
+    }
+    authenticated_file_url = f"{github_file_url}?token={GITHUB_TOKEN}"
+    with requests.get(authenticated_file_url, headers=headers, stream=True) as r:
+        r.raise_for_status()
+        with open(output_fn, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+    return output_fn
+def _run_align_script(bo_fn, en_fn, output_dir):
+    start = time.time()
+    cmd = [str(ALIGNER_SCRIPT_PATH), str(bo_fn), str(en_fn), str(output_dir)]
+    output = subprocess.run(
+        cmd,
+        check=True,
+        capture_output=True,
+        text=True,
+        cwd=str(ALIGNER_SCRIPT_DIR),
+    )
+    output_fn = re.search(r"\[OUTPUT\] (.*)", output.stdout).group(1)
+    output_fn = "/" + output_fn.split("//")[-1]
+    end = time.time()
+    total_time = round((end - start) / 60, 2)
+    logging.info(f"Total time taken for Aligning: {total_time} mins")
+    return output_fn
+def align(text_pair):
+    logging.info(f"Running aligner for TM{text_pair['text_id']}...")
+    with TemporaryDirectory() as tmpdir:
+        output_dir = Path(tmpdir)
+        bo_fn = download_file(text_pair["bo_file_url"], output_fn=output_dir / "bo.tx")
+        en_fn = download_file(text_pair["en_file_url"], output_fn=output_dir / "en.tx")
+        aligned_fn = _run_align_script(bo_fn, en_fn, output_dir)
+        repo_url = create_tm(aligned_fn, text_pair=text_pair)
+        return {"tm_repo_url": repo_url}
+with gr.Blocks() as demo:
+    gr.Markdown("## Tibetan-English Aligner API")
+    gr.Markdown("Please use Via API")
+    input = gr.JSON(
+#         value={
+#             "text_id": f"{uuid.uuid4().hex[:4]}",
+#             "bo_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-bo.txt",
+#             "en_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-en.txt",
+#         }
+    )
+    output = gr.JSON()
+    align_btn = gr.Button("Align")
+    align_btn.click(
+        fn=align,
+        inputs=input,
+        outputs=output,
+        api_name="align",
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, debug=True)

flagged/file_urls/tmpfuhgfj7m.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bo_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-bo.txt", "en_file_url": "https://raw.githubusercontent.com/OpenPecha/tibetan-aligner/main/tests/data/text-en.txt"}

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ file_urls,output,flag,username,timestamp
2	+ /home/user/app/flagged/file_urls/tmpfuhgfj7m.json,,,,2023-04-10 11:44:49.529324

import_tibetan_aligner_source.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import shutil
+import sys
+from pathlib import Path
+NON_SOURCES_FILES = [
+    ".",
+    "..",
+    ".git",
+    ".github",
+    ".gitignore",
+    ".venv",
+    ".idea",
+    "Dockerfile",
+    "__pycache__",
+    "tests",
+]
+if __name__ == "__main__":
+    source_dir = Path(sys.argv[1])
+    dest_dir = Path(__file__).parent / source_dir.name
+    dest_dir.mkdir(exist_ok=True)
+    for fn in source_dir.iterdir():
+        if fn.name in NON_SOURCES_FILES:
+            continue
+        dest_fn = dest_dir / fn.name
+        shutil.copy2(str(fn), str(dest_fn))

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+sentence-transformers==2.2.2
+pyewts==0.2.0
+Cython==0.29.34
+gradio>=3.34.0, <4.0
+requests==2.28.2

tibetan-aligner/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# align-tibetan
+Tibetan English sentence alignment
+Simply run bash align_tib_en.sh <tib_file> <eng_file>.
+Tib file should be in Tibetan unicode, English file should be plain text English.
+There are some possible parameters, please look into align_tib_en.sh.

tibetan-aligner/align_tib_en.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/bin/bash
+number_of_overlays=6 # the higher the number of overlays, the more precise alignment is going to be, but also slower
+deletion=0.06 # higher = less precise
+search_buffer_size=50
+# Args:
+# first parameter is a file in Tibetan unicode
+# second parameter is a file with English in plain text.
+# third parameter is output path
+cp $1 $1.work
+cp $2 $2.work
+output_dir=${3:-"output"}
+mkdir $output_dir
+cp $2.work $2.work2
+echo '[INFO] Getting Embedding...'
+time python get_vectors.py $1.work $number_of_overlays
+time python get_vectors.py $2.work $number_of_overlays
+rm ladder
+echo '[INFO] Running alignment...'
+time ./vecalign.py -a $number_of_overlays -d $deletion --search_buffer_size $search_buffer_size --alignment_max_size $number_of_overlays --src $1.work --tgt $2.work \
+   --src_embed $1.work_overlay $1.work_vectors.npy  \
+   --tgt_embed $2.work_overlay $2.work_vectors.npy >> ladder
+rm $1.org
+rm $1.train
+python ladder2org.py $1.work $2.work ladder >> $1.org
+python create_train.py $1.work $2.work ladder >> $1.train
+python create_train_clean.py $1.work $2.work ladder >> $1.train_cleaned
+# clean up
+mv *.txt* $output_dir/
+mv $output_dir/requirements.txt ./
+rm $output_dir/$1.work
+rm $output_dir/$2.work
+rm $output_dir/$2.work2
+rm $output_dir/$1.work_vectors.npy
+rm $output_dir/$2.work_vectors.npy
+echo "[OUTPUT] $output_dir/$1.train_cleaned"

tibetan-aligner/convert_to_wylie.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import sys
+import pyewts
+converter = pyewts.pyewts()
+path = sys.argv[1]
+result = ""
+for line in open(path, "r"):
+    line = converter.toWylie(line)
+    result += line
+with open(path,"w") as outfile:
+    outfile.write(result)

tibetan-aligner/create_train.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import sys
+import re
+import re
+f1 = open(sys.argv[1],'r')
+f2 = open(sys.argv[2],'r')
+ladder_file = open(sys.argv[3],'r')
+output = ""
+ladder = []
+sktfile = [line.rstrip('\n').strip() for line in f1]
+tibfile = [line.rstrip('\n').strip() for line in f2]
+last_score = 0.5
+def clean_num(string):
+    string = re.sub("[^0-9, ]","",string)
+    return int(string.split(',')[0])
+for line in ladder_file:
+    if len(line.split("\t")) == 3:
+        skt,tib,score = line.split('\t')
+        if re.search("[0-9]",skt) and re.search("[0-9]",tib):
+            skt_num = clean_num(skt)
+            tib_num = clean_num(tib)
+            ladder.append([skt_num,tib_num,score])
+    if ";" in line:
+        m = re.search("([0-9., ]+);([0-9., ]+).*=\"([0-9.,]+)", line)
+        if m:
+            skt_num = int(m.group(1).split()[0].replace(".","").replace(",",""))-1
+            tib_num = int(m.group(2).split()[0].replace(".","").replace(",",""))-1
+            score = float(m.group(3))
+            ladder.append([skt_num,tib_num,score])
+    if len(line.split(':')) == 3:
+        skt,tib,score = line.split(':')
+        if re.search("[0-9]",skt) and re.search("[0-9]",tib):
+            skt_num = clean_num(skt)
+            tib_num = clean_num(tib)
+            ladder.append([skt_num,tib_num,score])
+last_skt = 0
+last_tib = 0
+for entry in ladder:
+        output = output + ' '.join(sktfile[last_skt:entry[0]]) + "\t"
+        output = output + ' '.join(tibfile[last_tib:entry[1]]) + "\n"
+        last_skt = entry[0]
+        last_tib = entry[1]
+output = output + ' '.join(sktfile[last_skt:-1]) + "\t"
+output = output +  ' '.join(tibfile[last_tib:-1]) + "\n" # + str(entry[2])
+short_f1 = re.sub("\.tsv.*","",sys.argv[1])
+short_f2 = re.sub(".*/","",sys.argv[2])
+short_f2 = re.sub("\.tsv.*","",short_f2)
+print(output)
+# with open(short_f1 + "_" + short_f2 + ".train", 'w') as file:
+#     file.write(output)

tibetan-aligner/create_train_clean.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import sys
+import re
+import re
+f1 = open(sys.argv[1],'r')
+f2 = open(sys.argv[2],'r')
+ladder_file = open(sys.argv[3],'r')
+output = ""
+ladder = []
+sktfile = [line.rstrip('\n').strip() for line in f1]
+tibfile = [line.rstrip('\n').strip() for line in f2]
+last_score = 0.5
+def clean_num(string):
+    string = re.sub("[^0-9, ]","",string)
+    numbers = []
+    for number in string.split(','):
+        numbers.append(int(number))
+    return numbers
+for line in ladder_file:
+    if len(line.split(':')) == 3:
+        skt,tib,score = line.split(':')
+        if re.search("[0-9]",skt) and re.search("[0-9]",tib):
+            skt_nums = clean_num(skt)
+            tib_nums = clean_num(tib)
+            for num in skt_nums:
+                output += sktfile[num] + " "
+            output += "\t"
+            for num in tib_nums:
+                output += tibfile[num] + " "
+            output += "\n"
+print(output)
+# with open(short_f1 + "_" + short_f2 + ".train", 'w') as file:
+#     file.write(output)

tibetan-aligner/dp_core.cpython-310-x86_64-linux-gnu.so.reload1 ADDED Viewed

Binary file (643 kB). View file

tibetan-aligner/dp_core.cpython-39-darwin.so.reload1 ADDED Viewed

Binary file (170 kB). View file

tibetan-aligner/dp_core.pyx ADDED Viewed

	@@ -0,0 +1,411 @@

+# cython: language_level=3
+"""
+Copyright 2019 Brian Thompson
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import numpy as np
+cimport numpy as np
+cimport cython
+def make_x_y_offsets(alignment_types):
+    # alignment types for which we will precompute costs
+    # deletion/insertion is added later
+    for x, y in alignment_types:
+        assert (x > 0)
+        assert (y > 0)
+    x_offsets = np.array([x for x, y in alignment_types], dtype=np.int32)  # MUST **NOT** INCLUDE (0,1), (1,0)
+    y_offsets = np.array([y for x, y in alignment_types], dtype=np.int32)  # MUST **NOT** INCLUDE (0,1), (1,0)
+    return x_offsets, y_offsets
+def make_dense_costs(np.ndarray[float, ndim=3] vecs0,  # itput
+                     np.ndarray[float, ndim=3] vecs1,  # input
+                     np.ndarray[float, ndim=2] norm0,  # input
+                     np.ndarray[float, ndim=2] norm1,  # input
+                     int offset0 = 0,  # index into vecs0/norms0
+                     int offset1 = 0,  # index into vecs1/norms1
+                     ):
+    """
+    Make a full N*M feature matrix. By default, makes 1-1 alignments,
+       can build others by specifying offset0, offset1 to index into
+       vecs0, norms0 and vecs1, norms1 respectivly.
+    """
+    assert vecs0.shape[0] > offset0
+    assert vecs1.shape[0] > offset1
+    assert norm0.shape[0] > offset0
+    assert norm1.shape[0] > offset1
+    cdef int size0 = np.shape(vecs0)[1]
+    assert norm0.shape[1] == size0
+    cdef int size1 = np.shape(vecs1)[1]
+    assert norm1.shape[1] == size1
+    cdef int vecsize = np.shape(vecs0)[2]
+    assert vecs1.shape[2] == vecsize
+    cdef int xi, yi
+    cdef float sumx
+    cdef np.ndarray[float, ndim=2] costs = np.empty((size0, size1), dtype=np.float32)
+    for xi in range(size0):
+        for yi in range(size1):
+            sumx = 0.0
+            for jj in range(vecsize):
+                sumx += vecs0[offset0, xi, jj] * vecs1[offset1, yi, jj]
+            costs[xi, yi] = 2.0 * (1.0 - sumx) / (1e-6 + norm0[offset0, xi] + norm1[offset1, yi])
+            # normalize by alignment type
+            costs[xi, yi] = costs[xi, yi] * (offset0 + 1) * (offset1 + 1)
+    return costs
+def dense_dp(np.ndarray[float, ndim=2] alignment_cost, float pen):
+    """
+    Compute cost matrix (csum) and backpointers (bp)
+    from full 2-D 1-1 alignment costs matrix (alignment_cost)
+    """
+    size0 = alignment_cost.shape[0]
+    size1 = alignment_cost.shape[1]
+    # csum and traceback matrix are both on nodes
+    #   so they are +1 in each dimension compared to the jump costs matrix
+    # For anything being used in accumulation, use float64
+    cdef np.ndarray[double, ndim=2] csum = np.empty((size0 + 1, size1 + 1), dtype=np.float64)
+    cdef np.ndarray[int, ndim=2] bp = np.empty((size0 + 1, size1 + 1), dtype=np.int32)
+    # bp and csum are nodes,
+    #   while alignment_cost is the cost of going between the nodes
+    # Size of nodes should be one larger than alignment costs
+    b0, b1 = np.shape(bp)
+    c0, c1 = np.shape(csum)
+    j0, j1 = np.shape(alignment_cost)
+    assert (b0 == c0 == j0 + 1)
+    assert (b1 == c1 == j1 + 1)
+    cdef int cmax = np.shape(csum)[1]
+    cdef int rmax = np.shape(csum)[0]
+    cdef int c, r
+    cdef double cost0, cost1, cost2
+    # initialize the all c-direction deletion path
+    for c in range(cmax):
+        csum[0, c] = c * pen
+        bp[0, c] = 1
+    # initialize the all r-direction deletion path
+    for r in range(rmax):
+        csum[r, 0] = r * pen
+        bp[r, 0] = 2
+    # Initial cost is 0.0
+    csum[0, 0] = 0.0  # noop
+    bp[0, 0] = 4  # should not matter
+    # Calculate the rest recursively
+    for c in range(1, cmax):
+        for r in range(1, rmax):
+            # alignment_cost indexes are off by 1 wrt
+            #   csum/bp, since csum/bp are nodes
+            cost0 = csum[r - 1, c - 1] + alignment_cost[r - 1, c - 1]
+            cost1 = csum[r, c - 1] + pen
+            cost2 = csum[r - 1, c] + pen
+            csum[r, c] = cost0
+            bp[r, c] = 0
+            if cost1 < csum[r, c]:
+                csum[r, c] = cost1
+                bp[r, c] = 1
+            if cost2 < csum[r, c]:
+                csum[r, c] = cost2
+                bp[r, c] = 2
+    return csum, bp
+def score_path(np.ndarray[int, ndim=1] xx,
+               np.ndarray[int, ndim=1] yy,
+               np.ndarray[float, ndim=1] norm1,
+               np.ndarray[float, ndim=1] norm2,
+               np.ndarray[float, ndim=2] vecs1,
+               np.ndarray[float, ndim=2] vecs2,
+               np.ndarray[float, ndim=1] out):
+    cdef int xi, yi, ii, jj
+    cdef float outx
+    cdef int lenxy = xx.shape[0]
+    cdef int vecsize = vecs1.shape[1]
+    for ii in range(lenxy):
+        xi = xx[ii]
+        yi = yy[ii]
+        outx = 0.0
+        for jj in range(vecsize):
+            outx += vecs1[xi, jj] * vecs2[yi, jj]
+        out[ii] = 2.0 * (1.0 - outx) / (norm1[xi] + norm2[yi])
+# Bounds checking and wraparound slow things down by about 2x
+# Division by 0 checking has minimal speed impact
+@cython.boundscheck(False)  # turn off bounds-checking for entire function
+@cython.wraparound(False)  # turn off negative index wrapping for entire function
+@cython.cdivision(True)  # use c-style division (no division-by-zero check)
+def make_sparse_costs(np.ndarray[float, ndim=3] vecs0,  # intput: num aligns X num sents X dim
+                      np.ndarray[float, ndim=3] vecs1,  # input
+                      np.ndarray[float, ndim=2] norms0,  # intput: num aligns X num sents
+                      np.ndarray[float, ndim=2] norms1,  # input
+                      x_y_path,
+                      alignment_types,
+                      int width_over2):
+    """
+    Make features for DP, *for lines running across approximate path*, *for each alignment type*
+    x_offsets, y_offsets should not include (0,1), (1,0)
+    Basically, we take the feature matrix, rotate it 45 degress,
+       and compute a "wavy" matrix for the features.
+    It's like the diagonal but it moves around to hopefully always include the true path.
+    """
+    cdef np.ndarray[int, ndim=2] x_y_path_ = np.array(x_y_path).astype(np.int32)
+    assert (vecs0.shape[0] == norms0.shape[0])
+    assert (vecs1.shape[0] == norms1.shape[0])
+    assert (vecs0.shape[1] == norms0.shape[1])
+    assert (vecs1.shape[1] == norms1.shape[1])
+    # check how many overlaps vectors were passed in
+    num_overlaps_in_vecs0 = vecs0.shape[0]
+    num_overlaps_in_vecs1 = vecs1.shape[0]
+    # check how many overlaps were requested
+    # edge case: alignment_types could be empty
+    #    In that case, we should just return insertions/deletions
+    #    and max_x_overlap == max_y_overlap == 0
+    max_x_overlap = max([0] + [x for x, y in alignment_types])  # add [0] in case alignment_types is empty
+    max_y_overlap = max([0] + [y for x, y in alignment_types])  # add [0] in case alignment_types is empty
+    # note: alignment types are specified 1-based, but vectors are stored 0-based
+    if max_x_overlap > num_overlaps_in_vecs0:
+        raise Exception('%d x overlaps requrested (via alignment_types), but vecs0 only has %d' % (
+            max_x_overlap, num_overlaps_in_vecs0))
+    if max_y_overlap > num_overlaps_in_vecs1:
+        raise Exception('%d y overlaps requrested (via alignment_types), but vecs1 only has %d' % (
+            max_y_overlap, num_overlaps_in_vecs1))
+    # number of sentences in each document
+    cdef int xsize = vecs0.shape[1]
+    cdef int ysize = vecs1.shape[1]
+    # vector diminsions should match
+    assert (vecs0.shape[2] == vecs1.shape[2])
+    cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
+    x_offsets, y_offsets = make_x_y_offsets(alignment_types)
+    # reserve outputs
+    a_len = x_y_path_.shape[0]
+    b_len = 2 * width_over2
+    cdef np.ndarray[float, ndim=3] a_b_feats = np.empty((len(alignment_types), a_len, b_len), dtype=np.float32)
+    cdef np.ndarray[int, ndim=1] b_offset = np.empty(a_len).astype(np.int32)
+    cdef int x, y, aa, bb, xx, yy, a_idx, b_idx, bb2, x_offset, y_offset, ii_align, x_offset_idx, y_offset_idx
+    cdef int vecsize = vecs0.shape[2]
+    cdef int num_alignments = x_offsets.shape[0]
+    cdef float sumx, feat
+    cdef float inf = np.inf
+    for ii in range(x_y_path_.shape[0]):
+        x = x_y_path_[ii, 0]
+        y = x_y_path_[ii, 1]
+        # convert xy to ab cords
+        aa = x + y
+        bb = y
+        a_idx = aa
+        b_offset[aa] = bb - width_over2
+        for b_idx, bb2 in enumerate(range(bb - width_over2, bb + width_over2)):
+            # convert ab to xy cords
+            xx = aa - bb2
+            yy = bb2
+            for ii_align in range(num_alignments):
+                x_offset = x_offsets[ii_align]
+                x_offset_idx = x_offset - 1  # overlaps start at 1, vectors stored 0-based
+                y_offset = y_offsets[ii_align]
+                y_offset_idx = y_offset - 1
+                if 0 <= xx < xsize and 0 <= yy < ysize:
+                    sumx = 0.0
+                    for jj in range(vecsize):
+                        sumx += vecs0[x_offset_idx, xx, jj] * vecs1[y_offset_idx, yy, jj]
+                    feat = 2.0 * x_offset * y_offset * (1.0 - sumx) / (
+                            1e-6 + norms0[x_offset_idx, xx] + norms1[y_offset_idx, yy])
+                else:
+                    feat = inf
+                a_b_feats[ii_align, a_idx, b_idx] = feat
+    return a_b_feats, b_offset
+def sparse_dp(np.ndarray[float, ndim=3] a_b_costs,
+              np.ndarray[int, ndim=1] b_offset_in,
+              alignment_types,
+              double del_penalty,
+              int x_in_size,
+              int y_in_size):
+    """
+    Do DP along a path, using features saved off along path.
+    x_offsets, y_offsets should not include (0,1), (1,0)
+    xsize, ysize refer to the costs a_b_csum, but in x/y space
+    As in the simpler full-DP case,
+       we compute cumulative costs and backpointers on notes,
+       and there are COSTS associated with moving between them.
+    This means the size of the notes  +1,+1 larger (in x,y) than the COSTS.
+    So the size of a_b_csum, a_b_xp, a_b_yp are all one larger in x and y compared to the costs
+    In order to save memory (and time, vs a sparse matrix with hashes to look up values), let:
+             a = x + y
+             b = x - y
+    b_offsets tells us how far from the left edge the features are computed for.
+         basically it's like we are computing along the diagonal,
+         but we shift the diagonal around based on our belief
+         about where the alignments are.
+    b_offsets is used for both costs AND csum, backpointers, so it needs to be
+        +2 longer (it is in the a-direction) than the costs (in the a direction)
+    """
+    cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
+    x_offsets, y_offsets = make_x_y_offsets(alignment_types)
+    # make x/y offsets, including (0,1), (1,), i.e. including deletion and insertion
+    x_offsets = np.concatenate([x_offsets, np.array([0, 1], dtype=np.int32)])
+    y_offsets = np.concatenate([y_offsets, np.array([1, 0], dtype=np.int32)])
+    cdef int a_in_size = a_b_costs.shape[1]
+    cdef int b_in_size = a_b_costs.shape[2]
+    cdef int a_out_size = a_in_size + 2
+    cdef int b_out_size = b_in_size
+    cdef int x_out_size = x_in_size + 1
+    cdef int y_out_size = y_in_size + 1
+    # costs are the costs of going between nodes.
+    # in x,y for the nodes, we basically add a buffer
+    #   at x=0 and y=0, and shift the cost by (x=+1,y=+1)
+    # In a,b space, this means adding two points (for the buffer)
+    #      at the beginning, and shifting by (a=+0,b=+1) since
+    #      a=x+y and b=y
+    # for the first two points, we can simply replicate the
+    #    original b_offset, since it should be -width_over2
+    # i.e. b_offset_in[0] == -width_over2
+    extra_two_points = np.array([b_offset_in[0], b_offset_in[0]], dtype=np.int32)
+    cdef np.ndarray[int, ndim=1] b_offset_out = np.concatenate([extra_two_points, b_offset_in + 1])
+    # outputs
+    # For anything being used in accumulation, use float64
+    cdef np.ndarray[double, ndim=2] a_b_csum = np.zeros((a_in_size + 2, b_in_size),
+                                                        dtype=np.float64) + np.inf  # error cumulative sum
+    cdef np.ndarray[int, ndim=2] a_b_xp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2  # backpointer for x
+    cdef np.ndarray[int, ndim=2] a_b_yp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2  # backpointer for y
+    cdef int num_alignments = x_offsets.shape[0]
+    cdef double inf = np.inf
+    cdef int xx_out, yy_out, ii_align, x_offset, y_offset
+    cdef int aa_in_cost, bb_in_cost, aa_out, bb_out, aa_out_prev, bb_out_prev, xx_in_cost, yy_in_cost, xx_out_prev, yy_out_prev
+    cdef double alignment_cost, total_cost, prev_cost
+    # increasing in a is the same as going along diagonals in x/y, so DP order works
+    #  (and any ordering is fine in b - nothing depends on values adjacent on diagonal in x/y)
+    for aa_out in range(a_in_size + 2):
+        for bb_out in range(b_in_size):
+            #xx_out, yy_out = ab2xy_w_offset(aa_out, bb_out, b_offset_out)
+            yy_out = bb_out + b_offset_out[aa_out]
+            xx_out = aa_out - yy_out
+            # edge case: all deletions in y-direction
+            if xx_out == 0 and 0 <= yy_out < y_out_size:
+                a_b_csum[aa_out, bb_out] = del_penalty * yy_out
+                a_b_xp[aa_out, bb_out] = 0
+                a_b_yp[aa_out, bb_out] = 1
+            # edge case: all deletions in x-direction
+            elif yy_out == 0 and 0 <= xx_out < x_out_size:
+                a_b_csum[aa_out, bb_out] = del_penalty * xx_out
+                a_b_xp[aa_out, bb_out] = 1
+                a_b_yp[aa_out, bb_out] = 0
+            else:
+                # initialize output to inf
+                a_b_csum[aa_out, bb_out] = inf
+                a_b_xp[aa_out, bb_out] = -42
+                a_b_yp[aa_out, bb_out] = -42
+                for ii_align in range(num_alignments):
+                    x_offset = x_offsets[ii_align]
+                    y_offset = y_offsets[ii_align]
+                    # coords of location of alignment cost, in input x/y space
+                    xx_in_cost = xx_out - 1  # features were front padded,
+                    yy_in_cost = yy_out - 1  #   so offset is always 1
+                    # the coords of location of previous cumsum cost, in input x/y space
+                    xx_out_prev = xx_out - x_offset
+                    yy_out_prev = yy_out - y_offset
+                    if 0 <= xx_in_cost < x_in_size and 0 <= yy_in_cost < y_in_size and 0 <= xx_out_prev < x_out_size and 0 <= yy_out_prev < y_out_size:
+                        # convert x,y to a,b
+                        aa_in_cost = xx_in_cost + yy_in_cost
+                        bb_in_cost = yy_in_cost - b_offset_in[aa_in_cost]
+                        aa_out_prev = xx_out_prev + yy_out_prev
+                        bb_out_prev = yy_out_prev - b_offset_out[aa_out_prev]
+                        if 0 <= aa_in_cost < a_in_size and 0 <= bb_in_cost < b_in_size and 0 <= aa_out_prev < a_out_size and 0 <= bb_out_prev < b_out_size:
+                            if x_offset == 0 or y_offset == 0:
+                                alignment_cost = del_penalty
+                            else:
+                                alignment_cost = a_b_costs[ii_align, aa_in_cost, bb_in_cost]
+                            prev_cost = a_b_csum[aa_out_prev, bb_out_prev]
+                            total_cost = prev_cost + alignment_cost
+                            if total_cost < a_b_csum[aa_out, bb_out]:
+                                a_b_csum[aa_out, bb_out] = total_cost
+                                a_b_xp[aa_out, bb_out] = x_offset
+                                a_b_yp[aa_out, bb_out] = y_offset
+    return a_b_csum, a_b_xp, a_b_yp, b_offset_out

tibetan-aligner/dp_utils.py ADDED Viewed

	@@ -0,0 +1,668 @@

+"""
+Copyright 2019 Brian Thompson
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import logging
+import sys
+from ast import literal_eval
+from collections import OrderedDict
+from math import ceil
+from time import time
+import numpy as np
+import pyximport
+pyximport.install(setup_args={'include_dirs':np.get_include()}, inplace=True, reload_support=True)
+from dp_core import make_dense_costs, score_path, sparse_dp, make_sparse_costs, dense_dp
+logger = logging.getLogger('vecalign')  # set up in vecalign.py
+def preprocess_line(line):
+    line = line.strip()
+    if len(line) == 0:
+        line = 'BLANK_LINE'
+    return line
+def yield_overlaps(lines, num_overlaps):
+    lines = [preprocess_line(line) for line in lines]
+    for overlap in range(1, num_overlaps + 1):
+        for out_line in layer(lines, overlap):
+            # check must be here so all outputs are unique
+            out_line2 = out_line[:10000]  # limit line so dont encode arbitrarily long sentences
+            yield out_line2
+def read_in_embeddings(text_file, embed_file):
+    """
+    Given a text file with candidate sentences and a corresponing embedding file,
+       make a maping from candidate sentence to embedding index,
+       and a numpy array of the embeddings
+    """
+    sent2line = dict()
+    with open(text_file, 'rt', encoding="utf-8") as fin:
+        for ii, line in enumerate(fin):
+            # don't know if it is a good idea to uncomment these two lines ###
+            # if line.strip() in sent2line:
+            #     raise Exception('got multiple embeddings for the same line:',line)
+            sent2line[line.strip()] = ii
+    line_embeddings = np.load(embed_file,allow_pickle=True)
+    print("LINE EMBEDDINGS SHAPE",line_embeddings.shape)
+    # line_embeddings = np.fromfile(embed_file, dtype=np.float32, count=-1)
+    # if line_embeddings.size == 0:
+    #     raise Exception('Got empty embedding file')
+    # print("Line embeddings size",len(line_embeddings))
+    # laser_embedding_size = line_embeddings.size // len(sent2line)  # currently hardcoded to 1024
+    # if laser_embedding_size != 1024:
+    #     logger.warning('expected an embedding size of 1024, got %s', laser_embedding_size)
+    # logger.info('laser_embedding_size determined to be %d', laser_embedding_size)
+    # line_embeddings.resize(line_embeddings.shape[0] // laser_embedding_size, laser_embedding_size)
+    return sent2line, line_embeddings
+def make_doc_embedding(sent2line, line_embeddings, lines, num_overlaps):
+    """
+    lines: sentences in input document to embed
+    sent2line, line_embeddings: precomputed embeddings for lines (and overlaps of lines)
+    """
+    lines = [preprocess_line(line) for line in lines]
+    vecsize = line_embeddings.shape[1]
+    vecs0 = np.empty((num_overlaps, len(lines), vecsize), dtype=np.float32)
+    for ii, overlap in enumerate(range(1, num_overlaps + 1)):
+        for jj, out_line in enumerate(layer(lines, overlap)):
+            try:
+                line_id = sent2line[out_line]
+            except KeyError:
+                logger.warning('Failed to find overlap=%d line "%s". Will use random vector.', overlap, out_line)
+                line_id = None
+            if line_id is not None:
+                vec = line_embeddings[line_id]
+            else:
+                vec = np.random.random(vecsize) - 0.5
+                vec = vec / np.linalg.norm(vec)
+            vecs0[ii, jj, :] = vec
+    return vecs0
+def make_norm1(vecs0):
+    """
+    make vectors norm==1 so that cosine distance can be computed via dot product
+    """
+    for ii in range(vecs0.shape[0]):
+        for jj in range(vecs0.shape[1]):
+            norm = np.sqrt(np.square(vecs0[ii, jj, :]).sum())
+            vecs0[ii, jj, :] = vecs0[ii, jj, :] / (norm + 1e-5)
+def layer(lines, num_overlaps, comb=' '):
+    """
+    make front-padded overlapping sentences
+    """
+    if num_overlaps < 1:
+        raise Exception('num_overlaps must be >= 1')
+    out = ['PAD', ] * min(num_overlaps - 1, len(lines))
+    for ii in range(len(lines) - num_overlaps + 1):
+        out.append(comb.join(lines[ii:ii + num_overlaps]))
+    return out
+def read_alignments(fin):
+    alignments = []
+    with open(fin, 'rt', encoding="utf-8") as infile:
+        for line in infile:
+            fields = [x.strip() for x in line.split(':') if len(x.strip())]
+            if len(fields) < 2:
+                raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
+            try:
+                src = literal_eval(fields[0])
+                tgt = literal_eval(fields[1])
+            except:
+                raise Exception('Failed to parse line "%s"' % line.strip())
+            alignments.append((src, tgt))
+    # I know bluealign files have a few entries entries missing,
+    #   but I don't fix them in order to be consistent previous reported scores
+    return alignments
+def print_alignments(alignments, scores=None, file=sys.stdout):
+    if scores is not None:
+        for (x, y), s in zip(alignments, scores):
+            print('%s:%s:%.6f' % (x, y, s), file=file)
+    else:
+        for x, y in alignments:
+            print('%s:%s' % (x, y), file=file)
+class DeletionKnob(object):
+    """
+    A good deletion penalty is dependent on normalization, and probably language, domain, etc, etc
+    I want a way to control deletion penalty that generalizes well...
+    Sampling costs and use percentile seems to work fairly well.
+    """
+    def __init__(self, samp, res_min, res_max):
+        self.res_min = res_min
+        self.res_max = res_max
+        if self.res_min >= self.res_max:
+            logger.warning('res_max <= res_min, increasing it')
+            self.res_max = self.res_min + 1e-4
+        num_bins = 1000
+        num_pts = 30
+        self.hist, self.bin_edges = np.histogram(samp, bins=num_bins,
+                                                 range=[self.res_min, self.res_max],
+                                                 density=True)
+        dx = self.bin_edges[1] - self.bin_edges[0]
+        self.cdf = np.cumsum(self.hist) * dx
+        interp_points = [(0, self.res_min), ]
+        for knob_val in np.linspace(0, 1, num_pts - 1)[1:-1]:
+            cdf_idx = np.searchsorted(self.cdf, knob_val)
+            cdf_val = self.res_min + cdf_idx / float(num_bins) * (self.res_max - self.res_min)
+            interp_points.append((knob_val, cdf_val))
+        interp_points.append((1, self.res_max))
+        self.x, self.y = zip(*interp_points)
+    def percentile_frac_to_del_penalty(self, knob_val):
+        del_pen = np.interp([knob_val], self.x, self.y)[0]
+        return del_pen
+def make_alignment_types(max_alignment_size):
+    # return list of all (n,m) where n+m <= this
+    alignment_types = []
+    for x in range(1, max_alignment_size):
+        for y in range(1, max_alignment_size):
+            if x + y <= max_alignment_size:
+                alignment_types.append((x, y))
+    return alignment_types
+def ab2xy_w_offset(aa, bb_idx, bb_offset):
+    bb_from_side = bb_idx + bb_offset[aa]
+    xx = aa - bb_from_side
+    yy = bb_from_side
+    return (xx, yy)
+def xy2ab_w_offset(xx, yy, bb_offset):
+    aa = xx + yy
+    bb_from_side = yy
+    bb = bb_from_side - bb_offset[aa]
+    return aa, bb
+def process_scores(scores, alignments):
+    # floating point sometimes gives negative numbers, which is a little unnerving ...
+    scores = np.clip(scores, a_min=0, a_max=None)
+    for ii, (x_algn, y_algn) in enumerate(alignments):
+        # deletion penalty is pretty arbitrary, just report 0
+        if len(x_algn) == 0 or len(y_algn) == 0:
+            scores[ii] = 0.0
+        # report sores un-normalized by alignment sizes
+        #    (still normalized with random vectors, though)
+        else:
+            scores[ii] = scores[ii] / len(x_algn) / len(y_algn)
+    return scores
+def sparse_traceback(a_b_csum, a_b_xp, a_b_yp, b_offset, xsize, ysize):
+    alignments = []
+    xx = xsize
+    yy = ysize
+    cum_costs = []
+    while True:
+        aa, bb = xy2ab_w_offset(xx, yy, b_offset)
+        cum_costs.append(a_b_csum[aa, bb])
+        xp = a_b_xp[aa, bb]
+        yp = a_b_yp[aa, bb]
+        if xx == yy == 0:
+            break
+        if xx < 0 or yy < 0:
+            raise Exception('traceback bug')
+        x_side = list(range(xx - xp, xx))
+        y_side = list(range(yy - yp, yy))
+        alignments.append((x_side, y_side))
+        xx = xx - xp
+        yy = yy - yp
+    alignments.reverse()
+    cum_costs.reverse()
+    costs = np.array(cum_costs[1:]) - np.array(cum_costs[:-1])
+    # "costs" are scaled by x_alignment_size * y_alignment_size
+    #     and the cost of a deletion is del_penalty
+    # "scores": 0 for deletion/insertion,
+    #    and cosine distance, *not* scaled
+    #    by len(x_alignment)*len(y_alignment)
+    scores = process_scores(scores=costs, alignments=alignments)
+    return alignments, scores
+def dense_traceback(x_y_tb):
+    xsize, ysize = x_y_tb.shape
+    xx = xsize - 1
+    yy = ysize - 1
+    alignments = []
+    while True:
+        if xx == yy == 0:
+            break
+        bp = x_y_tb[xx, yy]
+        if bp == 0:
+            xp, yp = 1, 1
+            alignments.append(([xx - 1], [yy - 1]))
+        elif bp == 1:
+            xp, yp = 0, 1
+            alignments.append(([], [yy - 1]))
+        elif bp == 2:
+            xp, yp = 1, 0
+            alignments.append(([xx - 1], []))
+        else:
+            raise Exception('got unknown value')
+        xx = xx - xp
+        yy = yy - yp
+    alignments.reverse()
+    return alignments
+def append_slant(path, xwidth, ywidth):
+    """
+    Append quantized approximation to a straight line
+       from current x,y to a point at (x+xwidth, y+ywidth)
+    """
+    NN = xwidth + ywidth
+    xstart, ystart = path[-1]
+    for ii in range(1, NN + 1):
+        x = xstart + round(xwidth * ii / NN)
+        y = ystart + round(ywidth * ii / NN)
+        # In the case of ties we want them to round differently,
+        #   so explicitly make sure we take a step of 1, not 0 or 2
+        lastx, lasty = path[-1]
+        delta = x + y - lastx - lasty
+        if delta == 1:
+            path.append((x, y))
+        elif delta == 2:
+            path.append((x - 1, y))
+        elif delta == 0:
+            path.append((x + 1, y))
+def alignment_to_search_path(algn):
+    """
+    Given an alignment, make searchpath.
+    Searchpath must step exactly one position in x XOR y at each time step.
+    In the case of a block of deletions, the order found by DP is not meaningful.
+    To make things consistent and to improve the probability of recovering
+       from search errors, we search an approximately straight line
+       through a block of deletions. We do the same through a many-many
+       alignment, even though we currently don't refine a many-many alignment...
+    """
+    path = [(0, 0), ]
+    xdel, ydel = 0, 0
+    ydel = 0
+    for x, y in algn:
+        if len(x) and len(y):
+            append_slant(path, xdel, ydel)
+            xdel, ydel = 0, 0
+            append_slant(path, len(x), len(y))
+        elif len(x):
+            xdel += len(x)
+        elif len(y):
+            ydel += len(y)
+    append_slant(path, xdel, ydel)
+    return path
+def extend_alignments(course_alignments, size0, size1):
+    """
+    extend alignments to include new endpoints size0, size1
+    if alignments are larger than size0/size1, raise exception
+    """
+    # could be a string of deletions or insertions at end, so cannot just grab last one
+    xmax = 0  # maximum x value in course_alignments
+    ymax = 0  # maximum y value in course_alignments
+    for x, y in course_alignments:
+        for xval in x:
+            xmax = max(xmax, xval)
+        for yval in y:
+            ymax = max(ymax, yval)
+    if xmax > size0 or ymax > size1:
+        raise Exception('asked to extend alignments but already bigger than requested')
+    # do not duplicate xmax/ymax, do include size0/size1
+    extra_x = list(range(xmax + 1, size0 + 1))
+    extra_y = list(range(ymax + 1, size1 + 1))
+    logger.debug('extending alignments in x by %d and y by %d', len(extra_x), len(extra_y))
+    if len(extra_x) == 0:
+        for yval in extra_y:
+            course_alignments.append(([], [yval]))
+    elif len(extra_y) == 0:
+        for xval in extra_x:
+            course_alignments.append(([xval], []))
+    else:
+        course_alignments.append((extra_x, extra_y))
+def upsample_alignment(algn):
+    def upsample_one_alignment(xx):
+        return list(range(min(xx) * 2, (max(xx) + 1) * 2))
+    new_algn = []
+    for xx, yy in algn:
+        if len(xx) == 0:
+            for yyy in upsample_one_alignment(yy):
+                new_algn.append(([], [yyy]))
+        elif len(yy) == 0:
+            for xxx in upsample_one_alignment(xx):
+                new_algn.append(([xxx], []))
+        else:
+            new_algn.append((upsample_one_alignment(xx), upsample_one_alignment(yy)))
+    return new_algn
+def make_del_knob(e_laser,
+                  f_laser,
+                  e_laser_norms,
+                  f_laser_norms,
+                  sample_size):
+    e_size = e_laser.shape[0]
+    f_size = f_laser.shape[0]
+    if e_size > 0 and f_size > 0 and sample_size > 0:
+        if e_size * f_size < sample_size:
+            # dont sample, just compute full matrix
+            sample_size = e_size * f_size
+            x_idxs = np.zeros(sample_size, dtype=np.int32)
+            y_idxs = np.zeros(sample_size, dtype=np.int32)
+            c = 0
+            for ii in range(e_size):
+                for jj in range(f_size):
+                    x_idxs[c] = ii
+                    y_idxs[c] = jj
+                    c += 1
+        else:
+            # get random samples
+            x_idxs = np.random.choice(range(e_size), size=sample_size, replace=True).astype(np.int32)
+            y_idxs = np.random.choice(range(f_size), size=sample_size, replace=True).astype(np.int32)
+        # output
+        random_scores = np.empty(sample_size, dtype=np.float32)
+        score_path(x_idxs, y_idxs,
+                   e_laser_norms, f_laser_norms,
+                   e_laser, f_laser,
+                   random_scores, )
+        min_score = 0
+        max_score = max(random_scores)  # could bump this up... but its probably fine
+    else:
+        # Not much we can do here...
+        random_scores = np.array([0.0, 0.5, 1.0])  # ???
+        min_score = 0
+        max_score = 1  # ????
+    del_knob = DeletionKnob(random_scores, min_score, max_score)
+    return del_knob
+def compute_norms(vecs0, vecs1, num_samples, overlaps_to_use=None):
+    # overlaps_to_use = 10  # 10 matches before
+    overlaps1, size1, dim = vecs1.shape
+    overlaps0, size0, dim0 = vecs0.shape
+    assert (dim == dim0)
+    if overlaps_to_use is not None:
+        if overlaps_to_use > overlaps1:
+            raise Exception('Cannot use more overlaps than provided. You may want to re-run make_verlaps.py with a larger -n value')
+    else:
+        overlaps_to_use = overlaps1
+    samps_per_overlap = ceil(num_samples / overlaps_to_use)
+    if size1 and samps_per_overlap:
+        # sample other size (from all overlaps) to compre to this side
+        vecs1_rand_sample = np.empty((samps_per_overlap * overlaps_to_use, dim), dtype=np.float32)
+        for overlap_ii in range(overlaps_to_use):
+            idxs = np.random.choice(range(size1), size=samps_per_overlap, replace=True)
+            random_vecs = vecs1[overlap_ii, idxs, :]
+            vecs1_rand_sample[overlap_ii * samps_per_overlap:(overlap_ii + 1) * samps_per_overlap, :] = random_vecs
+        norms0 = np.empty((overlaps0, size0), dtype=np.float32)
+        for overlap_ii in range(overlaps0):
+            e_laser = vecs0[overlap_ii, :, :]
+            sim = np.matmul(e_laser, vecs1_rand_sample.T)
+            norms0[overlap_ii, :] = 1.0 - sim.mean(axis=1)
+    else:  # no samples, no normalization
+        norms0 = np.ones((overlaps0, size0)).astype(np.float32)
+    return norms0
+def downsample_vectors(vecs1):
+    a, b, c = vecs1.shape
+    half = np.empty((a, b // 2, c), dtype=np.float32)
+    for ii in range(a):
+        # average consecutive vectors
+        for jj in range(0, b - b % 2, 2):
+            v1 = vecs1[ii, jj, :]
+            v2 = vecs1[ii, jj + 1, :]
+            half[ii, jj // 2, :] = v1 + v2
+        # compute mean for all vectors
+        mean = np.mean(half[ii, :, :], axis=0)
+        for jj in range(0, b - b % 2, 2):
+            # remove mean
+            half[ii, jj // 2, :] = half[ii, jj // 2, :] - mean
+    # make vectors norm==1 so dot product is cosine distance
+    make_norm1(half)
+    return half
+def vecalign(vecs0,
+             vecs1,
+             final_alignment_types,
+             del_percentile_frac,
+             width_over2,
+             max_size_full_dp,
+             costs_sample_size,
+             num_samps_for_norm,
+             norms0=None,
+             norms1=None):
+    if width_over2 < 3:
+        logger.warning('width_over2 was set to %d, which does not make sense. increasing to 3.', width_over2)
+        width_over2 = 3
+    # make sure input embeddings are norm==1
+    make_norm1(vecs0)
+    make_norm1(vecs1)
+    # save off runtime stats for summary
+    runtimes = OrderedDict()
+    # Determine stack depth
+    s0, s1 = vecs0.shape[1], vecs1.shape[1]
+    max_depth = 0
+    while s0 * s1 > max_size_full_dp ** 2:
+        max_depth += 1
+        s0 = s0 // 2
+        s1 = s1 // 2
+    # init recursion stack
+    # depth is 0-based (full size is 0, 1 is half, 2 is quarter, etc)
+    stack = {0: {'v0': vecs0, 'v1': vecs1}}
+    # downsample sentence vectors
+    t0 = time()
+    for depth in range(1, max_depth + 1):
+        stack[depth] = {'v0': downsample_vectors(stack[depth - 1]['v0']),
+                        'v1': downsample_vectors(stack[depth - 1]['v1'])}
+    runtimes['Downsample embeddings'] = time() - t0
+    # compute norms for all depths, add sizes, add alignment types
+    t0 = time()
+    for depth in stack:
+        stack[depth]['size0'] = stack[depth]['v0'].shape[1]
+        stack[depth]['size1'] = stack[depth]['v1'].shape[1]
+        stack[depth]['alignment_types'] = final_alignment_types if depth == 0 else [(1, 1)]
+        if depth == 0 and norms0 is not None:
+            if norms0.shape != vecs0.shape[:2]:
+                print('norms0.shape:', norms0.shape)
+                print('vecs0.shape[:2]:', vecs0.shape[:2])
+                raise Exception('norms0 wrong shape')
+            stack[depth]['n0'] = norms0
+        else:
+            stack[depth]['n0'] = compute_norms(stack[depth]['v0'], stack[depth]['v1'], num_samps_for_norm)
+        if depth == 0 and norms1 is not None:
+            if norms1.shape != vecs1.shape[:2]:
+                print('norms1.shape:', norms1.shape)
+                print('vecs1.shape[:2]:', vecs1.shape[:2])
+                raise Exception('norms1 wrong shape')
+            stack[depth]['n1'] = norms1
+        else:
+            stack[depth]['n1'] = compute_norms(stack[depth]['v1'], stack[depth]['v0'], num_samps_for_norm)
+    runtimes['Normalize embeddings'] = time() - t0
+    # Compute deletion penalty for all depths
+    t0 = time()
+    for depth in stack:
+        stack[depth]['del_knob'] = make_del_knob(e_laser=stack[depth]['v0'][0, :, :],
+                                                 f_laser=stack[depth]['v1'][0, :, :],
+                                                 e_laser_norms=stack[depth]['n0'][0, :],
+                                                 f_laser_norms=stack[depth]['n1'][0, :],
+                                                 sample_size=costs_sample_size)
+        stack[depth]['del_penalty'] = stack[depth]['del_knob'].percentile_frac_to_del_penalty(del_percentile_frac)
+        logger.debug('del_penalty at depth %d: %f', depth, stack[depth]['del_penalty'])
+    runtimes['Compute deletion penalties'] = time() - t0
+    tt = time() - t0
+    logger.debug('%d x %d full DP make features: %.6fs (%.3e per dot product)',
+                 stack[max_depth]['size0'], stack[max_depth]['size1'], tt,
+                 tt / (stack[max_depth]['size0'] + 1e-6) / (stack[max_depth]['size1'] + 1e-6))
+    # full DP at maximum recursion depth
+    t0 = time()
+    stack[max_depth]['costs_1to1'] = make_dense_costs(stack[max_depth]['v0'],
+                                                      stack[max_depth]['v1'],
+                                                      stack[max_depth]['n0'],
+                                                      stack[max_depth]['n1'])
+    runtimes['Full DP make features'] = time() - t0
+    t0 = time()
+    _, stack[max_depth]['x_y_tb'] = dense_dp(stack[max_depth]['costs_1to1'], stack[max_depth]['del_penalty'])
+    stack[max_depth]['alignments'] = dense_traceback(stack[max_depth]['x_y_tb'])
+    runtimes['Full DP'] = time() - t0
+    # upsample the path up to the top resolution
+    compute_costs_times = []
+    dp_times = []
+    upsample_depths = [0, ] if max_depth == 0 else list(reversed(range(0, max_depth)))
+    for depth in upsample_depths:
+        if max_depth > 0:  # upsample previoius alignment to current resolution
+            course_alignments = upsample_alignment(stack[depth + 1]['alignments'])
+            # features may have been truncated when downsampleing, so alignment may need extended
+            extend_alignments(course_alignments, stack[depth]['size0'], stack[depth]['size1'])  # in-place
+        else:  # We did a full size 1-1 search, so search same size with more alignment types
+            course_alignments = stack[0]['alignments']
+        # convert couse alignments to a searchpath
+        stack[depth]['searchpath'] = alignment_to_search_path(course_alignments)
+        # compute ccosts for sparse DP
+        t0 = time()
+        stack[depth]['a_b_costs'], stack[depth]['b_offset'] = make_sparse_costs(stack[depth]['v0'], stack[depth]['v1'],
+                                                                                stack[depth]['n0'], stack[depth]['n1'],
+                                                                                stack[depth]['searchpath'],
+                                                                                stack[depth]['alignment_types'],
+                                                                                width_over2)
+        tt = time() - t0
+        num_dot_products = len(stack[depth]['b_offset']) * len(stack[depth]['alignment_types']) * width_over2 * 2
+        logger.debug('%d x %d sparse DP (%d alignment types, %d window) make features: %.6fs (%.3e per dot product)',
+                     stack[max_depth]['size0'], stack[max_depth]['size1'],
+                     len(stack[depth]['alignment_types']), width_over2 * 2,
+                     tt, tt / (num_dot_products + 1e6))
+        compute_costs_times.append(time() - t0)
+        t0 = time()
+        # perform sparse DP
+        stack[depth]['a_b_csum'], stack[depth]['a_b_xp'], stack[depth]['a_b_yp'], \
+        stack[depth]['new_b_offset'] = sparse_dp(stack[depth]['a_b_costs'], stack[depth]['b_offset'],
+                                                 stack[depth]['alignment_types'], stack[depth]['del_penalty'],
+                                                 stack[depth]['size0'], stack[depth]['size1'])
+        # performace traceback to get alignments and alignment scores
+        # for debugging, avoid overwriting stack[depth]['alignments']
+        akey = 'final_alignments' if depth == 0 else 'alignments'
+        stack[depth][akey], stack[depth]['alignment_scores'] = sparse_traceback(stack[depth]['a_b_csum'],
+                                                                                stack[depth]['a_b_xp'],
+                                                                                stack[depth]['a_b_yp'],
+                                                                                stack[depth]['new_b_offset'],
+                                                                                stack[depth]['size0'],
+                                                                                stack[depth]['size1'])
+        dp_times.append(time() - t0)
+    runtimes['Upsample DP compute costs'] = sum(compute_costs_times[:-1])
+    runtimes['Upsample DP'] = sum(dp_times[:-1])
+    runtimes['Final DP compute costs'] = compute_costs_times[-1]
+    runtimes['Final DP'] = dp_times[-1]
+    # log time stats
+    max_key_str_len = max([len(key) for key in runtimes])
+    for key in runtimes:
+        if runtimes[key] > 5e-5:
+            logger.info(key + ' took ' + '.' * (max_key_str_len + 5 - len(key)) + ('%.4fs' % runtimes[key]).rjust(7))
+    return stack

tibetan-aligner/get_vectors.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import sys
+from sentence_transformers import SentenceTransformer
+import numpy as np
+filename = sys.argv[1]
+number_of_overlays = int(sys.argv[2]) + 1 # +1 because we want to include the original sentence
+def process_file(filename):
+    model_path = "buddhist-nlp/bod-eng-similarity"
+    model = SentenceTransformer(model_path)
+    model.max_seq_length = 500
+    file = open(filename,'r')
+    sentences = [line.rstrip('\n').strip() for line in file]
+    sentences_overlay = []
+    for x in range(len(sentences)):
+        val = number_of_overlays
+        if (len(sentences) - x) < val:
+            val = (len(sentences) - x) + 1
+        for i in range(1,val):
+            sentences_overlay.append(' '.join(sentences[x:x+i]))
+    overlay_string = "\n".join(sentences_overlay)
+    vectors = np.array(model.encode(sentences_overlay,show_progress_bar=False))
+    print("LEN SENTENCES",len(sentences_overlay))
+    print("LEN VECTORS",len(vectors))
+    with open(sys.argv[1] + "_overlay", "w") as text_file:
+        text_file.write(overlay_string)
+    np.save(sys.argv[1] + "_vectors",vectors)
+process_file(filename)

tibetan-aligner/ladder ADDED Viewed

	@@ -0,0 +1,11 @@

+LINE EMBEDDINGS SHAPE (15, 768)
+LINE EMBEDDINGS SHAPE (87, 768)
+[0]:[0]:0.264225
+[1, 2]:[1]:0.354184
+[]:[2]:0.000000
+[]:[3]:0.000000
+[]:[4]:0.000000
+[]:[5]:0.000000
+[3]:[6, 7, 8, 9, 10]:0.404515
+[]:[11]:0.000000
+[4]:[12, 13, 14, 15, 16]:0.280724

tibetan-aligner/ladder2org.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import sys
+import re
+import re
+f1 = open(sys.argv[1],'r')
+f2 = open(sys.argv[2],'r')
+ladder_file = open(sys.argv[3],'r')
+output = ""
+ladder = []
+sktfile = [line.rstrip('\n').strip() for line in f1]
+tibfile = [line.rstrip('\n').strip() for line in f2]
+last_score = 0.5
+def clean_num(string):
+    string = re.sub("[^0-9, ]","",string)
+    return int(string.split(',')[0])
+for line in ladder_file:
+    if len(line.split(':')) == 3:
+        skt,tib,score = line.split(':')
+        if re.search("[0-9]",skt) and re.search("[0-9]",tib):
+            skt_num = clean_num(skt)
+            tib_num = clean_num(tib)
+            score = float(score)
+            if score > 0.0:
+                ladder.append([skt_num,tib_num,score])
+last_skt = 0
+last_tib = 0
+for entry in ladder:
+        output = output + ' +$+ '.join(sktfile[last_skt:entry[0]]) + "\n"
+        output = output + "# " + ' +!+ '.join(tibfile[last_tib:entry[1]]) + "\n" #+ "\t" + " SCORE: " + str(entry[2]) + "\n"
+        last_skt = entry[0]
+        last_tib = entry[1]
+output = output + ' / '.join(sktfile[last_skt:-1]) + "\n"
+output = output + "# " + ' / '.join(tibfile[last_tib:-1]) + "\n"
+short_f1 = re.sub("\.tsv.*","",sys.argv[1])
+short_f2 = re.sub(".*/","",sys.argv[2])
+short_f2 = re.sub("\.tsv.*","",short_f2)
+with open(short_f1 + "_" + short_f2 + ".org", 'w') as file:
+    file.write(output)

tibetan-aligner/model_to_hub.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+model_path = "model"
+model = AutoModelForSequenceClassification.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+# push model and tokenizer to huggingface hub
+model.push_to_hub("buddhist-nlp/bod-eng-similarity")
+tokenizer.push_to_hub("buddhist-nlp/bod-eng-similarity")

tibetan-aligner/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+sentence-transformers==2.2.2
+pyewts==0.2.0
+Cython==0.29.34

tibetan-aligner/score.py ADDED Viewed

	@@ -0,0 +1,170 @@

+#!/usr/bin/env python3
+"""
+Copyright 2019 Brian Thompson
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import sys
+from collections import defaultdict
+import numpy as np
+from dp_utils import read_alignments
+"""
+Faster implementation of lax and strict precision and recall, based on
+   https://www.aclweb.org/anthology/W11-4624/.
+"""
+def _precision(goldalign, testalign):
+    """
+    Computes tpstrict, fpstrict, tplax, fplax for gold/test alignments
+    """
+    tpstrict = 0  # true positive strict counter
+    tplax = 0     # true positive lax counter
+    fpstrict = 0  # false positive strict counter
+    fplax = 0     # false positive lax counter
+    # convert to sets, remove alignments empty on both sides
+    testalign = set([(tuple(x), tuple(y)) for x, y in testalign if len(x) or len(y)])
+    goldalign = set([(tuple(x), tuple(y)) for x, y in goldalign if len(x) or len(y)])
+    # mappings from source test sentence idxs to
+    #    target gold sentence idxs for which the source test sentence
+    #    was found in corresponding source gold alignment
+    src_id_to_gold_tgt_ids = defaultdict(set)
+    for gold_src, gold_tgt in goldalign:
+        for gold_src_id in gold_src:
+            for gold_tgt_id in gold_tgt:
+                src_id_to_gold_tgt_ids[gold_src_id].add(gold_tgt_id)
+    for (test_src, test_target) in testalign:
+        if (test_src, test_target) == ((), ()):
+            continue
+        if (test_src, test_target) in goldalign:
+            # strict match
+            tpstrict += 1
+            tplax += 1
+        else:
+            # For anything with partial gold/test overlap on the source,
+            #   see if there is also partial overlap on the gold/test target
+            # If so, its a lax match
+            target_ids = set()
+            for src_test_id in test_src:
+                for tgt_id in src_id_to_gold_tgt_ids[src_test_id]:
+                    target_ids.add(tgt_id)
+            if set(test_target).intersection(target_ids):
+                fpstrict += 1
+                tplax += 1
+            else:
+                fpstrict += 1
+                fplax += 1
+    return np.array([tpstrict, fpstrict, tplax, fplax], dtype=np.int32)
+def score_multiple(gold_list, test_list, value_for_div_by_0=0.0):
+    # accumulate counts for all gold/test files
+    pcounts = np.array([0, 0, 0, 0], dtype=np.int32)
+    rcounts = np.array([0, 0, 0, 0], dtype=np.int32)
+    for goldalign, testalign in zip(gold_list, test_list):
+        pcounts += _precision(goldalign=goldalign, testalign=testalign)
+        # recall is precision with no insertion/deletion and swap args
+        test_no_del = [(x, y) for x, y in testalign if len(x) and len(y)]
+        gold_no_del = [(x, y) for x, y in goldalign if len(x) and len(y)]
+        rcounts += _precision(goldalign=test_no_del, testalign=gold_no_del)
+    # Compute results
+    # pcounts: tpstrict,fnstrict,tplax,fnlax
+    # rcounts: tpstrict,fpstrict,tplax,fplax
+    if pcounts[0] + pcounts[1] == 0:
+        pstrict = value_for_div_by_0
+    else:
+        pstrict = pcounts[0] / float(pcounts[0] + pcounts[1])
+    if pcounts[2] + pcounts[3] == 0:
+        plax = value_for_div_by_0
+    else:
+        plax = pcounts[2] / float(pcounts[2] + pcounts[3])
+    if rcounts[0] + rcounts[1] == 0:
+        rstrict = value_for_div_by_0
+    else:
+        rstrict = rcounts[0] / float(rcounts[0] + rcounts[1])
+    if rcounts[2] + rcounts[3] == 0:
+        rlax = value_for_div_by_0
+    else:
+        rlax = rcounts[2] / float(rcounts[2] + rcounts[3])
+    if (pstrict + rstrict) == 0:
+        fstrict = value_for_div_by_0
+    else:
+        fstrict = 2 * (pstrict * rstrict) / (pstrict + rstrict)
+    if (plax + rlax) == 0:
+        flax = value_for_div_by_0
+    else:
+        flax = 2 * (plax * rlax) / (plax + rlax)
+    result = dict(recall_strict=rstrict,
+                  recall_lax=rlax,
+                  precision_strict=pstrict,
+                  precision_lax=plax,
+                  f1_strict=fstrict,
+                  f1_lax=flax)
+    return result
+def log_final_scores(res):
+    print(' ---------------------------------', file=sys.stderr)
+    print('|             |  Strict |    Lax  |', file=sys.stderr)
+    print('| Precision   |   {precision_strict:.3f} |   {precision_lax:.3f} |'.format(**res), file=sys.stderr)
+    print('| Recall      |   {recall_strict:.3f} |   {recall_lax:.3f} |'.format(**res), file=sys.stderr)
+    print('| F1          |   {f1_strict:.3f} |   {f1_lax:.3f} |'.format(**res), file=sys.stderr)
+    print(' ---------------------------------', file=sys.stderr)
+def main():
+    parser = argparse.ArgumentParser(
+        'Compute strict/lax precision and recall for one or more pairs of gold/test alignments',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('-t', '--test', type=str, nargs='+', required=True,
+                        help='one or more test alignment files')
+    parser.add_argument('-g', '--gold', type=str, nargs='+', required=True,
+                        help='one or more gold alignment files')
+    args = parser.parse_args()
+    if len(args.test) != len(args.gold):
+        raise Exception('number of gold/test files must be the same')
+    gold_list = [read_alignments(x) for x in args.gold]
+    test_list = [read_alignments(x) for x in args.test]
+    res = score_multiple(gold_list=gold_list, test_list=test_list)
+    log_final_scores(res)
+if __name__ == '__main__':
+    main()

tibetan-aligner/vecalign.py ADDED Viewed

	@@ -0,0 +1,148 @@

+#!/usr/bin/env python3
+"""
+Copyright 2019 Brian Thompson
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+import logging
+import pickle
+from math import ceil
+from random import seed as seed
+import numpy as np
+logger = logging.getLogger('vecalign')
+logger.setLevel(logging.WARNING)
+logFormatter = logging.Formatter("%(asctime)s  %(levelname)-5.5s  %(message)s")
+consoleHandler = logging.StreamHandler()
+consoleHandler.setFormatter(logFormatter)
+logger.addHandler(consoleHandler)
+from dp_utils import make_alignment_types, print_alignments, read_alignments, \
+    read_in_embeddings, make_doc_embedding, vecalign
+from score import score_multiple, log_final_scores
+def _main():
+    # make runs consistent
+    seed(42)
+    np.random.seed(42)
+    parser = argparse.ArgumentParser('Sentence alignment using sentence embeddings and FastDTW',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('-s', '--src', type=str, nargs='+', required=True,
+                        help='preprocessed source file to align')
+    parser.add_argument('-t', '--tgt', type=str, nargs='+', required=True,
+                        help='preprocessed target file to align')
+    parser.add_argument('-g', '--gold_alignment', type=str, nargs='+', required=False,
+                        help='preprocessed target file to align')
+    parser.add_argument('--src_embed', type=str, nargs=2, required=True,
+                        help='Source embeddings. Requires two arguments: first is a text file, sencond is a binary embeddings file. ')
+    parser.add_argument('--tgt_embed', type=str, nargs=2, required=True,
+                        help='Target embeddings. Requires two arguments: first is a text file, sencond is a binary embeddings file. ')
+    parser.add_argument('-a', '--alignment_max_size', type=int, default=4,
+                        help='Searches for alignments up to size N-M, where N+M <= this value. Note that the the embeddings must support the requested number of overlaps')
+    parser.add_argument('-d', '--del_percentile_frac', type=float, default=0.2,
+                        help='Deletion penalty is set to this percentile (as a fraction) of the cost matrix distribution. Should be between 0 and 1.')
+    parser.add_argument('-v', '--verbose', help='sets consle to logging.DEBUG instead of logging.WARN',
+                        action='store_true')
+    parser.add_argument('--max_size_full_dp', type=int, default=300, # org: 300
+                        help='Maximum size N for which is is acceptable to run full N^2 dynamic programming.')
+    parser.add_argument('--costs_sample_size', type=int, default=20000,
+                        help='Sample size to estimate costs distribution, used to set deletion penalty in conjunction with deletion_percentile.')
+    parser.add_argument('--num_samps_for_norm', type=int, default=100, # org 100
+                        help='Number of samples used for normalizing embeddings')
+    parser.add_argument('--search_buffer_size', type=int, default=5,
+                        help='Width (one side) of search buffer. Larger values makes search more likely to recover from errors but increases runtime.')
+    parser.add_argument('--debug_save_stack', type=str,
+                        help='Write stack to pickle file for debug purposes')
+    args = parser.parse_args()
+    if len(args.src) != len(args.tgt):
+        raise Exception('number of source files must match number of target files')
+    if args.gold_alignment is not None:
+        if len(args.gold_alignment) != len(args.src):
+            raise Exception('number of gold alignment files, if provided, must match number of source and target files')
+    if args.verbose:
+        import logging
+        logger.setLevel(logging.INFO)
+    if args.alignment_max_size < 2:
+        logger.warning('Alignment_max_size < 2. Increasing to 2 so that 1-1 alignments will be considered')
+        args.alignment_max_size = 2
+    src_sent2line, src_line_embeddings = read_in_embeddings(args.src_embed[0], args.src_embed[1])
+    tgt_sent2line, tgt_line_embeddings = read_in_embeddings(args.tgt_embed[0], args.tgt_embed[1])
+    width_over2 = ceil(args.alignment_max_size / 2.0) + args.search_buffer_size
+    test_alignments = []
+    stack_list = []
+    for src_file, tgt_file in zip(args.src, args.tgt):
+        logger.info('Aligning src="%s" to tgt="%s"', src_file, tgt_file)
+        src_lines = open(src_file, 'rt', encoding="utf-8").readlines()
+        vecs0 = make_doc_embedding(src_sent2line, src_line_embeddings, src_lines, args.alignment_max_size)
+        tgt_lines = open(tgt_file, 'rt', encoding="utf-8").readlines()
+        vecs1 = make_doc_embedding(tgt_sent2line, tgt_line_embeddings, tgt_lines, args.alignment_max_size)
+        final_alignment_types = make_alignment_types(args.alignment_max_size)
+        logger.debug('Considering alignment types %s', final_alignment_types)
+        stack = vecalign(vecs0=vecs0,
+                         vecs1=vecs1,
+                         final_alignment_types=final_alignment_types,
+                         del_percentile_frac=args.del_percentile_frac,
+                         width_over2=width_over2,
+                         max_size_full_dp=args.max_size_full_dp,
+                         costs_sample_size=args.costs_sample_size,
+                         num_samps_for_norm=args.num_samps_for_norm)
+        # write final alignments to stdout
+        print_alignments(stack[0]['final_alignments'], stack[0]['alignment_scores'])
+        test_alignments.append(stack[0]['final_alignments'])
+        stack_list.append(stack)
+    if args.gold_alignment is not None:
+        gold_list = [read_alignments(x) for x in args.gold_alignment]
+        res = score_multiple(gold_list=gold_list, test_list=test_alignments)
+        log_final_scores(res)
+    if args.debug_save_stack:
+        pickle.dump(stack_list, open(args.debug_save_stack, 'wb'))
+if __name__ == '__main__':
+    _main()

tm.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import json
+import logging
+import os
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import Dict
+import requests
+GITHUB_USERNAME = os.getenv("GITHUB_USERNAME")
+GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_TOKEN")
+GITHUB_EMAIL = os.getenv("GITHUB_EMAIL")
+GITHUB_ORG = os.getenv("MAI_GITHUB_ORG")
+MAI_TM_PUBLISH_TODO_REPO = os.environ["MAI_TMS_PUBLISH_TODO_REPO"]
+GITHUB_API_ENDPOINT = f"https://api.github.com/orgs/{GITHUB_ORG}/repos"
+DEBUG = os.getenv("DEBUG", False)
+quiet = "-q" if DEBUG else ""
+def create_github_repo(repo_path: Path, repo_name: str):
+    logging.info("[INFO] Creating GitHub repo...")
+    # configure git users
+    subprocess.run(f"git config --global user.name {GITHUB_USERNAME}".split())
+    subprocess.run(f"git config --global user.email {GITHUB_EMAIL}".split())
+    # Initialize a Git repository
+    subprocess.run(f"git init {quiet}".split(), cwd=str(repo_path))
+    # Commit the changes
+    subprocess.run("git add . ".split(), cwd=str(repo_path))
+    subprocess.run(
+        f"git commit {quiet} -m".split() + ["Initial commit"], cwd=str(repo_path)
+    )
+    # Create a new repository on GitHub
+    response = requests.post(
+        GITHUB_API_ENDPOINT,
+        json={
+            "name": repo_name,
+            "private": True,
+        },
+        auth=(GITHUB_USERNAME, GITHUB_ACCESS_TOKEN),
+    )
+    response.raise_for_status()
+    time.sleep(3)
+    # Add the GitHub remote to the local Git repository and push the changes
+    remote_url = f"https://{GITHUB_ORG}:{GITHUB_ACCESS_TOKEN}@github.com/{GITHUB_ORG}/{repo_name}.git"
+    subprocess.run(
+        f"git remote add origin {remote_url}", cwd=str(repo_path), shell=True
+    )
+    # rename default branch to main
+    subprocess.run("git branch -M main".split(), cwd=str(repo_path))
+    subprocess.run(f"git push {quiet} -u origin main".split(), cwd=str(repo_path))
+    return response.json()["html_url"]
+def convert_raw_align_to_tm(align_fn: Path, tm_path: Path):
+    if DEBUG:
+        logging.debug("[INFO] Conerting raw alignment to TM repo...")
+    def load_alignment(fn: Path):
+        content = fn.read_text()
+        if not content:
+            return []
+        for seg_pair in content.splitlines():
+            if not seg_pair:
+                continue
+            if "\t" in seg_pair:
+                try:
+                    bo_seg, en_seg = seg_pair.split("\t", 1)
+                except Exception as e:
+                    logging.error(f"{e} in {fn}")
+                    raise
+            else:
+                bo_seg = seg_pair
+                en_seg = "\n"
+            yield bo_seg, en_seg
+    text_bo_fn = tm_path / f"{tm_path.name}-bo.txt"
+    text_en_fn = tm_path / f"{tm_path.name}-en.txt"
+    with open(text_bo_fn, "w", encoding="utf-8") as bo_file, open(
+        text_en_fn, "w", encoding="utf-8"
+    ) as en_file:
+        for bo_seg, en_seg in load_alignment(align_fn):
+            bo_file.write(bo_seg + "\n")
+            en_file.write(en_seg + "\n")
+    return tm_path
+def get_github_dev_url(raw_github_url: str) -> str:
+    base_url = "https://github.dev"
+    _, file_path = raw_github_url.split(".com")
+    blob_file_path = file_path.replace("main", "blob/main")
+    return base_url + blob_file_path
+def add_input_in_readme(input_dict: Dict[str, str], path: Path) -> Path:
+    input_readme_fn = path / "README.md"
+    text_id = input_dict["text_id"]
+    bo_file_url = get_github_dev_url(input_dict["bo_file_url"])
+    en_file_url = get_github_dev_url(input_dict["en_file_url"])
+    input_string = "## Input\n- [BO{}]({})\n- [EN{}]({})".format(
+        text_id, bo_file_url, text_id, en_file_url
+    )
+    input_readme_fn.write_text(input_string)
+    return path
+def add_to_publish_todo_repo(org, repo_name, file_path, access_token):
+    base_url = f"https://api.github.com/repos/{org}/{repo_name}/contents/"
+    headers = {
+        "Authorization": f"Bearer {access_token}",
+        "Accept": "application/vnd.github.v3+json",
+    }
+    url = base_url + file_path
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        print(f"[INFO] '{file_path}' already added.")
+        return
+    payload = {"message": f"Add {file_path}", "content": ""}
+    response = requests.put(url, headers=headers, json=payload)
+    if response.status_code == 201:
+        print(f"[INFO] '{file_path}' added to publish todo")
+    else:
+        print(f"[ERROR] Failed to add '{file_path}'.")
+        print(f"[ERROR] Response: {response.text}")
+def create_tm(align_fn: Path, text_pair: Dict[str, str]):
+    align_fn = Path(align_fn)
+    text_id = text_pair["text_id"]
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        output_dir = Path(tmp_dir)
+        repo_name = f"TM{text_id}"
+        tm_path = output_dir / repo_name
+        tm_path.mkdir(exist_ok=True, parents=True)
+        repo_path = convert_raw_align_to_tm(align_fn, tm_path)
+        repo_path = add_input_in_readme(text_pair, tm_path)
+        repo_url = create_github_repo(repo_path, repo_name)
+        logging.info(f"TM repo created: {repo_url}")
+        add_to_publish_todo_repo(GITHUB_ORG, MAI_TM_PUBLISH_TODO_REPO, repo_name, GITHUB_ACCESS_TOKEN)
+    return repo_url
+if __name__ == "__main__":
+    align_fn = Path(sys.argv[1])
+    create_tm(align_fn)