File size: 4,415 Bytes
52937eb
 
10fa35d
 
 
 
a385dbe
 
 
 
 
52937eb
66f6ae1
 
 
77ddca8
a969267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d3ca7a
 
 
 
a969267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
---
license: mit
language:
- ce
- ru
- en
metrics:
- codeparrot/apps_metric
- accuracy
tags:
- code
---
The model uses only sign **ӏ** for explosive consonants (small cyrillic palochka letter)!


The model was teached by folloving David Dale's instructions for erzya language (https://arxiv.org/abs/2209.09368) and using code from his repository. Commentaries in Russian were left untouched.

```python
import torch
from transformers import BertTokenizer, AutoModel
import numpy as np
import pandas as pd
import razdel
import matplotlib.pyplot as plt
from tqdm.auto import tqdm, trange
```
Download the model from Huggingface repository:
```python
model_name = 'NM-development/labse-en-ru-ce-prototype'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
```
Assign files with the texts you want to split into parallel sentences:
```python
file_ru = None
file_nm = None


with open(file_nm, 'r') as f1, open(file_ru, 'r') as f2:
    nm_text = f1.read()
    ru_text = f2.read()
```
In the following section define auxillary functions for parallel sentence comparison:
```python
def embed(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    with torch.inference_mode():
        model_output = model(**encoded_input.to(model.device))
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

def get_top_mean_by_row(x, k=5):
    m, n = x.shape
    k = min(k, n)
    topk_indices = np.argpartition(x, -k, axis=1)[:, -k:]
    rows, _ = np.indices((m, k))
    return x[rows, topk_indices].mean(1)

def align3(sims):
    rewards = np.zeros_like(sims)
    choices = np.zeros_like(sims).astype(int)  # 1: choose this pair, 2: decrease i, 3: decrease j

    # алгоритм, разрешающий пропускать сколько угодно пар, лишь бы была монотонность
    for i in range(sims.shape[0]):
        for j in range(0, sims.shape[1]):
            # вариант первый: выровнять i-тое предложение с j-тым
            score_add = sims[i, j]
            if i > 0 and j > 0:  # вот как тогда выровняются предыдущие 
                score_add += rewards[i-1, j-1]
                choices[i, j] = 1
            best = score_add
            if i > 0 and rewards[i-1, j] > best:
                best = rewards[i-1, j]
                choices[i, j] = 2
            if j > 0 and rewards[i, j-1] > best:
                best = rewards[i, j-1]
                choices[i, j] = 3
            rewards[i, j] = best
    alignment = []
    i = sims.shape[0] - 1
    j = sims.shape[1] - 1
    while i > 0 and j > 0:
        if choices[i, j] == 1:
            alignment.append([i, j])
            i -= 1
            j -= 1
        elif choices[i, j] == 2:
            i -= 1
        else:
            j -= 1
    return alignment[::-1]

def make_sents(text):
    sents = [s.text.replace('\n', ' ').strip() for p in text.split('\n\n') for s in razdel.sentenize(p)]
    sents = [s for s in sents if s]
    return sents
```
Firstly split your texts into sentences:
```python
sents_nm = make_sents(nm_text)
sents_ru = make_sents(ru_text)
```
Then embed all the chunks:
```python
emb_ru = np.stack([embed(s) for s in tqdm(sents_ru)])
emb_nm = np.stack([embed(s) for s in tqdm(sents_nm)])
```
Now compare sentenses' semanics vectors and build correlation heatmap:
```python
pen = np.array([[min(len(x), len(y)) / max(len(x), len(y)) for x in sents_nm] for y in sents_ru])
sims = np.maximum(0, np.dot(emb_ru, emb_nm.T)) ** 1 * pen

alpha = 0.2
penalty = 0.2
sims_rel = (sims.T - get_top_mean_by_row(sims) * alpha).T - get_top_mean_by_row(sims.T) * alpha - penalty

alignment = align3(sims_rel)

print(sum(sims[i, j] for i, j in alignment) / min(sims.shape))
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(sims_rel)
plt.subplot(1, 2, 2)
plt.scatter(*list(zip(*alignment)), s=5);
```
Finally, save the parallel corpus into a json file:
```python
nm_ru_parallel_corpus = pd.DataFrame({'nm_text' : [sents_nm[x[1]] for x in alignment], 'ru_text' : [sents_ru[x[0]] for x in alignment]})
corpus_filename = 'nm_ru_corpus.json'
with open(corpus_filename, 'w') as f:
    nm_ru_parallel_corpus.to_json(f, force_ascii=False, indent=4)
```