File size: 755 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

import difflib
import webbrowser
from transformers import AutoTokenizer
from data_sample.oov_base import space_tokens, jd_vocab_tokens, docs


tokenizer = AutoTokenizer.from_pretrained("tokenizer")

def test_oov():
    d = difflib.HtmlDiff(wrapcolumn=50)

    raw_lines = []
    decode_lines = []
    for line in space_tokens + jd_vocab_tokens + docs:
        tokens = tokenizer.encode(line)
        decode_line = tokenizer.decode(tokens)
        if line != decode_line:
            raw_lines.append(line)
            decode_lines.append(decode_line)

    q = d.make_file(raw_lines, decode_lines)
    with open('diff.html', 'w', encoding="utf-8") as f_new:
        f_new.write(q)
    webbrowser.open('diff.html')
if __name__ == "__main__":
    test_oov()