|
|
|
import difflib |
|
import webbrowser |
|
from transformers import AutoTokenizer |
|
from data_sample.oov_base import space_tokens, jd_vocab_tokens, docs |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("tokenizer") |
|
|
|
def test_oov(): |
|
d = difflib.HtmlDiff(wrapcolumn=50) |
|
|
|
raw_lines = [] |
|
decode_lines = [] |
|
for line in space_tokens + jd_vocab_tokens + docs: |
|
tokens = tokenizer.encode(line) |
|
decode_line = tokenizer.decode(tokens) |
|
if line != decode_line: |
|
raw_lines.append(line) |
|
decode_lines.append(decode_line) |
|
|
|
q = d.make_file(raw_lines, decode_lines) |
|
with open('diff.html', 'w', encoding="utf-8") as f_new: |
|
f_new.write(q) |
|
webbrowser.open('diff.html') |
|
if __name__ == "__main__": |
|
test_oov() |