Spaces:
Sleeping
Sleeping
from math import log | |
class BPE(object): | |
def __init__(self, vocab_file): | |
with open(vocab_file, encoding="utf8") as f: | |
self.words = [l.split()[0] for l in f] | |
log_len = log(len(self.words)) | |
self.wordcost = { | |
k: log((i+1) * log_len) | |
for i, k in enumerate(self.words)} | |
self.maxword = max(len(x) for x in self.words) | |
def encode(self, s): | |
"""Uses dynamic programming to infer the location of spaces in a string | |
without spaces.""" | |
s = s.replace(" ", "▁") | |
# Find the best match for the i first characters, assuming cost has | |
# been built for the i-1 first characters. | |
# Returns a pair (match_cost, match_length). | |
def best_match(i): | |
candidates = enumerate(reversed(cost[max(0, i - self.maxword):i])) | |
return min( | |
(c + self.wordcost.get(s[i-k-1:i], 9e999), k+1) | |
for k, c in candidates) | |
# Build the cost array. | |
cost = [0] | |
for i in range(1, len(s) + 1): | |
c, k = best_match(i) | |
cost.append(c) | |
# Backtrack to recover the minimal-cost string. | |
out = [] | |
i = len(s) | |
while i > 0: | |
c, k = best_match(i) | |
assert c == cost[i] | |
out.append(s[i-k:i]) | |
i -= k | |
return " ".join(reversed(out)) | |
if __name__ == "__main__": | |
bpe = BPE("en.wiki.bpe.op25000.vocab") | |
print(bpe.encode(' this is our house in boomchakalaka')) | |
# >>> ▁this ▁is ▁our ▁house ▁in ▁boom ch ak al aka |