File size: 3,677 Bytes
2eee342 4bd0020 2eee342 4bd0020 5dcacdd 977ee0e 4bd0020 977ee0e 4bd0020 2eee342 4bd0020 1f29464 4bd0020 7b6dbcc 4bd0020 1f29464 4bd0020 7b6dbcc 6c1f2be 4bd0020 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import gradio as gr
import regex as re
from tqdm import tqdm
import pickle
class Tokenizer:
def __init__(self):
self.vocab = {idx : bytes([idx]) for idx in range(256)}
self.pattern = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
self.merges = {}
def merge(self, tokens, target, new_token):
new_tokens = []
i = 0
while i < len(tokens):
if i + 1 < len(tokens) and tokens[i] == target[0] and tokens[i + 1] == target[1]:
i += 1
new_tokens.append(new_token)
else:
new_tokens.append(tokens[i])
i += 1
return new_tokens
def get_stats(self, idsList):
pairs = {}
if not isinstance(idsList[0], list):
idsList = [idsList]
for tokens in idsList:
for a, b in zip(tokens, tokens[1:]):
if not (a, b) in pairs:
pairs[(a, b)] = 1
else:
pairs[(a, b)] += 1
return pairs
def get_max_pair(self, idsList):
pairs = self.get_stats(idsList)
return sorted(pairs.items(), key=lambda item : item[1])[-1][0]
def get_min(self, idsList):
stats = self.get_stats(idsList)
pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
return pair
def train(self, epochs, text):
pat = re.compile(self.pattern)
textList = re.findall(pat, text)
idsList = [list(text.encode('utf-8')) for text in textList]
for epoch in tqdm(range(epochs)):
max_pair = self.get_max_pair(idsList)
new_token = 256 + epoch
self.merges[max_pair] = new_token
idsList = [self.merge(tokens, max_pair, new_token) for tokens in idsList]
self.vocab[new_token] = self.vocab[max_pair[0]] + self.vocab[max_pair[1]]
return [x for xs in idsList for x in xs]
def encode(self, text):
tokens = list(text.encode('utf-8'))
while len(tokens) >= 2:
pair = self.get_min(tokens)
if pair not in self.merges:
break
idx = self.merges[pair]
tokens = self.merge(tokens, pair, idx)
return tokens
def decode(self, tokens):
tokens = b"".join(self.vocab[token] for token in tokens)
text = tokens.decode('utf-8', errors='replace')
return text
title = "Ghalib doing tiktok"
description = "A simple Gradio interface to infer urdu tokenizer"
tokenizer = Tokenizer()
temp = Tokenizer()
with open('vocab.pkl', 'rb') as files:
tokenizer.vocab = pickle.load(files)
with open('merges.pkl', 'rb') as files:
tokenizer.merges = pickle.load(files)
def inference(text):
print(len(temp.encode(text)) / len(tokenizer.encode(text)))
return tokenizer.encode(text)
iface = gr.Interface(
inference,
inputs = ["text"],
outputs = ["text", "text"],
examples=["سفید رنگ ہیں آخر سیاہ مو کرتے لٹاتے دولت دنیا کو میکدے میں ہم طلائی ساغر مے نقرئی سبو کرتے ہمیشہ میں نے گریباں کو چاک چاک کیا",
" دل کہ آتے ہیں جس کو دھیان بہت خود بھی آتا ہے اپنے دھیان میں کیاوہ ملے تو یہ پوچھنا ہے مجھےاب بھی ہوں میں تری امان میں کیا"],
title = title,
description = description,
)
iface.launch() |