File size: 2,131 Bytes
9cd37dc 718d914 9cd37dc b1b0223 9cd37dc 86239a7 9cd37dc a632f65 9cd37dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import gradio as gr
def tokenize(buffer: str):
split = {'(', ')', '{', '}', '[', ']', ',', ':', '+', '-', '*', '/', '%', '=', '\n'}
DFA_table = {
-1: {'any': -1},
0: {' ': 1, 'any': 5, 'split': 17, 'f': 7, '"': 8, "'": 9},
1: {' ': 2, 'f': 7, 'any': 5},
2: {' ': 3, 'f': 7, 'any': 5},
3: {' ': 4, 'f': 7, 'any': 5},
4: {'any': 18},
5: {' ': 6, 'any': 5, 'split': 17},
6: {' ': 6, 'any': 18, 'split': 17},
7: {'any': 5, '"': 8, "'": 9},
8: {'"': 16, 'any': 8},
9: {"'": 11, 'any': 10},
10: {"'": 16, 'any': 10},
11: {' ': 16, "'": 12, 'any': -1, 'split': 17},
12: {"'": 13, 'any': 12},
13: {"'": 14, 'any': -1},
14: {"'": 15, 'any': -1},
15: {' ': 16, 'split': 17, 'any': -1},
16: {' ': 16, 'any': -1, 'split': 17, '"': 18, "'": 18},
17: {'any': -1}, # final: consume split as token
18: {'any': -1}, # final: not consume split as token
}
finals = (17, 18)
tokens = []
cursor = 0
while cursor < len(buffer):
state = 0
temp = ''
while cursor < len(buffer):
ch = buffer[cursor]
if ch in split:
ch = 'split'
if ch not in DFA_table[state]:
ch = 'any'
state = DFA_table[state][ch]
if state not in finals:
temp += buffer[cursor]
else:
break
cursor += 1
if state not in finals and state != 5:
raise RuntimeError(f"Rejected at state {state}")
if temp != '':
tokens.append(temp.strip() if temp != ' ' else temp)
if state == finals[0]:
tokens.append(buffer[cursor])
cursor += 1
return tokens
interface = gr.Interface(
fn=tokenize,
title="Tokenizer",
description="Tokenize the python code",
theme="compact",
inputs=gr.TextArea(label="Python code",value = "print('Hello World!!')"),
outputs=gr.TextArea(label="Tokenize output")
)
interface.launch() |