File size: 2,131 Bytes
9cd37dc
 
718d914
9cd37dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1b0223
9cd37dc
86239a7
9cd37dc
a632f65
 
 
 
 
9cd37dc
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr

def tokenize(buffer: str):
    split = {'(', ')', '{', '}', '[', ']', ',', ':', '+', '-', '*', '/', '%', '=', '\n'}
    DFA_table = {
        -1: {'any': -1},
        0: {' ': 1, 'any': 5, 'split': 17, 'f': 7, '"': 8, "'": 9},
        1: {' ': 2, 'f': 7, 'any': 5},
        2: {' ': 3, 'f': 7, 'any': 5},
        3: {' ': 4, 'f': 7, 'any': 5},
        4: {'any': 18},
        5: {' ': 6, 'any': 5, 'split': 17}, 
        6: {' ': 6, 'any': 18, 'split': 17},
        7: {'any': 5, '"': 8, "'": 9},
        8: {'"': 16, 'any': 8},
        9: {"'": 11, 'any': 10},
        10: {"'": 16, 'any': 10},
        11: {' ': 16, "'": 12, 'any': -1, 'split': 17},
        12: {"'": 13, 'any': 12},
        13: {"'": 14, 'any': -1},
        14: {"'": 15, 'any': -1},
        15: {' ': 16, 'split': 17, 'any': -1},
        16: {' ': 16, 'any': -1, 'split': 17, '"': 18, "'": 18},
        17: {'any': -1}, # final: consume split as token
        18: {'any': -1}, # final: not consume split as token
    }
    finals = (17, 18)
    tokens = []
    cursor = 0
    while cursor < len(buffer):
        state = 0
        temp = ''
        while cursor < len(buffer):
            ch = buffer[cursor]
            if ch in split:
                ch = 'split'
            if ch not in DFA_table[state]:
                ch = 'any'
            state = DFA_table[state][ch]
            if state not in finals:
                temp += buffer[cursor]
            else:
                break
            cursor += 1

        if state not in finals and state != 5:
            raise RuntimeError(f"Rejected at state {state}")
        if temp != '':
            tokens.append(temp.strip() if temp != '    ' else temp)
        if state == finals[0]:
            tokens.append(buffer[cursor])
            cursor += 1
    return tokens





interface = gr.Interface(
        fn=tokenize,
        title="Tokenizer",
        description="Tokenize the python code",
        theme="compact",
        inputs=gr.TextArea(label="Python code",value = "print('Hello World!!')"), 
        outputs=gr.TextArea(label="Tokenize output")
)

interface.launch()